#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. """ The elastic agent is the control plane of torchelastic. It is a process that launches and manages underlying worker processes. The agent is responsible for: 1. Working with distributed torch: the workers are started with all the necessary information to successfully and trivially call ``torch.distributed.init_process_group()``. 2. Fault tolerance: monitors workers and upon detecting worker failures or unhealthiness, tears down all workers and restarts everyone. 3. Elasticity: Reacts to membership changes and restarts workers with the new members. The simplest agents are deployed per node and works with local processes. A more advanced agent can launch and manage workers remotely. Agents can be completely decentralized, making decisions based on the workers it manages. Or can be coordinated, communicating to other agents (that manage workers in the same job) to make a collective decision. """ from .api import ( # noqa: F401 ElasticAgent, RunResult, SimpleElasticAgent, Worker, WorkerGroup, WorkerSpec, WorkerState, ) from .local_elastic_agent import TORCHELASTIC_ENABLE_FILE_TIMER, TORCHELASTIC_TIMER_FILE