U
    Jc)                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZmZ d dlmZmZmZ d dlmZmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZ e e!Z"G dd deZ#eedddZ$eedddZ%eee#ef dddZ&dS )    N)	b64decode	b64encode)	timedelta)AnyOptionalTuplecast)	FileStoreStoreTCPStore)	NodeStateconstruct_and_record_rdzv_event   )RendezvousConnectionErrorRendezvousErrorRendezvousParametersRendezvousStateError)RendezvousBackendToken)_matches_machine_hostnameparse_rendezvous_endpointc                   @   s   e Zd ZU dZdZeed< eed< eeddddZe	ed	d
dZ
eeeef  d	ddZdeee eeeeef  dddZeedddZeeeeef  dddZdS )C10dRendezvousBackendzRepresents a C10d-backed rendezvous backend.

    Args:
        store:
            The :py:class:`torch.distributed.Store` instance to use to
            communicate with the C10d store.
        run_id:
            The run id of the rendezvous.
    ZY2FuaW1hZGFt_store_keyN)storerun_idreturnc                 C   s4   |st d|| _d| | _| d| jd| j d S )Nz&The run id must be a non-empty string.ztorch.rendezvous.compare_set )
ValueErrorr   r   _call_store_NULL_SENTINEL)selfr   r    r#   `/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py__init__2   s
    
zC10dRendezvousBackend.__init__)r   c                 C   s   dS )See base class.Zc10dr#   )r"   r#   r#   r$   nameB   s    zC10dRendezvousBackend.namec                 C   s   |  d| j}| |S )r&   get)r    r   _decode_state)r"   base64_stater#   r#   r$   	get_stateG   s    zC10dRendezvousBackend.get_state)statetokenr   c           
      C   s   t | }|rDt|ts:|  }|dk	r6|d}|S dS | }n| j}| d| j||}| |}|dkrrdS |\}}	||	||kfS )r&   NFr   )F)	r   decode
isinstancebytesr+   r!   r    r   r)   )
r"   r,   r-   Zbase64_state_strresulttmpr*   Zstate_token_pairZ	new_state	new_tokenr#   r#   r$   	set_stateM   s     


zC10dRendezvousBackend.set_state)store_opr   c              
   O   sL   zt | j|||W S  tttfk
rF } ztd|W 5 d }~X Y nX d S )NMThe connection to the C10d store has failed. See inner exception for details.)getattrr   r   RuntimeErrorTimeoutErrorr   )r"   r5   argskwargsexcr#   r#   r$   r    o   s    z!C10dRendezvousBackend._call_store)r*   r   c              
   C   sV   || j  krd S zt|}W n. tjk
rL } ztd|W 5 d }~X Y nX ||fS )Nz=The state object is corrupt. See inner exception for details.)r!   encoder   binasciiErrorr   )r"   r*   r,   r<   r#   r#   r$   r)   w   s    z#C10dRendezvousBackend._decode_state)N)__name__
__module____qualname____doc__r!   r
   __annotations__strr%   propertyr'   r   r   r0   r   r+   boolr4   r   r    r)   r#   r#   r#   r$   r   !   s    
  "r   )paramsr   c           
      C   s   t | jdd\}}| d}|d k	r*|}nt|}tt| dd}|dkrTtd|dfD ]}zPt|||t	|d	d
}|rdt
  d}t| j|tjd t| W  qW q\ tttfk
r }	 z|r|d k	rtd|	W 5 d }	~	X Y q\X q\|S )Nir  )default_portis_hostread_timeout<   r   z,The read timeout must be a positive integer.F)seconds)Z	is_mastertimeoutzProcess z5 hosts the TCP store for the C10d rendezvous backend.)r   message
node_stater6   )r   endpointZget_as_boolr   r   intZ
get_as_intr   r   r   osgetpidr   r   r   ZINITloginfor8   r9   r   )
rH   hostportZcfg_is_hostrJ   rK   Z	is_serverr   msgr<   r#   r#   r$   _create_tcp_store   sB    
     

rZ   c              
   C   s   | j r| j }n>zt \}}W n, tk
rJ } ztd|W 5 d }~X Y nX zt|}W n0 ttfk
r } ztd|W 5 d }~X Y nX |S )NzMThe file creation for C10d store has failed. See inner exception for details.r6   )	rQ   tempfilemkstempOSErrorr   r	   r   r8   r   )rH   path_r<   r   r#   r#   r$   _create_file_store   s&    r`   c              
   C   s   |  dd  }z<|dkr(t| }n|dkr:t| }ntdt|| j}W nJ tk
r } z,t	t
|j dt| | jtjd  W 5 d}~X Y nX ||fS )a	  Creates a new :py:class:`C10dRendezvousBackend` from the specified
    parameters.

    +--------------+-----------------------------------------------------------+
    | Parameter    | Description                                               |
    +==============+===========================================================+
    | store_type   | The type of the C10d store. The currently supported types |
    |              | are "tcp" and "file" which correspond to                  |
    |              | :py:class:`torch.distributed.TCPStore` and                |
    |              | :py:class:`torch.distributed.FileStore`, respectively.    |
    |              | Defaults to "tcp".                                        |
    +--------------+-----------------------------------------------------------+
    | read_timeout | The read timeout, in seconds, for store operations.       |
    |              | Defaults to 60 seconds.                                   |
    |              |                                                           |
    |              | Note this only applies to                                 |
    |              | :py:class:`torch.distributed.TCPStore`. It is not relevant|
    |              | to :py:class:`torch.distributed.FileStore` which does not |
    |              | take in timeout as a parameter.                           |
    +--------------+-----------------------------------------------------------+
    | is_host      | A boolean value indicating whether this backend instance  |
    |              | will host the C10d store. If not specified it will be     |
    |              | inferred heuristically by matching the hostname or the IP |
    |              | address of this machine against the specified rendezvous  |
    |              | endpoint. Defaults to ``None``.                           |
    |              |                                                           |
    |              | Note that this configuration option only applies to       |
    |              | :py:class:`torch.distributed.TCPStore`. In normal         |
    |              | circumstances you can safely skip it; the only time when  |
    |              | it is needed is if its value cannot be correctly          |
    |              | determined (e.g. the rendezvous endpoint has a CNAME as   |
    |              | the hostname or does not match the FQDN of the machine).  |
    +--------------+-----------------------------------------------------------+
    
store_typeZtcpfilez?Invalid store type given. Currently only supports file and tcp.z: )rO   r   rP   N)r(   striplowerr`   rZ   r   r   r   	Exceptionr   typer@   rE   r   ZFAILED)rH   ra   r   backender#   r#   r$   create_backend   s     %

ri   )'r>   loggingrS   r[   base64r   r   datetimer   typingr   r   r   r   Ztorch.distributedr	   r
   r   Z torch.distributed.elastic.eventsr   r   apir   r   r   r   Zdynamic_rendezvousr   r   utilsr   r   	getLoggerr@   rU   r   rZ   r`   ri   r#   r#   r#   r$   <module>   s    
d1