U
    &c#                    @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ ddlmZ d	d
lmZmZmZmZ d	dlmZ d	dlmZmZmZmZ d	dlmZmZ e e!Z"dddZ#dd Z$eejj%j&eeee$dZ'eddddgZ(edddgZ)eddddgZ*edddddgZ+edddddgZ,eddd d!gZ-d"d# Z.G d$d% d%ej/Z0G d&d' d'ej/Z1G d(d) d)ej/Z2G d*d+ d+Z3G d,d- d-ej/e3Z4G d.d/ d/eZ5G d0d1 d1ej/e3Z6G d2d3 d3ej/Z7G d4d5 d5ej/Z8G d6d7 d7ej/Z9G d8d9 d9ej/Z:G d:d; d;ej/Z;G d<d= d=ej/Z<G d>d? d?eZ=G d@dA dAej/Z>G dBdC dCej/Z?G dDdE dEeZ@dFZAdGZBedHeAG dIdJ dJe@ZCedKeAG dLdM dMe@ZDdS )NzPyTorch REFORMER model.     N)
namedtuple)reduce)mul)nn)Function)CrossEntropyLoss   )gelu	gelu_fastgelu_newswish)ReformerConfig)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings add_start_docstrings_to_callable)PreTrainedModelapply_chunking_to_forwardzQhttps://cdn.huggingface.co/google/reformer-crime-and-punishment/pytorch_model.binzChttps://cdn.huggingface.co/google/reformer-enwik8/pytorch_model.bin)z$google/reformer-crime-and-punishmentzgoogle/reformer-enwik8c                 C   s   | t tj|  S N)torchtanhr   
functionalZsoftplus)x r   B/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_reformer.pymish,   s    r   )r	   relur   r   r
   r   LSHSelfAttentionOutputhidden_statesattention_probsbucketsLocalSelfAttentionOutputAttentionOutputReformerOutputattn_outputReformerBackwardOutputgrad_attn_outputgrad_hidden_statesReformerEncoderOutputall_hidden_statesall_attentionsc                 C   s   | j }t|}t|dkr,|d dkr,| jS t|dkrJ|d dkrJ| jS t|dkrv|tddgkrvt| j| jS td| j d S )Nr   r   lshlocal   zOnly attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.)	attn_layerssetlenlsh_attn_chunk_lengthlocal_attn_chunk_lengthnpZlcmNotImplementedErrorformat)configZ
attn_typesZattn_types_setr   r   r    _get_least_common_mult_chunk_lenE   s    r7   c                       s(   e Zd ZdZ fddZdd Z  ZS )AxialPositionEmbeddingszlConstructs axial position embeddings. Useful for very long input
    sequences to save memory and time.
    c              	      s   t    |j| _|j| _|j| _t|| _t	 | _
t| j|jksZtd| j|jt| jD ]R\}}dgt| j }| j| ||< t||f }| j
ttj|tjd qdd S )NzOMake sure that config.axial_pos_embds factors: {} sum to config.hidden_size: {}r   )dtype)super__init__axial_pos_shapeZaxial_pos_embds_dimhidden_dropout_probdropoutr7   least_common_mult_chunk_lengthr   ZParameterListweightssumhidden_sizeAssertionErrorr5   	enumerater0   tupleappend	Parameterr   onesZfloat32)selfr6   ZaxisZaxial_pos_embd_dimZax_shape	__class__r   r   r;   [   s$    


 z AxialPositionEmbeddings.__init__c              	      s<  |j d  |j d  fddjD }jdkrttjksdtdjjttjjdkrt	j
|dd}|d	d}tjj|jjd
}|d	d}t	| df}nt	j
 fdd|D dd}nZttjkstdjjt	j
|dd}| d|j d d d d f }|S )Nr   r   c                    s,   g | ]$}|  fj |jd d  qS )N)expandr<   shape.0weight)
batch_sizerI   r   r   
<listcomp>y   s    z3AxialPositionEmbeddings.forward.<locals>.<listcomp>TzIf training, make sure that config.axial_pos_shape factors: {} multiply to sequence length. Got prod({}) != sequence_length: {}. You might want to consider padding your sequence length to {} or changing config.axial_pos_shape.rL   dimr-   ptrainingc                    s   g | ]}t | d fqS rL   )r   reshaperO   )rR   sequence_lengthr   r   rS      s     zMake sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {}))rN   r@   rX   r   r   r<   rC   r5   r>   r   cat	transposer   r   Z	dropout2drZ   r?   view)rI   position_idsZbroadcasted_weightsr@   Ztransposed_weightsZdropped_transposed_weightsZdropped_weightsZposition_encodingsr   )rR   rI   r[   r   forwardt   sT    


   

    zAxialPositionEmbeddings.forward__name__
__module____qualname____doc__r;   r`   __classcell__r   r   rJ   r   r8   V   s   r8   c                       s(   e Zd ZdZ fddZdd Z  ZS )PositionEmbeddingsz^Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`.
    c                    s(   t    |j| _t|j|j| _d S r   )	r:   r;   r=   r>   r   	Embeddingmax_position_embeddingsrB   	embeddingrI   r6   rJ   r   r   r;      s    
zPositionEmbeddings.__init__c                 C   s$   |  |}tjj|| j| jd}|S NrV   )rj   r   r   r>   rX   )rI   r_   position_embeddingsr   r   r   r`      s    
zPositionEmbeddings.forwardra   r   r   rJ   r   rg      s   rg   c                       s*   e Zd ZdZ fddZdddZ  ZS )ReformerEmbeddingszLConstruct the embeddings from word, position and token_type embeddings.
    c                    sH   t    |j| _|j| _t|j|j| _	|j
r:t|nt|| _d S r   )r:   r;   ri   r=   r>   r   rh   
vocab_sizerB   word_embeddingsZaxial_pos_embdsr8   rg   rm   rk   rJ   r   r   r;      s    
zReformerEmbeddings.__init__Nc           	      C   s   |d k	r|  }|j}n|  d d }|j}|d }|d kr`tj|tj|d}|d|}|d krr| |}|jd | j	kst
d|jd | j	tjj|| j| jd}| |}|| }|S )NrL   r   r9   devicer   zRSequence Length: {} has to be larger equal than config.max_position_embeddings: {}rV   )sizerr   r   arangelong	unsqueezerM   rp   rN   ri   rC   r5   r   r   r>   rX   rm   )	rI   	input_idsr_   inputs_embedsinput_shaperr   Z
seq_length
embeddingsrm   r   r   r   r`      s,    
 
zReformerEmbeddings.forward)NNNra   r   r   rJ   r   rn      s   
rn   c                   @   s2   e Zd ZdZdd Zdd Zdd Zdd	d
ZdS )EfficientAttentionMixinzL
    A few utilities for nn.Modules in Reformer, to be used as a mixin.
    c              
   C   s   |dkr|dkr|S g }t | |d D ]`}|dkr@|| q(|tj|dddd|ddf |ddddd|df gdd q(tj|ddS )a   Used to implement attention between consecutive chunks.

            Args:
                vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
                num_chunks_before: chunks before current chunk to include in attention
                num_chunks_after: chunks after current chunk to include in attention

            Returns:
                tensor of shape [num_chunks, N * chunk_length, ...], where
                N = (1 + num_chunks_before + num_chunks_after).
        r   r   N.r-   rT      )rangerF   r   r\   )rI   vectorsnum_chunks_beforenum_chunks_afterZslicesir   r   r   _look_adjacent   s    Jz&EfficientAttentionMixin._look_adjacentc                 C   s.   |  dd ||f }|j| }|ddS )zS
            splits hidden_size dim into attn_head_size and num_attn_heads
        NrL   r-   r   )rs   r^   r]   )rI   r   num_attn_headsattn_head_sizeZnew_x_shaper   r   r   _split_hidden_size_dim   s    
z.EfficientAttentionMixin._split_hidden_size_dimc                 C   s.   | dddd}t|| d d|| fS )zW
            merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r-   r   r|   rL   )Zpermuter   rZ   rs   )rI   r   r   r   r   r   r   _merge_hidden_size_dims  s    z/EfficientAttentionMixin._merge_hidden_size_dimsNc                 C   sh   |j d }||||f}t|j dkr6t|||f S t|j dkrPt||S tdt|j dS )zg
            splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
        r      r|   z5Input vector rank should be one of [3, 4], but is: {}N)rN   r0   r   rZ   
ValueErrorr5   )rI   r~   Zdim_factor_1Zdim_factor_2r   r   rR   Zsplit_dim_shaper   r   r   _split_seq_length_dim_to  s    
z0EfficientAttentionMixin._split_seq_length_dim_to)N)rb   rc   rd   re   r   r   r   r   r   r   r   r   r{      s
   r{   c                       sh   e Zd Z fddZdddZdd Zd	d
 Zdd Zdd Zdd Z	dd Z
dddZdd Z  ZS )LSHSelfAttentionc                    s   t    |j| _|j| _|j| _|j| _|j| _	|j
| _
|j| _|j| _|j| _|j| _|j| _| j| j | _|j| _tj| j| jdd| _tj| j| jdd| _| dtd | dtd | dtd | d	td
 d S )NFbiasself_mask_value_float16g     @self_mask_value_float32g     jmask_value_float16     mask_value_float32    e)r:   r;   r1   chunk_length
num_hashesnum_bucketsZlsh_num_chunks_beforer   Zlsh_num_chunks_afterr   	hash_seed
is_decoderri   Z lsh_attention_probs_dropout_probr>   num_attention_headsattention_head_sizeall_head_sizerB   r   Linear	query_keyvalueregister_bufferr   tensorrk   rJ   r   r   r;     s(    
zLSHSelfAttention.__init__NFc                 K   s  |j d }|j d }	|d k	r |n| j}| |}
| |}~| |
| j| j}
| || j| j}|
j d | jkstd|
j d | j|j d | jkstd|j d | j| j	d kr| 
| |d kr| |
|}t|j d || ks
td|j d || | |||\}}|| }| |
||}
| |||}| |
d| j| j| j}
| |d| j| j| j}| jd kr| jdkr| jdkstd| |
}| j|
|||||d\}}}~
~~t||||| j\}}|dkrJ| |||| j| j}| |||| j| jd}t|tj|d	d
d }tj|| d	d}~~|j |	| j|| jfksltd| || j| j}|dkrd}t|||dS )Nr   r   rL   5last dim of query_key_vectors is {} but should be {}.z1last dim of value_vectors is {} but should be {}.z+last dim of buckets is {}, but should be {}tIf `config.chunk_length` is `None`, make sure `config.num_chunks_after` and `config.num_chunks_before` are set to 0.)query_vectorskey_vectorsvalue_vectorssorted_bucket_idxattention_mask	head_maskr-   TrU   keepdimrT   zuout_vectors have be of shape `[batch_size, config.num_attention_heads, sequence_length, config.attention_head_size]`.Fr   r   r   r    )rN   r   r   r   r   r   r   rC   r5   r   _set_num_buckets_hash_vectorsint1_get_sorted_bucket_idx_and_undo_sorted_bucket_idx_gather_by_expansionr   r   r   r   _len_and_dim_norm_attendReverseSortapplyrv   r   exp	logsumexprA   r   r   )rI   r   r   r   r   do_output_attentionsr    kwargsr[   rR   Zquery_key_vectorsr   r   undo_sorted_bucket_idxr   out_vectorslogitsr   Zprobs_vectorsr   r   r   r`   8  s    




    

          
	    
        

zLSHSelfAttention.forwardc                 C   s  |j d }t| jtrB| jd dks4td| j| j}| j}n>d\}}| jD ].}|d dksntd||| }|| }qP| }| jd k	rt	| j | j
|j d ||d f}tj||j|jd}td||}	t| jtst| jd	krtj|	|	 gdd
}	tj|	dd
}
nd\}
}}| jD ]v}|	d|||d  f }||d  }tj|| gdd
}|
d kr~tj|dd
}
n|
|tj|dd
  }
|| }q(tj||jd}|| d}||| j
f|j dd   }|
| jddd}|S )Nr   r-   zEThere should be an even number of bucktes, but `self.num_bucktes`: {})r   r   z:The number of buckets should be even, but `num_bucket`: {}rL   rr   r9   zbmtd,mdhr->bmhtrr   rT   )Nr   r   .rr   )r   r   rL   r   r|   Z	start_dimZend_dim)rN   
isinstancer   r   rC   r5   detachr   r   manual_seedr   Zrandnrr   r9   Zeinsumr0   r\   Zargmaxrt   r^   rM   flatten)rI   r~   r   rR   Zrotation_sizer   Zbucket_factorZrotations_shapeZrandom_rotationsZrotated_vectorsr    Zcur_sumZcur_productZrotated_vectors_factoroffsetsZoffset_bucketsr   r   r   r     sL    







zLSHSelfAttention._hash_vectorsc           
   	   C   s   t   |jd }t j|| |jdddd}||| j|jd }|| ||  }| }t j	|dd}t j|jd |jdddd|j}|j
|  }	|	d|| W 5 Q R X ||	fS )Nr   r   r   rL   rT   )r   no_gradrN   rt   rr   r^   rM   r   r   Zargsortnewrs   Zscatter_)
rI   r[   r    r   rR   Zorig_indicesZscaled_bucketsr   indicesr   r   r   r   r     s$    

  zBLSHSelfAttention._get_sorted_bucket_idx_and_undo_sorted_bucket_idxc                 C   s`   d| | j  }tt| j| j  d | j }|d| krF||| d g}td| || _d S )Nr-   g      ?r   zBconfig.num_buckets is not set. Setting config.num_buckets to {}...)r   maxr   ri   loggerwarningr5   r   )rI   r[   r   Znum_buckets_limitr   r   r   r     s    z!LSHSelfAttention._set_num_bucketsc                 C   sh  |  || j| j}|  || j| j}t||dd}~~| |d| j| j}|  || j| j}	|j	tj
kr| j }
| j }n| j}
| j}| ||	|}|d k	rt|||}~t|d|	d|j}t|||
}~tj|ddd}t|| }~tjj|| j| jd}|d k	r.|| }t||}~|jdddd}|jddd}|||fS )	NrL   r   Tr   rV   r-   r|   r   )r   r   r   r   matmulr]   r   r   r   r9   float16r   halfr   r   r   _compute_attn_maskwherenerv   torr   r   r   r   r   r>   rX   r   Zsqueeze)rI   r   r   r   r   r   r   query_key_dotsZquery_bucket_idxZkey_value_bucket_idxZself_mask_value
mask_valuemaskZ	self_maskr   r   r   r   r   r   r   "  sH       

zLSHSelfAttention._attendc                 C   s   d }| j r*t|d|d|j}|d k	r|tjd d d d d d f }||jd d d }t	|d|}t	|d|}|d|d }~~~|d k	r|| }n|}|S )NrL   r   rY   )
r   r   gerv   r   rr   uint8rM   rN   gather)rI   query_indiceskey_indicesr   r   Zkey_attn_maskZquery_attn_mask	attn_maskr   r   r   r   q  s      
z#LSHSelfAttention._compute_attn_maskc                 C   s.   |  |}|ttj| j|j|jd }|S )zF
            length and attention head size dim normalization
        r   )	_len_normr   rsqrtr   r   rr   r9   )rI   r~   r   r   r   r     s
    
z"LSHSelfAttention._len_and_dim_normư>c                 C   s*   t j|d ddd}|t ||  }|S )z*
            length normalization
        r-   rL   T)r   )r   meanr   )rI   r   epsilonZvarianceZnorm_xr   r   r   r     s    zLSHSelfAttention._len_normc                 C   s6   | dddd| j}|dd|d}t|d|S )zO
            expand dims of idxs and vectors for all hashes and gather
        rL   r   r-   )rv   rM   r   repeatr   r   )rI   r~   Zidxsr   Zexpanded_idxsr   r   r   r     s    z%LSHSelfAttention._gather_by_expansion)NNNFN)r   )rb   rc   rd   r;   r`   r   r   r   r   r   r   r   r   rf   r   r   rJ   r   r     s        
~AO

r   c                   @   s(   e Zd ZdZedd Zedd ZdS )r   a  
        After chunked attention is applied which sorted clusters,
        original ordering has to be restored.
        Since customized backward function is used for Reformer,
        the gradients of the output vectors have to be explicitely
        sorted here.
    c              	   C   sV   t  @ || _|| _|d|j}t |d|}t |d|}W 5 Q R X ||fS )NrL   r-   )r   r   r   r   rv   rM   rN   r   )ctxr   r   r   r   r   Zexpanded_undo_sort_indicesr   r   r   r`     s    
zReverseSort.forwardc                 C   s   | j }| j}|j}|j}||d d |df }||d d |df |dd   }t||jd d |df }|d|j}t|d|}t|d|}t||}t||}||d d d fS )Nr-   rL   r|   )	r   r   rN   r^   r   rZ   rv   rM   r   )r   Zgrad_out_vectorsZgrad_logitsr   r   Zgrad_logits_shapeZgrad_out_vectors_shapeZexpanded_sort_indicesr   r   r   backward  s    zReverseSort.backwardNrb   rc   rd   re   staticmethodr`   r   r   r   r   r   r     s
   
r   c                       s.   e Zd Z fddZd	ddZdd Z  ZS )
LocalSelfAttentionc                    s   t    |j| _|j| _|j| _|j| _|j	| _	|j
| _
|j| _| j| j | _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _|j| _| dtd | dtd d S )NFr   r   r   r   r   )r:   r;   r   r2   r   Zlocal_num_chunks_beforer   Zlocal_num_chunks_afterr   r   pad_token_idr   r   rB   r   r   querykeyr   Z"local_attention_probs_dropout_probr>   r   r   r   rk   rJ   r   r   r;     s     
zLocalSelfAttention.__init__NFc                 K   s  |j d }|j d }| |}| |}	| |}
| || j| j}| |	| j| j}	| |
| j| j}
|j d | jkstd|j d | j|	j d | jkstd|	j d | j|
j d | jkstd|
j d | j| j	d kr| j
dkr| jdkstd|	ttj| j|	j|	jd }	| |d| j	| j| j}| |	d| j	| j| j}	| |
d| j	| j| j}
tj||jd|| jd}| |d| j	| j}| |d| j	| j}| |	| j
| j}	| |
| j
| j}
| || j
| j}t||	dd}~~	| ||||j }|d k	rP|jtjkr<| j }n| j}t|||}~tj|dd	d
}t|| }~tj j!|| j!| j"d}|d k	r|| }t||
}~
|j#ddd}|j || j|| jfkst| $|| j| j}|dkrd}t%||dS )Nr   r   rL   r   r   r   r   r   Tr   rV   r-   r|   r   Fr   )r   r   )&rN   r   r   r   r   r   r   rC   r5   r   r   r   r   sqrtr   rr   r9   r   rt   r   r   r   r]   r   r   r   r   r   r   r   r   r   r   r>   rX   r   r   r!   )rI   r   r   r   r   r   r[   rR   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r`     s    




                 


zLocalSelfAttention.forwardc                 C   s   d }|d k	rN| tjd d d d d f }| |d| jd}| || j| j}| jdkrxt	|
d|
d |j}|d k	r|
d|
d |}|d k	r|| }n|}|S )NrL   r   Tr   )r   r   r   r   r   r   r   r   r   r   rv   rr   rM   )rI   r   r   r   Zquery_key_dots_shaper   Zattention_mask_keyr   r   r   r   r   n  s    
 
z%LocalSelfAttention._compute_attn_mask)NNF)rb   rc   rd   r;   r`   r   rf   r   r   rJ   r   r     s   
sr   c                       s$   e Zd Z fddZdd Z  ZS )ReformerSelfOutputc                    s6   t    |j|j }|j| _tj||jdd| _	d S )NFr   )
r:   r;   r   r   r=   r>   r   r   rB   dense)rI   r6   r   rJ   r   r   r;     s    
zReformerSelfOutput.__init__c                 C   s$   |  |}tjj|| j| jd}|S rl   r   r   r   r>   rX   rI   r   r   r   r   r`     s    
zReformerSelfOutput.forwardrb   rc   rd   r;   r`   rf   r   r   rJ   r   r     s   r   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
ReformerAttentionr   c                    s   t    || _|j| _tj|j|jd| _t	t
| jdkrX| jd dkrXt|| _nt	t
| jdkr| jd dkrt|| _n`t	t
| jdkrt
| jt
ddgkr| j| j dkrt|| _qt|| _ntd| jt|| _d S )NZepsr   r   r+   r,   r-   zOnly attn layer types 'lsh' and 'local' exist, but got `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.)r:   r;   layer_idr.   r   	LayerNormrB   layer_norm_eps
layer_normr0   r/   r   self_attentionr   r4   r5   r   outputrI   r6   r   rJ   r   r   r;     s$    
  (zReformerAttention.__init__NFc           	      C   sR   |  |}| j||||||d}| |j}t|dr>|j}nd }t||j|dS )N)r   r   r   r   r   r    r    r   )r   r   r   r   hasattrr    r"   r   )	rI   r   r   r   r   r   r    Zself_attention_outputsattention_outputr   r   r   r`     s$    	

  zReformerAttention.forward)r   )NNNFNr   r   r   rJ   r   r     s        r   c                       s$   e Zd Z fddZdd Z  ZS )ReformerFeedForwardDensec                    sJ   t    |j| _t|jtr,t|j | _n|j| _t	
|j|j| _d S r   )r:   r;   r=   r>   r   Z
hidden_actstrACT2FNact_fnr   r   rB   feed_forward_sizer   rk   rJ   r   r   r;     s    
z!ReformerFeedForwardDense.__init__c                 C   s.   |  |}tjj|| j| jd}| |}|S rl   )r   r   r   r>   rX   r   r   r   r   r   r`     s    

z ReformerFeedForwardDense.forwardr   r   r   rJ   r   r     s   r   c                       s$   e Zd Z fddZdd Z  ZS )ReformerFeedForwardOutputc                    s(   t    |j| _t|j|j| _d S r   )	r:   r;   r=   r>   r   r   r   rB   r   rk   rJ   r   r   r;     s    
z"ReformerFeedForwardOutput.__init__c                 C   s$   |  |}tjj|| j| jd}|S rl   r   r   r   r   r   r`     s    
z!ReformerFeedForwardOutput.forwardr   r   r   rJ   r   r     s   r   c                       s,   e Zd Z fddZdd Zdd Z  ZS )ChunkReformerFeedForwardc                    sD   t    |j| _d| _tj|j|jd| _t	|| _
t|| _d S )Nr   r   )r:   r;   chunk_size_feed_forwardseq_len_dimr   r   rB   r   r   r   r   r   r   rk   rJ   r   r   r;     s    

z!ChunkReformerFeedForward.__init__c                 C   s   t | j| j| j|S r   )r   r   r  forward_chunk)rI   r   r   r   r   r`     s       z ChunkReformerFeedForward.forwardc                 C   s   |  |}| |}| |S r   )r   r   r   r   r   r   r   r    s    

z&ChunkReformerFeedForward.forward_chunkrb   rc   rd   r;   r`   r  rf   r   r   rJ   r   r     s   	r   c                       sB   e Zd Zd fdd	Zdd Zdd Zdd
dZdddZ  ZS )ReformerLayerr   c                    s0   t    t||| _d | _d | _t|| _d S r   )r:   r;   r   	attentionattention_seedfeed_forward_seedr   feed_forwardr   rJ   r   r   r;     s
    
zReformerLayer.__init__c                 C   sd   t |  jjdkr@tj }tjj|  | _	tj
| j	 n tt tj | _	t
| j	 dS )a  
            This function sets a new seed for the
            attention layer to make dropout deterministic
            for both forward calls: 1 normal forward
            call and 1 forward call in backward
            to recalculate activations.
        cudaN)next
parametersrr   typer   r	  current_devicedefault_generatorsseedr  r   r   sysmaxsizerI   Z
device_idxr   r   r   _init_attention_seed  s    

z"ReformerLayer._init_attention_seedc                 C   sd   t |  jjdkr@tj }tjj|  | _	tj
| j	 n tt tj | _	t
| j	 dS )a  
            This function sets a new seed for the
            feed forward layer to make dropout deterministic
            for both forward calls: 1 normal forward
            call and 1 forward call in backward
            to recalculate activations.
        r	  N)r
  r  rr   r  r   r	  r  r  r  r  r   r   r  r  r  r   r   r   _init_feed_forward_seed$  s    

z%ReformerLayer._init_feed_forward_seedNFc           	   	   C   sj   t  H |   | j|||||d}|j}|| }~|   || | }W 5 Q R X t|||j|j	dS )N)r   r   r   r   r   )r$   r   r   r    )
r   r   r  r  r   r  r  r#   r   r    )	rI   prev_attn_outputr   r   r   r   r   Zattn_outputsr$   r   r   r   r`   8  s(    	
zReformerLayer.forwardc              	   C   s   t  0 d|_t | j | |}|j|dd W 5 Q R X t    || }~||j }d |_W 5 Q R X t  : d|_t | j	 | j
||||dj}	|	j|dd W 5 Q R X t  * ||	 }
~	~||j }d |_| }W 5 Q R X t|
|||dS )NT)Zretain_graph)r   r   r   r    r$   r   r&   r'   )r   Zenable_gradZrequires_gradr   r  r  r   r   Zgradr  r  r   r   r%   )rI   next_attn_outputr   r&   r'   r   r   r    Zres_hidden_statesr   r$   r   r   r   backward_passd  s@    




   

zReformerLayer.backward_pass)r   )NNNF)NNN)	rb   rc   rd   r;   r  r  r`   r  rf   r   r   rJ   r   r    s   
    
2   r  c                   @   s(   e Zd ZdZedd Zedd ZdS )_ReversibleFunctionak  
    To prevent PyTorch from performing the usual backpropagation,
    a customized backward function is implemented here. This way
    it is made sure that no memory expensive activations are
    saved during the forward pass.
    This function is heavily inspired by https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py
    c
              	   C   s   d}
t j|ddd\}}t||D ]V\}}|dkr<|| |||||||	d}|j}|j}|
|jf }
|	r"||j q"|dkr|| | |	 |	  || _
|
| _|| _|| _t j||gddS )Nr   r-   rL   rT   T)r  r   r   r   r   r   )r   chunkziprF   r$   r   r    r   Zsave_for_backwardr   layersall_bucketsr   r   r\   )r   r   r  r   r   r   r)   r*   do_output_hidden_statesr   r  r$   layerZlayer_head_maskZlayer_outputsr   r   r   r`     s4    

z_ReversibleFunction.forwardc              
   C   s   t j|ddd\}}| j\}}t||||d}~~~~| j}| j}| j}| j}	t|d d d D ]L\}
}|d }|d d }|j	|j
|j|j|j|t||
 d  |	|d}q`|dkstdt j|j|jgdd}|d d d d d d d d f	S )	Nr-   rL   rT   r  r   )r  r   r&   r'   r   r   r    r   z.buckets have to be empty after backpropagation)r   r  Zsaved_tensorsr%   r  r  r   r   rD   r  r$   r   r&   r'   r0   rC   r\   )r   r'   r&   r$   r   r   r  r  r   r   idxr  r    r   r   r   r     s8    

z_ReversibleFunction.backwardNr   r   r   r   r   r    s
   
2r  c                       s&   e Zd Z fddZdddZ  ZS )ReformerEncoderc                    sN   t     j| _t fddt jD | _tj	d j
  jd| _d S )Nc                    s   g | ]}t  |qS r   )r  )rP   r   r6   r   r   rS     s     z,ReformerEncoder.__init__.<locals>.<listcomp>r-   r   )r:   r;   r=   r>   r   Z
ModuleListr}   num_hidden_layersr  r   rB   r   r   rk   rJ   r"  r   r;     s    
 zReformerEncoder.__init__NFc           	      C   sd   g }g }t j||gdd}t|| j|||||||	}| |}tjj|| j| j	d}t
|||dS )NrL   rT   rV   )r   r)   r*   )r   r\   r  r   r  r   r   r   r>   rX   r(   )	rI   r   r   r   r   r  r   r)   r*   r   r   r   r`     s*    

  zReformerEncoder.forward)NNNFFr   r   r   rJ   r   r!    s        r!  c                       s,   e Zd Z fddZdd Zdd Z  ZS )ReformerOnlyLMHeadc                    sT   t    d| _|j| _tjd|j |jdd| _t	t
|j| _| j| j_d S )Nr   r-   Fr   )r:   r;   r  chunk_size_lm_headr   r   rB   ro   decoderrG   r   zerosr   rk   rJ   r   r   r;   =  s    
zReformerOnlyLMHead.__init__c                 C   s   t | j| j| j|S r   )r   r%  r  r  r   r   r   r   r`   I  s    zReformerOnlyLMHead.forwardc                 C   s   |  |}|S r   )r&  r   r   r   r   r  L  s    
z ReformerOnlyLMHead.forward_chunkr  r   r   rJ   r   r$  <  s   r$  c                   @   s0   e Zd ZdZeZeZdZe	dd Z
dd ZdS )ReformerPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    reformerc                 C   s"   t t}t t}||d}|S )N)rw   r   )r   r   r   r   )rI   rw   Z
input_maskdummy_inputsr   r   r   r*  Z  s    

z$ReformerPreTrainedModel.dummy_inputsc                 C   s   t |tr.|jD ]}tjjj|| jjd qnnt |tj	rR|j
jjd| jjd nJt |tjrv|j
jjd| jjd n&t |tjr|jj  |j
jd t |tjr|jdk	r|jj  dS )z Initialize the weights )stdg        )r   r+  g      ?N)r   r8   r@   r   r   initZnormal_r6   Zaxial_norm_stdrh   rQ   dataZinitializer_ranger   r   r   Zzero_Zfill_)rI   modulerQ   r   r   r   _init_weightsd  s    

z%ReformerPreTrainedModel._init_weightsN)rb   rc   rd   re   r   Zconfig_class%REFORMER_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixpropertyr*  r/  r   r   r   r   r(  Q  s   
	r(  uJ  
    Reformer was proposed in
    `Reformer: The Efficient Transformer`_
    by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.

    .. _`Reformer: The Efficient Transformer`:
        https://arxiv.org/abs/2001.04451

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.ReformerConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a
  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            During training the input_ids sequence_length has to be a multiple of the relevant model's
            chunk lengths (lsh's, local's or both). During evaluation, the indices are automatically
            padded to be a multiple of the chunk length.

            Indices can be obtained using :class:`transformers.ReformerTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        num_hashes (:obj:`int`, `optional`, defaults to :obj:`None`):
            `num_hashes` is the number of hashing rounds that should be performed during
            bucketing. Setting `num_hashes` overwrites the default `num_hashes` defined
            in `config.num_hashes`.
            For more information, see `num_hashes` in :class:`transformers.ReformerConfig`.
zaThe bare Reformer Model transformer outputting raw hidden-stateswithout any specific head on top.c                	       sP   e Zd Z fddZdd Zdd Zdd ZeedddZ	dddZ
  ZS )ReformerModelc                    sF   t  | || _| jjdks&tdt|| _t|| _| 	  d S )Nr   zS`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local'])
r:   r;   r6   r#  rC   rn   rz   r!  encoderinit_weightsrk   rJ   r   r   r;     s    


zReformerModel.__init__c                 C   s   | j jS r   rz   rp   rI   r   r   r   get_input_embeddings  s    z"ReformerModel.get_input_embeddingsc                 C   s   || j _d S r   r5  )rI   r   r   r   r   set_input_embeddings  s    z"ReformerModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        N)itemsr3  r  r  Zprune_heads)rI   Zheads_to_pruner  Zheadsr   r   r   _prune_heads  s    zReformerModel._prune_headsNFc	              
   C   s  | j j}| j j}|dk	r*|dk	r*tdn@|dk	rB| }	|j}
n(|dk	rb| dd }	|j}
ntdt|	dkstd|	| j	|| j j
dd}|	d }t| j }|	d | d	k}|r||	d |  }| jdkrtd
|	d ||	d | | j|||||	|||
d\}}}}}	| j|||d}| j||||||d}|j}|rb|ddd|f }|f}|dkr~||jf }|dkr||jf }|S )a  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``do_output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import ReformerModel, ReformerTokenizer
        import torch

        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
        model =  ReformerModel.from_pretrained('google/reformer-crime-and-punishment')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        NzDYou cannot specify both input_ids and inputs_embeds at the same timerL   z5You have to specify either input_ids or inputs_embedsr-   zO`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {}T)Zis_attention_chunkedr   zIf training, sequence Length {} has to be a multiple of least common multiple chunk_length {}. Please consider padding the input to a length of {}.)rx   r   r_   ry   padding_lengthpadded_seq_lengthrr   )rw   r_   rx   )r   r   r   r   r  r   )r6   Zoutput_attentionsZoutput_hidden_statesr   rs   rr   r0   rC   r5   Zget_head_maskr#  r7   rX   _pad_to_mult_of_chunk_lengthrz   r3  r   r)   r*   )rI   rw   r   r_   r   rx   r   r  r   ry   rr   Zorig_sequence_lengthr?   Zmust_pad_to_match_chunk_lengthr;  Zembedding_outputZencoder_outputssequence_outputoutputsr   r   r   r`     sr    ,



  


zReformerModel.forwardc	                 C   sR  t d|d |d | | tj|d |f| jj|tjd}	|d k	rptj|tj	|d |||j
dgdd}n6tjtj||tjdtj	|d |f|tjdgdd}|d k	rtj||	gdd}| }|d k	rtj|d |tj|d}
|d|d |}
tj||
gdd}|d k	rD| |	|}tj||gdd}| }|||||fS )Nz^Input ids are automatically padded from {} to {} to be a multiple of `config.chunk_length`: {}rL   r   r   rT   rq   r   )r   infor5   r   fullr6   r   ru   r\   r'  r9   rH   r   rs   rt   rv   rM   rz   )rI   rw   rx   r   r_   ry   r;  r<  rr   Zpadded_input_idsZpadded_position_idsZpadded_inputs_embedsr   r   r   r=  F  sL     
 
   	


z*ReformerModel._pad_to_mult_of_chunk_length)NNNNNNFF)NNNNNNN)rb   rc   rd   r;   r7  r8  r:  r   REFORMER_INPUTS_DOCSTRINGr`   r=  rf   r   r   rJ   r   r2    s,           x       r2  z7Reformer Model with a `language modeling` head on top. c                
       sF   e Zd Z fddZdd Zdd Zeedd	d
Zdd Z	  Z
S )ReformerModelWithLMHeadc                    s,   t  | t|| _t|| _|   d S r   )r:   r;   r2  r)  r$  lm_headr4  rk   rJ   r   r   r;     s    

z ReformerModelWithLMHead.__init__c                 C   s   | j jS r   )rD  r&  r6  r   r   r   get_output_embeddings  s    z-ReformerModelWithLMHead.get_output_embeddingsc                 C   s   d S r   r   r6  r   r   r   tie_weights  s    z#ReformerModelWithLMHead.tie_weightsNFc
              
   C   s   | j ||||||||	d}
|
d }| |}|f|
dd  }|dk	r|dddddf  }|dddf  }t }||d| jj|d}|f| }|S )a5	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
                All labels set to ``-100`` are ignored (masked), the loss is only
                computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
            Classification loss (cross entropy).
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``do_output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import ReformerModelWithLMHead, ReformerTokenizer
        import torch

        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
        model =  ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=input_ids)

        loss, prediction_scores = outputs[:2]
        )r_   r   r   rx   r   r  r   r   r   N.rL   )r)  rD  
contiguousr   r^   r6   ro   )rI   rw   r_   r   r   rx   r   labelsr  r   Zreformer_outputsr>  r   r?  Zshift_logitsZshift_labelsZloss_fctZlossr   r   r   r`     s(    4

zReformerModelWithLMHead.forwardc                 K   s    d|i}d|kr|d |d< |S )Nrw   r   r   )rI   rw   Zpastr   Zinputs_dictr   r   r   prepare_inputs_for_generation  s    z5ReformerModelWithLMHead.prepare_inputs_for_generation)	NNNNNNNFF)rb   rc   rd   r;   rE  rF  r   rB  r`   rI  rf   r   r   rJ   r   rC    s            LrC  )Ere   loggingr  collectionsr   	functoolsr   operatorr   Znumpyr3   r   r   Ztorch.autograd.functionr   Ztorch.nnr   Zactivationsr	   r
   r   r   Zconfiguration_reformerr   Z
file_utilsr   r   r   r   Zmodeling_utilsr   r   	getLoggerrb   r   r0  r   r   r   r   r   r!   r"   r#   r%   r(   r7   Moduler8   rg   rn   r{   r   r   r   r   r   r   r   r   r  r  r!  r$  r(  ZREFORMER_START_DOCSTRINGrB  r2  rC  r   r   r   r   <module>   s   
 
O-:   9 '; k0&( L