U
    &cyQ                     @   sL  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZ eedd	d
ZeedddZdejjedddZeee	e dddZdddZdd Z eeef edddZ!dZ"dd Z#d d! Z$d"Z%d#e%d$fd%d&d'd(d)gZ&d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;Z'd<Z(d=Z)d>d? Z*d@dA Z+dBdedCfeedDdEdFZ,dGdH Z-ddJdKZ.ddLdMZ/e	dNdOdPZ0dQdR Z1edSfdTdUZ2dVdW Z3edXdYdZZ4eee5f e	e d[d\d]Z6d^d_ Z7edd`dadbZ8dcdd Z9dedf Z:dgdh Z;diZ<djdkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddddddZ=G dd dZ>dd Z?edddZ@eAdkre B ZCeCjDdeddd eCjDdeddd eCE ZFeeFjGZHeHI stJeFjKdkrdeHjL neFjKZMe@eHeM dd ZNe
ee	f eddddZOeeddddZPdS )    N)Path)DictListUnion)ZipFile)tqdm)MarianConfigMarianMTModelMarianTokenizer)HfApitextprefixc                 C   s   |  |r| t|d  S | S N)
startswithlenr    r   J/tmp/pip-unpacked-wheel-ymerj3tt/transformers/convert_marian_to_pytorch.pyremove_prefix   s    
r   )layer_prefix	converterc                 C   sH   i }| D ]:}| |sqt||}| | j}t| ||| < q|S r   )r   r   TtorchtensorZsqueeze)	opus_dictr   r   sdkstrippedvr   r   r   convert_encoder_layer   s    


r   F)	layer_lst
opus_statec                 C   sT   t | D ]F\}}|r$d|d  dnd|d  d}t|||}|j|dd qd S )N	decoder_l   _	encoder_lT)strict)	enumerater   Zload_state_dict)r    r!   r   
is_decoderiZlayerZ	layer_tagr   r   r   r   load_layers_#   s    $r*   )src_langtgt_langreturnc                    sL   d t  }| }dd |D } fdd|D } fdd|D }|S )zLFind models that can accept src_lang as input and return tgt_lang as output.zHelsinki-NLP/opus-mt-c                 S   s   g | ]}|j d r|j qS )zHelsinki-NLP)ZmodelIdr   .0xr   r   r   
<listcomp>/   s      z)find_pretrained_model.<locals>.<listcomp>c                    s(   g | ] }d |krt |  dqS )+-)r   lowersplit)r/   m)r   r   r   r1   0   s     c                    s2   g | ]*\}}|kr|kr  | d | qS )r3   r   )r/   abr   r+   r,   r   r   r1   3   s       )r   
model_list)r+   r,   apir:   Z	model_idsZsrc_and_targZmatchingr   r9   r   find_pretrained_model*   s    
r<   r#   c           	      C   sN   | j \}}t||f}t| |g}t|df}tj||fdd}||fS )Nr#   )Zaxis)shapenpzerosZconcatenate)	wemb
final_biasZn_special_tokensZvsized_modelZembs_to_addZnew_embsZbias_to_addZnew_biasr   r   r   add_emb_entries7   s    
rC   c              	   C   sT   ddd}t | ts| S | |kr(||  S z
t| W S  ttfk
rN   |  Y S X d S )NTF)truefalse)
isinstancestrint	TypeError
ValueError)r   Zbool_dctr   r   r   _cast_yaml_str@   s    


rK   )raw_cfgr-   c                 C   s   dd |   D S )Nc                 S   s   i | ]\}}|t |qS r   )rK   r/   r   r   r   r   r   
<dictcomp>M   s      z&cast_marian_config.<locals>.<dictcomp>)items)rL   r   r   r   cast_marian_configL   s    rP   zspecial:model.ymlc                 C   s@   dd l }ddd | t D }|j|d d |jd}t|S )Nr    c                 S   s   g | ]}t |qS r   )chrr.   r   r   r   r1   V   s     z/load_config_from_state_dict.<locals>.<listcomp>Loader)yamljoin
CONFIG_KEYload
BaseLoaderrP   )r   rV   Zcfg_strZyaml_cfgr   r   r   load_config_from_state_dictS   s    r[   c                 C   s2   t t| d}t|dks&t||d }|S )Nz*.npzr#   r   )listr   globr   AssertionError)dest_dirZmodel_filesZ
model_filer   r   r   find_model_file[   s    r`   zfr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la)z@cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zhZZHZROMANCE)z de+nl+fy+af+da+fo+is+no+nb+nn+svZNORTH_EU)zda+fo+is+no+nb+nn+svZSCANDINAVIA)zse+sma+smj+smn+smsZSAMI)znb_NO+nb+nn_NO+nn+nog+no_nb+noZNORWAY)zga+cy+br+gd+kw+gvZCELTICzCcmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-dezCcmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fizCcmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-svz)da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+svzAde+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+svzCde-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zhzen+el+es+fi-en+el+es+fizen-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+lazen-ga+cy+br+gd+kw+gvz!es-nb_NO+nb+nn_NO+nn+nog+no_nb+noz'fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+smszCfi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zhz!fi-nb_NO+nb+nn_NO+nn+nog+no_nb+nozfr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-enzga+cy+br+gd+kw+gv-enzCsv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zhz!sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no)zopus-mt-ZH-dezopus-mt-ZH-fizopus-mt-ZH-svzopus-mt-SCANDINAVIA-SCANDINAVIAzopus-mt-NORTH_EU-NORTH_EUzopus-mt-de-ZHzopus-mt-en_el_es_fi-en_el_es_fizopus-mt-en-ROMANCEzopus-mt-en-CELTICzopus-mt-es-NORWAYz!opus-mt-fi_nb_no_nn_ru_sv_en-SAMIzopus-mt-fi-ZHzopus-mt-fi-NORWAYzopus-mt-ROMANCE-enzopus-mt-CELTIC-enzopus-mt-sv-ZHzopus-mt-sv-NORWAYzAhttps://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/zHelsinki-NLP/c                 C   s&   t D ]\}}| ||} q| ddS )Nr2   r$   )GROUPSreplace)r0   substrZgrp_namer   r   r   convert_opus_name_to_hf_name   s    rd   c                 C   s2   t | t} | tkrt|  }n| dd}t |dS )zrRelies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.r$   r2   opus-mt-)r   ORG_NAMEGROUP_TO_OPUS_NAMErb   )hf_model_nameZopus_w_prefixr   r   r   convert_hf_name_to_opus_name   s
    

ri   zOPUS-MT-train/models/z*marian_converted/model_cards/Helsinki-NLP/)rh   r-   c                 C   s  t | t} t| }dd |dD \}}t| d }d|d| }}	d|  d| d|	 d	| d
| d}
t| | d}| st||	 
 }|dd }d|ddd }|
d |dd }|r|S |jdd ||  }|jdd |d }|	d| |S )zCopy the most recent model's readme section from opus, and add metadata.
    upload command: s3cmd sync --recursive model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/
    c                 S   s   g | ]}| d qS )r2   )r5   r.   r   r   r   r1      s     z$write_model_card.<locals>.<listcomp>r3   z
/README.md,z### z

* source languages: z
* target languages: z
*  OPUS readme: [z](z)
z
# rS   *r#   Nz
* downloadzdownload original weightsTexist_ok	README.mdw)r   rf   ri   r5   OPUS_GITHUB_URLrW   r   existsr^   openreadrb   mkdirwrite)rh   	repo_pathdry_runZmodel_card_dirZ	opus_nameZopus_srcZopus_tgtZ
readme_urlstZextra_markdownZopus_readme_pathcontentZsub_dirdestr   r   r   write_model_card   s(    	
$r}   c                 C   s   dd | D S )Nc                 S   s   i | ]}|t |qS r   )rd   r.   r   r   r   rN      s      z.get_clean_model_id_mapping.<locals>.<dictcomp>r   )Zmultiling_model_idsr   r   r   get_clean_model_id_mapping   s    r~   Opus-MT-train/modelsc                 C   s   t | d d  s$td|  di }t |  D ]>}|jd}|dkrPq4q4tt|d  }t	|||j< q4dd |
 D S )	Nzfr-enro   z
repo_path:zf does not exist: You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling.r3   r   c                 S   s4   g | ],\}}||d  |d |d dd d fqS )pre-processingrl   Nz	.test.txtr   rM   r   r   r   r1      s     z!make_registry.<locals>.<listcomp>)r   rr   rJ   lsnamecountr\   rs   	readlines_parse_readmerO   )rw   resultspZn_dashlnsr   r   r   make_registry   s    
r   c           	      C   s   t d}t d}|jdd | dkr.t|d} t| D ]X\}}}}d|krLq6tj|| d snt|||  t|}t	|| |d	|   q6dS )
zRequires 300GBZmarian_ckptZmarian_convertedTrm   N)rw   ZSentencePiecepytorch_model.binre   )
r   ru   r   r   ospathrr   download_and_unziprd   convert)	r:   rw   save_dirr_   r   Zpreprorl   test_set_urlZ	pair_namer   r   r    convert_all_sentencepiece_models   s    
r   r-   c                 C   s   t t| |S r   )r\   map)fr0   r   r   r   lmap   s    r   c                 C   s   dd l }|| d}t|  }ttj|d d d }ttj|dd d }ttj|dd d }t|t|  krt|ksn t	t
| |||fS )Nr   zopus_test.txt   r#      )wgetrl   r   rs   r   r   rG   stripr   r^   r   remove)r   r   fnamer   srcZgoldZ	mar_modelr   r   r   fetch_test_set   s    &
r   zmarian_ckpt/c                 C   s>   t t|  D ](}d|j }|d  r.qtt| qd S )Nzmarian_converted/r   )r   r\   r   r   rr   r   
source_dir)r   subdirr_   r   r   r   convert_whole_dir   s
    r   c                 C   s   i }dd | D D ]}| ds"q|dd  }dD ]}| |r6 qLq6q|dkrp|d}|\}}|||< q|d	kr|d
d dd }|||< q|S )z6Get link and metadata from opus model card equivalent.c                 S   s   g | ]}|  qS r   )r   r.   r   r   r   r1      s     z!_parse_readme.<locals>.<listcomp>rk   r#   N)rl   datasetmodelsmodelr   )r   r   r   :rl   (rS   )r   r   r5   )r   Zsubreslnr   Zsplatr$   r   r   r   r   r      s"    




r   )r_   c                 C   s<   | j d}t|d d|d d d}t|| d  d S )Nr3   rS   )Ztarget_langZsource_langtokenizer_config.json)r   r5   dictrW   	save_json)r_   Zdnamedctr   r   r   save_tokenizer_config  s    r   )vocabspecial_tokensc                 C   s@   t |  d }d}|D ]"}|| kr&q|| | |< |d7 }q|S )Nr#   r   )maxvalues)r   r   startaddedtokr   r   r   add_to_vocab_  s    
r   c                 C   s   t | dd S )Nz
*vocab.ymlr   )r\   r]   )	model_dirr   r   r   find_vocab_file  s    r   )r   r-   c                 C   sT   t t| }dd | D }t|dg}td| d t|| d  t|  d S )Nc                 S   s   i | ]\}}|t |qS r   )rH   rM   r   r   r   rN   %  s      z/add_special_tokens_to_vocab.<locals>.<dictcomp>z<pad>zadded z tokens to vocab
vocab.json)	load_yamlr   rO   r   printr   r   )r   r   Z	num_addedr   r   r   add_special_tokens_to_vocab#  s    r   c                 C   sJ   t |}t | jd }dD ]}t|j| ||  qt| j|d  d S )NZ
source_spm>   r   
source.spm
target.spmr   )r   Zinit_kwargsshutilcopyfileparentr   encoder)selfZsave_directoryr|   Zsrc_pathZ	dest_namer   r   r   save_tokenizer,  s
    r   c              	   C   s>   | | | |  }}||ks:t d| d| d| d| d S )Nzhparams rj   z	 differ: z != )r^   )
marian_cfgZk1Zk2Zv1Zv2r   r   r   check_equal5  s    r   c                 C   s   ddddddddddddd	dd
}|  D ]2\}}| | }||ks*td| d| d| q*t| dd t| dd t| dd d S )NTFr   dZdanrQ   Ztransformerr   r#   )ztied-embeddings-allzlayer-normalizationz
right-lefttransformer-ffn-depthtransformer-aan-depthztransformer-no-projectionztransformer-postprocess-embztransformer-postprocesstransformer-preprocesstypezulr-dim-embzdec-cell-base-depthzdec-cell-high-depthztransformer-aan-nogatezUnexpected config value for z
 expected z got ztransformer-ffn-activationtransformer-aan-activationr   r   transformer-dim-ffnztransformer-dim-aan)rO   r^   r   )r   Zassumed_settingsr   r   actualr   r   r   check_marian_cfg_assumptions:  s*    $r   decoder_ff_logit_out_bzself_attn.q_proj.weightzself_attn.k_proj.weightzself_attn.v_proj.weightzself_attn.out_proj.weightzself_attn.q_proj.biaszself_attn.k_proj.biaszself_attn.v_proj.biaszself_attn.out_proj.biaszself_attn_layer_norm.weightzself_attn_layer_norm.biasz
fc1.weightzfc1.biasz
fc2.weightzfc2.biaszfinal_layer_norm.weightzfinal_layer_norm.biaszencoder_attn.k_proj.weightzencoder_attn.out_proj.weightzencoder_attn.q_proj.weightzencoder_attn.v_proj.weightzencoder_attn.k_proj.biaszencoder_attn.out_proj.biaszencoder_attn.q_proj.biaszencoder_attn.v_proj.biaszencoder_attn_layer_norm.weightzencoder_attn_layer_norm.bias)Zself_WqZself_WkZself_WvZself_WoZself_bqZself_bkZself_bvZself_boZself_Wo_ln_scaleZself_Wo_ln_biasZffn_W1Zffn_b1Zffn_W2Zffn_b2Zffn_ffn_ln_scaleZffn_ffn_ln_biasZ
context_WkZ
context_WoZ
context_WqZ
context_WvZ
context_bkZ
context_boZ
context_bqZ
context_bvZcontext_Wo_ln_scaleZcontext_Wo_ln_biasc                   @   s>   e Zd Zdd Zdd Zedd Zdd Zed	d
dZ	dS )	OpusStatec                 C   s  t |}t|| _t| j}|d d |d d ks:td| jksHtt| j| _t| jd | jt d\| _	| _
| j	jd d | _| jd |d< t| j | _d| jkrtd|   || _|| _| jd	 j\}}||d
   krdksn ttt|d }t| t|d |d |d |d |d |d |d |d
 |d | jdd|d
 dd|d k|d  d|d | j| jggdd| _d S )Nz
dim-vocabsr   r#   WposWemb
vocab_sizeZWtypezfound Wtype keyZencoder_l1_ffn_W1zdim-embi   zdecoder.ymlz	dec-depthz	enc-depthztransformer-headsr   r   Tnr   z%transformer-train-position-embeddingsg?z	beam-size)r   Zdecoder_layersZencoder_layersZdecoder_attention_headsZencoder_attention_headsZdecoder_ffn_dimZencoder_ffn_dimrB   Zactivation_functionpad_token_idZeos_token_idZbos_token_idZmax_position_embeddingsZscale_embeddingnormalize_embeddingstatic_position_embeddingsZdropoutZ	num_beamsZdecoder_start_token_idZbad_words_ids
max_length)r`   r>   rY   
state_dictr[   r^   r   rC   BIAS_KEYr@   rA   r=   r   r\   keys
state_keysrJ   _check_layer_entriesr   cfgrP   r   r   r   	hf_config)r   r   Znpz_pathr   hidden_sizeZintermediate_shapeZdecoder_ymlr   r   r   __init__t  sT    
 

zOpusState.__init__c                 C   s   |  d| _|  d| _|  d| _t| jdkrHtdt| j  t| jdkrltdt| j  t| jdkrtdt| j  d S )N
encoder_l1
decoder_l1
decoder_l2   z-Expected 16 keys for each encoder layer, got    z-Expected 26 keys for each decoder layer, got )sub_keysr   r   r   r   warningswarn)r   r   r   r   r     s    zOpusState._check_layer_entriesc                 C   sF   g }| j D ]6}|ds
|ds
|tdddfkr6q
q
|| q
|S )Nr%   r"   r   r   r   )r   r   rX   append)r   extrar   r   r   r   
extra_keys  s    
zOpusState.extra_keysc                    s    fdd| j D S )Nc                    s    g | ]}|  rt| qS r   )r   r   )r/   r   r   r   r   r1     s     
 z&OpusState.sub_keys.<locals>.<listcomp>)r   )r   r   r   r   r   r     s    zOpusState.sub_keysr   c                 C   s*  | j | j }}|jstt|}d| ks0tt|jjj	|t
 t|jjj	|t
dd tjt| j}tjt| j}||jj_|jj |jj_|jj_||_d|krtd t|d }||jjj_||jjj_|jrd|ksttd| jrtd| j |jjj| jks&t|S )	Nr   T)r(   r   zUnexpected: got WposZencoder_emb_ln_scale_prez#Need to convert layernorm_embeddingzFailed to convert )r   r   r   r^   r	   Zto_dictr*   r   r   ZlayersBART_CONVERTERdecoderr   nn	ParameterZFloatTensorr@   rA   ZsharedZweightZembed_tokensZfinal_logits_biasr   r   Zembed_positionsr   NotImplementedErrorr   Zpadding_idxr   )r   r   r   r   Zwemb_tensorZbias_tensorZwpos_tensorr   r   r   load_marian_model  s4    
  
zOpusState.load_marian_modelN)
__name__
__module____qualname__r   r   propertyr   r   r	   r   r   r   r   r   r   s  s   0
r   c                 C   sL   zdd l }W n tk
r(   tdY nX || }t|| t| d S )Nr   zyou must pip install wget)r   ImportErrorrl   unzipr   r   )urlr_   r   filenamer   r   r   r     s    

r   )r   c                 C   st   t |}|jdd t|  tt| }t|| t| }|jd t	|j
ksTt| }|| || d S )NTrm   r   )r   ru   r   r
   Zfrom_pretrainedrG   r   r   r   r   r   r^   r   Zsave_pretrained)r   r_   	tokenizerr!   r   r   r   r   r     s    

r   __main__z--srczpath to marian model dirzen-de)r   helpdefaultz--destz!Path to the output PyTorch model.)r   r   r   z
converted-c              
   C   s8   dd l }t| }|j||jdW  5 Q R  S Q R X d S )Nr   rT   )rV   rs   rY   rZ   )r   rV   r   r   r   r   r     s    
r   )r{   r   r-   c              	   C   s&   t |d}t| | W 5 Q R X d S )Nrp   )rs   jsondump)r{   r   r   r   r   r   r     s    r   )zip_pathr_   r-   c              	   C   s$   t | d}|| W 5 Q R X d S )Nr)r   
extractall)r  r_   ZzipObjr   r   r   r     s    r   )F)r#   )r   )NN)Qargparser   r   r   r   pathlibr   typingr   r   r   zipfiler   Znumpyr>   r   r   Ztransformersr   r	   r
   Ztransformers.hf_apir   rG   r   r   r   r   Z
ModuleListr*   r<   rC   rK   rP   rX   r[   r`   Z	ROM_GROUPra   rg   rq   rf   rd   ri   r}   r~   r   r   r   r   r   r   r   rH   r   r   r   r   r   r   r   r   r   r   r   r   ArgumentParserparseradd_argument
parse_argsargsr   r   rr   r^   r|   r   r_   r   r   r   r   r   r   r   <module>   s   
	
!

		p


