U
    "c#                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlZd dlZejd  d	k rd dlZejZeed
 eddgdZejdedejdddddejdddddejdddddejddddd d!e d"d# Zeej d$d  d%k r<ej!nej"Z"e" d&d' Z#d(d) Z$d*d+ Z%e&d,ejd-d.ddd/d0ejd1d2ddd3d0ejd4d5d6d7ejd8d9d:d7e$d;d< Z'e&d=ejd>d2ddd?d0e$d@dA Z(e&dBejdCddddDd0ejdEdFdddGd0ejdHd5dddId0ejdJd9dddKd0e$dLdM Z)e&dNejdOdPddQdRejdSd.dddTd0ejdUd5dddVd0e$dWdX Z*e&dYejdOdPddZdRejdSd.dddTd0ejdUd5ddd[d0e$d\d] Z+e&d^ejd_d.ddd`d0e$dadb Z,dS )c    N)deepcopypartialupdate_wrapper)MosesTokenizerMosesDetokenizer)MosesTruecaserMosesDetruecaser)MosesPunctNormalizerparallelize_preprocess   zTYou should really be using Python3!!! Tick tock, tick tock, https://pythonclock.org/z-hz--help)Zhelp_option_namesT)chainZcontext_settingsz
--languagez-lenz+Use language specific rules when tokenizing)defaulthelpz--processesz-j   zNo. of processes.z
--encodingz-eutf8zSpecify encoding of file.z--quietz-qFzDisable progress bar.)is_flagr   r   c                 C   s   d S N )languageencoding	processesquietr   r   2/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/cli.pycli#   s    r   .   c              	   K   sT   t jd|d<}|}| D ]}|t|f|}q|rF|D ]}t | q6W 5 Q R X d S )Nstdinr   )clickZget_text_streamlistZecho)Z
processorsr   kwargsfiniteratorprocitemr   r   r   process_pipeline5   s    r)   c                    s    fdd}t | f|S )z\Helper decorator to rewrite a function so that
    it returns another function from it.
    c                     s    fdd}t |f| S )Nc                    s    | f|S r   r   )streamr$   fr   r   	processorF   s    z.processor.<locals>.new_func.<locals>.processorr   )r$   r-   r+   r   r   new_funcE   s    zprocessor.<locals>.new_funcr   )r,   r$   r.   r   r+   r   r-   @   s    r-   c                 c   s@   |dkr| D ]}||V  qnt || || dD ]
}|V  q0d S )Nr   )progress_barr   )r&   funcr   r   lineoutliner   r   r   parallel_or_notN   s       
r3   tokenizez--aggressive-dash-splitsz-azTriggers dash split rules.)r   r   r   z--xml-escapez-xz"Escape special characters for XML.z--protected-patternsz-pzXSpecify file with patters to be protected in tokenisation. Special values: :basic: :web:)r   z--custom-nb-prefixesz-czjSpecify a custom non-breaking prefixes file, add prefixes to the default ones from the specified language.c              	   C   s|   t ||d}|rZ|dkr |j}n:|dkr0|j}n*t|dd}	dd |	 D }W 5 Q R X t|jd|||d	}
t| |
||S )
N)langZ custom_nonbreaking_prefixes_filez:basic:z:web:r   r!   c                 S   s   g | ]}|  qS r   )strip).0patternr   r   r   
<listcomp>   s     z!tokenize_file.<locals>.<listcomp>T)
return_straggressive_dash_splitsescapeprotected_patterns)r   ZBASIC_PROTECTED_PATTERNSZWEB_PROTECTED_PATTERNSopen	readlinesr   r4   r3   )r&   r   r   r   Z
xml_escaper;   r=   Zcustom_nb_prefixesmosesr%   Zmoses_tokenizer   r   r   tokenize_file^   s&    $ rA   
detokenizez--xml-unescapez$Unescape special characters for XML.c                 C   s4   t |d}t|jd|d}ttttj| |||S )N)r5   T)r:   unescape)r   r   rB   r3   r#   mapstrsplit)r&   r   r   r   Zxml_unescaper@   Zmoses_detokenizer   r   r   detokenize_file   s    
   rG   	normalizez--normalize-quote-commasz Normalize quotations and commas.z--normalize-numbersz-dzNormalize number.z--replace-unicode-punctsz2Replace unicode punctuations BEFORE normalization.z--remove-control-charsz.Remove control characters AFTER normalization.c           
      C   s*   t |||||d}t|j}	t| |	||S )N)Znorm_quote_commasZnorm_numbersZpre_replace_unicode_punctZpost_remove_control_chars)r   r   rH   r3   )
r&   r   r   r   Znormalize_quote_commasZnormalize_numbersZreplace_unicode_punctsZremove_control_charsr@   Zmoses_normalizer   r   r   normalize_file   s    $
rI   ztrain-truecasez--modelfilez-mzFilename to save the modelfile.)requiredr   z--is-asrz)A flag to indicate that model is for ASR.z--possibly-use-first-tokenz*Use the first token as part of truecasing.c           	      C   s,   t |d}|j| ||| d}|| d S )Nis_asrpossibly_use_first_tokenr   r/   )r	   train
save_model)	r&   r   r   r   	modelfilerL   rN   r@   modelr   r   r   train_truecaser   s    
rS   truecasez$Filename to save/load the modelfile.z1Use the first token as part of truecase training.c                 C   sd   t j|s<t| }t|d}|j|||| d}	|| t||d}
t|
jdd}t	| |||S )NrK   rM   )Z	load_fromrL   T)r:   )
ospathisfiler   r	   rO   rP   r   rT   r3   )r&   r   r   r   rQ   rL   rN   Ziterator_copyZ	truecaserrR   r@   Zmoses_truecaser   r   r   truecase_file  s    

rX   
detruecasez--is-headlinezWhether the file are headlines.c                 C   s$   t  }t|jd|d}t| |||S )NT)r:   is_headline)r
   r   rY   r3   )r&   r   r   r   rZ   r@   Zmoses_detruecaser   r   r   detruecase_file@  s    
  r[   )-rU   copyr   	functoolsr   r   r"   Zsacremoses.tokenizer   r   Zsacremoses.truecaser	   r
   Zsacremoses.normalizer   Zsacremoses.utilr   syswarningsversion_infoior>   warnrE   dictZCONTEXT_SETTINGSgroupoptionZversion_optionr   int__version__rF   ZresultcallbackZresult_callbackr)   r-   r3   commandrA   rG   rI   rS   rX   r[   r   r   r   r   <module>   sL  	       &

&          