U
    "ºcùH  ã                   @   sÐ   d dl mZ d dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlZejd  d	k ržd dlZd dlZejZe ed
ƒ¡ eƒ ZG dd„ deƒZG dd„ deƒZddgZdS )é    )Úprint_functionN)ÚdefaultdictÚCounter)Úpartial)Úchain)Ú	text_type)ÚPerluniprops)Úparallelize_preprocessÚgrouperé   zTYou should really be using Python3!!! Tick tock, tick tock, https://pythonclock.org/c                       sÐ   e Zd ZdZed e d¡¡ƒZed e d¡¡ƒZ	ed e d¡¡ƒZ
d$‡ fdd„	Zd%d
d„Zd&dd„Zd'dd„Zd(dd„Zd)dd„Zd*dd„Zd+dd„Zedd„ ƒZdd„ Zdd„ Zd d!„ Zd"d#„ Z‡  ZS ),ÚMosesTruecaserzò
    This is a Python port of the Moses Truecaser from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
    Ú ÚLowercase_LetterÚUppercase_LetterNÚutf8c                    sz   t t| ƒ ¡  t d | j| j| j¡¡| _	t d¡| _
ddddh| _ddd	d
ddddh| _|| _|| _|rv|  |¡| _dS )a.  
        :param load_from:
        :type load_from:

        :param is_asr: A flag to indicate that model is for ASR. ASR input has
            no case, make sure it is lowercase, and make sure known are cased
            eg. 'i' to be uppercased even if i is known.
        :type is_asr: bool
        z[{}{}{}]z(<.*(?<=>))(.*)((?=</)[^>]*>)Ú.ú:ú?ú!ú(ú[ú"ú'ú&apos;ú&quot;ú&#91;ú&#93;N)Úsuperr   Ú__init__ÚreÚcompileÚformatr   r   ÚTitlecase_LetterÚSKIP_LETTERS_REGEXZXML_SPLIT_REGXÚSENT_ENDÚDELAYED_SENT_STARTÚencodingÚis_asrÚ_load_modelÚmodel)ÚselfZ	load_fromr'   r&   ©Ú	__class__© ú7/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/truecase.pyr   -   s.      ÿÿøzMosesTruecaser.__init__Fc                 C   s°   d}g }t |ƒD ]š\}}t d|¡r(qn|| jkr4q|sH|| jkrHd}q| j |¡sZd}qd}|shd}n"|rŠ|d  ¡ r~d}n|dkrŠd}d}|dkr| | ¡ ||f¡ q|S )z
        This function checks through each tokens in a sentence and returns the
        appropriate weight of each surface token form.
        Tú(<\S[^>]*>)Fr   é   gš™™™™™¹?)	Ú	enumerater   Úsearchr%   r$   r#   ÚislowerÚappendÚlower)r*   ÚtokensÚpossibly_use_first_tokenÚis_first_wordZtruecase_weightsÚiÚtokenZcurrent_word_weightr-   r-   r.   Úlearn_truecase_weightsT   s2    
z%MosesTruecaser.learn_truecase_weightsr0   c                 C   sh   t tƒ}t| j|d}tt||||dŽ }|D ]\}	}
}||	 |
  |7  < q.|r^|  ||¡ |  |¡S )aN  
        :param document_iterator: The input document, each outer list is a sentence,
                          the inner list is the list of tokens for each sentence.
        :type document_iterator: iter(list(str))

        :param possibly_use_first_token: When True, on the basis that the first
            word of a sentence is always capitalized; if this option is provided then:
            a) if a sentence-initial token is *not* capitalized, then it is counted, and
            b) if a capitalized sentence-initial token is the only token of the segment,
               then it is counted, but with only 10% of the weight of a normal token.
        :type possibly_use_first_token: bool

        :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
        :rtype: {'best': dict, 'known': Counter}
        )r7   ©Úprogress_bar)r   r   r   r;   r   r	   Ú_save_model_from_casingÚ_casing_to_model)r*   Údocument_iteratorÚsave_tor7   Ú	processesr=   ÚcasingZtrain_truecaserZtoken_weightsZlowercase_tokenZsurface_tokenZweightr-   r-   r.   Ú_train‚   s$    þ   ÿÿzMosesTruecaser._trainc                 C   s"   d| _ | j|||||d| _ | j S )z\
        Default duck-type of _train(), accepts list(list(str)) as input documents.
        Nr<   )r)   rD   )r*   Z	documentsrA   r7   rB   r=   r-   r-   r.   Útrain¬   s    ûzMosesTruecaser.trainc              	   C   sN   t || jd}dd„ | ¡ D ƒ}W 5 Q R X d| _| j|||||d| _| jS )zj
        Duck-type of _train(), accepts a filename to read as a `iter(list(str))`
        object.
        ©r&   c                 s   s   | ]}|  ¡ V  qd S ©N©Úsplit©Ú.0Úliner-   r-   r.   Ú	<genexpr>Ï   s    z1MosesTruecaser.train_from_file.<locals>.<genexpr>Nr<   )Úopenr&   Ú	readlinesr)   rD   )r*   ÚfilenamerA   r7   rB   r=   Úfinr@   r-   r-   r.   Útrain_from_fileÁ   s    ÿûzMosesTruecaser.train_from_filec                 C   s4   dd„ |  ¡ D ƒ}d| _| j|||||d| _| jS )zm
        Duck-type of _train(), accepts a file object to read as a `iter(list(str))`
        object.
        c                 s   s   | ]}|  ¡ V  qd S rG   rH   rJ   r-   r-   r.   rM   é   s    z8MosesTruecaser.train_from_file_object.<locals>.<genexpr>Nr<   )rO   r)   rD   )r*   Zfile_objectrA   r7   rB   r=   r@   r-   r-   r.   Útrain_from_file_objectÜ   s    ÿûz%MosesTruecaser.train_from_file_objectc                 C   s$  t dƒ}t| dƒst|ƒ‚d}g }|  |¡}t|ƒD ]Ú\}}	t d|	¡rT| |	¡ q4|	dksf|	 d¡rr| |	¡ q4t d|	¡ 	¡ \}	}
| j
r”|	 ¡ }	| jd  |	 ¡ d¡}| jd	  |	d¡}|rÊ|rÊ|}	n|rÜ|rÖ|n|	}	n|rä|}	|	|
 }	| |	¡ |	| jk}|	| jkr4d
}q4|r d |¡S |S )a  
        Truecase a single sentence / line of text.

        :param text: A single string, i.e. sentence text.
        :type text: str

        :param use_known: Use the known case if a word is a known word but not the first word.
        :type use_known: bool
        zV
Use Truecaser.train() to train a model.
Or use Truecaser('modefile') to load a model.r)   Tr/   ú|z^([^\|]+)(.*)ÚbestNÚknownFú )ÚstrÚhasattrÚAssertionErrorÚ	split_xmlr1   r   r2   r4   Ú
startswithÚgroupsr'   r5   r)   Úgetr$   r%   Újoin)r*   ÚtextÚ
return_strZ	use_knownZcheck_model_messager8   Útruecased_tokensr6   r9   r:   Zother_factorsZ	best_caseZ
known_caser-   r-   r.   Útruecaseö   s@    
ÿ


ÿ


zMosesTruecaser.truecaseTc              	   c   sJ   t || jd2}|D ]&}|  | ¡ ¡}|r4d |¡n|V  qW 5 Q R X d S )NrF   rW   )rN   r&   rc   Ústripr_   )r*   rP   ra   rQ   rL   rb   r-   r-   r.   Útruecase_file>  s    zMosesTruecaser.truecase_filec                 C   s*  |   ¡ } g }| r&t d| ¡}t d| ¡}t d| ¡}|rÄ| ¡ \}}t d| ¡r°t|ƒdkr°t d|d ¡r°|d  |7  < t d|¡}|r¾|d  | d	¡7  < | d
¡}n| |d ¡ |} nP|rä| | d	¡¡ | d
¡} n0|r| | d	¡¡ | d
¡} ntd | ¡ƒ‚|d   ¡ |d< q|S )a
  
        Python port of split_xml function in Moses' truecaser:
        https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecaser.perl

        :param line: Input string, should be tokenized, separated by space.
        :type line: str
        z^\s*(<\S[^>]*>)(.*)$z^\s*([^\s<>]+)(.*)$z^\s*(\S+)(.*)$z^\Sr   z\|$éÿÿÿÿz^(\|+)(.*)$r0   é   rW   zERROR: huh? {})	rd   r   r2   r]   ÚlenÚgroupr4   Ú	Exceptionr!   )rL   r6   Zhas_xmlZ
is_non_xmlZxml_cognatesZpotential_xmlZ	line_nextZ	is_factorr-   r-   r.   r[   E  sD    	
ÿ
þýÿÿzMosesTruecaser.split_xmlc           	      C   sl   i }t ƒ }|D ]L}||  ¡ }|d d ||< | js|dd… D ]\}}||  d7  < q@q|||dœ}|S )zg

        :returns: A tuple of the (best, known) objects.
        :rtype: tuple(dict, Counter)
        r   r0   N)rU   rV   rC   )r   Úmost_commonr'   )	r*   rC   rU   rV   Ztoken_lowerr6   r:   Úcountr)   r-   r-   r.   r?   x  s    zMosesTruecaser._casing_to_modelc                 C   s   |   | jd |¡ d S )NrC   )r>   r)   )r*   rP   r-   r-   r.   Ú
save_modelŽ  s    zMosesTruecaser.save_modelc           
   	   C   s    t |d| jd†}|D ]z}t||  ¡ ƒ}g }t||  ¡ ƒD ]<\}\}}|dkrbd |||¡}	nd |||¡}	| |	¡ q>td 	|¡d|d qW 5 Q R X d	S )
a"  
        Outputs the truecaser model file in the same output format as
        https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl

        :param casing: The dictionary of tokens counter from `train()`.
        :type casing: default(Counter)
        ÚwrF   r   z
{} ({}/{})z{} ({})rW   Ú
)ÚendÚfileN)
rN   r&   ÚsumÚvaluesr1   rk   r!   r4   Úprintr_   )
r*   rC   rP   Zfoutr:   Ztotal_token_countZtokens_countsr9   rl   Z	out_tokenr-   r-   r.   r>   ‘  s    z&MosesTruecaser._save_model_from_casingc              	   C   s~   t tƒ}t|| jdX}|D ]L}| ¡  ¡ }t|dƒD ]0\}}| d¡d  d¡}t|ƒ|| ¡  |< q6qW 5 Q R X |  	|¡S )zÄ
        Loads pre-trained truecasing file.

        :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
        :rtype: {'best': dict, 'known': Counter}
        rF   rg   ú/r   z())
r   r   rN   r&   rd   rI   r
   Úintr5   r?   )r*   rP   rC   rQ   rL   r:   rl   r-   r-   r.   r(   ¥  s    "zMosesTruecaser._load_model)NNr   )F)NFr0   F)NFr0   F)NFr0   F)NFr0   F)FF)T)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r_   ÚperlunipropsÚcharsr   r   r"   r   r;   rD   rE   rR   rS   rc   re   Ústaticmethodr[   r?   rm   r>   r(   Ú__classcell__r-   r-   r+   r.   r   !   sD   '
1    ú
-    ú
    ú
    ú

H

2r   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚMosesDetruecaserc              #      s‚   t t| ƒ ¡  ddddh| _ddddd	d
ddh| _dddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/h#| _d S )0Nr   r   r   r   r   r   r   r   r   r   r   r   ÚaÚafterZagainstzal-.+ÚandÚanyÚasÚatÚbeZbecauseZbetweenZbyZduringzel-.+ÚforÚfromZhisÚinÚisZitsÚlastÚnotZofÚoffÚonZthanZtheZtheirÚthisÚtoÚwasÚwereÚwhichZwillÚwith)r   r   r   r$   r%   ÚALWAYS_LOWER©r*   r+   r-   r.   r   ¸  s^    øÝzMosesDetruecaser.__init__Fc                    sŠ   g }d}|  ¡ D ]P}|r4|dd…  ¡ |dd…  n|}| |¡ |ˆ jkrRd}q|ˆ jkrd}q|rx‡ fdd„|D ƒ}|r†d |¡S |S )z¼
        Detruecase the translated files from a model that learnt from truecased
        tokens.

        :param text: A single string, i.e. sentence text.
        :type text: str
        TNr0   Fc                    s6   g | ].}|ˆ j kr|n|d d…  ¡ |dd …  ‘qS )Nr0   )r•   Úupper)rK   r:   r–   r-   r.   Ú
<listcomp>  s   ÿz/MosesDetruecaser.detruecase.<locals>.<listcomp>rW   )rI   r—   r4   r$   r%   r_   )r*   r`   Zis_headlinera   Zcased_tokensZsentence_startr:   r-   r–   r.   Ú
detruecaseî  s    	$



þzMosesDetruecaser.detruecase)FF)rw   rx   ry   r   r™   r~   r-   r-   r+   r.   r   ·  s   6r   )Ú
__future__r   r   Úcollectionsr   r   Ú	functoolsr   Ú	itertoolsr   Úsixr   Zsacremoses.corpusr   Zsacremoses.utilr	   r
   ÚsysÚversion_infoÚioÚwarningsrN   ÚwarnrX   r{   Úobjectr   r   Ú__all__r-   r-   r-   r.   Ú<module>   s0   ÿÿ   T