U
    "cz                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 e Ze ZG dd deZG d	d
 d
eZdd
gZdS )    N)	text_type)Perluniprops)NonbreakingPrefixes)is_cjk)VIRAMASNUKTASc                2       s  e Zd ZdZededZededde	 de
 ZededZededZededde	 de
 ZededZd	Zd
ZdZdZdZdedfZdjeddfZdZdZdZdedfZdedfZdedfZdZdZdZ dZ!dZ"d Z#d!Z$d"Z%d#Z&d$Z'd%jed&d'fZ(d(jed&d'fZ)d)jed&d'fZ*d*eedfZ+d+jedd,fZ,d-Z-d.Z.d/Z/d0Z0d1Z1d2Z2d3Z3d4Z4d5Z5d6Z6d7Z7d8Z8d9Z9d:Z:d;Z;d<Z<d=Z=d>Z>d?Z?d@Z@dAZAdBZBdCZCdDZDdEZEdFZFdGZGdHZHdIZIdJZJdKZKdLZLdMZMdNZNdOZOdPZPdQZQdRZRdSZSdTZTdUZUdVZVdWZWdXZXdYZYdZjed[d\fZZd]jeed^d\fZ[d_jed[d\fZ\d`jed[dafZ]dbjedcdafZ^eZe[e\e]e^gZ_dZjed[d\fZ`ddjed[d\fZad_jed[d\fZbd`jed[defZce`eaebecgZddfZedgZfdhZgdiZhdjZidkZjdlZkeeeee e!e"e#e$e%e&e(e)e*e+e,e-e.e/e0e1e2e3e4e5e6e7e8e9e:e;e<e=e>e?e@eAeBeCeDeEeFeGeHeIeJeKeLeMeNg2Zle'eOePeQeReSeTeUeVeWg
ZmeReSeTeUeVeWeXeYgZnegeheiejekgZodmdkdndogZpd fdrds	Zqdtdu Zrdvdw Zsdxdy Ztdzd{ Zud|d} Zvd~d Zwdd ZxdddZydddZz  Z{S )MosesTokenizerz
    This is a Python port of the Moses Tokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
     IsNIsAlnumIsScIsSoIsAlphaIsLower)z\s+ )z[\000-\037]r	   )z +r   )z^ r	   )z $r	   ([^{}\s\.'\`\,\-]) \1  ([{alphanum}])\-(?=[{alphanum}])Zalphanum\1 @-@ )	\.([\.]+)z
 DOTMULTI)DOTMULTI\.([^\.])zDOTDOTMULTI )
DOTMULTI\.DOTDOTMULTIz
([^{}])[,]z\1 , z
[,]([^{}])z , \1z
([{}])[,]$)z^```` )z^"r   )z^`([^`])z` \1)z^'z`  )z
([ ([{<])"\1 `` )z([ ([{<])``r   )z([ ([{<])`([^`])z\1 ` \2)z
([ ([{<])'z\1 ` )\.\.\.z _ELLIPSIS_ )Z
_ELLIPSIS_r   z([^{numbers}])[,]([^{numbers}]))Znumbersz\1 , \2z([{numbers}])[,]([^{numbers}])z([^{numbers}])[,]([{numbers}])z([;:@#\$%&{}{}])([{alphanum}])\/([{alphanum}])$1 \@\/\@ $2)z([^.])([.])([\]\)}>"']*) ?$z\1 \2\3)z([?!])r   )z([\]\[\(\){}<>])r   )z\(z-LRB-)z\)z-RRB-)\[z-LSB-)z\]z-RSB-)z\{z-LCB-)z\}z-RCB-)z--z -- )^r   )$r   )"z '' )z([^'])' \1 ' )z([^'])'r#   )z'([sSmMdD]) z '\1 )z'll z 'll )z're z 're )z've z 've )zn't z n't )z'LL z 'LL )z'RE z 'RE )z'VE z 'VE )zN'T z N'T )z ([Cc])annot z
 \1an not )z ([Dd])'ye z \1' ye )z ([Gg])imme z	 \1im me )z ([Gg])onna z	 \1on na )z ([Gg])otta z	 \1ot ta )z ([Ll])emme z	 \1em me )z ([Mm])ore'n z
 \1ore 'n )z '([Tt])is z '\1 is )z '([Tt])was z	 '\1 was )z ([Ww])anna z	 \1an na )z  *r   )z^ *r	   )z *$r	   )&&amp;)z\|&#124;)<&lt;)>&gt;)\'&apos;)z\"&quot;)r   &#91;)]&#93;z([^{alpha}])[']([^{alpha}]))alphaz\1 ' \2z([^{alpha}{isn}])[']([{alpha}]))r1   isnz([{alpha}])[']([^{alpha}])z([{alpha}])[']([{alpha}])z\1 '\2z([{isn}])[']([s]))r2   z([^{alpha}])[']([{alpha}])z\1' \2)r+   z ' )z\.' ?$z . ' z<\/?\S+\/?>z#<\S+( [a-zA-Z0-9]+\="?[^"]")+ ?\/?>z#<\S+( [a-zA-Z0-9]+\='?[^']')+ ?\/?>z'[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}z/(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+z"((https?|ftp|rsync)://|www\.)[^ ]*z@[a-zA-Z0-9_]+z#[a-zA-Z0-9_]+enNc              	      s  t t   | _dd t|D  _|rg  _t|d<}|D ]0}| }|rD|	dsD| jkrD j
| qDW 5 Q R X  fdd jD  _ jdkrd} jdkr|tdtd	7 } jd
kr|tdtd7 } jdkr>|tdtd7 }|tdtd7 }|tdtd7 }  j|7  _  j|7  _d jdf _dj jddf _dj jddf _d S )Nc                 S   s   g | ]}|  qS  )strip).0Z_nbpr4   r4   7/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/tokenize.py
<listcomp>0  s    z+MosesTokenizer.__init__.<locals>.<listcomp>r#c                    s$   g | ]}  |r|d d qS )r   r   )has_numeric_only
rpartition)r6   wselfr4   r7   r8   >  s   
)zhjakocjkr	   )rB   rC   ZHangul)r@   rC   Han)rA   rC   HiraganaKatakanar   r   r   r   r   r   r   )superr   __init__langnonbreaking_prefixeswordsNONBREAKING_PREFIXESopenr5   
startswithappendNUMERIC_ONLY_PREFIXESr   joinperlunipropscharsr   r   formatPAD_NOT_ISALNUMAGGRESSIVE_HYPHEN_SPLITINTRATOKEN_SLASHES)r?   rI   Z custom_nonbreaking_prefixes_fileZfinlineZ	cjk_chars	__class__r>   r7   rH   *  sF    



zMosesTokenizer.__init__c                 C   s<   t dd|}t d|r8t dd|}t dd|}q|S )Nr   z DOTMULTI\1r   r   zDOTDOTMULTI \1r   )resubsearchr?   textr4   r4   r7   replace_multidots[  s
    z MosesTokenizer.replace_multidotsc                 C   s*   t d|rt dd|}q t dd|S )Nr   z	DOTMULTI.ZDOTMULTI.)r[   r]   r\   r^   r4   r4   r7   restore_multidotsb  s    z MosesTokenizer.restore_multidotsc                 C   s   t |t | j S N)set
differencer   r^   r4   r4   r7   islowerg  s    zMosesTokenizer.islowerc                 C   s   t t|t| jS rc   )anyrd   intersectionr   r^   r4   r4   r7   
isanyalphaj  s    zMosesTokenizer.isanyalphac                 C   s   t td|S )Nz[\s]+(\#NUMERIC_ONLY\#))boolr[   r]   r^   r4   r4   r7   r;   m  s    zMosesTokenizer.has_numeric_onlyc                 C   s   |  }t|}t|D ]\}}td|}|r|d}d|krL| |s|| jkr`|| jks||d kr||d  r| 	||d  d rq|| jkr|d |k rtd||d  rq|d ||< qd
|S )Nz	^(\S+)\.$   ra   r   z^[0-9]+z .r   )splitlen	enumerater[   r]   groupri   rL   rP   rf   rQ   )r?   r_   tokensZ
num_tokensitokenZtoken_ends_with_periodprefixr4   r4   r7   handles_nonbreaking_prefixesp  s<    
	

	
z+MosesTokenizer.handles_nonbreaking_prefixesc                 C   s"   | j D ]\}}t|||}q|S rc   )MOSES_ESCAPE_XML_REGEXESr[   r\   r?   r_   regexpsubstitutionr4   r4   r7   
escape_xml  s    zMosesTokenizer.escape_xmlFc                 C   s^   t |}| jD ]\}}t|||}q| |}| jD ]\}}t|||}q6|rV|S | S )z
        This is a Python port of the Penn treebank tokenizer adapted by the Moses
        machine translation community.
        )r   MOSES_PENN_REGEXES_1r[   r\   rt   MOSES_PENN_REGEXES_2rl   )r?   r_   
return_strrw   rx   r4   r4   r7   penn_tokenize  s    
zMosesTokenizer.penn_tokenizeTc                    s  t   | j| jfD ]\}}t||  q|rr fdd|D }t|D ]&\}	}
dt|	d } |
| qJ 	  | j
\}}t||  |r| j\}}t||  |   | j| j| jfD ]\}}t||  q| jdkr| jD ]\}}t||  qnF| jdkr:| jD ]\}}t||  qn| j\}}t||  |   | j\}}t|| 	  | j\}}t||  |rt|D ](\}	}
dt|	d } ||
 q|   |r|   |r S   S )a  
        Python port of the Moses tokenizer.

            :param tokens: A single string, i.e. sentence text.
            :type tokens: str
            :param aggressive_dash_splits: Option to trigger dash split rules .
            :type aggressive_dash_splits: bool
        c                    s*   g | ]"}t | t jD ]}| qqS r4   )r[   finditer
IGNORECASEro   )r6   Zprotected_patternmatchr_   r4   r7   r8     s    z+MosesTokenizer.tokenize.<locals>.<listcomp>ZTHISISPROTECTED   r3   )frit)r   DEDUPLICATE_SPACE
ASCII_JUNKr[   r\   rn   strzfillreplacer5   rU   rV   r`   COMMA_SEPARATE_1COMMA_SEPARATE_2COMMA_SEPARATE_3rI   ENGLISH_SPECIFIC_APOSTROPHEFR_IT_SPECIFIC_APOSTROPHENON_SPECIFIC_APOSTROPHErt   TRAILING_DOT_APOSTROPHErb   ry   rl   )r?   r_   Zaggressive_dash_splitsr|   escapeZprotected_patternsrw   rx   Zprotected_tokensrq   rr   Zsubstituitionr4   r   r7   tokenize  sV    









zMosesTokenizer.tokenize)r3   N)F)FFTN)|__name__
__module____qualname____doc__r   rQ   rR   rS   r
   r   r   r   r   r   r   r   r   r   Z	MID_STRIPZ
LEFT_STRIPZRIGHT_STRIPrT   rU   rV   Z REPLACE_DOT_WITH_LITERALSTRING_1Z REPLACE_DOT_WITH_LITERALSTRING_2Z REPLACE_DOT_WITH_LITERALSTRING_3r   r   r   ZDIRECTIONAL_QUOTE_1ZDIRECTIONAL_QUOTE_2ZDIRECTIONAL_QUOTE_3ZDIRECTIONAL_QUOTE_4ZDIRECTIONAL_QUOTE_5ZDIRECTIONAL_QUOTE_6ZDIRECTIONAL_QUOTE_7ZDIRECTIONAL_QUOTE_8ZREPLACE_ELLIPSISZRESTORE_ELLIPSISZCOMMA_1ZCOMMA_2ZCOMMA_3ZSYMBOLSrW   ZFINAL_PERIODZPAD_QUESTION_EXCLAMATION_MARKZPAD_PARENTHESISZCONVERT_PARENTHESIS_1ZCONVERT_PARENTHESIS_2ZCONVERT_PARENTHESIS_3ZCONVERT_PARENTHESIS_4ZCONVERT_PARENTHESIS_5ZCONVERT_PARENTHESIS_6ZPAD_DOUBLE_DASHESZPAD_START_OF_STRZPAD_END_OF_STRZCONVERT_DOUBLE_TO_SINGLE_QUOTESZHANDLES_SINGLE_QUOTESZ
APOSTROPHEZCONTRACTION_1ZCONTRACTION_2ZCONTRACTION_3ZCONTRACTION_4ZCONTRACTION_5ZCONTRACTION_6ZCONTRACTION_7ZCONTRACTION_8ZCONTRACTION_9ZCONTRACTION_10ZCONTRACTION_11ZCONTRACTION_12ZCONTRACTION_13ZCONTRACTION_14ZCONTRACTION_15ZCONTRACTION_16ZCONTRACTION_17ZCONTRACTION_18ZCONTRACTION_19ZCLEAN_EXTRA_SPACE_1ZCLEAN_EXTRA_SPACE_2ZCLEAN_EXTRA_SPACE_3ZESCAPE_AMPERSANDZESCAPE_PIPEZESCAPE_LEFT_ANGLE_BRACKETZESCAPE_RIGHT_ANGLE_BRACKETZESCAPE_SINGLE_QUOTEZESCAPE_DOUBLE_QUOTEZESCAPE_LEFT_SQUARE_BRACKETZESCAPE_RIGHT_SQUARE_BRACKETZEN_SPECIFIC_1ZEN_SPECIFIC_2ZEN_SPECIFIC_3ZEN_SPECIFIC_4ZEN_SPECIFIC_5r   ZFR_IT_SPECIFIC_1ZFR_IT_SPECIFIC_2ZFR_IT_SPECIFIC_3ZFR_IT_SPECIFIC_4r   r   r   ZBASIC_PROTECTED_PATTERN_1ZBASIC_PROTECTED_PATTERN_2ZBASIC_PROTECTED_PATTERN_3ZBASIC_PROTECTED_PATTERN_4ZBASIC_PROTECTED_PATTERN_5rz   r{   ru   ZBASIC_PROTECTED_PATTERNSZWEB_PROTECTED_PATTERNSrH   r`   rb   rf   ri   r;   rt   ry   r}   r   __classcell__r4   r4   rY   r7   r      s  ""

61)
    r   c                6       s  e Zd ZdZededZededZ	ededZ
dZeddfZd	Zd
ZdZdZdZdZdZdZdZdZdZeeeeeeeeeeegZddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIg6ZdJdKdLdMdNgZdOdPd&d(dQdRdSdTdUg	ZdVedWeedWeedWeZdb fdYdZ	Z d[d\ Z!dcd^d_Z"ddd`daZ#  Z$S )eMosesDetokenizerz
    This is a Python port of the Moses Detokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl

    r	   r   r   r   )z \@\-\@ -z {2,}r   )r&   |)r(   r'   )r*   r)   )r-   r"   )r,   ')r.   [)r0   r/   )r%   r$   )z&bar;r   )z&bra;r   )z&ket;r/   NnAa   Ä   äZssaZSsau   ssäu   SsäZstau   stäZStau   StäZhunZHunZhynZHynZhanrD   u   hänu   Hänu   hönu   HönZunZUnZynZYnZanZAnu   änu   Änu   önu   ÖnseenZSeenZllaZLlau   lläu   LläZltaZLtau   ltäu   LtäZlleZLleZksiZKsiZkseZKseZttaZTtaZineZInenisiZmmeZnneZnsarB   u   köpau   päZkaanu   käänZkinz^({})({})?({})$r   r3   c                    s   t t|   || _d S rc   )rG   r   rH   rI   )r?   rI   rY   r4   r7   rH     s    zMosesDetokenizer.__init__c                 C   s"   | j D ]\}}t|||}q|S rc   )MOSES_UNESCAPE_XML_REGEXESr[   r\   rv   r4   r4   r7   unescape_xml  s    zMosesDetokenizer.unescape_xmlTc                 C   s6  d d|}t|}| j\}}t|||}|r>| |}dddddd}d}d}	| }tt	|D ]\}
}t
|d r| jdkr|
dkrt
||
d  d r|	|7 }	n|	|| 7 }	d}qjtd	| j d
 |r|	|| 7 }	d}qjtd|r(| jdkrtd|r|	d7 }	|	|7 }	d}qj| jdkrb|
dkrbtd | j|rb|	|7 }	d}qj| jdkr|
dkrtd|d rtd|d rtd|r|	|7 }	d}qj| jdkr|
t|d krtd | j|rtd | j||
d  r|	|| 7 }	d}qj| jdkr|
t|d krtd | j|rtd||
d  rtd||
d  tjr|	|| ||
d   7 }	t|d d}qjtd|r|}td|rd}||d||< | jdkr|dkrd||< | jdkr|d krd||< || d dkr| jdkrn|d!krn|
dkrntd"||
d  rn|	|7 }	d}n |	|| 7 }	d}||  d7  < n|	|7 }	d}||  d7  < qj| jd#krtd$||
d  rt| j|r|	|| 7 }	d}qj|	|| 7 }	d}qj| j\}}t|||	}	|	 }	|r.|	S |	 S )%z
        Python port of the Moses detokenizer.
        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: list(str)
        :return: str
        z {} r   r   )r   r"   z```z''r	   rB   rk   z^[u   \(\[\{\¿\¡]+$z^[\,\.\?\!\:\;\\\%\}\]\)]+$r   z^[\?\!\:\;\\\%]$r3   z^['][{}]csz^[0-9]+$z^[.,]$)r   r   ga   z[{}][']$z^[{}]r   u   ^[-–]$z^li$|^mail.*Nu   ^[\'\"„“`]+$u   ^[„“”]+$r"   u   „u   “r   z[s]$fiz:$)rT   rQ   r   rV   r[   r\   r   rl   rn   iterr   rI   r]   r   r   rm   r   nextgetFINNISH_REGEX	ONE_SPACEr5   )r?   rp   r|   unescaper_   rw   rx   Zquote_countsZprepend_spaceZdetokenized_textrq   rr   Znormalized_quor4   r4   r7   r     s    


 



zMosesDetokenizer.tokenizec                 C   s   |  |||S )z&Duck-typing the abstract *tokenize()*.)r   )r?   rp   r|   r   r4   r4   r7   
detokenize8  s    zMosesDetokenizer.detokenize)r3   )TT)TT)%r   r   r   r   r   rQ   rR   rS   r   r   r   rV   r[   compiler   ZUNESCAPE_FACTOR_SEPARATORZUNESCAPE_LEFT_ANGLE_BRACKETZUNESCAPE_RIGHT_ANGLE_BRACKETZUNESCAPE_DOUBLE_QUOTEZUNESCAPE_SINGLE_QUOTEZ UNESCAPE_SYNTAX_NONTERMINAL_LEFTZ!UNESCAPE_SYNTAX_NONTERMINAL_RIGHTZUNESCAPE_AMPERSANDZ UNESCAPE_FACTOR_SEPARATOR_LEGACYZ'UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACYZ(UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACYr   ZFINNISH_MORPHSET_1ZFINNISH_MORPHSET_2ZFINNISH_MORPHSET_3rT   r   rH   r   r   r   r   r4   r4   rY   r7   r     s   9
 r   )r[   sixr   Zsacremoses.corpusr   r   Zsacremoses.utilr   Zsacremoses.indicr   r   rR   rJ   objectr   r   __all__r4   r4   r4   r7   <module>   s          #