U
    "c!                     @   s   zd dl mZ W n  ek
r0   d dl mZ Y nX d dl mZ d dlmZmZ d dlm	Z	m
Z
 d dlmZ G dd deZd	d
 Zdd Zdd Zdd ZdddZdddZdS )    )zip_longest)izip_longest)tee)escapeunescape)Paralleldelayed)tqdmc                   @   sZ   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZeeeeee	e
eeeegZdS )CJKCharsz
    An object that enumerates the code points of the CJK characters as listed on
    http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
    i   i  i.  iϤ  i@  i  i   i  i   i  i0  iO  ie  i  )io io i p i i  i/ ip i i   i N)__name__
__module____qualname____doc__ZHangul_JamoZCJK_RadicalsZPhags_PaZHangul_SyllablesZCJK_Compatibility_IdeographsZCJK_Compatibility_FormsZKatakana_Hangul_HalfwidthZ#Ideographic_Symbols_And_PunctuationZTangutZKana_SupplementZNushuZSupplementary_Ideographic_Planeranges r   r   3/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/util.pyr
      s2   r
   c                    s   t  fdddD S )u  
    This checks for CJK character.

        >>> CJKChars().ranges
        [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (94208, 101119), (110592, 110895), (110960, 111359), (131072, 196607)]
        >>> is_cjk(u'㏾')
        True
        >>> is_cjk(u'﹟')
        False

    :param character: The character that needs to be checked.
    :type character: char
    :return: bool
    c                    s,   g | ]$\}}|t    ko"|kn  qS r   )ord).0startend	characterr   r   
<listcomp>s   s   zis_cjk.<locals>.<listcomp>)r   r   r   r   r   r   r   r   r   r   r   )anyr!   r   r!   r   is_cjkc   s
    
r%   c                 C   s   t | dddddddS )a  
    This function transforms the input text into an "escaped" version suitable
    for well-formed XML formatting.
    Note that the default xml.sax.saxutils.escape() function don't escape
    some characters that Moses does so we have to manually add them to the
    entities dictionary.

        >>> input_str = ''')| & < > ' " ] ['''
        >>> expected_output =  ''')| &amp; &lt; &gt; ' " ] ['''
        >>> escape(input_str) == expected_output
        True
        >>> xml_escape(input_str)
        ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'

    :param text: The text that needs to be escaped.
    :type text: str
    :rtype: str
    &apos;&quot;&#124;&#91;&#93;)'"|[]entities)r   textr   r   r   
xml_escape   s    r4   c                 C   s   t | dddddddS )ai  
    This function transforms the "escaped" version suitable
    for well-formed XML formatting into humanly-readable string.
    Note that the default xml.sax.saxutils.unescape() function don't unescape
    some characters that Moses does so we have to manually add them to the
    entities dictionary.

        >>> from xml.sax.saxutils import unescape
        >>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
        >>> expected = ''')| & < > ' " ] ['''
        >>> xml_unescape(s) == expected
        True

    :param text: The text that needs to be unescaped.
    :type text: str
    :rtype: str
    r+   r,   r-   r.   r/   )r&   r'   r(   r)   r*   r0   )r   r2   r   r   r   xml_unescape   s    r5   c                 C   s    t | \}}t|d t||S )zp
    From https://docs.python.org/3/library/itertools.html#recipes
    s -> (s0,s1), (s1,s2), (s2, s3), ...
    N)r   nextzip)iterableabr   r   r   pairwise   s    
r;   Nc                 C   s   t | g| }t|d|iS )ziCollect data into fixed-length chunks or blocks
    from https://stackoverflow.com/a/16789869/610569
    	fillvalue)iterr   )r8   nr<   argsr   r   r   grouper   s    r@   Fc                    s>   |rt |n|}|dkr"t |S t|d fdd|D S )N   )Zn_jobsc                 3   s   | ]}t  |V  qd S )N)r   )r   linefuncr   r   	<genexpr>   s     z)parallelize_preprocess.<locals>.<genexpr>)r	   mapr   )rD   iteratorZ	processesprogress_barr   rC   r   parallelize_preprocess   s    
rI   )N)F)	itertoolsr   ImportErrorr   r   xml.sax.saxutilsr   r   Zjoblibr   r   r	   objectr
   r%   r4   r5   r;   r@   rI   r   r   r   r   <module>   s   S#

	