
    9e                       d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZmZ ddlmZ 	 d dlZg d	Z ed
      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z e       Z  e       Z! e       Z" e       Z# e       Z$ e       Z% e       Z& e       Z'y# e$ r dZY w xY w)    )annotationsN)Counter)Fraction)groupbypermutations)AnySequenceTypeVar   )Base)ArithNCDLZMANCDBZ2NCDRLENCD	BWTRLENCDZLIBNCDSqrtNCD
EntropyNCDbz2_ncdlzma_ncd	arith_ncdrle_ncd
bwtrle_ncdzlib_ncdsqrt_ncdentropy_ncdTc                  >    e Zd ZdZdZd	d
dZddZddZddZddZ	y)_NCDBasezNormalized compression distance (NCD)

    https://articles.orsinium.dev/other/ncd/
    https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance
    r   c                    || _         y Nqvalselfr#   s     K/usr/lib/python3/dist-packages/textdistance/algorithms/compression_based.py__init__z_NCDBase.__init__(   	    	    c                     y)Nr    )r%   	sequencess     r&   maximumz_NCDBase.maximum+   s    r)   c                6    t        | j                  |            S r!   )len	_compressr%   datas     r&   	_get_sizez_NCDBase._get_size.   s    4>>$'((r)   c                    t         r!   )NotImplementedErrorr1   s     r&   r0   z_NCDBase._compress1   s    !!r)   c                   |sy | j                   | }t        d      } t        |d                }t        |      D ]Q  }t	        |t
        t        f      r|j                  |      }nt        ||      }t        || j                  |            }S |D cg c]  }| j                  |       }}t        |      }|dk(  ry|t        |      t        |      dz
  z  z
  |z  S c c}w )Nr   Infr   )_get_sequencesfloattyper   
isinstancestrbytesjoinsumminr3   maxr/   )	r%   r,   
concat_lenemptymutationr2   scompressed_lensmax_lens	            r&   __call__z_NCDBase.__call__4   s    'D''3	5\
"Yq\"$$Y/ 	?H%#u.zz(+8U+Z)=>J	? 7@@4>>!,@@o&a<S1S^a5GHHGSS	 As   CNr   r#   intreturnNone)rL   rK   )r2   r<   rL   r9   )r2   r<   rL   r   rL   r9   )
__name__
__module____qualname____doc__r#   r'   r-   r3   r0   rH   r+   r)   r&   r   r       s(    
 D)"Tr)   r   c                  (     e Zd ZddZd fdZ xZS )_BinaryNCDBasec                     y r!   r+   )r%   s    r&   r'   z_BinaryNCDBase.__init__K   s    r)   c                l    |syt        |d   t              rt        d |D              }t        |   | S )Nr   c              3  >   K   | ]  }|j                  d         yw)zutf-8N)encode).0rE   s     r&   	<genexpr>z*_BinaryNCDBase.__call__.<locals>.<genexpr>R   s     CAahhw/Cs   )r;   r<   tuplesuperrH   )r%   r,   	__class__s     r&   rH   z_BinaryNCDBase.__call__N   s7    ilC(CCCIw++r)   )rL   rM   rN   )rO   rP   rQ   r'   rH   __classcell__r]   s   @r&   rT   rT   I   s    , ,r)   rT   c                  F    e Zd ZdZdd	dZd
dZ	 	 	 	 	 	 ddZddZddZy)r   zArithmetic coding

    https://github.com/gw-c/arith
    http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251
    https://en.wikipedia.org/wiki/Arithmetic_coding
    Nc                .    || _         || _        || _        y r!   )base
terminatorr#   )r%   rb   rc   r#   s       r&   r'   zArithNCD.__init__^   s    	$	r)   c                0    | j                   | } | j                  | }| j                  d|| j                  <   t        |j	                               }i }d}|j                         D ]%  \  }}t        ||      t        ||      f||<   ||z  }' ||k(  sJ |S )zD
        https://github.com/gw-c/arith/blob/master/arith.py
        r   r   )_get_counters_sum_countersrc   r?   valuesmost_commonr   )r%   r,   countstotal_letters
prob_pairscumulative_countcharcurrent_counts           r&   _make_probszArithNCD._make_probsc   s     'D&&	2	###Y/??&&'F4??#FMMO,
#)#5#5#7 	.D-)=96 Jt -	.  =000r)   c                   | j                   9| j                   |v r|j                  | j                   d      }|| j                   z  }t        dd      }t        dd      }|D ]  }||   \  }}|||z  z  }||z  } |||z   fS )N r   r   )rc   replacer   )r%   r2   probsstartwidthrm   
prob_start
prob_widths           r&   
_get_rangezArithNCD._get_rangex   s    
 ??&$&||DOOR8DOO#DAA 	 D%*4["J
Z%''EZE	  eem##r)   c                
   | j                  |      }| j                  ||      \  }}t        dd      }d}||cxk  r|k  sBn d|j                  |z  |j                  z  z   }t        ||      }|dz  }||cxk  r|k  s<|S  @|S )N)r2   rs   r   r      )ro   rx   r   	numeratordenominator)r%   r2   rs   rt   endoutput_fractionoutput_denominatoroutput_numerators           r&   r0   zArithNCD._compress   s      &__$e_<
s"1a.O1c1 U__7I%IeN_N_$_`&'79KLO!# O1c1 	 2 r)   c                    | j                  |      j                  }|dk(  ryt        j                  t        j                  || j
                              S )Nr   )r0   r{   mathceillogrb   )r%   r2   r{   s      r&   r3   zArithNCD._get_size   s>    NN4(22	>yy)TYY788r)   )rz   Nr   )rb   rK   rc   z
str | Noner#   rK   rL   rM   )rL   $dict[str, tuple[Fraction, Fraction]])r2   r<   rs   r   rL   ztuple[Fraction, Fraction])r2   r<   rL   r   )r2   r<   rL   rK   )	rO   rP   rQ   rR   r'   ro   rx   r0   r3   r+   r)   r&   r   r   V   s<    
*$$ 4$ 
#	$$	9r)   r   c                      e Zd ZdZddZy)r   zORun-length encoding

    https://en.wikipedia.org/wiki/Run-length_encoding
    c                   g }t        |      D ]g  \  }}t        t        |            }|dkD  r|j                  t	        |      |z          =|dk(  r|j                  |       T|j                  d|z         i dj                  |      S )Nrz   r   rq   )r   r/   listappendr<   r>   )r%   r2   new_datakgns         r&   r0   zRLENCD._compress   sz    DM 	'DAqDGA1uA
+a"A&	' wwx  r)   N)r2   r	   rL   r<   rO   rP   rQ   rR   r0   r+   r)   r&   r   r      s    

!r)   r   c                  .     e Zd ZdZdddZd fdZ xZS )r   z
    https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
    https://en.wikipedia.org/wiki/Run-length_encoding
    c                    || _         y r!   )rc   )r%   rc   s     r&   r'   zBWTRLENCD.__init__   s	    )r)   c                   s| j                   nk| j                   vr]| j                   z  t        fdt        t                    D              } t	                     }|j                  d |D              t        |         S )Nc              3  4   K   | ]  }|d  d | z     y wr!   r+   )rY   ir2   s     r&   rZ   z&BWTRLENCD._compress.<locals>.<genexpr>   s"     Lad12hbq1Ls   c              3  &   K   | ]	  }|d      yw)Nr+   )rY   subdatas     r&   rZ   z&BWTRLENCD._compress.<locals>.<genexpr>   s     BggbkBs   )rc   sortedranger/   r:   r>   r\   r0   )r%   r2   modifiedrC   r]   s    `  r&   r0   zBWTRLENCD._compress   ss    ??D__D(DOO#DL5T;KLLHDJLE::BBBDw &&r)   ) )rc   r<   rL   rM   )r2   r<   rL   r<   )rO   rP   rQ   rR   r'   r0   r^   r_   s   @r&   r   r      s    
*' 'r)   r   c                  *    e Zd ZdZdddZddZd	dZy)
r   zSquare Root based NCD

    Size of compressed data equals to sum of square roots of counts of every
    element in the input sequence.
    c                    || _         y r!   r"   r$   s     r&   r'   zSqrtNCD.__init__   r(   r)   c                    t        |      j                         D ci c]  \  }}|t        j                  |       c}}S c c}}w r!   )r   itemsr   sqrt)r%   r2   elementcounts       r&   r0   zSqrtNCD._compress   s5    @G@S@S@UVngu5))VVVs    Ac                R    t        | j                  |      j                               S r!   )r?   r0   rg   r1   s     r&   r3   zSqrtNCD._get_size   s    4>>$'..011r)   NrI   rJ   )r2   zSequence[T]rL   zdict[T, float]r2   r	   rL   r9   rO   rP   rQ   rR   r'   r0   r3   r+   r)   r&   r   r      s    W2r)   r   c                  *    e Zd ZdZdddZddZddZy)	r   zEntropy based NCD

    Get Entropy of input sequence as a size of compressed data.

    https://en.wikipedia.org/wiki/Entropy_(information_theory)
    https://en.wikipedia.org/wiki/Entropy_encoding
    c                .    || _         || _        || _        y r!   )r#   coefrb   )r%   r#   r   rb   s       r&   r'   zEntropyNCD.__init__   s    			r)   c                    t        |      }d}t        |      j                         D ]-  }||z  }||t        j                  || j
                        z  z  }/ |dk\  sJ |S )Ng        r   )r/   r   rg   r   r   rb   )r%   r2   total_countentropyelement_countps         r&   r0   zEntropyNCD._compress   sg    $i$T]113 	2M+Aq488Atyy111G	2 !||r)   c                >    | j                   | j                  |      z   S r!   )r   r0   r1   s     r&   r3   zEntropyNCD._get_size   s    yy4>>$///r)   N)r   r   rz   )r#   rK   r   rK   rb   rK   rL   rM   r   r   r+   r)   r&   r   r      s    
0r)   r   c                      e Zd ZdZddZy)r   z-
    https://en.wikipedia.org/wiki/Bzip2
    c                4    t        j                  |d      dd  S )N	bz2_codec   codecsrX   r1   s     r&   r0   zBZ2NCD._compress   s    }}T;/44r)   Nr2   zstr | bytesrL   r=   r   r+   r)   r&   r   r          5r)   r   c                      e Zd ZdZddZy)r   z,
    https://en.wikipedia.org/wiki/LZMA
    c                T    t         st        d      t        j                  |      dd  S )Nz$Please, install the PylibLZMA module   )lzmaImportErrorcompressr1   s     r&   r0   zLZMANCD._compress  s'    DEE}}T"23''r)   N)r2   r=   rL   r=   r   r+   r)   r&   r   r     s    (r)   r   c                      e Zd ZdZddZy)r   z,
    https://en.wikipedia.org/wiki/Zlib
    c                4    t        j                  |d      dd  S )N
zlib_codecrz   r   r1   s     r&   r0   zZLIBNCD._compress  s    }}T<044r)   Nr   r   r+   r)   r&   r   r     r   r)   r   )(
__future__r   r   r   collectionsr   	fractionsr   	itertoolsr   r   typingr   r	   r
   rb   r   _Baser   r   __all__r   r   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r)   r&   <module>r      s   "     + ) )  
 CL&Tu &TR
,X 
,C9x C9L!X !&' '.2h 2"0 0F5^ 5(n (5n 5 J	[

(9
(99lY  Ds   C+ +C54C5