
    vKgN              
          S SK Jr  S SKJr  S SKJrJr  SSKJrJ	r	J
r
  SSKJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJr   " S S5      r " S	 S
\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r&\" SS9S\\'   S\\'   S\(4S  j5       r)\" S!S9 S'S"\'S#\*S$\(S\*4S% jj5       r+g&)(    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   ^    \ rS rSrSrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)MessDetectorPlugin   zm
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
	characterreturnc                     [         e)z0
Determine if given character should be fed in.
NotImplementedErrorselfr   s     U/var/www/highfloat_scraper/venv/lib/python3.13/site-packages/charset_normalizer/md.pyeligibleMessDetectorPlugin.eligible%   
     "!    Nc                     [         e)zq
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
r"   r$   s     r&   feedMessDetectorPlugin.feed+   s
    
 "!r*   c                     [         e)z2
Permit to reset the plugin to the initial state.
r"   r%   s    r&   resetMessDetectorPlugin.reset2   r)   r*   c                     [         e)zm
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
r"   r/   s    r&   ratioMessDetectorPlugin.ratio8   s
     "!r*    r    N)__name__
__module____qualname____firstlineno____doc__strboolr'   r,   r0   propertyfloatr3   __static_attributes__r5   r*   r&   r   r      sM    
"# "$ ""c "d "" "u " "r*   r   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg) TooManySymbolOrPunctuationPluginA   r    Nc                 J    SU l         SU l        SU l        S U l        SU l        g )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr/   s    r&   __init__)TooManySymbolOrPunctuationPlugin.__init__B   s*    '("#%&37!,1#r*   r   c                 "    UR                  5       $ Nisprintabler$   s     r&   r'   )TooManySymbolOrPunctuationPlugin.eligibleJ       $$&&r*   c                 D   U =R                   S-  sl         XR                  :w  av  U[        ;  al  [        U5      (       a  U =R                  S-  sl        OFUR                  5       SL a3  [        U5      (       a#  [        U5      SL a  U =R                  S-  sl        Xl        g )Nr   F   )	rG   rH   r   r   rE   isdigitr   r   rF   r$   s     r&   r,   %TooManySymbolOrPunctuationPlugin.feedM   s    " 222!==i((''1,'!!#u,i((	*e3""a'"$-!r*   c                 .    SU l         SU l        SU l        g Nr   )rE   rG   rF   r/   s    r&   r0   &TooManySymbolOrPunctuationPlugin.reset_   s    "# !r*   c                     U R                   S:X  a  gU R                  U R                  -   U R                   -  nUS:  a  U$ S$ )Nr           333333?)rG   rE   rF   )r%   ratio_of_punctuations     r&   r3   &TooManySymbolOrPunctuationPlugin.ratiod   sO      A% ##d&8&88!!'" (<s'B#KKr*   )rG   rI   rH   rE   rF   r6   r7   r8   r9   r:   rJ   r<   r=   r'   r,   r0   r>   r?   r3   r@   r5   r*   r&   rB   rB   A   sP    2'# '$ '.c .d .$
 Lu L Lr*   rB   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)TooManyAccentuatedPluginp   r    Nc                      SU l         SU l        g rW   rG   _accentuated_countr/   s    r&   rJ   !TooManyAccentuatedPlugin.__init__q   s    %&'(r*   r   c                 "    UR                  5       $ rM   )isalphar$   s     r&   r'   !TooManyAccentuatedPlugin.eligibleu   s      ""r*   c                 z    U =R                   S-  sl         [        U5      (       a  U =R                  S-  sl        g g Nr   )rG   r   rd   r$   s     r&   r,   TooManyAccentuatedPlugin.feedx   s4    ")$$##q(# %r*   c                      SU l         SU l        g rW   rc   r/   s    r&   r0   TooManyAccentuatedPlugin.reset~   s     !"#r*   c                 j    U R                   S:  a  gU R                  U R                   -  nUS:  a  U$ S$ )N   rZ   gffffff?rc   )r%   ratio_of_accentuations     r&   r3   TooManyAccentuatedPlugin.ratio   s=      1$'+'>'>AVAV'V(=(E$N3Nr*   )rd   rG   r6   r^   r5   r*   r&   r`   r`   p   sP    )## #$ #)c )d )$ Ou O Or*   r`   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)UnprintablePlugin   r    Nc                      SU l         SU l        g rW   )_unprintable_countrG   r/   s    r&   rJ   UnprintablePlugin.__init__   s    '(%&r*   r   c                     gNTr5   r$   s     r&   r'   UnprintablePlugin.eligible       r*   c                 x    [        U5      (       a  U =R                  S-  sl        U =R                  S-  sl        g rj   )r   rv   rG   r$   s     r&   r,   UnprintablePlugin.feed   s/    )$$##q(#"r*   c                     SU l         g rW   )rv   r/   s    r&   r0   UnprintablePlugin.reset   s
    "#r*   c                 \    U R                   S:X  a  gU R                  S-  U R                   -  $ )Nr   rZ   ro   rG   rv   r/   s    r&   r3   UnprintablePlugin.ratio   s/      A%''!+t/D/DDDr*   r   r6   r^   r5   r*   r&   rs   rs      sP    '# $ #c #d #
$ Eu E Er*   rs   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)SuspiciousDuplicateAccentPlugin   r    Nc                 .    SU l         SU l        S U l        g rW   _successive_countrG   _last_latin_characterr/   s    r&   rJ   (SuspiciousDuplicateAccentPlugin.__init__   s    &'%&48"r*   r   c                 F    UR                  5       =(       a    [        U5      $ rM   )rg   r   r$   s     r&   r'   (SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r*   c                    U =R                   S-  sl         U R                  b  [        U5      (       a  [        U R                  5      (       a  UR                  5       (       a4  U R                  R                  5       (       a  U =R                  S-  sl        [        U5      [        U R                  5      :X  a  U =R                  S-  sl        Xl        g rj   )rG   r   r   isupperr   r   r$   s     r&   r,   $SuspiciousDuplicateAccentPlugin.feed   s    "&&2y))t99::  ""t'A'A'I'I'K'K&&!+&Y'=9S9S+TT&&!+&%."r*   c                 .    SU l         SU l        S U l        g rW   r   r/   s    r&   r0   %SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r*   c                 \    U R                   S:X  a  gU R                  S-  U R                   -  $ )Nr   rZ   rS   )rG   r   r/   s    r&   r3   %SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr*   )rG   r   r   r6   r^   r5   r*   r&   r   r      sP    9;# ;$ ;/c /d /*
 Du D Dr*   r   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)SuspiciousRange   r    Nc                 .    SU l         SU l        S U l        g rW   )"_suspicious_successive_range_countrG   _last_printable_seenr/   s    r&   rJ   SuspiciousRange.__init__   s    78/%&37!r*   r   c                 "    UR                  5       $ rM   rN   r$   s     r&   r'   SuspiciousRange.eligible   rQ   r*   c                 Z   U =R                   S-  sl         UR                  5       (       d  [        U5      (       d
  U[        ;   a  S U l        g U R                  c  Xl        g [        U R                  5      n[        U5      n[        X#5      (       a  U =R                  S-  sl        Xl        g rj   )rG   isspacer   r   r   r    is_suspiciously_successive_ranger   )r%   r   unicode_range_aunicode_range_bs       r&   r,   SuspiciousRange.feed   s    " i((88(,D%$$,(1%)6t7P7P)Q)6y)A+OMM33q83$-!r*   c                 .    SU l         SU l        S U l        g rW   )rG   r   r   r/   s    r&   r0   SuspiciousRange.reset   s     !23/$(!r*   c                 `    U R                   S::  a  gU R                  S-  U R                   -  nU$ )N   rZ   rS   )rG   r   )r%   ratio_of_suspicious_range_usages     r&   r3   SuspiciousRange.ratio   s<      B& 33a7!!2"' /.r*   )rG   r   r   r6   r^   r5   r*   r&   r   r      sM    8
'# '$ '.c .d ..)
 /u / /r*   r   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)SuperWeirdWordPlugin   r    Nc                     SU l         SU l        SU l        SU l        SU l        SU l        SU l        SU l        SU l        SU l	        g )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrG   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr/   s    r&   rJ   SuperWeirdWordPlugin.__init__   sQ     !$%() */!). %&)*!)*!() r*   r   c                     gry   r5   r$   s     r&   r'   SuperWeirdWordPlugin.eligible	  r{   r*   c                    UR                  5       (       Ga  U =R                  U-  sl        [        U5      (       a  U =R                  S-  sl        U R                  SL ak  [        U5      SL d  [        U5      (       aM  [        U5      SL a?  [        U5      SL a1  [        U5      SL a#  [        U5      SL a  [        U5      SL a  SU l        [        U5      (       d@  [        U5      (       d0  [        U5      (       d   [        U5      (       d  [        U5      (       a  U =R                  S-  sl        g U R                  (       d  g UR                  5       (       d!  [        U5      (       d  [        U5      (       Ga,  U R                  (       Ga  U =R                  S-  sl        [!        U R                  5      nU =R"                  U-  sl        US:  a  U R                  U-  S:  a  SU l        O[        U R                  S   5      (       a^  U R                  S   R'                  5       (       a<  [)        S U R                   5       5      SL a  U =R*                  S-  sl        SU l        O,U R                  S:X  a  SU l        U =R*                  S-  sl        US:  a  U R                  (       a  [-        U R                  [/        S	U5      5       VVs/ sH  u  p4UR'                  5       (       d  M  UPM      nnnSnU(       a  [!        U5      U-  S
::  a  SnU(       d  U =R*                  S-  sl        SU l        U R$                  (       aD  U =R0                  S-  sl        U =R2                  [!        U R                  5      -  sl        SU l        SU l        SU l        S	U l        S	U l        g US;  aB  UR5                  5       SL a.  [7        U5      (       a  SU l        U =R                  U-  sl        g g g g s  snnf )Nr   FT   g      ?c              3   >   #    U H  oR                  5       v   M     g 7frM   )r   ).0_s     r&   	<genexpr>,SuperWeirdWordPlugin.feed.<locals>.<genexpr>6  s     >AIIKKs      r   r[   r   >   -<=>r   |~)rg   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   lenrG   r   r   allr   zipranger   r   rT   r   )r%   r   buffer_lengthcicamel_case_dstprobable_camel_caseds          r&   r,   SuperWeirdWordPlugin.feed  s:   LLI%Li(())Q.)((E1i(E1^I5N5N9%.i(E1	*e3	*e3I&%/+/(y!!Y''y))y))9%%((A-(||>)#<#<Y@W@Wlll!!$T\\!2M!!]2!!,,}<C04D- #4<<#344R(0022>>>%G,,1,04D---204D-,,1,"t'?'? !$DLL%=2I J" Jyy{  J  "
 .3$!s>':]'Jc'Q+/(+,,1,04D-(($$)$))S->>),1)',D$DL()D%'(D$@@!!#u,)$$(,D%LLI%L % - A1"s   O.*O.c                 t    SU l         SU l        SU l        SU l        SU l        SU l        SU l        SU l        g )Nr   Fr   )r   r   r   r   r   rG   r   r   r/   s    r&   r0   SuperWeirdWordPlugin.reset]  sA    $)!#(   !$%!#$ r*   c                 v    U R                   S::  a  U R                  S:X  a  gU R                  U R                  -  $ )N
   r   rZ   )r   r   r   rG   r/   s    r&   r3   SuperWeirdWordPlugin.ratiog  s7    r!d&>&>!&C((4+@+@@@r*   )
r   r   r   r   r   rG   r   r   r   r   r6   r^   r5   r*   r&   r   r      sT    *# $ O&c O&d O&b% Au A Ar*   r   c                   h    \ rS rSrSrSS jrS\S\4S jrS\SS4S jr	SS	 jr
\S\4S
 j5       rSrg)CjkInvalidStopPluginio  u   
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '丅' and '丄'.
r    Nc                      SU l         SU l        g rW   _wrong_stop_count_cjk_character_countr/   s    r&   rJ   CjkInvalidStopPlugin.__init__u  s    &')*!r*   r   c                     gry   r5   r$   s     r&   r'   CjkInvalidStopPlugin.eligibley  r{   r*   c                     US;   a  U =R                   S-  sl         g [        U5      (       a  U =R                  S-  sl        g g )N>      丄   丅r   )r   r   r   r$   s     r&   r,   CjkInvalidStopPlugin.feed|  s?    &""a'")%%*% r*   c                      SU l         SU l        g rW   r   r/   s    r&   r0   CjkInvalidStopPlugin.reset  s    !"$%!r*   c                 V    U R                   S:  a  gU R                  U R                   -  $ )N   rZ   r   r   r/   s    r&   r3   CjkInvalidStopPlugin.ratio  s*    $$r)%%(A(AAAr*   r   r6   )r7   r8   r9   r:   r;   rJ   r<   r=   r'   r,   r0   r>   r?   r3   r@   r5   r*   r&   r   r   o  sU    
+# $ +c +d +& Bu B Br*   r   c                   d    \ rS rSrSS jrS\S\4S jrS\SS4S jrSS jr	\
S\4S	 j5       rS
rg)ArchaicUpperLowerPlugini  r    Nc                 f    SU l         SU l        SU l        SU l        SU l        S U l        SU l        g )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrG   _last_alpha_seen_current_ascii_onlyr/   s    r&   rJ    ArchaicUpperLowerPlugin.__init__  s9    	45,23*890%&/3)- r*   r   c                     gry   r5   r$   s     r&   r'    ArchaicUpperLowerPlugin.eligible  r{   r*   c                    UR                  5       =(       a    [        U5      nUSL nU(       a  U R                  S:  a  U R                  S::  aA  UR                  5       SL a.  U R                  SL a  U =R
                  U R                  -  sl        SU l        SU l        S U l        SU l        U =R                  S-  sl	        SU l        g U R                  SL a  UR                  5       SL a  SU l        U R                  b  UR                  5       (       a  U R                  R                  5       (       d4  UR                  5       (       aS  U R                  R                  5       (       a4  U R                  SL a  U =R                  S-  sl        SU l        OSU l        OSU l        U =R                  S-  sl	        U =R                  S-  sl        Xl        g )NFr   @   r   TrS   )rg   r   r   rT   r   r   r   r   r   rG   isasciir   islower)r%   r   is_concerned	chunk_seps       r&   r,   ArchaicUpperLowerPlugin.feed  s    ((*J/?	/J E)	==A44:%%'50,,588668 23D.34D0$(D!DI!!Q&!'+D$##t+	0A0A0Cu0L',D$  ,!!##(=(=(E(E(G(G!!##(=(=(E(E(G(G99$66!;6 %DI $DI!	",,1, )r*   c                 f    SU l         SU l        SU l        SU l        S U l        SU l        SU l        g )Nr   FT)rG   r   r   r   r   r   r   r/   s    r&   r0   ArchaicUpperLowerPlugin.reset  s9     !/0,-.*340 $	#' r*   c                 V    U R                   S:X  a  gU R                  U R                   -  $ )Nr   rZ   )rG   r   r/   s    r&   r3   ArchaicUpperLowerPlugin.ratio  s*      A%77$:O:OOOr*   )r   rG   r   r   r   r   r   r6   r^   r5   r*   r&   r   r     sQ    .# $ (*c (*d (*T( Pu P Pr*   r   c                   d    \ rS rSrSS jrSS jrS\S\4S jrS\SS4S jr	\
S\4S	 j5       rS
rg)ArabicIsolatedFormPlugini  r    Nc                      SU l         SU l        g rW   rG   _isolated_form_countr/   s    r&   rJ   !ArabicIsolatedFormPlugin.__init__  s    %&)*!r*   c                      SU l         SU l        g rW   r  r/   s    r&   r0   ArabicIsolatedFormPlugin.reset  s     !$%!r*   r   c                     [        U5      $ rM   )r   r$   s     r&   r'   !ArabicIsolatedFormPlugin.eligible  s    ##r*   c                 z    U =R                   S-  sl         [        U5      (       a  U =R                  S-  sl        g g rj   )rG   r   r  r$   s     r&   r,   ArabicIsolatedFormPlugin.feed  s4    ""9--%%*% .r*   c                 Z    U R                   S:  a  gU R                  U R                   -  nU$ )Nro   rZ   r  )r%   isolated_form_usages     r&   r3   ArabicIsolatedFormPlugin.ratio  s0      1$%)%>%>AVAV%V""r*   r  r6   )r7   r8   r9   r:   rJ   r0   r<   r=   r'   r,   r>   r?   r3   r@   r5   r*   r&   r  r    sM    +&$# $$ $+c +d + #u # #r*   r     )maxsizer   r   r    c                 .   U b  Uc  gX:X  a  gSU ;   a  SU;   a  gSU ;   d  SU;   a  gSU ;   d  SU;   a  SU ;   d  SU;   a  gU R                  S5      UR                  S5      p2U H  nU[        ;   a  M  XC;   d  M    g   U S;   US;   peU(       d  U(       a  SU ;   d  SU;   a  gU(       a  U(       a  gS	U ;   d  S	U;   a  SU ;   d  SU;   a  gU S
:X  d  US
:X  a  gSU ;   d  SU;   d  U S;   a-  US;   a'  SU ;   d  SU;   a  gSU ;   d  SU;   a  gU S
:X  d  US
:X  a  gg)zY
Determine if two Unicode range seen next to each other can be considered as suspicious.
TFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r  r  PunctuationForms)splitr
   )r   r   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r&   r   r     sx    /"9)/!g&@o%)G 	?"g&@&+*H)8)>)>*S! ' 00!	  	
	

 	33 ' 	, E_$<,?"h/&AO#u'?m+-/O 	 E_$<3377O+}/Oo%O)Cm+-/Or*   i   decoded_sequencemaximum_thresholddebugc           	      "   [         R                  5        Vs/ sH	  o3" 5       PM     nn[        U 5      S-   nSnUS:  a  SnOUS::  a  SnOSn[        U S-   [	        U5      5       Hh  u  pU H,  n
U
R                  U5      (       d  M  U
R                  U5        M.     U	S	:  a  X-  S	:X  d
  XS-
  :X  d  MO  [        S
 U 5       5      nXa:  d  Mh    O   U(       a  [        S5      nUR                  [        SU SU SU 35        [        U 5      S:  a8  UR                  [        SU SS  35        UR                  [        SU SS  35        U H2  nUR                  [        UR                   SUR                   35        M4     [        US5      $ s  snf )zo
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
r   rZ   i       r  r      
r   c              3   6   #    U H  oR                   v   M     g 7frM   )r3   )r   dts     r&   r   mess_ratio.<locals>.<genexpr>^  s     !?Yr((Ys   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r'   r,   sumr   logr	   	__class__r3   round)r$  r%  r&  md_class	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr,  s                r&   
mess_ratior=  ?  s    $6#D#D#F+#Fx
#F  + &'!+F O|13)	4,.),/) 04 7vG	!H  ++i( "
 AI%CqHqj !!?Y!??O3 H /0

11R0SSdetdu v!!2 35	
  2%JJu0@"0E/FGHJJu.>su.E-FGHBJJub
;<  !$$[+s   FN)g?F),	functoolsr   loggingr   typingr   r   constantr   r	   r
   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rB   r`   rs   r   r   r   r   r   r  r<   r=   r   r?   r=  r5   r*   r&   <module>rC     sC     ! 
    *" "D,L'9 ,L^O1 O6E* E0"D&8 "DJ./( ./bsA- sAlB- B>IP0 IPX#1 #8 4Ec]E5=c]E	E EP 4IN4%4%.34%BF4%
4% 4%r*   