o
    7aI                  
   @   s  d Z dZdgZddlmZ zddlmZ W n ey/ Z zG dd deZW Y dZ[ndZ[ww ddl	Z	ddl
Z
e	jdd	 \ZZZed	koMed
koMed	kZed	koUed	kZed	ko]edkZddlmZmZmZmZmZ ddlmZmZ ddlmZmZmZ dZG dd deZ G dd deZ!ed	kred
kresddl"Z"e"#dZ$e$e!_$e"#de"j%Z&e&e _&ddlm'Z'm(Z( dd Z)dd Z*e)e _)e*e _*dZdS dS dS dS )zCUse the HTMLParser library to parse HTML files that aren't too bad.ZMITHTMLParserTreeBuilder    )
HTMLParser)HTMLParseErrorc                   @   s   e Zd ZdS )r   N)__name__
__module____qualname__ r   r   9/usr/lib/python3/dist-packages/bs4/builder/_htmlparser.pyr      s    r   N         )CDataCommentDeclarationDoctypeProcessingInstruction)EntitySubstitutionUnicodeDammit)HTMLHTMLTreeBuilderSTRICTzhtml.parserc                   @   s|   e Zd ZdZdZdZdd Zdd Zdd	 ZdddZ	dddZ
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )BeautifulSoupHTMLParserzA subclass of the Python standard library's HTMLParser class, which
    listens for HTMLParser events and translates them into calls
    to Beautiful Soup's tree construction API.
    ignorereplacec                 O   s2   | d| j| _tj| g|R i | g | _dS )a  Constructor.

        :param on_duplicate_attribute: A strategy for what to do if a
            tag includes the same attribute more than once. Accepted
            values are: REPLACE (replace earlier values with later
            ones, the default), IGNORE (keep the earliest value
            encountered), or a callable. A callable must take three
            arguments: the dictionary of attributes already processed,
            the name of the duplicate attribute, and the most recent value
            encountered.           
        on_duplicate_attributeN)popREPLACEr   r   __init__already_closed_empty_element)selfargskwargsr   r   r	   r   A   s
   
	z BeautifulSoupHTMLParser.__init__c                 C   s   t | dS )a  In Python 3, HTMLParser subclasses must implement error(), although
        this requirement doesn't appear to be documented.

        In Python 2, HTMLParser implements error() by raising an exception,
        which we don't want to do.

        In any event, this method is called only on very strange
        markup and our best strategy is to pretend it didn't happen
        and keep going.
        N)warningswarn)r   msgr   r   r	   error[   s   zBeautifulSoupHTMLParser.errorc                 C   s   | j ||dd}| | dS )zHandle an incoming empty-element tag.

        This is only called when the markup looks like <tag/>.

        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        F)handle_empty_elementN)handle_starttaghandle_endtag)r   nameattrstagr   r   r	   handle_startendtagh   s   z*BeautifulSoupHTMLParser.handle_startendtagTc                 C   s   i }|D ]0\}}|du rd}||v r.| j }|| jkrn|d| jfv r'|||< n|||| n|||< d}q|  \}	}
| jj|dd||	|
d}|r]|jr_|ra| j|dd | j	| dS dS dS dS )a3  Handle an opening tag, e.g. '<tag>'

        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        :param handle_empty_element: True if this tag is known to be
            an empty-element tag (i.e. there is not expected to be any
            closing tag).
        N z"")
sourceline	sourceposF)check_already_closed)
r   IGNOREr   getpossoupr'   Zis_empty_elementr(   r   append)r   r)   r*   r&   Z	attr_dictkeyvalueZon_dupe	attrvaluer.   r/   r+   r   r   r	   r'   w   s,   




z'BeautifulSoupHTMLParser.handle_starttagc                 C   s.   |r|| j v r| j | dS | j| dS )zHandle a closing tag, e.g. '</tag>'
        
        :param name: A tag name.
        :param check_already_closed: True if this tag is expected to
           be the closing portion of an empty-element tag,
           e.g. '<tag></tag>'.
        N)r   remover3   r(   )r   r)   r0   r   r   r	   r(      s   	z%BeautifulSoupHTMLParser.handle_endtagc                 C   s   | j | dS )z4Handle some textual data that shows up between tags.N)r3   handle_datar   datar   r   r	   r9      s   z#BeautifulSoupHTMLParser.handle_datac                 C   s   | drt|dd}n| drt|dd}nt|}d}|dk rN| jjdfD ]!}|s1q,z
t|g|}W q, tyM } zW Y d}~q,d}~ww |skzt|}W n t	t
fyj } zW Y d}~nd}~ww |pnd}| | dS )zHandle a numeric character reference by converting it to the
        corresponding Unicode character and treating it as textual
        data.

        :param name: Character number, possibly in hexadecimal.
        x   XN   zwindows-1252u   �)
startswithintlstripr3   original_encoding	bytearraydecodeUnicodeDecodeErrorchr
ValueErrorOverflowErrorr9   )r   r)   Z	real_namer;   encodinger   r   r	   handle_charref   s2   


z&BeautifulSoupHTMLParser.handle_charrefc                 C   s0   t j|}|dur|}nd| }| | dS )zHandle a named entity reference by converting it to the
        corresponding Unicode character(s) and treating it as textual
        data.

        :param name: Name of the entity reference.
        Nz&%s)r   ZHTML_ENTITY_TO_CHARACTERgetr9   )r   r)   	characterr;   r   r   r	   handle_entityref   s
   z(BeautifulSoupHTMLParser.handle_entityrefc                 C   &   | j   | j | | j t dS )zOHandle an HTML comment.

        :param data: The text of the comment.
        N)r3   endDatar9   r   r:   r   r   r	   handle_comment      
z&BeautifulSoupHTMLParser.handle_commentc                 C   s6   | j   |tdd }| j | | j t dS )zYHandle a DOCTYPE declaration.

        :param data: The text of the declaration.
        zDOCTYPE N)r3   rQ   lenr9   r   r:   r   r   r	   handle_decl  s   
z#BeautifulSoupHTMLParser.handle_declc                 C   sN   |  drt}|tdd }nt}| j  | j| | j| dS )z{Handle a declaration of unknown type -- probably a CDATA block.

        :param data: The text of the declaration.
        zCDATA[N)upperr@   r   rT   r   r3   rQ   r9   )r   r;   clsr   r   r	   unknown_decl  s   
z$BeautifulSoupHTMLParser.unknown_declc                 C   rP   )z\Handle a processing instruction.

        :param data: The text of the instruction.
        N)r3   rQ   r9   r   r:   r   r   r	   	handle_pi  rS   z!BeautifulSoupHTMLParser.handle_piN)T)r   r   r   __doc__r1   r   r   r%   r,   r'   r(   r9   rL   rO   rR   rU   rX   rY   r   r   r   r	   r   7   s     

4'	
r   c                       sR   e Zd ZdZdZdZeZeee	gZ
dZd fdd	Z		dddZd	d
 Z  ZS )r   zpA Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
    found in the Python standard library.
    FTNc                    s   t  }dD ]}||v r||}|||< qtt| jdi | |p#g }|p'i }|| tr5ts5d|d< tr;d|d< ||f| _	dS )a  Constructor.

        :param parser_args: Positional arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param parser_kwargs: Keyword arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param kwargs: Keyword arguments for the superclass constructor.
        )r   FstrictZconvert_charrefsNr   )
dictr   superr   r   updateCONSTRUCTOR_TAKES_STRICT CONSTRUCTOR_STRICT_IS_DEPRECATED"CONSTRUCTOR_TAKES_CONVERT_CHARREFSparser_args)r   rb   Zparser_kwargsr!   Zextra_parser_kwargsargr6   	__class__r   r	   r   3  s   

zHTMLParserTreeBuilder.__init__c           	      c   s^    t |tr|dddfV  dS |g}|g}||g}t|||d|d}|j|j|j|jfV  dS )a  Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.

        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)

         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
        NFT)known_definite_encodingsuser_encodingsZis_htmlexclude_encodings)
isinstancestrr   markuprC   Zdeclared_html_encodingZcontains_replacement_characters)	r   rk   Zuser_specified_encodingZdocument_declared_encodingrh   rf   rg   Ztry_encodingsZdammitr   r   r	   prepare_markupO  s$   

z$HTMLParserTreeBuilder.prepare_markupc              
   C   sl   | j \}}t|i |}| j|_z|| |  W n ty0 } z	ttd |d}~ww g |_	dS )z{Run some incoming markup through some parsing process,
        populating the `BeautifulSoup` object in self.soup.
        a*  Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.N)
rb   r   r3   feedcloser   r"   r#   RuntimeWarningr   )r   rk   r    r!   parserrK   r   r   r	   rm     s   


zHTMLParserTreeBuilder.feed)NN)NNN)r   r   r   rZ   Zis_xmlZ	picklable
HTMLPARSERNAMEr   r   ZfeaturesZTRACKS_LINE_NUMBERSr   rl   rm   __classcell__r   r   rd   r	   r   &  s    

1zQ\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?a  
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
)tagfindattrfindc                 C   s*  d | _ | |}|dk r|S | j}||| | _ g }t||d }|s(J d| }||d |   | _}||k r| jrGt	||}nt
||}|sPnT|ddd\}	}
}|
s_d }n-|d d d  krq|dd  ksn |d d d  kr|dd  krn n|dd }|r| |}||	 |f | }||k s=|||  }|d	vr|  \}}d
| j v r|| j d
 }t| j | j d
 }n|t| j  }| jr| d||| d d f  | |||  |S |dr| || |S | || || jv r| | |S )Nr      z#unexpected call to parse_starttag()r   r
   '")>/>
z junk characters in start tag: %r   r{   )Z__starttag_textZcheck_for_whole_start_tagrawdatart   matchendlowerZlasttagr[   ru   attrfind_tolerantgroupZunescaper4   stripr2   countrT   rfindr%   r9   endswithr,   r'   ZCDATA_CONTENT_ELEMENTSset_cdata_mode)r   iendposr~   r*   r   kr+   mattrnamerestr7   r   linenooffsetr   r   r	   parse_starttag  sj   
&





r   c                 C   s$   |  | _td| j tj| _d S )Nz</\s*%s\s*>)r   Z
cdata_elemrecompileIZinteresting)r   elemr   r   r	   r     s   
r   T)+rZ   Z__license____all__Zhtml.parserr   r   ImportErrorrK   	Exceptionsysr"   version_infomajorminorreleaser_   r`   ra   Zbs4.elementr   r   r   r   r   Z
bs4.dammitr   r   Zbs4.builderr   r   r   rq   r   r   r   r   r   VERBOSEZlocatestarttagendrt   ru   r   r   r   r   r   r	   <module>   sR   	 po7