o
    _cc.                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ ed	ejZed
ejZedejZedejejB ZedejejB ZedejZdZ			d7dedee de dedef
ddZ!d8dedee de fddZ"d9dededee defddZ#ed ejZ$d8dedee defd!d"Z%			d:ded#ee dee dee def
d$d%Z&	d;ded#ee dee defd&d'Z'	(		d<ded#ee d)edee def
d*d+Z(			d=dedee de dee def
d,d-Z)	d>ded.ededefd/d0Z*			1d?ded.eded2ee de
ee+ ee f f
d3d4Z,dedefd5d6Z-dS )@z(
Functions for dealing with markup text
    N)name2codepoint)IterableMatchAnyStrOptionalPatternTupleUnion)urljoin)
to_unicode)safe_url_string)
StrOrByteszI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)[^>]*?\shttp-equiv\s*=[^>]*refreshz<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
 Tutf-8textkeepremove_illegalencodingreturnc                    s*   dt dtf fdd}t|t| |S )u  Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    'Price: \xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    mr   c                    s   |   }d }|drt|d d}n-|dr t|d d}n |dr@|d }|  v r4| dS t|p?t| }|d urhzd|  krOdkrZn n	t|fd	W S t|W S  t	yg   Y nw rq|d
rqdS | dS )Ndec
   hex   namedr         cp1252	semicolon )
	groupdictgetintlowergroupr   bytesdecodechr
ValueError)r   groupsnumberentity_namer   r   r   5/usr/local/lib/python3.10/dist-packages/w3lib/html.pyconvert_entityE   s,   




z(replace_entities.<locals>.convert_entity)r   str_ent_resubr   )r   r   r   r   r.   r   r,   r-   replace_entities$   s   !r2   c                 C   s   t tt| |S N)boolr0   searchr   )r   r   r   r   r-   has_entitiesf   s   r6   r   tokenc                 C   s   t |t| |S )ac  Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags('This text contains <a>some tag</a>')
    'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
    ' -- Je ne parle pas  -- fran\xe7ais --  -- '
    >>>

    )_tag_rer1   r   )r   r7   r   r   r   r-   replace_tagsj   s   r9   z<!--.*?(?:-->|$)c                 C   s   t | |}td|S )zRemove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    'test  whatever'
    >>>

    r   )r   _REMOVECOMMENTS_REr1   )r   r   utextr   r   r-   remove_comments   s   

r<   
which_onesc                    s   r rt ddd D dd  D  dtdtf fddd	tdtffd
d}d}t|tjtjB }||t	| |S )a;  Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    '<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    '<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
        ...
    ValueError: Cannot use both which_ones and keep
    >>>

    z#Cannot use both which_ones and keepc                 S      h | ]}|  qS r   r#   .0tagr   r   r-   	<setcomp>       zremove_tags.<locals>.<setcomp>c                 S   r>   r   r?   r@   r   r   r-   rC      rD   rB   r   c                    s   |   } r
| v S |  vS r3   r?   )rB   )r   r=   r   r-   will_remove   s   z remove_tags.<locals>.will_remover   c                    s    |  d} |rdS |  dS )N   r   r   )r$   )r   rB   )rE   r   r-   
remove_tag   s   
zremove_tags.<locals>.remove_tagz</?([^ >/]+).*?>)
r(   r/   r4   r   recompileDOTALL
IGNORECASEr1   r   )r   r=   r   r   rG   regexretagsr   )r   r=   rE   r-   remove_tags   s   1rN   c                 C   sF   t | |}|r!ddd |D }t|tjtjB }|d|}|S )a  Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    '<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    |c              	   S   s$   g | ]}d | d| d| dqS )<z\b.*?</z>|<z\s*/>r   r@   r   r   r-   
<listcomp>   s   $ z,remove_tags_with_content.<locals>.<listcomp>r   )r   joinrH   rI   rJ   rK   r1   )r   r=   r   r;   tagsrM   r   r   r-   remove_tags_with_content   s   
rT   
	
replace_byc                 C   s*   t | |}|D ]}||t ||}q|S )a$  Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\n``, ``\t``, ``\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    )r   replace)r   r=   rY   r   r;   ecr   r   r-   replace_escape_chars   s   
r\   c              	   C   sp   dt dtdttt tf  fdd}t| |}d}||tD ]}t|t r.|t|||d7 }q||	d7 }q|S )	a`  
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    txtpatternr   c                 s   sN    d}| | D ]}|d\}}| || V  |V  |}q| |d  V  d S )Nr   rF   )finditerspan)r]   r^   offsetmatchZmatch_sZmatch_er   r   r-   _get_fragments  s   z&unquote_markup.<locals>._get_fragmentsr   r,   Zcdata_d)
r/   r   r   r	   r   r   	_cdata_re
isinstancer2   r$   )r   r   r   r   rc   r;   Zret_textfragmentr   r   r-   unquote_markup  s   "
	

rg   baseurlc                 C   s<   t | |}t|}|rtt|t|d|dS t|S )zReturn the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    rF   )r   )r   _baseurl_rer5   r
   r   r$   )r   rh   r   r;   r   r   r   r-   get_base_url0  s   


rj   scriptnoscriptignore_tagsc                 C   s   zt | |}W n ty   t|   w t||}tt|}t|p(t|}|rFt	|
d}t|
dd|}t||}||fS dS )aY  Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    r"   urlz "')NN)r   UnicodeDecodeErrorprintrT   r<   r2   _meta_refresh_rer5   _meta_refresh_re2floatr$   r   stripr
   )r   rh   r   rn   r;   r   intervalro   r   r   r-   get_meta_refreshD  s   

rw   c                 C   s
   |  tS )a  
    Strip all leading and trailing space characters (as defined in
    https://www.w3.org/TR/html5/infrastructure.html#space-character).

    Such stripping is useful e.g. for processing HTML element attributes which
    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
    defines them as "valid URL potentially surrounded by spaces"
    or "valid non-empty URL potentially surrounded by spaces".

    >>> strip_html5_whitespace(' hello\n')
    'hello'
    )ru   HTML5_WHITESPACE)r   r   r   r-   strip_html5_whitespaced  s   
ry   )r   Tr   r3   )r   N)r   r   N)r   N)rU   r   N)r   TN)r   r   )r   r   rk   ).__doc__rH   html.entitiesr   typingr   r   r   r   r   r   r	   urllib.parser
   Z
w3lib.utilr   Z	w3lib.urlr   Zw3lib._typesr   rI   rK   r0   rJ   r8   Iri   rr   rs   rd   rx   r/   r4   r2   r6   r9   r:   r<   rN   rT   r\   rg   rj   rt   rw   ry   r   r   r   r-   <module>   s    $


B 
I


)

 