
     `i+                         d Z ddlZddlmZmZ ddlmZmZmZ  e            r
ddl	Z	ddl	m
Z
  ej        e          Z G d de          ZdgZdS )	z'
Feature extractor class for MarkupLM.
    N   )BatchFeatureFeatureExtractionMixin)is_bs4_availableloggingrequires_backends)BeautifulSoupc                   @     e Zd ZdZ fdZd Zd Zd ZdefdZ	 xZ
S )MarkupLMFeatureExtractorao  
    Constructs a MarkupLM feature extractor. This can be used to get a list of nodes and corresponding xpaths from HTML
    strings.

    This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
    of the main methods. Users should refer to this superclass for more information regarding those methods.

    c                 \    t          | dg            t                      j        di | d S )Nbs4 )r   super__init__)selfkwargs	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/markuplm/feature_extraction_markuplm.pyr   z!MarkupLMFeatureExtractor.__init__+   s8    $(((""6"""""    c           
         g }g }|j         r|n|j        j        D ]}|                    j         d          }|                    j                    |                    dt          |          k    rdn(t          fdt          |d          D                                  ||                                 |                                 ||fS )NF)	recursive   r   c              3   *   K   | ]\  }}|u 	|V  d S )Nr   ).0ischilds      r   	<genexpr>z6MarkupLMFeatureExtractor.xpath_soup.<locals>.<genexpr>7   s3      1e1e1Z[_dZdZd!ZdZdZdZd1e1er   )	nameparentparentsfind_allappendlennext	enumeratereverse)r   element
xpath_tagsxpath_subscriptsr    siblingsr   s         @r   
xpath_soupz#MarkupLMFeatureExtractor.xpath_soup/   s    
"<;W^m 	 	FuzUCCHej)))###h--''T1e1e1e1e	(TU@V@V1e1e1e-e-e   EE  """+++r   c                    t          |d          }g }g }g }|j        D ]}t          |t          j        j                  rt          |j                  t          j        j        urGt          j
        |                                          }|sp|                    |           |                     |          \  }}	|                    |           |                    |	           t          |          t          |          k    rt          d          t          |          t          |          k    rt          d          |||fS )Nzhtml.parserz3Number of doc strings and xtags does not correspondz3Number of doc strings and xsubs does not correspond)r	   descendants
isinstancer   r(   NavigableStringtyper    Taghtmlunescapestripr#   r,   r$   
ValueError)
r   html_string	html_codeall_doc_stringsstring2xtag_seqstring2xsubs_seqr(   text_in_this_tagr)   r*   s
             r   get_three_from_singlez.MarkupLMFeatureExtractor.get_three_from_single>   sF   !+}==	 , 	: 	:G'3;#>?? :''s{>>#'=#9#9#?#?#A#A ' &&'7888/3w/G/G,
,&&z222 ''(89993#7#777RSSS3'7#8#888RSSS1AAAr   c                 d    d}t          ||          D ]\  }}|d| z  }|dk    r	|d| dz  }|S )N /r   [])zip)r   r)   r*   xpathtagnamesubss         r   construct_xpathz(MarkupLMFeatureExtractor.construct_xpath[   sV     -=>> 	% 	%MGT]]]"EqyyT$r   returnc                    d}t          |t                    rd}nLt          |t          t          f          r0t	          |          dk    st          |d         t                    rd}|s t          dt          |           d          t          |t          t          f          ot          |d         t                    }|s|g}g }g }|D ]}|                     |          \  }}}	|                    |           g }
t          |||	          D ]1\  }}}| 
                    ||          }|
                    |           2|                    |
           ||d}t          |d          }|S )	a\  
        Main method to prepare for the model one or several HTML strings.

        Args:
            html_strings (`str`, `list[str]`):
                The HTML string or batch of HTML strings from which to extract nodes and corresponding xpaths.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **nodes** -- Nodes.
            - **xpaths** -- Corresponding xpaths.

        Examples:

        ```python
        >>> from transformers import MarkupLMFeatureExtractor

        >>> page_name_1 = "page1.html"
        >>> page_name_2 = "page2.html"
        >>> page_name_3 = "page3.html"

        >>> with open(page_name_1) as f:
        ...     single_html_string = f.read()

        >>> feature_extractor = MarkupLMFeatureExtractor()

        >>> # single example
        >>> encoding = feature_extractor(single_html_string)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])

        >>> # batched example

        >>> multi_html_strings = []

        >>> with open(page_name_2) as f:
        ...     multi_html_strings.append(f.read())
        >>> with open(page_name_3) as f:
        ...     multi_html_strings.append(f.read())

        >>> encoding = feature_extractor(multi_html_strings)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])
        ```FTr   zQHTML strings must of type `str`, `list[str]` (batch of examples), but is of type .)nodesxpathsN)datatensor_type)r/   strlisttupler$   r6   r1   r=   r#   rC   rG   r   )r   html_stringsvalid_strings
is_batchedrK   rL   r7   r9   r:   r;   xpath_stringsnodetag_listsub_listxpath_stringrM   encoded_inputss                    r   __call__z!MarkupLMFeatureExtractor.__call__c   s   `  lC(( 	% MMtUm44 	%<  A%%LOS)I)I% $ 	8"&|"4"48 8 8  
  tUm<<c*\Z[_^aBbBb
 	*(>L ' 	) 	)KAEA[A[\gAhAh>O_.>LL)))M,/Rb,c,c 3 3(h#33HhGG$$\2222MM-(((( &11%4TBBBr   )__name__
__module____qualname____doc__r   r,   r=   rG   r   r[   __classcell__)r   s   @r   r   r   !   s         # # # # #, , ,B B B:  T T T T T T T T Tr   r   )r_   r3   feature_extraction_utilsr   r   utilsr   r   r   r   r	   
get_loggerr\   loggerr   __all__r   r   r   <module>rf      s      L L L L L L L L A A A A A A A A A A  "JJJ!!!!!! 
	H	%	%V V V V V5 V V Vr &
&r   