ZiF4dZddlmZddlmZddlmZddlmZm Z ddl m Z ddl m Z mZmZmZmZmZmZmZmZmZmZmZmZmZerdd lmZd ZGd d eZed d.dZed d/dZ ed d/dZ!ed d/dZ"ed d/dZ#Gdde Z$ed d0dZ%d1d"Z& d2d3d)Z'd4d+Z(d5d,Z) d2d3d-Z*d#S)6z Grapheme cluster segmentation following Unicode Standard Annex #29. This module provides pure-Python implementation of the grapheme cluster boundary algorithm as defined in UAX #29: Unicode Text Segmentation. https://www.unicode.org/reports/tr29/ ) annotations)IntEnum) lru_cache) TYPE_CHECKING NamedTuple)bisearch) GRAPHEME_L GRAPHEME_T GRAPHEME_V GRAPHEME_LV INCB_EXTEND INCB_LINKER GRAPHEME_LVTINCB_CONSONANTGRAPHEME_EXTENDGRAPHEME_CONTROLGRAPHEME_PREPENDGRAPHEME_SPACINGMARKEXTENDED_PICTOGRAPHICGRAPHEME_REGIONAL_INDICATOR)Iterator cJeZdZdZdZdZdZdZdZdZ dZ d Z d Z d Z d Zd ZdZdZdS)GCBz'Grapheme Cluster Break property values.rr N)__name__ __module__ __qualname____doc__OTHERCRLFCONTROLEXTENDZWJREGIONAL_INDICATORPREPEND SPACING_MARKLVTLVLVT`/home/jenkins/workspace/simtester-sanitize/venv/lib/python3.11/site-packages/wcwidth/grapheme.pyrr,s[11 E B BG F CGL A A A B CCCr;ri)maxsizeucsintreturnc|dkr tjS|dkr tjS|dkr tjSt |t r tjSt |tr tjSt |tr tj St |tr tj St |tr tjSt |tr tjSt |t"r tjSt |t&r tjSt |t*r tjSt |t.r tjStjS)z;Return the Grapheme_Cluster_Break property for a codepoint.r'r$i )rr-r.r1 _bisearchrr/rr0rr2rr3rr4r r5r r6r r7r r8rr9r,r>s r<_grapheme_cluster_breakrDBsA  f}}v  f}}v  f}}w&''{o&&z122&%%&''{*++ j!!u j!!u j!!u k""v l##w 9r;boolcFtt|tS)z6Check if codepoint has Extended_Pictographic property.)rErBrrCs r<_is_extended_pictographicrGes  #455 6 66r;cFtt|tS)z,Check if codepoint has InCB=Linker property.)rErBrrCs r<_is_incb_linkerrIk  #{++ , ,,r;cFtt|tS)z/Check if codepoint has InCB=Consonant property.)rErBrrCs r<_is_incb_consonantrLqs  #~.. / //r;cFtt|tS)z,Check if codepoint has InCB=Extend property.)rErBrrCs r<_is_incb_extendrNwrJr;c(eZdZUdZded<ded<dS) BreakResultz*Result of grapheme cluster break decision.rE should_breakr?ri_countN)r(r)r*r+__annotations__r:r;r<rPrP}s+44MMMMMr;rPprev_gcbcurr_gcbBreakResult | Nonec|tjkr!|tjkrtddS|tjtjtjfvrtddS|tjtjtjfvrtddS|tjkrA|tjtjtjtjfvrtddS|tjtjfvr+|tjtj fvrtddS|tjtj fvr!|tj krtddS|tj krtddS|tj krtddS|tj krtddSdS)z Check simple GCB-pair-based break rules (cacheable). Returns BreakResult for rules that can be determined from GCB properties alone, or None if complex lookback rules (GB9c, GB11) need to be checked. FrrQrRTN) rr-r.rPr/r5r6r8r9r7r0r4r3)rTrUs r<_simple_break_checkrYs36h#&00::::CK000q9999CK000q999935X#%)HHH::::CFCE?""xCE35>'A'A::::CGSU###CE(9(9::::3:::::3###::::3;:::: 4r;textstrcurr_idxrRct||}||S|tjkrtddSt ||}t |rxd}|dz }|dkrkt ||} t | rd}|dz}n9t| r|dz}n$t | r|rtddSnn|dkk|tjkr{t|rl|dz }|dkrat ||} t| } | tj kr|dz}n!t| rtddSn|dka|tj kr>|tj kr.|dzdkrtd|dzStddS|tj krdnd}td|S)z Determine if there should be a grapheme cluster break between prev and curr. Implements UAX #29 grapheme cluster boundary rules. NFrrXrTr) rYrr1rPordrLrIrNrGrDr0r2) rTrUrZr\rRresultcurr_ucs has_linkeriprev_ucs prev_props r< _should_breakres!8 4 4F  37:::: 4>""H(## qL1ff47||Hx(( ! Q ** Q#H-- G&EAFFFF1ff378BB qL1ff47||H/99ICJ&&Q*844 "BBBB1ff3)))h#:P.P.P a<1  EHqLIII Iq9999 666qqAH D8 < < <>> list(iter_graphemes('cafe\u0301')) ['c', 'a', 'f', 'e\u0301'] >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467')) ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467'] >>> list(iter_graphemes('\U0001F1FA\U0001F1F8')) ['o', 'k', '\U0001F1FA\U0001F1F8'] .. versionadded:: 0.3.0 Nrr) lenminrDr^rr2rangererRrQ) rfrgrhlength cluster_startrRrTidxrUr_s r<iter_graphemesrrs(8  [[F { ||u c6  CMH's6%='9'9::H3)))UQY$$  *3vc{+;+;<<x63II?   s*+ + + +M s" ######r;poscVt||dz }|dkr|dkr||dz dkr|dz S|dkr_|dkrT|dkrNt||dz }|dkr0t|tjkrt ||dz S|dz S|dz }|dkrk||z t kr]t||}d|cxkrdkrnnn7t|tjkrn|dz}|dkr||z t k]|}tt||}|tjkrdnd}t|dz|D]I} tt|| } t|| || |} | j }| j r| }| }J|S)a Find the start of the grapheme cluster containing the character before pos. Scans backwards from pos to find a safe starting point, then iterates forward using standard break rules to find the actual cluster boundary. :param text: The Unicode string. :param pos: Position to search before (exclusive). :returns: Start position of the grapheme cluster. rr$r rr) r^rDrr3_find_cluster_startMAX_GRAPHEME_SCANr/r2rnrerRrQ) rZrs target_cpprev_cp safe_startcprpleft_gcbrRrb right_gcbr_s r<rwrw<sDqM""IDSAXX$sQw-4*?*?Qw4 !88 T))$sQw-((G$#:7#C#Cs{#R#R*4q999QwqJ q..cJ.2CCC j! " " 2           "2 & &#+ 5 5 a q..cJ.2CCCM&s4 +;'<'<==H 666qqAH :>3 ' '+CQLL99 xD!XFF?   M r;c h|dkrdSt|t|t|S)a Find the grapheme cluster boundary immediately before a position. :param unistr: The Unicode string to search. :param pos: Position in the string (0 < pos <= len(unistr)). :returns: Start index of the grapheme cluster containing the character at pos-1. Example:: >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8) 6 >>> grapheme_boundary_before('a\r\nb', 3) 1 .. versionadded:: 0.3.6 r)rwrmrl)rfrss r<grapheme_boundary_beforerps2" axxq vs3F '<'< = ==r;c#K|sdSt|}||nt||}t|d}||ks||krdS|}||kr.t||}||krdS|||V|}||k,dSdS)a Iterate over grapheme clusters in reverse order (last to first). :param unistr: The Unicode string to segment. :param start: Starting index (default 0). :param end: Ending index (default len(unistr)). :yields: Grapheme cluster substrings in reverse order. Example:: >>> list(iter_graphemes_reverse('cafe\u0301')) ['e\u0301', 'f', 'a', 'c'] .. versionadded:: 0.3.6 Nr)rlrmmaxrw)rfrgrhrorsrps r<iter_graphemes_reversers(  [[FK&&Sf%5%5C qMME ||u C +++FC88 5 E]3&'''' ++++++r;)r>r?r@r)r>r?r@rE)rTrrUrr@rV) rTrrUrrZr[r\r?rRr?r@rP)rN)rfr[rgr?rhrir@rj)rZr[rsr?r@r?)rfr[rsr?r@r?)+r+ __future__renumr functoolsrtypingrrr rBtable_graphemer r r r rrrrrrrrrrcollections.abcrrxrrDrGrIrLrNrPrYrerrrwrrr:r;r<rs7#""""",,,,,,,,,+++++ : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : :)((((((     '   , 4D 47777  4----  40000  4---- * 4----`@=@=@=@=JA$A$A$A$A$H1111h>>>>0&&&&&&&r;