o ]Lb[@stddlmZmZddlZddlZddlZddlZddlmZddl m Z m Z mZddl m ZejrXddlmZmZmZmZmZmZmZeeeeeefD]ZesQJqKedd d Ze d Z e jZe jZe jZe jZej Z!ej"rqe#Z$d d d%DZ&e'dde&DsJddZ(ej" pej)Z*ej"sej+Z+ne*rej,Z+n ddej+-DZ+dddZ.ej/rej"sde.d<ze+0dZ1e1se23dpdZ1e.0e1e1Z1Wn ej4ydZ1Ynwe+0ddZ5dZ6Gdd d e7Z8Gdd d e7Z9d!d"Z:d#d$Z;d%d&Zej"reZAn d+d,Z?d-d.Z@ejBZAd/d0ZCd1d2ZDd3d4ZEe*sXej/r?ej"r?Gd5d6d6eFZGeGZ+ej+-D]\ZHZIe:eI3d7e+e:eH3d7<qDeJd8ZKej"rnej/rjd9d:ZLnejMZLnejLZLe!e+0d;d<d=kr~d>pd?ZNd@dAZOdBdCZPdDdEZQdWdHdIZRGdJdKdKeSZTdXdLdMZUej"rdNZVndOZVgdPZWdQdRZXdSdTZYdUdVZZdS)Y)absolute_importprint_functionN)getattr)errorpolicypycompat) charencode)AnyCallableListTextTypeTypeVarUnion _Tlocalstrlocalstr)boundr cCs g|] }tt|ddqS)utf-8)unichrintencode).0xr4/usr/lib/python3/dist-packages/mercurial/encoding.py 8srsO200c 200d 200e 200f 202a 202b 202c 202d 202e 206a 206b 206c 206d 206e 206f feffccs|]}|dVqdS))N) startswith)rirrr >sr"cCs*d|vsd|vrtD]}||d}q |S)uRemove codepoints ignored by HFS+ from s. >>> hfsignoreclean(u'.h‌g'.encode('utf-8')) '.hg' >>> hfsignoreclean(u'.hg'.encode('utf-8')) '.hg' rr)_ignorereplace)scrrrhfsignorecleanAs r(cCs"i|] \}}|d|dqS)r)r)rkvrrr Zsr+sascii)s646sANSI_X3.4-1968sutf-8scp65001s HGENCODINGasciisHGENCODINGMODEsstricts ISO-8859-1cs6eZdZdZddZejrfddZddZZ S)rzdThis class allows strings that are unmodified to be round-tripped to the local encoding and backcCst||}||_|SN)bytes__new___utf8)clsulr&rrrr/xs zlocalstr.__new__cstt||||_dSr-)superr__init__r0)selfr2r3 __class__rrr5s zlocalstr.__init__cCs t|jSr-)hashr0)r6rrr__hash__s zlocalstr.__hash__) __name__ __module__ __qualname____doc__r/r TYPE_CHECKINGr5r: __classcell__rrr7rrts  c@seZdZdZdS) safelocalstraDTagged string denoting it was previously an internal UTF-8 string, and can be converted back to UTF-8 losslessly >>> assert safelocalstr(b'\xc3') == b'\xc3' >>> assert b'\xc3' == safelocalstr(b'\xc3') >>> assert b'\xc3' in {safelocalstr(b'\xc3'): 0} >>> assert safelocalstr(b'\xc3') in {b'\xc3': 0} N)r;r<r=r>rrrrrAsrAc Cs.t|r|Szzz+|d}tdkr|WWS|ttd}||ttkr,t|WWSt||WWStyz,|tt}|ttd}||ttkrZt|WYWSt|d|WYWSty|dd}|ttdYYWSwwt y}z t j t |ddd}~ww)a Convert a string from internal UTF-8 to local encoding All internal strings should be UTF-8 but some repos before the implementation of locale support may contain latin1 or possibly other character sets. We attempt to decode everything strictly using UTF-8, then Latin-1, and failing that, we use UTF-8 and replace unknown characters. The localstr class is used to cache the known UTF-8 encoding of strings next to their local representation to allow lossless round-trip conversion back to UTF-8. >>> u = b'foo: \xc3\xa4' # utf-8 >>> l = tolocal(u) >>> l 'foo: ?' >>> fromlocal(l) 'foo: \xc3\xa4' >>> u2 = b'foo: \xc3\xa1' >>> d = { l: 1, tolocal(u2): 2 } >>> len(d) # no collision 2 >>> b'foo: ?' in d False >>> l1 = b'foo: \xe4' # historical latin1 fallback >>> l = tolocal(l1) >>> l 'foo: ?' >>> fromlocal(l) # magically in utf-8 'foo: \xc3\xa4' zUTF-8sUTF-8r%r!please check your locale settingshintN) isasciistrdecodeencodingr_sysstrrArUnicodeDecodeErrorfallbackencoding LookupErrorrAbortrbytestr)r&r2rr)rrrtolocals:#       rOc Cst|tr|jSt|r|Sz|tttt}|dWSt yE}z|t d|j d|j d}t d|t|fd}~wtyZ}z t j t|ddd}~ww)a Convert a string from the local character encoding to UTF-8 We attempt to decode strings using the encoding mode set by HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown characters will cause an error message. Other modes include 'replace', which replaces unknown characters with a special Unicode character, and 'ignore', which drops the character. rr sdecoding near '%s': %s!NrBrC) isinstancerr0rErFrHrG encodingmoderrImaxstartrrLrrMrK)r&r2instsubr)rrr fromlocals&   rWcCst|dS)z;Convert a unicode string to a byte string of local encodingr)rOr)r2rrr unitolocalrXcCst|dS)z;Convert a byte string of local encoding to a unicode stringr)rWrFr&rrr unifromlocalrYr[csfdd}|S)z^Create a proxy method that forwards __unicode__() and __str__() of Python 3 to __bytes__()cs t|Sr-)r[)obj bytesfuncrrunifunc s zunimethod..unifuncr)r^r_rr]r unimethods r`cC|Sr-rrZrrr strtolocalrbcCrar-rrZrrr strfromlocalrcrdc Cszt|WStyYnwz)t|tr|jd}n |tttt}| }||kr1|WS| ttWSt yE| YSt yZ}z t jt|ddd}~ww)9best-effort encoding-aware case-folding of local string srrBrCN) asciilowerrIrQrr0rFrHrGrRlowerr UnicodeErrorrKrrLrrM)r&r2lur)rrrrg#s*      rgcCs&zt|WStyt|YSw)re) asciiupperrI upperfallbackrZrrrupper<s    rlc Csz)t|tr |jd}n |tttt}|}||kr"|WS|ttWSt y6|YSt yK}z t j t |ddd}~ww)NrrBrC)rQrr0rFrHrGrRrlrrhrKrrLrrM)r&r2uur)rrrrkEs     rkcs"eZdZdZdfdd ZZS)WindowsEnvironzE`os.environ` normalizes environment variables to uppercase on windowsNcstt||Sr-)r4getrl)r6keydefaultr7rrroaszWindowsEnviron.getr-)r;r<r=r>ror@rrr7rrn^srnrs^[a-z]:cCsFt}tj|}t|}t|r!|dd|dd}|S)Nrr)osgetcwdpathrealpathrbDRIVE_REmatchrl)cwdrrrrs~s   rssHGENCODINGAMBIGUOUSsnarrowswidesWFAsWFcCst|ttdS)zCFind the column width of a string for display in the local encodingr%) ucolwidthrFrHrGrZrrrcolwidthsrzcs2ttdddurtfdd|DSt|S)z5Find the column width of a Unicode string for displayeast_asian_widthNcs g|] }|tvr dp dqS)r)_wide)rr'eawrrrs zucolwidth..)r unicodedatasumlen)drr~rrys rycCsBt||t|D]}|||}t||kr|Sq td)zOUse colwidth to find a c-column substring of s starting at byte index startzsubstring not found)rxrangerrz ValueError)r&rTr'rtrrrgetcolss   rr#Fc CsJz |tt}Wn>tyGt||kr|YS|t|8}|dkr0|d|t|YS|r=||| dYS|d||YSwt||krP|S|t|8}|dkrd|d|t|St|}|rn|d}t|D]\}}|t|7}||krnqt|d|}|r|d | tt}|r||S||S)uTrim string 's' to at most 'width' columns (including 'ellipsis'). If 'leftside' is True, left side of string 's' is trimmed. 'ellipsis' is always placed at trimmed side. >>> from .node import bin >>> def bprint(s): ... print(pycompat.sysstr(s)) >>> ellipsis = b'+++' >>> from . import encoding >>> encoding.encoding = b'utf-8' >>> t = b'1234567890' >>> bprint(trim(t, 12, ellipsis=ellipsis)) 1234567890 >>> bprint(trim(t, 10, ellipsis=ellipsis)) 1234567890 >>> bprint(trim(t, 8, ellipsis=ellipsis)) 12345+++ >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) +++67890 >>> bprint(trim(t, 8)) 12345678 >>> bprint(trim(t, 8, leftside=True)) 34567890 >>> bprint(trim(t, 3, ellipsis=ellipsis)) +++ >>> bprint(trim(t, 1, ellipsis=ellipsis)) + >>> u = u'あいうえお' # 2 x 5 = 10 columns >>> t = u.encode(pycompat.sysstr(encoding.encoding)) >>> bprint(trim(t, 12, ellipsis=ellipsis)) あいうえお >>> bprint(trim(t, 10, ellipsis=ellipsis)) あいうえお >>> bprint(trim(t, 8, ellipsis=ellipsis)) あい+++ >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) +++えお >>> bprint(trim(t, 5)) あい >>> bprint(trim(t, 5, leftside=True)) えお >>> bprint(trim(t, 4, ellipsis=ellipsis)) +++ >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) +++ >>> t = bin(b'112233445566778899aa') # invalid byte sequence >>> bprint(trim(t, 12, ellipsis=ellipsis)) "3DUfwˆ™ª >>> bprint(trim(t, 10, ellipsis=ellipsis)) "3DUfwˆ™ª >>> bprint(trim(t, 8, ellipsis=ellipsis)) "3DU+++ >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) +++fwˆ™ª >>> bprint(trim(t, 8)) "3DUfwˆ >>> bprint(trim(t, 8, leftside=True)) 3DUfwˆ™ª >>> bprint(trim(t, 3, ellipsis=ellipsis)) +++ >>> bprint(trim(t, 1, ellipsis=ellipsis)) + rN) rFrHrGrIrrylistreverse enumeratejoinr) r&widthellipsisleftsider2chars width_so_farr!r'rrrtrimsBB       rc@seZdZdZdZdZdZdS) normcasespecsaxwhat a platform's normcase does to ASCII strings This is specified per platform, and should be consistent with what normcase on that platform actually does. lower: normcase lowercases ASCII strings upper: normcase uppercases ASCII strings other: the fallback function should always be called This should be kept in sync with normcase_spec in util.h.rrN)r;r<r=r>rgrlotherrrrrrs  rcCs4t|}zt||WStyYnwt||S)areturns a string suitable for JSON JSON is problematic for us because it doesn't support non-Unicode bytes. To deal with this, we take the following approach: - localstr/safelocalstr objects are converted back to UTF-8 - valid UTF-8/ASCII strings are passed as-is - other strings are converted to UTF-8b surrogate encoding - apply JSON-specified string escaping (escapes are doubled in these tests) >>> jsonescape(b'this is a test') 'this is a test' >>> jsonescape(b'escape characters: \0 \x0b \x7f') 'escape characters: \\u0000 \\u000b \\u007f' >>> jsonescape(b'escape characters: \b \t \n \f \r \" \\') 'escape characters: \\b \\t \\n \\f \\r \\" \\\\' >>> jsonescape(b'a weird byte: \xdd') 'a weird byte: \xed\xb3\x9d' >>> jsonescape(b'utf-8: caf\xc3\xa9') 'utf-8: caf\xc3\xa9' >>> jsonescape(b'') '' If paranoid, non-ascii and common troublesome characters are also escaped. This is suitable for web output. >>> s = b'escape characters: \0 \x0b \x7f' >>> assert jsonescape(s) == jsonescape(s, paranoid=True) >>> s = b'escape characters: \b \t \n \f \r \" \\' >>> assert jsonescape(s) == jsonescape(s, paranoid=True) >>> jsonescape(b'escape boundary: \x7e \x7f \xc2\x80', paranoid=True) 'escape boundary: ~ \\u007f \\u0080' >>> jsonescape(b'a weird byte: \xdd', paranoid=True) 'a weird byte: \\udcdd' >>> jsonescape(b'utf-8: caf\xc3\xa9', paranoid=True) 'utf-8: caf\\u00e9' >>> jsonescape(b'non-BMP: \xf0\x9d\x84\x9e', paranoid=True) 'non-BMP: \\ud834\\udd1e' >>> jsonescape(b'', paranoid=True) '\\u003cfoo@example.org\\u003e' )toutf8b_jsonescapeu8fastrcharencodepurejsonescapeu8fallback)r&paranoidu8charsrrr jsonescape$s.   r surrogatepassstrict)rrrrrrrrrrrrr|r|cCsPtt|||dd?}|s|||dS||||}|dt|S)zget the next full utf-8 character in the given string, starting at pos Raises a UnicodeError if the given location does not start a valid utf-8 character. rrr)_utf8lenordrF _utf8strict)r&posr3r'rrr getutf8chards   rcCs"t|tr|jSt|trt|St|r|Sd|vr.z |dt|WSty-Ynwt |}d}d}t |}||krz,t ||}d|krOdkrdnnt dt||dt}|d7}n|t |7}Wntyt dt||dt}|d7}Ynw||7}||ks?|S) aBconvert a local, possibly-binary string into UTF-8b This is intended as a generic method to preserve data when working with schemes like JSON and XML that have no provision for arbitrary byte strings. As Mercurial often doesn't know what encoding data is in, we use so-called UTF-8b. If a string is already valid UTF-8 (or ASCII), it passes unmodified. Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, uDC00-uDCFF. Principles of operation: - ASCII and UTF-8 data successfully round-trips and is understood by Unicode-oriented clients - filenames and file contents in arbitrary other encodings can have be round-tripped or recovered by clueful clients - local strings that have a cached known UTF-8 encoding (aka localstr) get sent as UTF-8 so Unicode-oriented clients get the Unicode data they want - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well - because we must preserve UTF-8 bytestring in places such as filenames, metadata can't be roundtripped without help (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and arbitrary bytes into an internal Unicode format that can be re-encoded back into the original. Here we are exposing the internal surrogate encoding as a UTF-8 string.) rr#rir)rQrr0rArWrErFrrIrrMrrrrrr&rNrr3r'rrrrws@           rcCst|r|Sd|vr |St|}d}d}t|}||krIt||}|t|7}d|kr2dkrAnn tt|dtd@}||7}||ks|S)aWGiven a UTF-8b string, return a local, possibly-binary string. return the original binary string. This is a round-trip process for strings like filenames, but metadata that's was passed through tolocal will remain in UTF-8. >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x >>> m = b"\xc3\xa9\x99abcd" >>> toutf8b(m) '\xc3\xa9\xed\xb2\x99abcd' >>> roundtrip(m) True >>> roundtrip(b"\xc2\xc2\x80") True >>> roundtrip(b"\xef\xbf\xbd") True >>> roundtrip(b"\xef\xef\xbf\xbd") True >>> roundtrip(b"\xf1\x80\x80\x80\x80") True rr#rrrr) rErrMrrbytechrrrFrrrrr fromutf8bs    r)r#F)F)[ __future__rrlocalerrrerrrrrrpurer rr?typingr r r r rrrrr importmodrErfrjjsonescapeu8fastrsysstrrHispy3chrrsplitr$allr(supports_bytes_environ_nativeenvironenvironenvironbitems_encodingrewrites iswindowsrorGgetpreferredencodingrErrorrRrJr.rrArOrWrXr[r`rbrd strmethodidentityrgrlrkdictrnr)r*compilervrsgetcwdbr}rzryrrobjectrrrrrrrrrrrs  $        E       f 8 E