# common.py from .core import * from .helpers import DelimitedList, any_open_tag, any_close_tag from datetime import datetime import sys PY_310_OR_LATER = sys.version_info >= (3, 10) # some other useful expressions - using lower-case class name since we are really using this as a namespace class pyparsing_common: """Here are some common low-level expressions that may be useful in jump-starting parser development: - numeric forms (:class:`integers`, :class:`reals`, :class:`scientific notation`) - common :class:`programming identifiers` - network addresses (:class:`MAC`, :class:`IPv4`, :class:`IPv6`) - ISO8601 :class:`dates` and :class:`datetime` - :class:`UUID` - :class:`comma-separated list` - :class:`url` Parse actions: - :class:`convert_to_integer` - :class:`convert_to_float` - :class:`convert_to_date` - :class:`convert_to_datetime` - :class:`strip_html_tags` - :class:`upcase_tokens` - :class:`downcase_tokens` Examples: .. testcode:: pyparsing_common.number.run_tests(''' # any int or real number, returned as the appropriate type 100 -100 +100 3.14159 6.02e23 1e-12 ''') .. testoutput:: :options: +NORMALIZE_WHITESPACE # any int or real number, returned as the appropriate type 100 [100] -100 [-100] +100 [100] 3.14159 [3.14159] 6.02e23 [6.02e+23] 1e-12 [1e-12] .. testcode:: pyparsing_common.fnumber.run_tests(''' # any int or real number, returned as float 100 -100 +100 3.14159 6.02e23 1e-12 ''') .. testoutput:: :options: +NORMALIZE_WHITESPACE # any int or real number, returned as float 100 [100.0] -100 [-100.0] +100 [100.0] 3.14159 [3.14159] 6.02e23 [6.02e+23] 1e-12 [1e-12] .. testcode:: pyparsing_common.hex_integer.run_tests(''' # hex numbers 100 FF ''') .. testoutput:: :options: +NORMALIZE_WHITESPACE # hex numbers 100 [256] FF [255] .. testcode:: pyparsing_common.fraction.run_tests(''' # fractions 1/2 -3/4 ''') .. testoutput:: :options: +NORMALIZE_WHITESPACE # fractions 1/2 [0.5] -3/4 [-0.75] .. testcode:: pyparsing_common.mixed_integer.run_tests(''' # mixed fractions 1 1/2 -3/4 1-3/4 ''') .. testoutput:: :options: +NORMALIZE_WHITESPACE # mixed fractions 1 [1] 1/2 [0.5] -3/4 [-0.75] 1-3/4 [1.75] .. testcode:: import uuid pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID)) pyparsing_common.uuid.run_tests(''' # uuid 12345678-1234-5678-1234-567812345678 ''') .. testoutput:: :options: +NORMALIZE_WHITESPACE # uuid 12345678-1234-5678-1234-567812345678 [UUID('12345678-1234-5678-1234-567812345678')] """ @staticmethod def convert_to_integer(_, __, t): """ Parse action for converting parsed integers to Python int """ return [int(tt) for tt in t] @staticmethod def convert_to_float(_, __, t): """ Parse action for converting parsed numbers to Python float """ return [float(tt) for tt in t] integer = ( Word(nums) .set_name("integer") .set_parse_action( convert_to_integer if PY_310_OR_LATER else lambda t: [int(tt) for tt in t] # type: ignore[misc] ) ) """expression that parses an unsigned integer, converts to an int""" hex_integer = ( Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16)) ) """expression that parses a hexadecimal integer, converts to an int""" signed_integer = ( Regex(r"[+-]?\d+") .set_name("signed integer") .set_parse_action( convert_to_integer if PY_310_OR_LATER else lambda t: [int(tt) for tt in t] # type: ignore[misc] ) ) """expression that parses an integer with optional leading sign, converts to an int""" fraction = ( signed_integer().set_parse_action( convert_to_float if PY_310_OR_LATER else lambda t: [float(tt) for tt in t] # type: ignore[misc] ) + "/" + signed_integer().set_parse_action( convert_to_float if PY_310_OR_LATER else lambda t: [float(tt) for tt in t] # type: ignore[misc] ) ).set_name("fraction") """fractional expression of an integer divided by an integer, converts to a float""" fraction.add_parse_action(lambda tt: tt[0] / tt[-1]) mixed_integer = ( fraction | signed_integer + Opt(Opt("-").suppress() + fraction) ).set_name("fraction or mixed integer-fraction") """mixed integer of the form 'integer - fraction', with optional leading integer, converts to a float""" mixed_integer.add_parse_action(sum) real = ( Regex(r"[+-]?(?:\d+\.\d*|\.\d+)") .set_name("real number") .set_parse_action( convert_to_float if PY_310_OR_LATER else lambda t: [float(tt) for tt in t] # type: ignore[misc] ) ) """expression that parses a floating point number, converts to a float""" sci_real = ( Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)") .set_name("real number with scientific notation") .set_parse_action( convert_to_float if PY_310_OR_LATER else lambda t: [float(tt) for tt in t] # type: ignore[misc] ) ) """expression that parses a floating point number with optional scientific notation, converts to a float""" # streamlining this expression makes the docs nicer-looking number = (sci_real | real | signed_integer).set_name("number").streamline() """any numeric expression, converts to the corresponding Python type""" fnumber = ( Regex(r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?") .set_name("fnumber") .set_parse_action( convert_to_float if PY_310_OR_LATER else lambda t: [float(tt) for tt in t] # type: ignore[misc] ) ) """any int or real number, always converts to a float""" ieee_float = ( Regex(r"(?i:[+-]?(?:(?:\d+\.?\d*(?:e[+-]?\d+)?)|nan|inf(?:inity)?))") .set_name("ieee_float") .set_parse_action( convert_to_float if PY_310_OR_LATER else lambda t: [float(tt) for tt in t] # type: ignore[misc] ) ) """any floating-point literal (int, real number, infinity, or NaN), converts to a float""" identifier = Word(identchars, identbodychars).set_name("identifier") """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')""" ipv4_address = Regex( r"(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}" ).set_name("IPv4 address") "IPv4 address (``0.0.0.0 - 255.255.255.255``)" _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer") _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name( "full IPv6 address" ) _short_ipv6_address = ( Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6)) + "::" + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6)) ).set_name("short IPv6 address") _short_ipv6_address.add_condition( lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8 ) _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address") ipv6_address = Combine( (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name( "IPv6 address" ) ).set_name("IPv6 address") "IPv6 address (long, short, or mixed form)" mac_address = Regex( r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}" ).set_name("MAC address") "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)" @staticmethod def convert_to_date(fmt: str = "%Y-%m-%d"): """ Helper to create a parse action for converting parsed date string to Python datetime.date Params - - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``) Example: .. testcode:: date_expr = pyparsing_common.iso8601_date.copy() date_expr.set_parse_action(pyparsing_common.convert_to_date()) print(date_expr.parse_string("1999-12-31")) prints: .. testoutput:: [datetime.date(1999, 12, 31)] """ def cvt_fn(ss, ll, tt): try: return datetime.strptime(tt[0], fmt).date() except ValueError as ve: raise ParseException(ss, ll, str(ve)) return cvt_fn @staticmethod def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"): """Helper to create a parse action for converting parsed datetime string to Python :class:`datetime.datetime` Params - - fmt - format to be passed to :class:`datetime.strptime` (default= ``"%Y-%m-%dT%H:%M:%S.%f"``) Example: .. testcode:: dt_expr = pyparsing_common.iso8601_datetime.copy() dt_expr.set_parse_action(pyparsing_common.convert_to_datetime()) print(dt_expr.parse_string("1999-12-31T23:59:59.999")) prints: .. testoutput:: [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)] """ def cvt_fn(s, l, t): try: return datetime.strptime(t[0], fmt) except ValueError as ve: raise ParseException(s, l, str(ve)) return cvt_fn iso8601_date = Regex( r"(?P\d{4})(?:-(?P\d\d)(?:-(?P\d\d))?)?" ).set_name("ISO8601 date") "ISO8601 date (``yyyy-mm-dd``)" iso8601_datetime = Regex( r"(?P\d{4})-(?P\d\d)-(?P\d\d)[T ](?P\d\d):(?P\d\d)(:(?P\d\d(\.\d*)?)?)?(?PZ|[+-]\d\d:?\d\d)?" ).set_name("ISO8601 datetime") "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``" @staticmethod def as_datetime(s, l, t): """Parse action to convert parsed dates or datetimes to a Python :class:`datetime.datetime`. This parse action will use the year, month, day, etc. results names defined in the ISO8601 date expressions, but it can be used with any expression that provides one or more of these fields. Omitted fields will default to fields from Jan 1, 00:00:00. Invalid dates will raise a :class:`ParseException` with the error message indicating the invalid date fields. """ year = int(t.year.lstrip("0") or 0) month = int(t.month or 1) day = int(t.day or 1) hour = int(t.hour or 0) minute = int(t.minute or 0) second = float(t.second or 0) try: return datetime( year, month, day, hour, minute, int(second), int((second % 1) * 1000) ) except ValueError as ve: raise ParseException(t, l, f"Invalid date/time: {ve}").with_traceback( ve.__traceback__ ) from None if PY_310_OR_LATER: iso8601_date_validated = iso8601_date().add_parse_action(as_datetime) "Validated ISO8601 date strings, raising :class:`ParseException` for invalid date values." iso8601_datetime_validated = iso8601_datetime().add_parse_action(as_datetime) "Validated ISO8601 date and time strings, raising :class:`ParseException` for invalid date/time values." uuid = Regex(r"[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name( "UUID" ) "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)" _html_stripper = any_open_tag.suppress() | any_close_tag.suppress() @staticmethod def strip_html_tags(s: str, l: int, tokens: ParseResults): """Parse action to remove HTML tags from web page HTML source Example: .. testcode:: # strip HTML links from normal text text = 'More info at the pyparsing wiki page' td, td_end = make_html_tags("TD") table_text = td + SkipTo(td_end).set_parse_action( pyparsing_common.strip_html_tags)("body") + td_end print(table_text.parse_string(text).body) Prints: .. testoutput:: More info at the pyparsing wiki page """ return pyparsing_common._html_stripper.transform_string(tokens[0]) _commasepitem = ( Combine( OneOrMore( ~Literal(",") + ~LineEnd() + Word(printables, exclude_chars=",") + Opt(White(" \t") + ~FollowedBy(LineEnd() | ",")) ) ) .streamline() .set_name("commaItem") ) comma_separated_list = DelimitedList( Opt(quoted_string.copy() | _commasepitem, default="") ).set_name("comma separated list") """Predefined expression of 1 or more printable words or quoted strings, separated by commas.""" @staticmethod def upcase_tokens(s, l, t): """Parse action to convert tokens to upper case.""" return [tt.upper() for tt in t] @staticmethod def downcase_tokens(s, l, t): """Parse action to convert tokens to lower case.""" return [tt.lower() for tt in t] # fmt: off url = Regex( # https://mathiasbynens.be/demo/url-regex # https://gist.github.com/dperini/729294 r"(?P" # protocol identifier (optional) # short syntax // still required r"(?:(?:(?Phttps?|ftp):)?\/\/)" # user:pass BasicAuth (optional) r"(?:(?P\S+(?::\S*)?)@)?" r"(?P" # IP address exclusion # private & local networks r"(?!(?:10|127)(?:\.\d{1,3}){3})" r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" # IP address dotted notation octets # excludes loopback network 0.0.0.0 # excludes reserved space >= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host & domain names, may end with dot # can be replaced by a shortest alternative # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+ r"(?:" r"(?:" r"[a-z0-9\u00a1-\uffff]" r"[a-z0-9\u00a1-\uffff_-]{0,62}" r")?" r"[a-z0-9\u00a1-\uffff]\." r")+" # TLD identifier name, may end with dot r"(?:[a-z\u00a1-\uffff]{2,}\.?)" r")" # port number (optional) r"(:(?P\d{2,5}))?" # resource path (optional) r"(?P\/[^?# ]*)?" # query string (optional) r"(\?(?P[^#]*))?" # fragment (optional) r"(#(?P\S*))?" r")" ).set_name("url") """ URL (http/https/ftp scheme) .. versionchanged:: 3.1.0 ``url`` named group added """ # fmt: on # pre-PEP8 compatibility names # fmt: off convertToInteger = staticmethod(replaced_by_pep8("convertToInteger", convert_to_integer)) convertToFloat = staticmethod(replaced_by_pep8("convertToFloat", convert_to_float)) convertToDate = staticmethod(replaced_by_pep8("convertToDate", convert_to_date)) convertToDatetime = staticmethod(replaced_by_pep8("convertToDatetime", convert_to_datetime)) stripHTMLTags = staticmethod(replaced_by_pep8("stripHTMLTags", strip_html_tags)) upcaseTokens = staticmethod(replaced_by_pep8("upcaseTokens", upcase_tokens)) downcaseTokens = staticmethod(replaced_by_pep8("downcaseTokens", downcase_tokens)) # fmt: on _builtin_exprs = [ v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement) ]