Groups | Search | Server Info | Keyboard shortcuts | Login | Register [http] [https] [nntp] [nntps]


Groups > comp.lang.python > #197602

valid_identifiers

From Lawrence D’Oliveiro <ldo@nz.invalid>
Newsgroups comp.lang.python
Subject valid_identifiers
Date 2025-12-17 02:11 +0000
Organization A noiseless patient Spider
Message-ID <10ht3ge$36mjt$1@dont-email.me> (permalink)

Show all headers | View raw


#!/usr/bin/python3
#+
# Which characters are valid in identifiers?
# See details at <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
#-

import sys
import unicodedata as ud

class SeqRuns :

    def __init__(self, seq) :
        runs = []
        start = end = None
        elts = iter(seq)
        while True :
            elt = next(elts, None)
            if elt == None or start != None and elt != end + 1 :
                if start != None :
                    runs.append((start, end))
                #end if
                start = None
                if elt == None :
                    break
            #end if
            if start == None :
                start = elt
            #end if
            end = elt
        #end while
        self.runs = runs
    #end __init__

    def nrelts(self) :
        return sum(e[1] - e[0] for e in self.runs)
    #end nrelts

    def __len__(self) :
        return len(self.runs)
    #end __len__

    def __iter__(self) :
        return iter(self.runs)
    #end __iter__

#end SeqRuns

UNICODE_RANGE = range(sys.maxunicode + 1)
# special cases from <https://www.unicode.org/Public/13.0.0/ucd/PropList.txt>
OTHER_ID_START = {0x1885, 0x1886, 0x2118, 0x212E, 0x309B, 0x309C}
  # Other_ID_Start
OTHER_ID_CONTINUE = \
  ( # Other_ID_Continue
        {0x00B7, 0x0387}
    |
        set(range(0x1369, 0x1371 + 1))
    |
        {0x19DA}
  )
ID_START_EXTRA = {ord("_")}

ID_START = SeqRuns \
  (
    c for c in UNICODE_RANGE
    if
            ud.category(chr(c)) in {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl"}
        or
            c in OTHER_ID_START | ID_START_EXTRA
  )
ID_CONTINUE = SeqRuns \
  (
    c for c in UNICODE_RANGE
    if ud.category(chr(c)) in {"Mn", "Mc", "Nd", "Pc"} or c in OTHER_ID_CONTINUE
  )
# identifiers are compared according to NFKC normalization

for n, l in \
    (
        ("start", ID_START),
        ("continue", ID_CONTINUE),
    ) \
:
    sys.stdout.write \
      (
            "%s[%d]: {%s}\n"
        %
            (
                n,
                l.nrelts(),
                ", ".join
                  (
                    (
                        lambda : "%#04X" % c[0],
                        lambda : "%#04X..%#04X" % c,
                    )[c[1] != c[0]]()
                    for c in l
                  ),
            )
      )
#end for

Back to comp.lang.python | Previous | Next | Find similar


Thread

valid_identifiers Lawrence D’Oliveiro <ldo@nz.invalid> - 2025-12-17 02:11 +0000

csiph-web