pytokens.py - specific enhancements to tokenize

not a standalone program, specifically for import

Interesting symbols:
        Parse(infile) returns a list of tuples of
            (TOKENNUMBER, STRING, LINE, COLUMN)
        gTokenNames[] strings for token numbers
        gNTokens len(gTokenNames)
        gReservedWords{} dictionary of reserved words : token number
        gReservedWordList[] simple list of reserved words

specifically uses Python/lib/ token, tokenize, and keyword
some of what is used isn't part of the 'published interface'
      19    import tokenize
      20    from token import *
      21    from tokenize import tokenize, tok_name, NL, COMMENT
      22    from keyword import kwlist
some improvements to the token stuff from tokenize
      26    gTokenNames = tok_name
      27    
      28    WHITE    = NL + 1
      29    RESERVED = WHITE + 1
      30    gTokenNames[WHITE]    = 'WHITE'
      31    gTokenNames[RESERVED] = 'RESERVED'
      32    NEXT_TOKEN = RESERVED + 1

this is a global cause it might be handy from elsewhere
dictionary for lookup speed
numbers represent 'RESERVED+VALUE' for tagging reserved words
instead of explicitly breaking out the reserved words above
self isn't reserved, but it's in here anyways(?)
values in the initializer are bogus
      43    gReservedWordList = []
      44    
      45    gReservedWords = {}
      46    
      47    def __fixupReserved():
add reserved words to gTokenNames on startup - don't call twice!
      49        global NEXT_TOKEN, gTokenNames, gReservedWords, gReservedWordList
      50        gReservedWordList = kwlist[:]   # copy
      51        gReservedWordList.append('self') # debatable...
      52        gReservedWordList.sort()
      53        g = globals()
      54        for r in gReservedWordList:
      55            gTokenNames[NEXT_TOKEN] = r       # add reserved word to token list
      56            gReservedWords[r] = NEXT_TOKEN    # words[word] = token number
      57            x = 'TOKEN_' + r + '=' + str(NEXT_TOKEN)
      58            exec x in g
      59            NEXT_TOKEN = NEXT_TOKEN + 1       # one more token
      60            
      61    __fixupReserved()

      65    class parsefile:
manage an input file
        handle missing WHITE tokens by tracking raw input
        also provide an equivalent to readlines for reading raw input
    
      70        def __call__(self, infile, constructor=None):
initialize with the raw input file
            new contructor for a class instead of tuples only
        
      74            self.lines = infile.readlines()
      75            infile.close()
      76            self.crow  = 0
      77            self.ccol  = 0
      78            self.cline = 0
      79            self.fResult = []
      80            self.constructor = constructor
      81            tokenize(self.readline, self.tokenizeCB)
      82            return self.fResult
      83            
      84        def readline(self):
input style
      86            if self.cline < len(self.lines):
      87                l = self.lines[self.cline]
      88                self.cline = self.cline + 1
      89                return l
      90            return ""
      91                
      92        def skip(self, toRow, toCol):   #NIU?
in case we need to explicitly skip something
      94            self.crow = toRow - 1
      95            self.ccol = toCol
      96    
      97        def through(self, row, col):
return data through row, col; possible multiple linefeeds?
      99            row = row - 1   # off by 1
     100            ll = ""
     101            if row < self.crow:
     102                return ll
     103            if row == self.crow and col <= self.ccol:
     104                return ll
     105            r = self.crow
     106            while r <= row:
     107                if r >= len(self.lines):
     108                    return ll
     109                l = self.lines[r]
     110                if r == row:
     111                    l = l[:col]
     112                if r == self.crow:
     113                    l = l[self.ccol:]
     114                ll = ll + l
     115                r = r + 1
     116            self.crow = row
     117            self.ccol = col
     118            return ll
     119    
     120        def tokenizeCB(self, type, token, (srow, scol), (erow, ecol), line):
tokenize calls us back on this method
     122            cr = self.crow + 1
     123            cc = self.ccol
     124            white = self.through(srow, scol)
     125            k = self.constructor
     126            res = self.fResult
     127            
     128            if white:
     129                cr = srow   # funkiness in tokenize...
     130                if k:
     131                    res.append( k(WHITE, white, cr, cc, srow, scol) )
     132                else:
     133                    res.append( (WHITE, white, cr, cc, srow, scol) )
     134            
     135            t = self.through(erow, ecol)
     136            if type == NAME:
     137                w = gReservedWords.get(token)
     138                if w:
     139                    type = w
     140            
     141            if k:
     142                res.append( k(type, token, srow, scol, erow, ecol) )
     143            else:
     144                res.append( (type, token, srow, scol, erow, ecol) )

     148    def Parse(infile, k=None):
return a list of tuples from the infile (type, token)
     150        p = parsefile()
     151        return p(infile, k)

command line invocation; testing only
     157    if __name__ == '__main__':  
     158        import sys
     159        if len(sys.argv) > 1: 
     160            files = sys.argv[1:]
     161            for fileName in files:
     162                p = Parse(open(fileName, 'r'))
     163                for t in p:
     164                    if 0:   # standard test input == output
     165                        sys.stdout.write(t[1])
     166                    if 1:   # alternative output
     167                        print (gTokenNames[t[0]], t[1], t[2], t[3], t[4], t[5])
     168        else: 
     169            print __doc__