pytokens.py - specific enhancements to tokenize not a standalone program, specifically for import Interesting symbols: Parse(infile) returns a list of tuples of (TOKENNUMBER, STRING, LINE, COLUMN) gTokenNames[] strings for token numbers gNTokens len(gTokenNames) gReservedWords{} dictionary of reserved words : token number gReservedWordList[] simple list of reserved words specifically uses Python/lib/ token, tokenize, and keyword some of what is used isn't part of the 'published interface' this is a global cause it might be handy from elsewhere dictionary for lookup speed numbers represent 'RESERVED+VALUE' for tagging reserved words instead of explicitly breaking out the reserved words above self isn't reserved, but it's in here anyways(?) values in the initializer are bogus 49 global NEXT_TOKEN, gTokenNames, gReservedWords, gReservedWordList 50 gReservedWordList = kwlist[:] # copy 51 gReservedWordList.append('self') # debatable... 52 gReservedWordList.sort() 53 g = globals() 54 for r in gReservedWordList: 55 gTokenNames[NEXT_TOKEN] = r # add reserved word to token list 56 gReservedWords[r] = NEXT_TOKEN # words[word] = token number 57 x = 'TOKEN_' + r + '=' + str(NEXT_TOKEN) 58 exec x in g 59 NEXT_TOKEN = NEXT_TOKEN + 1 # one more token 60 61 __fixupReserved() 65 class parsefile: handle missing WHITE tokens by tracking raw input also provide an equivalent to readlines for reading raw input 70 def __call__(self, infile, constructor=None): new contructor for a class instead of tuples only 99 row = row - 1 # off by 1 100 ll = "" 101 if row < self.crow: 102 return ll 103 if row == self.crow and col <= self.ccol: 104 return ll 105 r = self.crow 106 while r <= row: 107 if r >= len(self.lines): 108 return ll 109 l = self.lines[r] 110 if r == row: 111 l = l[:col] 112 if r == self.crow: 113 l = l[self.ccol:] 114 ll = ll + l 115 r = r + 1 116 self.crow = row 117 self.ccol = col 118 return ll 119 120 def tokenizeCB(self, type, token, (srow, scol), (erow, ecol), line): 122 cr = self.crow + 1 123 cc = self.ccol 124 white = self.through(srow, scol) 125 k = self.constructor 126 res = self.fResult 127 128 if white: 129 cr = srow # funkiness in tokenize... 130 if k: 131 res.append( k(WHITE, white, cr, cc, srow, scol) ) 132 else: 133 res.append( (WHITE, white, cr, cc, srow, scol) ) 134 135 t = self.through(erow, ecol) 136 if type == NAME: 137 w = gReservedWords.get(token) 138 if w: 139 type = w 140 141 if k: 142 res.append( k(type, token, srow, scol, erow, ecol) ) 143 else: 144 res.append( (type, token, srow, scol, erow, ecol) ) 148 def Parse(infile, k=None): command line invocation; testing only 157 if __name__ == '__main__': 158 import sys 159 if len(sys.argv) > 1: 160 files = sys.argv[1:] 161 for fileName in files: 162 p = Parse(open(fileName, 'r')) 163 for t in p: 164 if 0: # standard test input == output 165 sys.stdout.write(t[1]) 166 if 1: # alternative output 167 print (gTokenNames[t[0]], t[1], t[2], t[3], t[4], t[5]) 168 else: 169 print __doc__ |