There is a lexical analyzer on regex with a token structure
#-------------------------------------------------- #Lexer Start #-------------------------------------------------- class Token(object): def __init__(self,type,value): self.type=type self.value=value def __str__(self): return 'Token({type},{value})'.format( type=self.type, value=repr(self.value) ) def __repr__(self): return self.__str__ #------------------------------------------------------ token_exprs=[(r'[ \n]+','None'), (r'#[^\n]*','None'), (r'\:=','RESERVED'), (r'\(','LPAREN'), (r'\)','RPAREN'), (r';','RESERVED'), (r'\+','PLUS'), (r'\.','DOT'), (r'SUM','FUNC'), (r',','DELIMARGS'), (r'-','MINUS'), (r'\*','OP'), (r'\^','OP'), (r'/','OP'), (r'<=','RESERVED'), (r'<','RESERVED'), (r'>=','RESERVED'), (r'>','RESERVED'), (r'!=','RESERVED'), (r'=','RESERVED'), (r'and','RESERVED'), (r'or','RESERVED'), (r'not','RESERVED'), (r'if','RESERVED'), (r'then','RESERVED'), (r'else','RESERVED'), (r'while','RESERVED'), (r'do','RESERVED'), (r'end','RESERVED'), (r'\d+(\.\d*)?','NUM'), (r'[az]+','ID'),] #-------------------------------------------------- #-*-coding:utf-8-*- import re import sys class Lexer(object): def __init__(self,characters,token_exprs): self.pos=0 self.token_exprs=token_exprs self.characters=characters def get_next_token(self): while self.pos<len(self.characters) :#pri pos=0 match=None for token_expr in self.token_exprs:#token_expr=('do', 'RESERVED') #token_exprs=[(r'do','RESERVED')] patern,tag=token_expr#patern='do' tag='RESERVED' regex=re.compile(patern)#regex=re.compile('do') match=regex.match(self.characters,self.pos)#match=<_sre.SRE_Match object; span=(0, 2), match='do'> if match:# klass True text=match.group(0)#text='do' if text is not None and text==' ': self.pos+=1 if tag:#klass True self.pos=match.end() break if not match:#not None==not False->True->klass True sys.stderr.write('Illegal character:%s\n'% self.characters[self.pos]) sys.exit(1) #else: # self.pos=match.end() return Token(tag,text) #-------------------------------------------------- #Lexer End #-------------------------------------------------- lexer=Lexer('1 11',token_exprs) print(lexer.get_next_token()) print(lexer.get_next_token()) print(lexer.get_next_token()) print(lexer.get_next_token()) Gives out:
Token(NUM,'1') Token(None,' ') Token(NUM,'11') None It is necessary to exclude Token (None, ""). I tried to create a code above
if text is not None and text==' ': self.pos+=1 Something does not work? What you need to do