This is a lexical analyzer for a very reduced form of a language, composed by myself with all it's lexical rules and components. One fact worth mentioning is that I made my life easier by not performing look-ahead for operators of type ==,<=,>= etc, and also it does not have functions, classes etc. It parses a text file in which a basic program is written.
These are language related, you can figure out how my language looks like after them etc.
//Adds operators to list
private void getOperators(){
operators.add("<"); operators.add(">");
operators.add("lt"); operators.add("gt");
operators.add("="); operators.add("equals");
operators.add("plus"); operators.add("minus");
}
//Adds reserved words to list
private void getReservedWords(){
reservedWords.add("if"); reservedWords.add("else"); reservedWords.add("while");
reservedWords.add("int"); reservedWords.add("char"); reservedWords.add("string");
reservedWords.add("main"); reservedWords.add("cin>>"); reservedWords.add("cout<<"); //really skipping that look ahead step here :D
}
//Adds separators to list
private void getSeparators(){
sepList.add("("); sepList.add(")"); sepList.add(",");
sepList.add(";"); sepList.add("["); sepList.add("]");
}
//Adds codification to list
private void getCodification(){
codification.add("identifier");
codification.add("constant");
codification.add("main");
codification.add("int");
codification.add("char");
codification.add("string");
codification.add("cout<<");
codification.add("cin>>");
codification.add("while");
codification.add("if");
codification.add("else");
codification.add(";");
codification.add(",");
codification.add("[");
codification.add("]");
codification.add("(");
codification.add(")");
codification.add("plus");
codification.add("minus");
codification.add("<");
codification.add(">");
codification.add("=");
codification.add("lt");
codification.add("gt");
codification.add("equals");
}
And now for the Scanner part, it basically parses lines, breaks it in tokens and then decides if token is reserved word, operator, separator, identifier, constant and it adds them to PIF/ST respectively; or error. The regex should tell you about the language constraints.
//Bubble sort for SymbolTable type elements in a list, alphabetically after token
//Yes bubble sort.
/* Note this was a particular requirement, it could have been done way more elegantly, but not when one is out of time. */
public void sortST(List<SymbolTable> list){
for(int i=0; i<list.size();i++){
for(int j=1; j<(list.size()-i);j++){
if(list.get(j-1).getToken().compareTo(list.get(j).getToken()) > 0 ){
SymbolTable aux = new SymbolTable(list.get(j-1).getToken(), list.get(j-1).getPosition());
SymbolTable aux1 = new SymbolTable(list.get(j).getToken(), list.get(j).getPosition());
list.set(j-1, aux1);
list.set(j, aux);
}
}
}
}
//Adds token in PIF or SymbolTable
public void classify(String token, Integer nr){
//System.out.println(token + "Here token");
int aux=0,index = 0, aux1=0, index1=0;
if(nr==0){
for(int i=0; i<codification.size(); i++){
if(codification.get(i).equals(token)){
Classification item = new Classification(i,nr);
pif.add(item);
break;
}
}
return;
}
if(nr==1){
if(symbolTableC.isEmpty()){
SymbolTable pair = new SymbolTable(token,1);
symbolTableC.add(pair);
}
for(int i =0; i< symbolTableC.size(); i++){
if(symbolTableC.get(i).getToken().equals(token)== true){
Classification item = new Classification(nr, symbolTableC.get(i).getPosition());
pif.add(item);
aux++;
break;
}
index=i;
}
if(aux==0){
int poz = symbolTableC.get(index).getPosition() +1;
SymbolTable pair = new SymbolTable(token, poz);
symbolTableC.add(pair);
Classification item = new Classification(nr, poz);
pif.add(item);
}
return;
}
if(nr == 2){
if(symbolTableI.isEmpty()){
SymbolTable pair = new SymbolTable(token,1);
symbolTableI.add(pair);
}
for(int i =0; i< symbolTableI.size(); i++){
if(symbolTableI.get(i).getToken().equals(token)== true){
Classification item = new Classification(0, symbolTableI.get(i).getPosition());
pif.add(item);
aux1++;
break;
}
index1=i;
}
if(aux1==0){
int poz = symbolTableI.get(index1).getPosition() +1;
SymbolTable pair = new SymbolTable(token, poz);
symbolTableI.add(pair);
Classification item = new Classification(0, poz);
pif.add(item);
}
return;
}
}
//Algorithm that determines if token is op/resW/sep/constant/identifier or error
public void scanAlg() throws Exception{
int counter = 0;
try{
BufferedReader br = new BufferedReader(new FileReader(this.file));
String line = new String();
Pattern constants = Pattern.compile("^-\\d+?|\\d+?|\\+\\d+?|'\\w'|\"\\w+?\"|\"\\s+?\"|''|' '$");
Pattern identifiers = Pattern.compile("^[a-zA-Z]+?|[a-zA-Z]\\d+?$");
Pattern costSpecial = Pattern.compile("^[1-9]\\d+?$");
String aux = "";
while((line = br.readLine())!= null){
counter++;
StringTokenizer tokenSplits = new StringTokenizer(line,"(),;=><[] ", true);
while(tokenSplits.hasMoreTokens()){
String token = tokenSplits.nextToken();
if(token.equals(" ")) continue;
if(reservedWords.contains(token) || sepList.contains(token) || operators.contains(token)){
classify(token, 0);
continue;
}
Matcher m = constants.matcher(token);
if(token.equals("-") | token.equals("+")){
aux = token;
continue;
}
Matcher s = costSpecial.matcher(token);
if(s.matches()){
String megaAux = aux.concat(token);
token = megaAux;
aux="";
}
if(m.matches()){
classify(token, 1);
continue;
}
Matcher t = identifiers.matcher(token);
int size = token.length();
if(t.matches() && size < 250){
classify(token,2);
continue;
}
throw new Exception("Error token: " + token + " on line " + counter);
}
}
br.close();
} catch (IOException e){
System.err.println(e.getMessage());
} catch (NumberFormatException e) {
System.err.println(e.getMessage());
} catch (NoSuchElementException e){
System.err.println(e.getMessage());
}
}
Any comments, suggestions, ways to improve the code etc. are welcome. (Well except for the bubble sort part
)