View Javadoc

1   /**
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    * @author Zev Blut zb@ubit.com
4    * @author Romain PELISSE belaran@gmail.com
5    */
6   package net.sourceforge.pmd.cpd;
7   
8   import java.util.List;
9   
10  public abstract class AbstractTokenizer implements Tokenizer {
11  
12  	//FIXME depending on subclasses to assign local vars is rather fragile - better to make private and setup via explicit hook methods
13  	
14  	protected List<String> stringToken;		    // List<String>, should be set by sub classes
15  	protected List<String> ignorableCharacter;  // List<String>, should be set by sub classes
16  												// FIXME:Maybe an array of 'char' would be better for performance ?
17  	protected List<String> ignorableStmt; 		// List<String>, should be set by sub classes
18  	protected char oneLineCommentChar = '#'; // Most script languages ( shell, ruby, python,...) use this symbol for comment line
19  
20  	private List<String> code;
21  	private int lineNumber = 0;
22  	private String currentLine;
23  
24  	protected boolean spanMultipleLinesString = true;	// Most languages do, so default is true
25  
26  	private boolean downcaseString = true;
27  
28      public void tokenize(SourceCode tokens, Tokens tokenEntries) {
29          code = tokens.getCode();
30  
31          for ( lineNumber = 0; lineNumber < code.size(); lineNumber++ ) {
32          	currentLine = code.get(lineNumber);
33              int loc = 0;
34              while ( loc < currentLine.length() ) {
35                  StringBuilder token = new StringBuilder();
36                  loc = getTokenFromLine(token,loc);
37                  if (token.length() > 0 && !isIgnorableString(token.toString())) {
38                      if (downcaseString) {
39                          token = new StringBuilder(token.toString().toLowerCase());
40                      }
41  // need to re-think how to link this                    
42  //                    if ( CPD.debugEnable ) {
43  //                    	System.out.println("Token added:" + token.toString());
44  //                    }
45                      tokenEntries.add(new TokenEntry(token.toString(),
46                              tokens.getFileName(),
47                              lineNumber)
48                      		);
49  
50                  }
51              }
52          }
53          tokenEntries.add(TokenEntry.getEOF());
54      }
55  
56      private int getTokenFromLine(StringBuilder token, int loc) {
57          for (int j = loc; j < currentLine.length(); j++) {
58              char tok = currentLine.charAt(j);
59              if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
60                  if (isComment(tok)) {
61                      if (token.length() > 0) {
62                          return j;
63                      } else {
64                          return getCommentToken(token, loc);
65                      }
66                  } else if (isString(tok)) {
67                      if (token.length() > 0) {
68                          return j; // we need to now parse the string as a separate token.
69                      } else {
70                          // we are at the start of a string
71                          return parseString(token, j, tok);
72                      }
73                  } else {
74                      token.append(tok);
75                  }
76              } else {
77                  if (token.length() > 0) {
78                      return j;
79                  }
80              }
81              loc = j;
82          }
83          return loc + 1;
84      }
85  
86      private int parseString(StringBuilder token, int loc, char stringDelimiter) {
87          boolean escaped = false;
88          boolean done = false;
89          char tok = ' '; // this will be replaced.
90          while ((loc < currentLine.length()) && ! done) {
91              tok = currentLine.charAt(loc);
92              if (escaped && tok == stringDelimiter) { // Found an escaped string
93                  escaped = false;
94              } else if (tok == stringDelimiter && (token.length() > 0)) { // We are done, we found the end of the string...
95                  done = true;
96              } else if (tok == '\\') { // Found an escaped char
97                  escaped = true;
98              } else {	// Adding char...
99                  escaped = false;
100             }
101             //Adding char to String:" + token.toString());
102             token.append(tok);
103             loc++;
104         }
105         // Handling multiple lines string
106         if ( 	! done &&	// ... we didn't find the end of the string
107         		loc >= currentLine.length() && // ... we have reach the end of the line ( the String is incomplete, for the moment at least)
108         		spanMultipleLinesString && // ... the language allow multiple line span Strings
109         		lineNumber < code.size() - 1 // ... there is still more lines to parse
110         	) {
111         	// parsing new line
112         	currentLine = code.get(++lineNumber);
113         	// Warning : recursive call !
114         	loc = parseString(token, loc, stringDelimiter);
115         }
116         return loc + 1;
117     }
118 
119     private boolean ignoreCharacter(char tok)
120     {
121     	return ignorableCharacter.contains(String.valueOf(tok));
122     }
123 
124     private boolean isString(char tok)
125     {
126     	return stringToken.contains(String.valueOf(tok));
127     }
128 
129     private boolean isComment(char tok)
130     {
131         return tok == oneLineCommentChar;
132     }
133 
134     private int getCommentToken(StringBuilder token, int loc)
135     {
136         while (loc < currentLine.length())
137         {
138             token.append(currentLine.charAt(loc++));
139         }
140         return loc;
141     }
142 
143     private boolean isIgnorableString(String token)
144     {
145     	return ignorableStmt.contains(token);
146     }
147 }