001    package aima.basic;
002    
003    import java.util.Enumeration;
004    import java.util.NoSuchElementException;
005    
006    /**
007     * Replacement for StringTokenizer in java.util, beacuse of bug in the Sun's
008     * implementation.
009     * 
010     * @author <A HREF="mailto:moravek@pobox.sk">Peter Moravek </A>
011     */
012    public class Tokenizer implements Enumeration {
013    
014            /**
015             * Constructs a string tokenizer for the specified string. All characters in
016             * the delim argument are the delimiters for separating tokens. If the
017             * returnTokens flag is true, then the delimiter characters are also
018             * returned as tokens. Each delimiter is returned as a string of length one.
019             * If the flag is false, the delimiter characters are skipped and only serve
020             * as separators between tokens.
021             * 
022             * @param str
023             *            a string to be parsed
024             * @param delim
025             *            the delimiters
026             * @param returnTokens
027             *            flag indicating whether to return the delimiters as tokens
028             */
029            public Tokenizer(String str, String delim, boolean returnTokens) {
030                    this.str = str;
031                    this.delim = delim;
032                    this.returnTokens = returnTokens;
033    
034                    max = str.length();
035            }
036    
037            /**
038             * Constructs a string tokenizer for the specified string. The characters in
039             * the delim argument are the delimiters for separating tokens. Delimiter
040             * characters themselves will not be treated as tokens.
041             * 
042             * @param str
043             *            a string to be parsed
044             * @param delim
045             *            the delimiters
046             */
047            public Tokenizer(String str, String delim) {
048                    this(str, delim, false);
049            }
050    
051            /**
052             * Constructs a string tokenizer for the specified string. The character in
053             * the delim argument is the delimiter for separating tokens. Delimiter
054             * character themselves will not be treated as token.
055             * 
056             * @param str
057             *            a string to be parsed
058             * @param delim
059             *            the delimiter
060             */
061            public Tokenizer(String str, char delim) {
062                    this(str, String.valueOf(delim), false);
063            }
064    
065            /**
066             * Constructs a string tokenizer for the specified string. The tokenizer
067             * uses the default delimiter set, which is " \t\n\r\f": the space
068             * character, the tab character, the newline character, the carriage-return
069             * character, and the form-feed character. Delimiter characters themselves
070             * will not be treated as tokens.
071             * 
072             * @param str
073             *            a string to be parsed
074             */
075            public Tokenizer(String str) {
076                    this(str, Tokenizer.DEFAULT_DELIMITERS, false);
077            }
078    
079            /**
080             * Tests if there are more tokens available from this tokenizer's string. If
081             * this method returns true, then a subsequent call to nextToken with no
082             * argument will successfully return a token.
083             * 
084             * @return true if and only if there is at least one token in the string
085             *         after the current position; false otherwise.
086             */
087            public boolean hasMoreTokens() {
088                    return ((current < max) ? (true)
089                                    : (((current == max) && (max == 0 || (returnTokens && delim
090                                                    .indexOf(str.charAt(previous)) >= 0)))));
091            }
092    
093            /**
094             * Returns the next token from this string tokenizer.
095             * 
096             * @return the next token from this string tokenizer
097             * 
098             * @exception NoSuchElementException
099             *                if there are no more tokens in this tokenizer's string
100             */
101            public String nextToken() throws NoSuchElementException {
102                    if (current == max
103                                    && (max == 0 || (returnTokens && delim.indexOf(str
104                                                    .charAt(previous)) >= 0))) {
105    
106                            current++;
107                            return new String();
108                    }
109    
110                    if (current >= max)
111                            throw new NoSuchElementException();
112    
113                    int start = current;
114                    String result = null;
115    
116                    if (delim.indexOf(str.charAt(start)) >= 0) {
117                            if (previous == -1
118                                            || (returnTokens && previous != current && delim
119                                                            .indexOf(str.charAt(previous)) >= 0)) {
120    
121                                    result = new String();
122                            } else if (returnTokens)
123                                    result = str.substring(start, ++current);
124    
125                            if (!returnTokens)
126                                    current++;
127                    }
128    
129                    previous = start;
130                    start = current;
131    
132                    if (result == null)
133                            while (current < max && delim.indexOf(str.charAt(current)) < 0)
134                                    current++;
135    
136                    return result == null ? str.substring(start, current) : result;
137            }
138    
139            /**
140             * Returns the next token in this string tokenizer's string. First, the set
141             * of characters considered to be delimiters by this Tokenizer object is
142             * changed to be the characters in the string delim. Then the next token in
143             * the string after the current position is returned. The current position
144             * is advanced beyond the recognized token. The new delimiter set remains
145             * the default after this call.
146             * 
147             * @param delim
148             *            the new delimiters
149             * 
150             * @return the next token, after switching to the new delimiter set
151             * 
152             * @exception NoSuchElementException
153             *                if there are no more tokens in this tokenizer's string.
154             */
155            public String nextToken(String delim) throws NoSuchElementException {
156                    this.delim = delim;
157                    return nextToken();
158            }
159    
160            /**
161             * Returns the same value as the hasMoreTokens method. It exists so that
162             * this class can implement the Enumeration interface.
163             * 
164             * @return true if there are more tokens; false otherwise.
165             */
166            public boolean hasMoreElements() {
167                    return hasMoreTokens();
168            }
169    
170            /**
171             * Returns the same value as the nextToken method, except that its declared
172             * return value is Object rather than String. It exists so that this class
173             * can implement the Enumeration interface.
174             * 
175             * @return the next token in the string
176             * 
177             * @exception NoSuchElementException
178             *                if there are no more tokens in this tokenizer's string
179             */
180            public Object nextElement() {
181                    return nextToken();
182            }
183    
184            /**
185             * Calculates the number of times that this tokenizer's nextToken method can
186             * be called before it generates an exception. The current position is not
187             * advanced.
188             * 
189             * @return the number of tokens remaining in the string using the current
190             *         delimiter set
191             */
192            public int countTokens() {
193                    int curr = current;
194                    int count = 0;
195    
196                    for (int i = curr; i < max; i++) {
197                            if (delim.indexOf(str.charAt(i)) >= 0)
198                                    count++;
199    
200                            curr++;
201                    }
202    
203                    return count + (returnTokens ? count : 0) + 1;
204            }
205    
206            /**
207             * Resets this tokenizer's state so the tokenizing starts from the begin.
208             */
209            public void reset() {
210                    previous = -1;
211                    current = 0;
212            }
213    
214            /**
215             * Constructs a string tokenizer for the specified string. All characters in
216             * the delim argument are the delimiters for separating tokens. If the
217             * returnTokens flag is true, then the delimiter characters are also
218             * returned as tokens. Each delimiter is returned as a string of length one.
219             * If the flag is false, the delimiter characters are skipped and only serve
220             * as separators between tokens. Then tokenizes the str and return an
221             * String[] array with tokens.
222             * 
223             * @param str
224             *            a string to be parsed
225             * @param delim
226             *            the delimiters
227             * @param returnTokens
228             *            flag indicating whether to return the delimiters as tokens
229             * 
230             * @return array with tokens
231             */
232            public static String[] tokenize(String str, String delim,
233                            boolean returnTokens) {
234    
235                    Tokenizer tokenizer = new Tokenizer(str, delim, returnTokens);
236                    String[] tokens = new String[tokenizer.countTokens()];
237    
238                    for (int i = 0; i < tokens.length; i++)
239                            tokens[i] = tokenizer.nextToken();
240    
241                    return tokens;
242            }
243    
244            /**
245             * Default delimiters "\t\n\r\f": the space character, the tab character,
246             * the newline character, the carriage-return character, and the form-feed
247             * character.
248             */
249            public static final String DEFAULT_DELIMITERS = " \t\n\r\f";
250    
251            /**
252             * String to tokenize.
253             */
254            private String str = null;
255    
256            /**
257             * Delimiters.
258             */
259            private String delim = null;
260    
261            /**
262             * Flag indicating whether to return the delimiters as tokens.
263             */
264            private boolean returnTokens = false;
265    
266            /**
267             * Previous token start.
268             */
269            private int previous = -1;
270    
271            /**
272             * Current position in str string.
273             */
274            private int current = 0;
275    
276            /**
277             * Maximal position in str string.
278             */
279            private int max = 0;
280    }