Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
IndexTokenizer |
|
| 5.523809523809524;5.524 |
1 | package org.paneris.bibliomania.fti; | |
2 | ||
3 | import java.io.BufferedInputStream; | |
4 | import java.io.EOFException; | |
5 | import java.io.FileInputStream; | |
6 | import java.io.IOException; | |
7 | import java.io.InputStream; | |
8 | import java.util.Enumeration; | |
9 | import java.util.NoSuchElementException; | |
10 | ||
11 | import org.melati.util.UnexpectedExceptionException; | |
12 | ||
13 | public class IndexTokenizer implements Enumeration { | |
14 | ||
15 | private InputStream reader; | |
16 | 2 | private int pushedBack = -1; |
17 | 2 | private int offset = -1, wordIndex = -1, wordOffset; |
18 | 2 | private StringBuffer buffer = new StringBuffer(); |
19 | 2 | private String nextWord = null; |
20 | 2 | private boolean hadBreak = false; |
21 | 2 | private boolean lastWasAnchor = false; |
22 | 2 | private boolean inStrong = false; |
23 | 2 | private boolean inScript = false; |
24 | private boolean wantEverything; | |
25 | ||
26 | 2 | public IndexTokenizer(InputStream reader, boolean wantEverything) { |
27 | 2 | this.reader = reader; |
28 | 2 | this.wantEverything = wantEverything; |
29 | 2 | } |
30 | ||
31 | public IndexTokenizer(InputStream reader) { | |
32 | 2 | this(reader, false); |
33 | 2 | } |
34 | ||
35 | private int nextChar() throws IOException { | |
36 | int c; | |
37 | 3463 | if (pushedBack == -1) { |
38 | 3232 | c = reader.read(); |
39 | 3232 | if (c == -1) |
40 | 2 | throw new EOFException(); |
41 | } else { | |
42 | 231 | c = pushedBack; |
43 | 231 | pushedBack = -1; |
44 | } | |
45 | ||
46 | 3461 | ++offset; |
47 | 3461 | return c; |
48 | } | |
49 | ||
50 | private void pushback(int c) { | |
51 | 231 | if (pushedBack != -1) |
52 | 0 | throw new IllegalArgumentException("pushed back already"); |
53 | 231 | pushedBack = c; |
54 | 231 | --offset; |
55 | 231 | } |
56 | ||
57 | private void string(int term) throws IOException { | |
58 | int c; | |
59 | do { | |
60 | 0 | if ((c = nextChar()) == '\\') { |
61 | 0 | nextChar(); |
62 | 0 | c = nextChar(); |
63 | } | |
64 | 0 | } while (c != term); |
65 | 0 | } |
66 | ||
67 | private void comment() throws IOException { | |
68 | for (;;) { | |
69 | int c; | |
70 | 0 | if ((c = nextChar()) == '-') |
71 | 0 | if ((c = nextChar()) == '-' && (c = nextChar()) == '>') |
72 | 0 | break; |
73 | else | |
74 | 0 | pushback(c); |
75 | 0 | } |
76 | 0 | } |
77 | ||
78 | private String tag() throws IOException { | |
79 | 300 | int tagOffset = offset; |
80 | 300 | String anchorName = null; |
81 | int c; | |
82 | 300 | if ((c = nextChar()) == '!' |
83 | && (c = nextChar()) == '-' | |
84 | && (c = nextChar()) == '-') | |
85 | 0 | comment(); |
86 | else { | |
87 | boolean sense; | |
88 | 300 | if (c == '/') { |
89 | 124 | sense = false; |
90 | 124 | c = nextChar(); |
91 | } else | |
92 | 176 | sense = true; |
93 | ||
94 | 300 | boolean isA = false; |
95 | ||
96 | 300 | while (Character.isWhitespace((char)c)) |
97 | 0 | c = nextChar(); |
98 | ||
99 | 300 | boolean lastWhite = false; |
100 | ||
101 | 300 | inStrong = false; |
102 | ||
103 | 300 | if (c == 'a' || c == 'A') { |
104 | 2 | if (Character.isWhitespace((char) (c = nextChar()))) { |
105 | 1 | isA = sense; |
106 | 1 | lastWhite = true; |
107 | } | |
108 | 298 | } else if (c == 's' || c == 'S') { |
109 | 0 | c = nextChar(); |
110 | 0 | if ((c == 't' || c == 'T') |
111 | && ((c = nextChar()) == 'r' || c == 'R') | |
112 | && ((c = nextChar()) == 'o' || c == 'O') | |
113 | && ((c = nextChar()) == 'n' || c == 'N') | |
114 | && ((c = nextChar()) == 'g' || c == 'G')) | |
115 | 0 | inStrong = sense; |
116 | 0 | else if ( |
117 | (c == 'c' || c == 'C') | |
118 | && ((c = nextChar()) == 'r' || c == 'R') | |
119 | && ((c = nextChar()) == 'i' || c == 'I') | |
120 | && ((c = nextChar()) == 'p' || c == 'P') | |
121 | && ((c = nextChar()) == 't' || c == 'T')) | |
122 | 0 | inScript = sense; |
123 | } | |
124 | ||
125 | 1218 | for (; c != '>'; c = nextChar()) |
126 | 459 | if (c == '"' || c == '\'') |
127 | 0 | string(c); |
128 | 459 | else if (isA) { |
129 | 2 | if (lastWhite |
130 | && (c == 'n' || c == 'N') | |
131 | && ((c = nextChar()) == 'a' || c == 'A') | |
132 | && ((c = nextChar()) == 'm' || c == 'M') | |
133 | && ((c = nextChar()) == 'e' || c == 'E') | |
134 | && ((c = nextChar()) == '=')) { | |
135 | 1 | StringBuffer anchorNameBuf = new StringBuffer(); |
136 | 1 | if ((c = nextChar()) == '"' || c == '\'') { |
137 | 0 | int term = c; |
138 | 0 | while ((c = nextChar()) != term) |
139 | 0 | anchorNameBuf.append((char)c); |
140 | 0 | } else { |
141 | for (; | |
142 | 10 | Character.isLetterOrDigit((char)c) || c == '_'; |
143 | 9 | c = nextChar()) |
144 | 9 | anchorNameBuf.append((char)c); |
145 | 1 | pushback(c); |
146 | } | |
147 | ||
148 | 1 | anchorName = anchorNameBuf.toString(); |
149 | 1 | } else |
150 | 1 | lastWhite = Character.isWhitespace((char)c); |
151 | } | |
152 | } | |
153 | ||
154 | 300 | if (anchorName != null) |
155 | 1 | wordOffset = tagOffset; |
156 | 300 | return anchorName; |
157 | } | |
158 | ||
159 | private void element() throws IOException { | |
160 | int c; | |
161 | 0 | while (Character.isLetterOrDigit((char) (c = nextChar()))); |
162 | 0 | if (c != ';') |
163 | 0 | pushback(c); |
164 | 0 | } |
165 | ||
166 | private static boolean isPrint(char c) { | |
167 | 0 | return !Character.isWhitespace(c) |
168 | && !Character.isISOControl(c) | |
169 | && c != '<' | |
170 | && c != '&'; | |
171 | } | |
172 | ||
173 | private boolean isInteresting(char c) { | |
174 | 1248 | return wantEverything ? isPrint(c) : Character.isLetter(c); |
175 | } | |
176 | ||
177 | private boolean isInterestingNonLetter(char c) { | |
178 | 0 | return isPrint(c) && !Character.isLetter(c); |
179 | } | |
180 | ||
181 | private String _nextWord() throws IOException { | |
182 | try { | |
183 | int c; | |
184 | ||
185 | 233 | hadBreak = lastWasAnchor; |
186 | 1250 | while (!isInteresting((char) (c = nextChar())) || inScript) { |
187 | 1018 | hadBreak = true; |
188 | 1018 | if (c == '<') { |
189 | 300 | String anchorName = tag(); |
190 | 300 | if (anchorName != null) { |
191 | // wordOffset has been set above | |
192 | 1 | lastWasAnchor = true; |
193 | 1 | return "#" + anchorName; |
194 | } | |
195 | 299 | } else if (c == '&') |
196 | 0 | element(); |
197 | } | |
198 | ||
199 | 230 | lastWasAnchor = false; |
200 | 230 | wordOffset = offset; |
201 | ||
202 | 230 | buffer.setLength(1); |
203 | 230 | buffer.setCharAt(0, (char)c); |
204 | ||
205 | 230 | if (Character.isLetter((char)c)) { |
206 | 1314 | while (Character.isLetter((char) (c = nextChar()))) |
207 | 1084 | buffer.append((char)c); |
208 | 230 | ++wordIndex; |
209 | } else | |
210 | 0 | while (isInterestingNonLetter((char) (c = nextChar()))) |
211 | 0 | buffer.append((char)c); |
212 | ||
213 | 230 | pushback(c); |
214 | ||
215 | 230 | String word = buffer.toString(); |
216 | ||
217 | 230 | if (buffer.capacity() > 1000) |
218 | 0 | buffer = new StringBuffer(); |
219 | ||
220 | 230 | return inStrong && !wantEverything ? "$" + word : word; |
221 | 2 | } catch (EOFException e) { |
222 | 2 | return null; |
223 | } | |
224 | } | |
225 | ||
226 | public boolean hadBreak() { | |
227 | 0 | return hadBreak; |
228 | } | |
229 | ||
230 | public int wordOffset() { | |
231 | 231 | return wordOffset; |
232 | } | |
233 | ||
234 | public int bytesReadFromUnderlyingStream() { | |
235 | 0 | return offset + (pushedBack == -1 ? 1 : 2); |
236 | } | |
237 | ||
238 | public int wordIndex() { | |
239 | 232 | return wordIndex; |
240 | } | |
241 | ||
242 | public synchronized String nextWord() { | |
243 | 231 | if (!hasMoreElements()) |
244 | 0 | throw new NoSuchElementException(); |
245 | ||
246 | try { | |
247 | 231 | return nextWord; |
248 | } finally { | |
249 | 231 | nextWord = null; |
250 | } | |
251 | } | |
252 | ||
253 | public final Object nextElement() { | |
254 | 0 | return nextWord(); |
255 | } | |
256 | ||
257 | public synchronized boolean hasMoreWords() throws IOException { | |
258 | 464 | return nextWord != null || (nextWord = _nextWord()) != null; |
259 | } | |
260 | ||
261 | public final boolean hasMoreElements() { | |
262 | try { | |
263 | 231 | return hasMoreWords(); |
264 | 0 | } catch (IOException e) { |
265 | 0 | throw new UnexpectedExceptionException(e); |
266 | } | |
267 | } | |
268 | ||
269 | public static void main(String[] args) throws Exception { | |
270 | 0 | if (args[0].equals("-context")) { |
271 | 0 | for (IndexTokenizer words = |
272 | new IndexTokenizer( | |
273 | new BufferedInputStream(new FileInputStream(args[1])), | |
274 | true); | |
275 | 0 | words.hasMoreWords(); |
276 | ) { | |
277 | 0 | String word = words.nextWord(); |
278 | 0 | if (!word.startsWith("#")) { |
279 | 0 | if (words.hadBreak()) |
280 | 0 | System.out.print(" "); |
281 | 0 | System.out.print(word); |
282 | } | |
283 | 0 | } |
284 | 0 | System.out.println(); |
285 | } else { | |
286 | 0 | for (IndexTokenizer words = |
287 | new IndexTokenizer( | |
288 | new BufferedInputStream(new FileInputStream(args[0])), | |
289 | args[1].equals("all")); | |
290 | 0 | words.hasMoreWords(); |
291 | ) | |
292 | 0 | System.out.print( |
293 | words.nextWord() | |
294 | + "(" | |
295 | + words.wordOffset() | |
296 | + ":" | |
297 | + words.wordIndex() | |
298 | + ") "); | |
299 | 0 | System.out.println(); |
300 | } | |
301 | 0 | } |
302 | } |