View Javadoc

1   package org.paneris.bibliomania.fti;
2   
3   import java.io.BufferedInputStream;
4   import java.io.EOFException;
5   import java.io.FileInputStream;
6   import java.io.IOException;
7   import java.io.InputStream;
8   import java.util.Enumeration;
9   import java.util.NoSuchElementException;
10  
11  import org.melati.util.UnexpectedExceptionException;
12  
13  public class IndexTokenizer implements Enumeration {
14  
15    private InputStream reader;
16    private int pushedBack = -1;
17    private int offset = -1, wordIndex = -1, wordOffset;
18    private StringBuffer buffer = new StringBuffer();
19    private String nextWord = null;
20    private boolean hadBreak = false;
21    private boolean lastWasAnchor = false;
22    private boolean inStrong = false;
23    private boolean inScript = false;
24    private boolean wantEverything;
25  
26    public IndexTokenizer(InputStream reader, boolean wantEverything) {
27      this.reader = reader;
28      this.wantEverything = wantEverything;
29    }
30  
31    public IndexTokenizer(InputStream reader) {
32      this(reader, false);
33    }
34  
35    private int nextChar() throws IOException {
36      int c;
37      if (pushedBack == -1) {
38        c = reader.read();
39        if (c == -1)
40          throw new EOFException();
41      } else {
42        c = pushedBack;
43        pushedBack = -1;
44      }
45  
46      ++offset;
47      return c;
48    }
49  
50    private void pushback(int c) {
51      if (pushedBack != -1)
52        throw new IllegalArgumentException("pushed back already");
53      pushedBack = c;
54      --offset;
55    }
56  
57    private void string(int term) throws IOException {
58      int c;
59      do {
60        if ((c = nextChar()) == '\\') {
61          nextChar();
62          c = nextChar();
63        }
64      } while (c != term);
65    }
66  
67    private void comment() throws IOException {
68      for (;;) {
69        int c;
70        if ((c = nextChar()) == '-')
71          if ((c = nextChar()) == '-' && (c = nextChar()) == '>')
72            break;
73          else
74            pushback(c);
75      }
76    }
77  
78    private String tag() throws IOException {
79      int tagOffset = offset;
80      String anchorName = null;
81      int c;
82      if ((c = nextChar()) == '!'
83        && (c = nextChar()) == '-'
84        && (c = nextChar()) == '-')
85        comment();
86      else {
87        boolean sense;
88        if (c == '/') {
89          sense = false;
90          c = nextChar();
91        } else
92          sense = true;
93  
94        boolean isA = false;
95  
96        while (Character.isWhitespace((char)c))
97          c = nextChar();
98  
99        boolean lastWhite = false;
100 
101       inStrong = false;
102 
103       if (c == 'a' || c == 'A') {
104         if (Character.isWhitespace((char) (c = nextChar()))) {
105           isA = sense;
106           lastWhite = true;
107         }
108       } else if (c == 's' || c == 'S') {
109         c = nextChar();
110         if ((c == 't' || c == 'T')
111           && ((c = nextChar()) == 'r' || c == 'R')
112           && ((c = nextChar()) == 'o' || c == 'O')
113           && ((c = nextChar()) == 'n' || c == 'N')
114           && ((c = nextChar()) == 'g' || c == 'G'))
115           inStrong = sense;
116         else if (
117           (c == 'c' || c == 'C')
118             && ((c = nextChar()) == 'r' || c == 'R')
119             && ((c = nextChar()) == 'i' || c == 'I')
120             && ((c = nextChar()) == 'p' || c == 'P')
121             && ((c = nextChar()) == 't' || c == 'T'))
122           inScript = sense;
123       }
124 
125       for (; c != '>'; c = nextChar())
126         if (c == '"' || c == '\'')
127           string(c);
128         else if (isA) {
129           if (lastWhite
130             && (c == 'n' || c == 'N')
131             && ((c = nextChar()) == 'a' || c == 'A')
132             && ((c = nextChar()) == 'm' || c == 'M')
133             && ((c = nextChar()) == 'e' || c == 'E')
134             && ((c = nextChar()) == '=')) {
135             StringBuffer anchorNameBuf = new StringBuffer();
136             if ((c = nextChar()) == '"' || c == '\'') {
137               int term = c;
138               while ((c = nextChar()) != term)
139                 anchorNameBuf.append((char)c);
140             } else {
141               for (;
142                 Character.isLetterOrDigit((char)c) || c == '_';
143                 c = nextChar())
144                 anchorNameBuf.append((char)c);
145               pushback(c);
146             }
147 
148             anchorName = anchorNameBuf.toString();
149           } else
150             lastWhite = Character.isWhitespace((char)c);
151         }
152     }
153 
154     if (anchorName != null)
155       wordOffset = tagOffset;
156     return anchorName;
157   }
158 
159   private void element() throws IOException {
160     int c;
161     while (Character.isLetterOrDigit((char) (c = nextChar())));
162     if (c != ';')
163       pushback(c);
164   }
165 
166   private static boolean isPrint(char c) {
167     return !Character.isWhitespace(c)
168       && !Character.isISOControl(c)
169       && c != '<'
170       && c != '&';
171   }
172 
173   private boolean isInteresting(char c) {
174     return wantEverything ? isPrint(c) : Character.isLetter(c);
175   }
176 
177   private boolean isInterestingNonLetter(char c) {
178     return isPrint(c) && !Character.isLetter(c);
179   }
180 
181   private String _nextWord() throws IOException {
182     try {
183       int c;
184 
185       hadBreak = lastWasAnchor;
186       while (!isInteresting((char) (c = nextChar())) || inScript) {
187         hadBreak = true;
188         if (c == '<') {
189           String anchorName = tag();
190           if (anchorName != null) {
191             // wordOffset has been set above
192             lastWasAnchor = true;
193             return "#" + anchorName;
194           }
195         } else if (c == '&')
196           element();
197       }
198 
199       lastWasAnchor = false;
200       wordOffset = offset;
201 
202       buffer.setLength(1);
203       buffer.setCharAt(0, (char)c);
204 
205       if (Character.isLetter((char)c)) {
206         while (Character.isLetter((char) (c = nextChar())))
207           buffer.append((char)c);
208         ++wordIndex;
209       } else
210         while (isInterestingNonLetter((char) (c = nextChar())))
211           buffer.append((char)c);
212 
213       pushback(c);
214 
215       String word = buffer.toString();
216 
217       if (buffer.capacity() > 1000)
218         buffer = new StringBuffer();
219 
220       return inStrong && !wantEverything ? "$" + word : word;
221     } catch (EOFException e) {
222       return null;
223     }
224   }
225 
226   public boolean hadBreak() {
227     return hadBreak;
228   }
229 
230   public int wordOffset() {
231     return wordOffset;
232   }
233 
234   public int bytesReadFromUnderlyingStream() {
235     return offset + (pushedBack == -1 ? 1 : 2);
236   }
237 
238   public int wordIndex() {
239     return wordIndex;
240   }
241 
242   public synchronized String nextWord() {
243     if (!hasMoreElements())
244       throw new NoSuchElementException();
245 
246     try {
247       return nextWord;
248     } finally {
249       nextWord = null;
250     }
251   }
252 
253   public final Object nextElement() {
254     return nextWord();
255   }
256 
257   public synchronized boolean hasMoreWords() throws IOException {
258     return nextWord != null || (nextWord = _nextWord()) != null;
259   }
260 
261   public final boolean hasMoreElements() {
262     try {
263       return hasMoreWords();
264     } catch (IOException e) {
265       throw new UnexpectedExceptionException(e);
266     }
267   }
268 
269   public static void main(String[] args) throws Exception {
270     if (args[0].equals("-context")) {
271       for (IndexTokenizer words =
272         new IndexTokenizer(
273           new BufferedInputStream(new FileInputStream(args[1])),
274           true);
275         words.hasMoreWords();
276         ) {
277         String word = words.nextWord();
278         if (!word.startsWith("#")) {
279           if (words.hadBreak())
280             System.out.print(" ");
281           System.out.print(word);
282         }
283       }
284       System.out.println();
285     } else {
286       for (IndexTokenizer words =
287         new IndexTokenizer(
288           new BufferedInputStream(new FileInputStream(args[0])),
289           args[1].equals("all"));
290         words.hasMoreWords();
291         )
292         System.out.print(
293           words.nextWord()
294             + "("
295             + words.wordOffset()
296             + ":"
297             + words.wordIndex()
298             + ") ");
299       System.out.println();
300     }
301   }
302 }