View Javadoc

1   package org.paneris.bibliomania.fti;
2   
3   import java.io.BufferedInputStream;
4   import java.io.File;
5   import java.io.FileInputStream;
6   import java.io.IOException;
7   import java.io.InputStream;
8   
9   import com.sleepycat.db.DbException;
10  
11  public class ContextSearchResults implements SearchResults {
12  
13    private static byte[] skipBuffer = new byte[5000];
14  
15    public static final String contextUnavailable = "(context not available)";
16  
17    public static final int contextWordsAfterHit = 5;
18  
19    public static final int wordsGapBetweenAreas = 6;
20  
21    private Library library;
22  
23    private Text currentText = null;
24  
25    private SearchResults results;
26  
27    private AnchorFinder blockmarks;
28  
29    // we don't want to use a Reader for this because then skip is implemented
30    // using repeated reads ...
31  
32    private InputStream body = null;
33  
34    private long bodyPosition;
35  
36    public ContextSearchResults(Library library, SearchResults results,
37        IndexOther fti) {
38      this.library = library;
39      this.results = results;
40      blockmarks = new AnchorFinder(fti, true);
41    }
42  
43    public int frequency() {
44      return results.frequency();
45    }
46  
47    public int hitWordsCount() {
48      return results.hitWordsCount();
49    }
50  
51    public void init() {
52      closeBody();
53      results.init();
54    }
55  
56    public void skipToNextHit() {
57      results.skipToNextHit();
58    }
59  
60    public void skipToWordIndex(int wordIndex) {
61      results.skipToWordIndex(wordIndex);
62    }
63  
64    public int currentWordIndex() {
65      return results.currentWordIndex();
66    }
67  
68    public int currentOffset() {
69      return results.currentOffset();
70    }
71  
72    public void gotoText(long textID) throws DbException {
73      closeBody();
74      results.gotoText(textID);
75      long it = results.currentTextID();
76      currentText = it == -1 ? null : library.text(it);
77    }
78  
79    public void gotoPosition(long position) throws DbException {
80      gotoText(position >> 32);
81      if (currentTextID() != -1)
82        skipToWordIndex((int) position);
83    }
84  
85    public long getPosition() {
86      int wi = currentWordIndex();
87      return wi == -1 ? -1 : (currentTextID() << 32 | wi);
88    }
89  
90    public long currentTextID() {
91      return results.currentTextID();
92    }
93  
94    public Text currentText() {
95      return currentText;
96    }
97  
98    public String currentAnchor() throws DbException {
99      // FIXME not necessarily very optimal
100     // could exploit forward-moving definition of these cursors
101     int index = currentWordIndex();
102     return index == -1 ? null : blockmarks
103         .anchorOfIndex(currentTextID(), index);
104   }
105 
106   private synchronized void closeBody() {
107     if (body != null) {
108       try {
109         body.close();
110       } catch (IOException e) {
111       }
112       body = null;
113     }
114   }
115 
116   public synchronized String nextArea() {
117     try {
118       long target = results.currentOffset();
119       if (target == -1) {
120         closeBody();
121         return null;
122       }
123 
124       if (body == null) {
125         if (currentText == null)
126           return null;
127 
128         body = new BufferedInputStream(currentText.bodyForFragment());
129         bodyPosition = 0;
130       }
131 
132       if (bodyPosition > target)
133         throw new IOException("tried to skip backwards");
134 
135       // FIXME BufferedInputStream seems genuinely not to work well
136       // wrt to skipping ... so we use skip for the first move and dummy
137       // reads thereafter
138 
139       if (bodyPosition == 0)
140         bodyPosition += body.skip(target - bodyPosition);
141       else {
142         long read;
143         do {
144           bodyPosition += (read = body.read(skipBuffer, 0, (int) Math.min(
145               target - bodyPosition, skipBuffer.length)));
146         } while (read > 0);
147       }
148 
149       if (bodyPosition < target)
150         throw new IOException("skipped only to " + bodyPosition + " not "
151             + target);
152 
153       StringBuffer buf = new StringBuffer();
154       IndexTokenizer words = new IndexTokenizer(body, true);
155       int baseIndex = Math.max(results.currentWordIndex()
156           - IndexOther.contextWordsBeforeHit, 0);
157       // int lastIndex = -1;
158       do {
159         int limitIndex = results.currentWordIndex() + results.hitWordsCount()
160             + contextWordsAfterHit - baseIndex;
161 
162         while (words.hasMoreWords() && words.wordIndex() < limitIndex) {
163           String word = words.nextWord();
164           if (!word.startsWith("#")) {
165             if (words.hadBreak() && buf.length() > 0)
166               buf.append(' ');
167             buf.append(word);
168           }
169         }
170 
171         // lastIndex = results.currentWordIndex() + results.hitWordsCount();
172 
173         results.skipToNextHit();
174       } while (results.currentWordIndex() != -1
175           && (results.currentWordIndex() - baseIndex) - words.wordIndex() < wordsGapBetweenAreas);
176 
177       bodyPosition += words.bytesReadFromUnderlyingStream();
178 
179       return buf.toString();
180     } catch (IOException e) {
181       System.err.println(e);
182 
183       try {
184         body.close();
185       } catch (Exception ee) {
186       }
187 
188       return null;
189     }
190   }
191 
192   public static void main(String[] args) throws Exception {
193     IndexOther fti = new IndexOther(new File("/tmp"));
194 
195     SearchResults rawResults;
196     if (args[0].charAt(0) == '_') {
197       args[0] = args[0].substring(1);
198       rawResults = fti.groupSearchResults(args);
199       System.out.println("phrase");
200     } else if (args[0].equals("-query")) {
201       rawResults = fti.querySearchResults(args[1]);
202     } else {
203       rawResults = fti.andSearchResults(args);
204       System.out.println("and");
205     }
206 
207     ContextSearchResults results = new ContextSearchResults(new Library() {
208       public Text text(final long textID) {
209         return new Text() {
210           public InputStream body() throws IOException {
211             return new FileInputStream(
212                 "/usr/doc/HOWTO/other-formats/html/CDROM-HOWTO-3.html");
213           }
214 
215           public InputStream bodyForFragment() throws IOException {
216             return body();
217           }
218 
219           public long ftiTextID() {
220             return textID;
221           }
222         };
223       }
224     }, rawResults, fti);
225 
226     for (results.gotoText(0); results.currentTextID() != -1;
227     // FIXME this isn't very clever! (maybe?)
228     results.gotoText(results.currentTextID() + 1)) {
229       System.out.println("== " + results.currentTextID());
230       for (;;) {
231         String anchor = results.currentAnchor();
232         String area = results.nextArea();
233         if (area == null)
234           break;
235         System.out.println("-- A HREF=http://doc#" + anchor);
236         System.out.println(area);
237       }
238     }
239   }
240 
241   public void close() {
242     results.close();
243     blockmarks.close();
244     closeBody();
245   }
246 }