View Javadoc

1   package org.paneris.bibliomania.fti;
2   
3   import java.io.BufferedInputStream;
4   import java.io.File;
5   import java.io.FileInputStream;
6   import java.io.IOException;
7   import java.io.InputStream;
8   
9   import org.melati.poem.NoSuchRowPoemException;
10  
11  import com.sleepycat.db.DatabaseException;
12  
13  public class ContextSearchResults implements SearchResults {
14  
15    private static byte[] skipBuffer = new byte[5000];
16  
17    public static final String contextUnavailable = "(context not available)";
18  
19    public static final int contextWordsAfterHit = 5;
20  
21    public static final int wordsGapBetweenAreas = 6;
22  
23    private Library library;
24  
25    private Text currentText = null;
26  
27    private SearchResults results;
28  
29    private AnchorFinder blockmarks;
30  
31    // we don't want to use a Reader for this because then skip is implemented
32    // using repeated reads ...
33  
34    private InputStream body = null;
35  
36    private long bodyPosition;
37  
38    public ContextSearchResults(Library library, SearchResults results,
39        IndexOther fti) {
40      this.library = library;
41      this.results = results;
42      blockmarks = new AnchorFinder(fti, true);
43    }
44  
45    public int frequency() {
46      return results.frequency();
47    }
48  
49    public int hitWordsCount() {
50      return results.hitWordsCount();
51    }
52  
53    public void init() {
54      closeBody();
55      results.init();
56    }
57  
58    public void skipToNextHit() {
59      results.skipToNextHit();
60    }
61  
62    public void skipToWordIndex(int wordIndex) {
63      results.skipToWordIndex(wordIndex);
64    }
65  
66    public int currentWordIndex() {
67      return results.currentWordIndex();
68    }
69  
70    public int currentOffset() {
71      return results.currentOffset();
72    }
73  
74    public void gotoText(long textID) throws DatabaseException {
75      closeBody();
76      results.gotoText(textID);
77      long it = results.currentTextID();
78      currentText = it == -1 ? null : library.text(it);
79    }
80  
81    public void gotoPosition(long position) throws DatabaseException {
82      gotoText(position >> 32);
83      if (currentTextID() != -1)
84        skipToWordIndex((int) position);
85    }
86  
87    public long getPosition() {
88      int wi = currentWordIndex();
89      return wi == -1 ? -1 : (currentTextID() << 32 | wi);
90    }
91  
92    public long currentTextID() {
93      return results.currentTextID();
94    }
95  
96    public Text currentText() {
97      return currentText;
98    }
99  
100   public String currentAnchor() throws DatabaseException {
101     // FIXME not necessarily very optimal
102     // could exploit forward-moving definition of these cursors
103     int index = currentWordIndex();
104     return index == -1 ? null : blockmarks
105         .anchorOfIndex(currentTextID(), index);
106   }
107 
108   private synchronized void closeBody() {
109     if (body != null) {
110       try {
111         body.close();
112       } catch (IOException e) {
113       }
114       body = null;
115     }
116   }
117 
118   public synchronized String nextArea() {
119     try {
120       long target = results.currentOffset();
121       if (target == -1) {
122         closeBody();
123         return null;
124       }
125 
126       if (body == null) {
127         if (currentText == null)
128           return null;
129         try { 
130           body = new BufferedInputStream(currentText.bodyForFragment());
131           bodyPosition = 0;
132         } catch (NoSuchRowPoemException e) { 
133           // This should not happen in a coherent database, but 
134           // the one on my machine isn't coherent
135           return null;
136         }
137       }
138 
139       if (bodyPosition > target)
140         throw new IOException("tried to skip backwards");
141 
142       // FIXME BufferedInputStream seems genuinely not to work well
143       // wrt to skipping ... so we use skip for the first move and dummy
144       // reads thereafter
145 
146       if (bodyPosition == 0)
147         bodyPosition += body.skip(target - bodyPosition);
148       else {
149         long read;
150         do {
151           bodyPosition += (read = body.read(skipBuffer, 0, (int) Math.min(
152               target - bodyPosition, skipBuffer.length)));
153         } while (read > 0);
154       }
155 
156       if (bodyPosition < target)
157         throw new IOException("skipped only to " + bodyPosition + " not "
158             + target);
159 
160       StringBuffer buf = new StringBuffer();
161       IndexTokenizer words = new IndexTokenizer(body, true);
162       int baseIndex = Math.max(results.currentWordIndex()
163           - IndexOther.contextWordsBeforeHit, 0);
164       // int lastIndex = -1;
165       do {
166         int limitIndex = results.currentWordIndex() + results.hitWordsCount()
167             + contextWordsAfterHit - baseIndex;
168 
169         while (words.hasMoreWords() && words.wordIndex() < limitIndex) {
170           String word = words.nextWord();
171           if (!word.startsWith("#")) {
172             if (words.hadBreak() && buf.length() > 0)
173               buf.append(' ');
174             buf.append(word);
175           }
176         }
177 
178         // lastIndex = results.currentWordIndex() + results.hitWordsCount();
179 
180         results.skipToNextHit();
181       } while (results.currentWordIndex() != -1
182           && (results.currentWordIndex() - baseIndex) - words.wordIndex() < wordsGapBetweenAreas);
183 
184       bodyPosition += words.bytesReadFromUnderlyingStream();
185 
186       return buf.toString();
187     } catch (IOException e) {
188       System.err.println(e);
189 
190       try {
191         body.close();
192       } catch (Exception ee) {
193       }
194 
195       return null;
196     }
197   }
198 
199   public static void main(String[] args) throws Exception {
200     IndexOther fti = new IndexOther(new File("/tmp"));
201 
202     SearchResults rawResults;
203     if (args[0].charAt(0) == '_') {
204       args[0] = args[0].substring(1);
205       rawResults = fti.groupSearchResults(args);
206       System.out.println("phrase");
207     } else if (args[0].equals("-query")) {
208       rawResults = fti.querySearchResults(args[1]);
209     } else {
210       rawResults = fti.andSearchResults(args);
211       System.out.println("and");
212     }
213 
214     ContextSearchResults results = new ContextSearchResults(new Library() {
215       public Text text(final long textID) {
216         return new Text() {
217           public InputStream body() throws IOException {
218             return new FileInputStream(
219                 "/usr/doc/HOWTO/other-formats/html/CDROM-HOWTO-3.html");
220           }
221 
222           public InputStream bodyForFragment() throws IOException {
223             return body();
224           }
225 
226           public long ftiTextID() {
227             return textID;
228           }
229         };
230       }
231     }, rawResults, fti);
232 
233     for (results.gotoText(0); results.currentTextID() != -1;
234     // FIXME this isn't very clever! (maybe?)
235     results.gotoText(results.currentTextID() + 1)) {
236       System.out.println("== " + results.currentTextID());
237       for (;;) {
238         String anchor = results.currentAnchor();
239         String area = results.nextArea();
240         if (area == null)
241           break;
242         System.out.println("-- A HREF=http://doc#" + anchor);
243         System.out.println(area);
244       }
245     }
246   }
247 
248   public void close() {
249     results.close();
250     blockmarks.close();
251     closeBody();
252   }
253 }