View Javadoc

1   package org.paneris.bibliomania.fti;
2   
3   import java.io.BufferedInputStream;
4   import java.io.ByteArrayOutputStream;
5   import java.io.File;
6   import java.io.FileInputStream;
7   import java.io.IOException;
8   import java.io.InputStream;
9   import java.util.Enumeration;
10  import java.util.Hashtable;
11  import java.util.Properties;
12  import java.util.StringTokenizer;
13  import java.util.Vector;
14  
15  import org.melati.util.MelatiRuntimeException;
16  import org.melati.util.PropertiesUtils;
17  import org.melati.util.PropertyException;
18  import org.melati.util.UnexpectedExceptionException;
19  
20  import com.sleepycat.db.Db;
21  import com.sleepycat.db.DbBtreeStat;
22  import com.sleepycat.db.DbEnv;
23  import com.sleepycat.db.DbException;
24  import com.sleepycat.db.Dbc;
25  import com.sleepycat.db.Dbt;
26  
27  public class IndexOther {
28  
29    public static final int wordTruncationLength = 100;
30    public static final int contextWordsBeforeHit = 5;
31  
32    private static final String propertiesName =
33        "org.paneris.bibliomania.fti.IndexOther.properties";
34  
35    File dbHome;
36    Db idOfWord;
37    Dbt idOfWordKey = DbUtils.userMemDbt(wordTruncationLength);
38    Dbt idOfWordData = DbUtils.userMemDbt(wcBytesLength);
39    int nextWordID = 0;
40    Db occurrencesOfWordInText;
41    Db anchorOfIndex;
42    Db blockmarkOfIndex;          // i.e. page of index
43    Db wordsInText;
44    private AnchorFinder blockmarkFinder;
45  
46    public static class CorruptionException extends FTIException {
47  
48      /**
49       * 
50       */
51      private static final long serialVersionUID = 1L;
52      public String corruptionProblem;
53  
54      public CorruptionException(String corruptionProblem) {
55        this.corruptionProblem = corruptionProblem;
56      }
57  
58      public String getMessage() {
59        return "The text index is corrupt, and the " +
60               "index must be rebuilt (problem: " + corruptionProblem + ")";
61      }
62    }
63  
64    public IndexOther(File dbHome)
65        throws IOException, DbException, PropertyException {
66      this(dbHome, null);
67    }
68  
69    public IndexOther(File dbHome, Properties config)
70        throws IOException, DbException, PropertyException {
71      this.dbHome = dbHome;
72      if (!dbHome.isDirectory())
73        throw new IllegalArgumentException("dbHome `" + dbHome +
74                       "' is not a directory");
75  
76      if (config == null)
77        config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
78  
79      DbEnv env = null;
80  
81      idOfWord = new Db(env, 0);
82      idOfWord.setCacheSize(
83          PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"), 1);
84      System.err.println("Attempting to open: " + new File(dbHome, "idOfWord.db").getPath());
85      idOfWord.open(null, new File(dbHome, "idOfWord.db").getPath(),
86            null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
87  
88      occurrencesOfWordInText = new Db(env, 0);
89      occurrencesOfWordInText.setCacheSize(
90          PropertiesUtils.getOrDie_int(config, "occurrences.cacheSize"), 1);
91      occurrencesOfWordInText.open(null, new File(dbHome, "occurrences.db").getPath(),
92                   null, Db.DB_BTREE,
93                                   Db.DB_CREATE | Db.DB_THREAD, 0644);
94  
95      nextWordID = ((DbBtreeStat)idOfWord.stat(0)).bt_nkeys;
96  
97  //    System.err.println("nextWordID = " + nextWordID);
98  
99      anchorOfIndex = new Db(env, 0);
100     anchorOfIndex.setCacheSize(
101         PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"), 1);
102     anchorOfIndex.open(null, new File(dbHome, "anchorOfIndex.db").getPath(),
103                null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
104 
105     blockmarkOfIndex = new Db(env, 0);
106     blockmarkOfIndex.setCacheSize(
107         PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"), 1);
108     blockmarkOfIndex.open(null, new File(dbHome, "blockmarkOfIndex.db").getPath(),
109                           null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
110 
111     wordsInText = new Db(env, 0);
112     // make it a btree because of possible locality of reference when
113     // processing many books
114     wordsInText.open(null, new File(dbHome, "wordsInText.db").getPath(),
115              null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
116 
117     blockmarkFinder = new AnchorFinder(this, true);
118   }
119 
120   private static final class Buffer extends ByteArrayOutputStream {
121     public byte[] buffer() {
122       return buf;
123     }
124 
125     public int count() {
126       return count;
127     }
128   }
129 
130   private class WordRecord {
131     public String word;
132     public int wordID;
133     public Buffer occurrenceData = new Buffer();
134     public int count;
135 
136     public WordRecord(String word) throws FTIException, DbException {
137       this.word = word;
138       wordID = idOfWord(word);
139     }
140 
141     // FIXME this really goes with WordTextSearchResults
142 
143     private Packer wordIndexPacker = OnePacker.it;
144     private Packer offsetPacker = OnePacker.it;
145 
146     public void noteOccurrence(int wordIndex, int offset) {
147       try {
148     while (wordIndex >= wordIndexPacker.numberMax()) {
149       wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
150       wordIndexPacker = wordIndexPacker.bigger();
151     }
152 
153     wordIndexPacker.write(occurrenceData, wordIndex);
154 
155     while (offset >= offsetPacker.numberMax()) {
156       offsetPacker.write(occurrenceData, offsetPacker.numberMax());
157       offsetPacker = offsetPacker.bigger();
158     }
159 
160     offsetPacker.write(occurrenceData, offset);
161       }
162       catch (IOException e) {
163     throw new UnexpectedExceptionException(e);
164       }
165 
166       ++count;
167     }
168   }
169 
170   static final int wtBytesLength = 8;
171 
172   static int getWT_wordID(byte[] bytes) {
173     return ThreePacker.number_(bytes, 0);
174   }
175 
176   static long getWT_textID(byte[] bytes) {
177     // Old version used 4-byte textid keys, we use 5 now to accomodate more bits
178     // for the `Section' part
179 
180     if (bytes.length == 3 + 5)
181       return FivePacker.number_(bytes, 3);
182     else if (bytes.length == 3 + 4)
183       throw new CorruptionException(
184           "unexpected occurrence key length 7; are you running against an " +
185           "index built with an old version of the bibliomania software, from " +
186           "before the number of allowed `sections' was increased?");
187     else
188       throw new CorruptionException(
189           "unexpected occurrence key length " + bytes.length);
190   }
191 
192   static void setWT(byte[] bytes, int wordID, long textID) {
193     ThreePacker.set_(bytes, 0, wordID);
194     FivePacker.set_(bytes, 3, textID);
195   }
196 
197   static final int wcBytesLength = 6;
198 
199   static int getWC_wordID(byte[] bytes) {
200     return ThreePacker.number_(bytes, 0);
201   } 
202 
203   static void setWC_count(byte[] bytes, int count) {
204     ThreePacker.set_(bytes, 3, count);
205   }
206 
207   static void setWC(byte[] bytes, int wordID, int count) {
208     ThreePacker.set_(bytes, 0, wordID);
209     setWC_count(bytes, count);
210   }
211 
212   static int getWC_count(byte[] bytes) {
213     return ThreePacker.number_(bytes, 3);
214   } 
215 
216   static void setWord(Dbt key, String word) {
217     byte[] data = key.getData();
218     int length = Math.min(word.length(), wordTruncationLength);
219     if (data.length < length) {
220       key.setData(data = new byte[length]);
221       key.setUserBufferLength(length);
222     }
223     key.setSize(length);
224     for (int i = 0; i < length; ++i)
225       data[i] = (byte)word.charAt(i);
226   }
227 
228   public class WordIDExceededMaxException extends MelatiRuntimeException {
229     /**
230      * 
231      */
232     private static final long serialVersionUID = 1L;
233 
234     public String getMessage() {
235       return "Word dictionary in FTI subsystem " + dbHome +
236              " exceeded maximum size";
237     }
238   }
239 
240   public int idOfWord(String word) throws FTIException, DbException {
241     word = word.toLowerCase();
242     // FIXME synchronized???? is that a good idea??? what about WordFinder?
243     synchronized (idOfWord) {
244       setWord(idOfWordKey, word);
245       if (idOfWord.get(null, idOfWordKey, idOfWordData, 0) == 0)
246         return getWC_wordID(idOfWordData.getData());
247       else {
248         if (nextWordID > 0xFFFFFF)
249           throw new WordIDExceededMaxException();
250         setWC(idOfWordData.getData(), nextWordID, 0);
251         idOfWord.put(null, idOfWordKey, idOfWordData, 0);
252         return nextWordID++;
253       }
254     }
255   }
256 
257   private void addWordCount(String word, int count) throws DbException {
258     synchronized (idOfWord) {
259       setWord(idOfWordKey, word.toLowerCase());
260       if (idOfWord.get(null, idOfWordKey, idOfWordData, 0) == 0) {
261     byte[] data = idOfWordData.getData();
262     setWC_count(data, getWC_count(data) + count);
263         idOfWord.put(null, idOfWordKey, idOfWordData, 0);
264       }
265     }
266   }
267 
268   public static final int tiBytesLength = 8;
269 
270   public static void setTI(byte[] bytes, long textID, int index) {
271     FivePacker.set_(bytes, 0, textID);
272     ThreePacker.set_(bytes, 5, index);
273   }
274 
275   public static long getTI_textID(byte[] bytes) {
276     return FivePacker.number_(bytes, 0);
277   }
278 
279   public void unIndex(long textID)
280       throws FTIException, DbException {
281     Dbt text = DbUtils.userMemDbt(5);
282     FivePacker.set_(text.getData(), 0, textID);
283     Dbt words = DbUtils.userMemDbt(256);
284 
285     if (DbUtils.get(wordsInText, text, words, 256) == 0) {
286       Dbt wt = DbUtils.userMemDbt(wtBytesLength);
287       for (int i = 0; i < words.getSize(); i += 3) {
288     setWT(wt.getData(),
289           ThreePacker.number_(words.getData(), i),
290           textID);
291     occurrencesOfWordInText.delete(null, wt, 0);
292       }
293     }
294 
295     wordsInText.delete(null, text, 0);
296   }
297 
298   private void noteOccurrence(Hashtable wordRecords, String word,
299                               int index, int offset) throws DbException {
300 
301     WordRecord wordRecord = (WordRecord)wordRecords.get(word);
302     if (wordRecord == null)
303       wordRecords.put(word, wordRecord = new WordRecord(word));
304 
305     wordRecord.noteOccurrence(index, offset);
306   }
307 
308   public void index(Text text)
309       throws FTIException, IOException, DbException {
310 
311     long textID = text.ftiTextID();
312 
313     unIndex(textID);
314 
315     Dbt anchorOfIndexKey = DbUtils.userMemDbt(tiBytesLength);
316     Dbt anchorOfIndexData = DbUtils.userMemDbt(256);
317 
318     Hashtable wordRecords = new Hashtable();
319     int[] offsetHistory = new int[contextWordsBeforeHit];
320     int wordCount = 0;
321 
322     InputStream body = new BufferedInputStream(text.body());
323     try {
324       for (IndexTokenizer words = new IndexTokenizer(body);
325            words.hasMoreWords();) {
326         String word = words.nextWord();
327         if (word.startsWith("#")) {
328           boolean blockmark = word.startsWith("#__");
329           setWord(anchorOfIndexData, word.substring(1));
330           setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
331           (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
332               null, anchorOfIndexKey, anchorOfIndexData, 0);
333         }
334         else
335           word = word.toLowerCase();
336 
337         offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
338 
339         int index = words.wordIndex();
340         int offset = wordCount < contextWordsBeforeHit ?
341                        offsetHistory[0] :
342                        offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
343 
344         noteOccurrence(wordRecords, word, index, offset);
345         if (word.startsWith("$"))
346           noteOccurrence(wordRecords, word.substring(1), index, offset);
347 
348         ++wordCount;
349       }
350     }
351     finally {
352       try { body.close(); } catch (IOException e) {}
353     }
354 
355     byte[] allWordIDs = new byte[wordRecords.size() * 3];
356     int allWordIDs_p = 0;
357 
358     Dbt occurrences = DbUtils.userMemDbt(1);
359     Dbt wt = DbUtils.userMemDbt(wtBytesLength);
360 
361     for (Enumeration w = wordRecords.elements(); w.hasMoreElements();) {
362       WordRecord wordRecord = (WordRecord)w.nextElement();
363       setWT(wt.getData(), wordRecord.wordID, textID);
364       occurrences.setData(wordRecord.occurrenceData.buffer());
365       occurrences.setSize(wordRecord.occurrenceData.count());
366       occurrences.setUserBufferLength(occurrences.getData().length);
367       occurrencesOfWordInText.put(null, wt, occurrences, 0);
368       addWordCount(wordRecord.word, wordRecord.count);
369 
370       ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
371       allWordIDs_p += 3;
372     }
373 
374     Dbt textIDDbt = DbUtils.userMemDbt(5);
375     FivePacker.set_(textIDDbt.getData(), 0, textID);
376     wordsInText.put(null, textIDDbt, DbUtils.userMemDbt(allWordIDs), 0);
377   }
378 
379   public SearchResults andSearchResults(String[] args)
380       throws FTIException, DbException {
381     return new AndSearchResults(this, args, false);
382   }
383 
384   public SearchResults groupSearchResults(String[] args)
385       throws FTIException, DbException {
386     return new AndSearchResults(this, args, true);
387   }
388 
389   // i.e., which page is the #anchor marking this poem on?
390 
391   public String blockmarkerBeforeFirstOccurrence(long textID, String word)
392       throws DbException {
393     TextStream stream = new TextStream(this, word);
394     stream.gotoText(textID);
395     stream.init();
396     int index = stream.currentWordIndex();
397     return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
398   }
399 
400   public void stat() throws DbException {
401     Dbc words = idOfWord.cursor(null, 0);
402     try {
403       int total = 0;
404       while (words.get(idOfWordKey, idOfWordData, Db.DB_NEXT) == 0) {
405         System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
406         System.out.print(": ");
407         int count = getWC_count(idOfWordData.getData());
408         System.out.println(count);
409         total += count;
410       }
411       System.out.println("--- total " + total);
412     }
413     finally {
414       try { words.close(); } catch (DbException e) {}
415     }
416   }
417 
418   public void flush() throws DbException {
419     idOfWord.sync(0);
420     occurrencesOfWordInText.sync(0);
421     anchorOfIndex.sync(0);
422     blockmarkOfIndex.sync(0);
423     wordsInText.sync(0);
424   }
425 
426   public void close() throws DbException {
427     idOfWord.close(0);
428     occurrencesOfWordInText.close(0);
429     anchorOfIndex.close(0);
430     blockmarkOfIndex.close(0);
431     wordsInText.close(0);
432   }
433 
434   protected void finalize() throws Throwable {
435     close();
436   }
437 
438   public void appendTerms(Vector terms, String query, boolean keywords)
439       throws DbException {
440     Vector group = null;
441 
442     StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
443 
444     while (tokens.hasMoreTokens()) {
445       String token = tokens.nextToken();
446       switch (token.charAt(0)) {
447         case ' ': case '\t' : case '\n': case '\r': case '\f':
448       break;
449         case '"':
450       if (group == null)
451         group = new Vector();
452       else {
453         String[] g = new String[group.size()];
454         group.copyInto(g);
455         terms.addElement(new AndSearchResults(this, g, true));
456         group = null;
457       }
458       break;
459         default:
460           if (keywords)
461             token = '$' + token;
462 
463       if (group != null)
464         group.addElement(token);
465       else
466         terms.addElement(new TextStream(this, token));
467       }
468     }
469 
470     if (group != null) {
471       String[] g = new String[group.size()];
472       group.copyInto(g);
473       terms.addElement(new AndSearchResults(this, g, true));
474     }
475   }
476 
477   public SearchResults querySearchResults(
478       String textQuery, String keywordsQuery) throws DbException, FTIException {
479     Vector terms = new Vector();
480     if (textQuery != null) appendTerms(terms, textQuery, false);
481     if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
482     SearchResults[] t = new SearchResults[terms.size()];
483     terms.copyInto(t);
484     return new AndSearchResults(t);
485   }
486 
487   public SearchResults querySearchResults(String query)
488       throws DbException, FTIException {
489     return querySearchResults(query, null);
490   }
491 
492   public IndexCursor allEntries() throws DbException {
493     return new IndexCursor(this);
494   }
495 
496   public static boolean debug = false;
497 
498   public static void main(final String[] args) throws Exception {
499     IndexOther index = new IndexOther(
500         new File("/usr/local/share/bibliomania/workspace/infoFTI"));
501 
502     if (args[0].equals("-index"))
503       for (int i = 1; i < args.length; i += 2) {
504     try {
505       System.err.println("indexing " + args[i + 1]);
506       final InputStream body =
507           new BufferedInputStream(new FileInputStream(args[i + 1]));
508       final long textID = Long.parseLong(args[i]);
509       index.index(new Text() {
510                     public InputStream body() {
511               return body;
512             }
513                     public InputStream bodyForFragment() {
514               return body;
515             }
516             public long ftiTextID() {
517               return textID;
518             }
519                   });
520 
521     }
522     catch (Exception e) {
523       e.printStackTrace();
524     }
525       }
526     else if (args[0].equals("-stat"))
527       index.stat();
528     else if (args[0].equals("-anchorpage"))
529       System.out.println(index.blockmarkerBeforeFirstOccurrence(Integer.parseInt(args[1]), args[2]));
530     else {
531       SearchResults results;
532       if (args[0].charAt(0) == '_') {
533     args[0] = args[0].substring(1);
534     results = index.groupSearchResults(args);
535     System.out.println("phrase");
536       }
537       else if (args[0].equals("-query")) {
538     results = index.querySearchResults(args[1]);
539       }
540       else {
541     results = index.andSearchResults(args);
542     System.out.println("and");
543       }
544 
545       for (results.gotoText(0L); results.currentTextID() != -1L;
546        // FIXME this isn't very clever! (maybe?)
547        results.gotoText(results.currentTextID() + 1L)) {
548     System.out.print("-- " + results.currentTextID() + "\n ");
549     for (; results.currentOffset() != -1; results.skipToNextHit())
550       System.out.print(" " + // results.currentAnchor() + ":" +
551                results.currentOffset());
552     System.out.println();
553       }
554     }
555 
556     index.close();
557   }
558 }