View Javadoc

1   package org.paneris.bibliomania.fti;
2   
3   import java.io.BufferedInputStream;
4   import java.io.ByteArrayOutputStream;
5   import java.io.File;
6   import java.io.FileInputStream;
7   import java.io.IOException;
8   import java.io.InputStream;
9   import java.util.Enumeration;
10  import java.util.Hashtable;
11  import java.util.Properties;
12  import java.util.StringTokenizer;
13  import java.util.Vector;
14  
15  import org.melati.util.MelatiRuntimeException;
16  import org.melati.util.PropertiesUtils;
17  import org.melati.util.PropertyException;
18  import org.melati.util.UnexpectedExceptionException;
19  
20  import com.sleepycat.db.Database;
21  import com.sleepycat.db.BtreeStats;
22  import com.sleepycat.db.DatabaseConfig;
23  import com.sleepycat.db.DatabaseException;
24  import com.sleepycat.db.Cursor;
25  import com.sleepycat.db.DatabaseEntry;
26  import com.sleepycat.db.LockMode;
27  import com.sleepycat.db.OperationStatus;
28  
29  public class IndexOther {
30  
31    private static final String USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI = "/usr/local/share/bibliomania/workspace/infoFTI";
32    public static final int wordTruncationLength = 100;
33    public static final int contextWordsBeforeHit = 5;
34  
35  
36    File dbHome;
37    Database idOfWord;
38    DatabaseConfig idOfWordConfig;
39    DatabaseEntry idOfWordKey = DbUtils.userMemDatabaseEntry(wordTruncationLength);
40    DatabaseEntry idOfWordData = DbUtils.userMemDatabaseEntry(wcBytesLength);
41    int nextWordID = 0;
42    Database occurrencesOfWordInText;
43    Database anchorOfIndex;
44    Database blockmarkOfIndex;          // i.e. page of index
45    Database wordsInText;  
46  
47    private AnchorFinder blockmarkFinder;
48  
49    public static class CorruptionException extends FTIException {
50  
51      private static final long serialVersionUID = 6251343503765495522L;
52      public String corruptionProblem;
53  
54      public CorruptionException(String corruptionProblem) {
55        this.corruptionProblem = corruptionProblem;
56      }
57  
58      public String getMessage() {
59        return "The text index is corrupt, and the " +
60               "index must be rebuilt (problem: " + corruptionProblem + ")";
61      }
62    }
63  
64    public IndexOther(File dbHome)
65        throws IOException, DatabaseException, PropertyException {
66      this(dbHome, null);
67    }
68  
69    public IndexOther(File dbHome, Properties config)
70        throws IOException, DatabaseException, PropertyException {
71      this.dbHome = dbHome;
72      if (!dbHome.exists())
73        throw new IllegalArgumentException("dbHome `" + dbHome +
74                       "' does nto exist");
75      if (!dbHome.isDirectory())
76        throw new IllegalArgumentException("dbHome `" + dbHome +
77                       "' is not a directory");
78  
79      if (config == null)
80  //      config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
81        config = PropertiesUtils.fromResource(this.getClass());
82  
83      idOfWord = DbUtils.openOrCreateBTreeDb(dbHome, "idOfWord", 
84          PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"));
85  
86      occurrencesOfWordInText = DbUtils.openOrCreateBTreeDb(dbHome, "occurrencesOfWordInText", 
87          PropertiesUtils.getOrDie_int(config, "occurrencesOfWordInText.cacheSize"));
88      
89      nextWordID = ((BtreeStats)idOfWord.getStats(null,null)).getNumKeys();
90  
91  //    System.err.println("nextWordID = " + nextWordID);
92  
93      anchorOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "anchorOfIndex", 
94          PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"));
95      
96      blockmarkOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "blockmarkOfIndex", 
97          PropertiesUtils.getOrDie_int(config, "blockmarkOfIndex.cacheSize"));
98      
99      // make it a btree because of possible locality of reference when
100     // processing many books
101     wordsInText = DbUtils.openOrCreateBTreeDb(dbHome, "wordsInText", 0);
102     
103     blockmarkFinder = new AnchorFinder(this, true);
104   }
105   
106   private static final class Buffer extends ByteArrayOutputStream {
107     public byte[] buffer() {
108       return buf;
109     }
110 
111     public int count() {
112       return count;
113     }
114   }
115 
116   public class WordRecord {
117     public String word;
118     public int wordID;
119     public Buffer occurrenceData = new Buffer();
120     public int count;
121 
122     public WordRecord(String word) throws FTIException, DatabaseException {
123       this.word = word;
124       wordID = idOfWord(word);
125     }
126 
127     // FIXME this really goes with WordTextSearchResults
128 
129     private Packer wordIndexPacker = OnePacker.it;
130     private Packer offsetPacker = OnePacker.it;
131 
132     public void noteOccurrence(int wordIndex, int offset) {
133       try {
134     while (wordIndex >= wordIndexPacker.numberMax()) {
135       wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
136       wordIndexPacker = wordIndexPacker.bigger();
137     }
138 
139     wordIndexPacker.write(occurrenceData, wordIndex);
140 
141     while (offset >= offsetPacker.numberMax()) {
142       offsetPacker.write(occurrenceData, offsetPacker.numberMax());
143       offsetPacker = offsetPacker.bigger();
144     }
145 
146     offsetPacker.write(occurrenceData, offset);
147       }
148       catch (IOException e) {
149     throw new UnexpectedExceptionException(e);
150       }
151 
152       ++count;
153     }
154   }
155 
156   static final int wtBytesLength = 8;
157 
158   static int getWT_wordID(byte[] bytes) {
159     return ThreePacker.number_(bytes, 0);
160   }
161 
162   static long getWT_textID(byte[] bytes) {
163     // Old version used 4-byte textid keys, 
164     // we use 5 now to accommodate more bits
165     // for the `Section' part
166 
167     if (bytes.length == 3 + 5)
168       return FivePacker.number_(bytes, 3);
169     else if (bytes.length == 3 + 4)
170       throw new CorruptionException(
171           "unexpected occurrence key length 7; are you running against an " +
172           "index built with an old version of the bibliomania software, from " +
173           "before the number of allowed `sections' was increased?");
174     else
175       throw new CorruptionException(
176           "unexpected occurrence key length " + bytes.length);
177   }
178 
179   static void setWT(byte[] bytes, int wordID, long textID) {
180     ThreePacker.set_(bytes, 0, wordID);
181     FivePacker.set_(bytes, 3, textID);
182   }
183 
184   static final int wcBytesLength = 6;
185 
186   static int getWC_wordID(byte[] bytes) {
187     return ThreePacker.number_(bytes, 0);
188   } 
189 
190   static void setWC_count(byte[] bytes, int count) {
191     ThreePacker.set_(bytes, 3, count);
192   }
193 
194   static void setWC(byte[] bytes, int wordID, int count) {
195     ThreePacker.set_(bytes, 0, wordID);
196     setWC_count(bytes, count);
197   }
198 
199   static int getWC_count(byte[] bytes) {
200     return ThreePacker.number_(bytes, 3);
201   } 
202 
203   static void setWord(DatabaseEntry key, String word) {
204     byte[] data = key.getData();
205     int length = Math.min(word.length(), wordTruncationLength);
206     if (data.length < length) {
207       key.setData(data = new byte[length]);
208       key.setUserBuffer(length, true);
209     }
210     key.setSize(length);
211     for (int i = 0; i < length; ++i)
212       data[i] = (byte)word.charAt(i);
213   }
214 
215   public class WordIDExceededMaxException extends MelatiRuntimeException {
216     /**
217      * 
218      */
219     private static final long serialVersionUID = 1L;
220 
221     public String getMessage() {
222       return "Word dictionary in FTI subsystem " + dbHome +
223              " exceeded maximum size";
224     }
225   }
226 
227   public int idOfWord(String word) throws FTIException, DatabaseException {
228     word = word.toLowerCase();
229     // FIXME synchronized???? is that a good idea??? what about WordFinder?
230     synchronized (idOfWord) {
231       setWord(idOfWordKey, word);
232       if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS)
233         return getWC_wordID(idOfWordData.getData());
234       else {
235         if (nextWordID > 0xFFFFFF)
236           throw new WordIDExceededMaxException();
237         setWC(idOfWordData.getData(), nextWordID, 0);
238         idOfWord.put(null, idOfWordKey, idOfWordData);
239         return nextWordID++;
240       }
241     }
242   }
243 
244   private void addWordCount(String word, int count) throws DatabaseException {
245     synchronized (idOfWord) {
246       setWord(idOfWordKey, word.toLowerCase());
247       if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
248         byte[] data = idOfWordData.getData();
249         setWC_count(data, getWC_count(data) + count);
250         idOfWord.put(null, idOfWordKey, idOfWordData);
251       }
252     }
253   }
254 
255   public static final int textIdBytesLength = 8;
256 
257   public static void setTI(byte[] bytes, long textID, int index) {
258     FivePacker.set_(bytes, 0, textID);
259     ThreePacker.set_(bytes, 5, index);
260   }
261 
262   public static long getTI_textID(byte[] bytes) {
263     return FivePacker.number_(bytes, 0);
264   }
265 
266   public void unIndex(long textID)
267       throws FTIException, DatabaseException {
268     DatabaseEntry text = DbUtils.userMemDatabaseEntry(5);
269     FivePacker.set_(text.getData(), 0, textID);
270     DatabaseEntry words = DbUtils.userMemDatabaseEntry(256);
271 
272     if (DbUtils.get(wordsInText, text, words, 256) == OperationStatus.SUCCESS) {
273       DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
274       for (int i = 0; i < words.getSize(); i += 3) {
275         setWT(wt.getData(),
276             ThreePacker.number_(words.getData(), i),
277             textID);
278         occurrencesOfWordInText.delete(null, wt);
279       }
280     }
281 
282     wordsInText.delete(null, text);
283   }
284 
285   private void noteOccurrence(Hashtable<String, WordRecord> wordRecords, String word,
286                               int index, int offset) throws DatabaseException {
287 
288     WordRecord wordRecord = (WordRecord)wordRecords.get(word);
289     if (wordRecord == null)
290       wordRecords.put(word, wordRecord = new WordRecord(word));
291 
292     wordRecord.noteOccurrence(index, offset);
293   }
294 
295   public void index(Text text)
296       throws FTIException, IOException, DatabaseException {
297 
298     long textID = text.ftiTextID();
299 
300     unIndex(textID);
301 
302     DatabaseEntry anchorOfIndexKey = DbUtils.userMemDatabaseEntry(textIdBytesLength);
303     DatabaseEntry anchorOfIndexData = DbUtils.userMemDatabaseEntry(256);
304 
305     Hashtable<String, WordRecord> wordRecords = new Hashtable<String, WordRecord>();
306     int[] offsetHistory = new int[contextWordsBeforeHit];
307     int wordCount = 0;
308 
309     InputStream body = new BufferedInputStream(text.body());
310     try {
311       for (IndexTokenizer words = new IndexTokenizer(body);
312            words.hasMoreWords();) {
313         String word = words.nextWord();
314         if (word.startsWith("#")) {
315           boolean blockmark = word.startsWith("#__");
316           setWord(anchorOfIndexData, word.substring(1));
317           setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
318           (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
319               null, anchorOfIndexKey, anchorOfIndexData);
320         }
321         else
322           word = word.toLowerCase();
323 
324         offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
325 
326         int index = words.wordIndex();
327         int offset = wordCount < contextWordsBeforeHit ?
328                        offsetHistory[0] :
329                        offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
330 
331         noteOccurrence(wordRecords, word, index, offset);
332         if (word.startsWith("$"))
333           noteOccurrence(wordRecords, word.substring(1), index, offset);
334 
335         ++wordCount;
336       }
337     }
338     finally {
339       try { body.close(); } catch (IOException e) {}
340     }
341 
342     byte[] allWordIDs = new byte[wordRecords.size() * 3];
343     int allWordIDs_p = 0;
344 
345     DatabaseEntry occurrences = DbUtils.userMemDatabaseEntry(1);
346     DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
347 
348     for (Enumeration<WordRecord> w = wordRecords.elements(); w.hasMoreElements();) {
349       WordRecord wordRecord = (WordRecord)w.nextElement();
350       setWT(wt.getData(), wordRecord.wordID, textID);
351       occurrences.setData(wordRecord.occurrenceData.buffer());
352       occurrences.setSize(wordRecord.occurrenceData.count());
353       occurrences.setUserBuffer(occurrences.getData().length,true);
354       occurrencesOfWordInText.put(null, wt, occurrences);
355       addWordCount(wordRecord.word, wordRecord.count);
356 
357       ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
358       allWordIDs_p += 3;
359     }
360 
361     DatabaseEntry textIDDatabaseEntry = DbUtils.userMemDatabaseEntry(5);
362     FivePacker.set_(textIDDatabaseEntry.getData(), 0, textID);
363     wordsInText.put(null, textIDDatabaseEntry, DbUtils.userMemDatabaseEntry(allWordIDs));
364   }
365 
366   public SearchResults andSearchResults(String[] args)
367       throws FTIException, DatabaseException {
368     return new AndSearchResults(this, args, false);
369   }
370 
371   public SearchResults groupSearchResults(String[] args)
372       throws FTIException, DatabaseException {
373     return new AndSearchResults(this, args, true);
374   }
375 
376   // i.e., which page is the #anchor marking this poem on?
377 
378   public String blockmarkerBeforeFirstOccurrence(long textID, String word)
379       throws DatabaseException {
380     TextStream stream = new TextStream(this, word);
381     stream.gotoText(textID);
382     stream.init();
383     int index = stream.currentWordIndex();
384     return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
385   }
386 
387   public void stat() throws DatabaseException {
388     Cursor words = idOfWord.openCursor(null, null);
389     try {
390       int total = 0;
391       while (words.getNext(idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
392         System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
393         System.out.print(": ");
394         int count = getWC_count(idOfWordData.getData());
395         System.out.println(count);
396         total += count;
397       }
398       System.out.println("--- total " + total);
399     }
400     finally {
401       try { words.close(); } catch (DatabaseException e) {}
402     }
403   }
404 
405   public void flush() throws DatabaseException {
406     idOfWord.sync();
407     occurrencesOfWordInText.sync();
408     anchorOfIndex.sync();
409     blockmarkOfIndex.sync();
410     wordsInText.sync();
411   }
412 
413   public void close() throws DatabaseException {
414     idOfWord.close(false);
415     occurrencesOfWordInText.close(false);
416     anchorOfIndex.close(false);
417     blockmarkOfIndex.close(false);
418     wordsInText.close(false);
419   }
420 
421   protected void finalize() throws Throwable {
422     close();
423   }
424 
425   public void appendTerms(Vector<SearchResults> terms, String query, boolean keywords)
426       throws DatabaseException {
427     Vector<String> group = null;
428 
429     StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
430 
431     while (tokens.hasMoreTokens()) {
432       String token = tokens.nextToken();
433       switch (token.charAt(0)) {
434         case ' ': case '\t' : case '\n': case '\r': case '\f':
435       break;
436         case '"':
437       if (group == null)
438         group = new Vector<String>();
439       else {
440         String[] g = new String[group.size()];
441         group.copyInto(g);
442         terms.addElement(new AndSearchResults(this, g, true));
443         group = null;
444       }
445       break;
446         default:
447           if (keywords)
448             token = '$' + token;
449 
450       if (group != null)
451         group.addElement(token);
452       else
453         terms.addElement(new TextStream(this, token));
454       }
455     }
456 
457     if (group != null) {
458       String[] g = new String[group.size()];
459       group.copyInto(g);
460       terms.addElement(new AndSearchResults(this, g, true));
461     }
462   }
463 
464   public SearchResults querySearchResults(
465       String textQuery, String keywordsQuery) throws DatabaseException, FTIException {
466     Vector<SearchResults> terms = new Vector<SearchResults>();
467     if (textQuery != null) appendTerms(terms, textQuery, false);
468     if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
469     SearchResults[] t = new SearchResults[terms.size()];
470     terms.copyInto(t);
471     return new AndSearchResults(t);
472   }
473 
474   public SearchResults querySearchResults(String query)
475       throws DatabaseException, FTIException {
476     return querySearchResults(query, null);
477   }
478 
479   public IndexCursor allEntries() throws DatabaseException {
480     return new IndexCursor(this);
481   }
482 
483   public static boolean debug = false;
484 
485   public static void main(final String[] args) throws Exception {
486     IndexOther index = new IndexOther(
487         new File(USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI));
488 
489     if (args[0].equals("-index"))
490       for (int i = 1; i < args.length; i += 2) {
491         System.err.println("indexing " + args[i + 1]);
492         final String filename = args[i + 1];
493         final long textID = Long.parseLong(args[i]);
494         final InputStream body = new BufferedInputStream(new FileInputStream(
495             filename));
496         index.index(new Text() {
497           public InputStream body() {
498             return body;
499           }
500 
501           public InputStream bodyForFragment() {
502             return body;
503           }
504 
505           public long ftiTextID() {
506             return textID;
507           }
508         });
509       }
510     else if (args[0].equals("-stat"))
511       index.stat();
512     else if (args[0].equals("-anchorpage"))
513       System.out.println(index.blockmarkerBeforeFirstOccurrence(
514           Integer.parseInt(args[1]), args[2]));
515     else {
516       SearchResults results;
517       if (args[0].charAt(0) == '_') {
518         args[0] = args[0].substring(1);
519         results = index.groupSearchResults(args);
520         System.out.println("phrase");
521       } else if (args[0].equals("-query")) {
522         results = index.querySearchResults(args[1]);
523       } else {
524         results = index.andSearchResults(args);
525         System.out.println("and");
526       }
527 
528       for (results.gotoText(0L); results.currentTextID() != -1L;
529            // FIXME this isn't very clever! (maybe?)
530            results.gotoText(results.currentTextID() + 1L)) {
531         System.out.print("-- " + results.currentTextID() + "\n ");
532         for (; results.currentOffset() != -1; results.skipToNextHit())
533           System.out.print(" " + // results.currentAnchor() + ":" +
534               results.currentOffset());
535         System.out.println();
536       }
537     }
538     index.close();
539   }
540 }