Coverage Report - org.paneris.bibliomania.fti.IndexOther
 
Classes in this File Line Coverage Branch Coverage Complexity
IndexOther
58%
132/226
32%
26/79
2.41
IndexOther$1
0%
0/4
N/A
2.41
IndexOther$Buffer
100%
3/3
N/A
2.41
IndexOther$WordRecord
80%
16/20
75%
3/4
2.41
 
 1  
 package org.paneris.bibliomania.fti;
 2  
 
 3  
 import java.io.BufferedInputStream;
 4  
 import java.io.ByteArrayOutputStream;
 5  
 import java.io.File;
 6  
 import java.io.FileInputStream;
 7  
 import java.io.IOException;
 8  
 import java.io.InputStream;
 9  
 import java.util.Enumeration;
 10  
 import java.util.Hashtable;
 11  
 import java.util.Properties;
 12  
 import java.util.StringTokenizer;
 13  
 import java.util.Vector;
 14  
 
 15  
 import org.melati.util.MelatiRuntimeException;
 16  
 import org.melati.util.PropertiesUtils;
 17  
 import org.melati.util.PropertyException;
 18  
 import org.melati.util.UnexpectedExceptionException;
 19  
 
 20  
 import com.sleepycat.db.Database;
 21  
 import com.sleepycat.db.BtreeStats;
 22  
 import com.sleepycat.db.DatabaseConfig;
 23  
 import com.sleepycat.db.DatabaseException;
 24  
 import com.sleepycat.db.Cursor;
 25  
 import com.sleepycat.db.DatabaseEntry;
 26  
 import com.sleepycat.db.LockMode;
 27  
 import com.sleepycat.db.OperationStatus;
 28  
 
 29  
 public class IndexOther {
 30  
 
 31  
   private static final String USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI = "/usr/local/share/bibliomania/workspace/infoFTI";
 32  
   public static final int wordTruncationLength = 100;
 33  
   public static final int contextWordsBeforeHit = 5;
 34  
 
 35  
 
 36  
   File dbHome;
 37  
   Database idOfWord;
 38  
   DatabaseConfig idOfWordConfig;
 39  17
   DatabaseEntry idOfWordKey = DbUtils.userMemDatabaseEntry(wordTruncationLength);
 40  17
   DatabaseEntry idOfWordData = DbUtils.userMemDatabaseEntry(wcBytesLength);
 41  17
   int nextWordID = 0;
 42  
   Database occurrencesOfWordInText;
 43  
   Database anchorOfIndex;
 44  
   Database blockmarkOfIndex;          // i.e. page of index
 45  
   Database wordsInText;  
 46  
 
 47  
   private AnchorFinder blockmarkFinder;
 48  
 
 49  
   public static class CorruptionException extends FTIException {
 50  
 
 51  
     private static final long serialVersionUID = 6251343503765495522L;
 52  
     public String corruptionProblem;
 53  
 
 54  
     public CorruptionException(String corruptionProblem) {
 55  
       this.corruptionProblem = corruptionProblem;
 56  
     }
 57  
 
 58  
     public String getMessage() {
 59  
       return "The text index is corrupt, and the " +
 60  
              "index must be rebuilt (problem: " + corruptionProblem + ")";
 61  
     }
 62  
   }
 63  
 
 64  
   public IndexOther(File dbHome)
 65  
       throws IOException, DatabaseException, PropertyException {
 66  16
     this(dbHome, null);
 67  16
   }
 68  
 
 69  
   public IndexOther(File dbHome, Properties config)
 70  17
       throws IOException, DatabaseException, PropertyException {
 71  17
     this.dbHome = dbHome;
 72  17
     if (!dbHome.exists())
 73  0
       throw new IllegalArgumentException("dbHome `" + dbHome +
 74  
                      "' does nto exist");
 75  17
     if (!dbHome.isDirectory())
 76  0
       throw new IllegalArgumentException("dbHome `" + dbHome +
 77  
                      "' is not a directory");
 78  
 
 79  17
     if (config == null)
 80  
 //      config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
 81  17
       config = PropertiesUtils.fromResource(this.getClass());
 82  
 
 83  17
     idOfWord = DbUtils.openOrCreateBTreeDb(dbHome, "idOfWord", 
 84  
         PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"));
 85  
 
 86  17
     occurrencesOfWordInText = DbUtils.openOrCreateBTreeDb(dbHome, "occurrencesOfWordInText", 
 87  
         PropertiesUtils.getOrDie_int(config, "occurrencesOfWordInText.cacheSize"));
 88  
     
 89  17
     nextWordID = ((BtreeStats)idOfWord.getStats(null,null)).getNumKeys();
 90  
 
 91  
 //    System.err.println("nextWordID = " + nextWordID);
 92  
 
 93  17
     anchorOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "anchorOfIndex", 
 94  
         PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"));
 95  
     
 96  17
     blockmarkOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "blockmarkOfIndex", 
 97  
         PropertiesUtils.getOrDie_int(config, "blockmarkOfIndex.cacheSize"));
 98  
     
 99  
     // make it a btree because of possible locality of reference when
 100  
     // processing many books
 101  17
     wordsInText = DbUtils.openOrCreateBTreeDb(dbHome, "wordsInText", 0);
 102  
     
 103  17
     blockmarkFinder = new AnchorFinder(this, true);
 104  17
   }
 105  
   
 106  282
   private static final class Buffer extends ByteArrayOutputStream {
 107  
     public byte[] buffer() {
 108  141
       return buf;
 109  
     }
 110  
 
 111  
     public int count() {
 112  141
       return count;
 113  
     }
 114  
   }
 115  
 
 116  
   public class WordRecord {
 117  
     public String word;
 118  
     public int wordID;
 119  141
     public Buffer occurrenceData = new Buffer();
 120  
     public int count;
 121  
 
 122  141
     public WordRecord(String word) throws FTIException, DatabaseException {
 123  141
       this.word = word;
 124  141
       wordID = idOfWord(word);
 125  141
     }
 126  
 
 127  
     // FIXME this really goes with WordTextSearchResults
 128  
 
 129  141
     private Packer wordIndexPacker = OnePacker.it;
 130  141
     private Packer offsetPacker = OnePacker.it;
 131  
 
 132  
     public void noteOccurrence(int wordIndex, int offset) {
 133  
       try {
 134  231
     while (wordIndex >= wordIndexPacker.numberMax()) {
 135  0
       wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
 136  0
       wordIndexPacker = wordIndexPacker.bigger();
 137  
     }
 138  
 
 139  231
     wordIndexPacker.write(occurrenceData, wordIndex);
 140  
 
 141  342
     while (offset >= offsetPacker.numberMax()) {
 142  111
       offsetPacker.write(occurrenceData, offsetPacker.numberMax());
 143  111
       offsetPacker = offsetPacker.bigger();
 144  
     }
 145  
 
 146  231
     offsetPacker.write(occurrenceData, offset);
 147  
       }
 148  0
       catch (IOException e) {
 149  0
     throw new UnexpectedExceptionException(e);
 150  231
       }
 151  
 
 152  231
       ++count;
 153  231
     }
 154  
   }
 155  
 
 156  
   static final int wtBytesLength = 8;
 157  
 
 158  
   static int getWT_wordID(byte[] bytes) {
 159  0
     return ThreePacker.number_(bytes, 0);
 160  
   }
 161  
 
 162  
   static long getWT_textID(byte[] bytes) {
 163  
     // Old version used 4-byte textid keys, 
 164  
     // we use 5 now to accommodate more bits
 165  
     // for the `Section' part
 166  
 
 167  0
     if (bytes.length == 3 + 5)
 168  0
       return FivePacker.number_(bytes, 3);
 169  0
     else if (bytes.length == 3 + 4)
 170  0
       throw new CorruptionException(
 171  
           "unexpected occurrence key length 7; are you running against an " +
 172  
           "index built with an old version of the bibliomania software, from " +
 173  
           "before the number of allowed `sections' was increased?");
 174  
     else
 175  0
       throw new CorruptionException(
 176  
           "unexpected occurrence key length " + bytes.length);
 177  
   }
 178  
 
 179  
   static void setWT(byte[] bytes, int wordID, long textID) {
 180  212
     ThreePacker.set_(bytes, 0, wordID);
 181  212
     FivePacker.set_(bytes, 3, textID);
 182  212
   }
 183  
 
 184  
   static final int wcBytesLength = 6;
 185  
 
 186  
   static int getWC_wordID(byte[] bytes) {
 187  99
     return ThreePacker.number_(bytes, 0);
 188  
   } 
 189  
 
 190  
   static void setWC_count(byte[] bytes, int count) {
 191  183
     ThreePacker.set_(bytes, 3, count);
 192  183
   }
 193  
 
 194  
   static void setWC(byte[] bytes, int wordID, int count) {
 195  42
     ThreePacker.set_(bytes, 0, wordID);
 196  42
     setWC_count(bytes, count);
 197  42
   }
 198  
 
 199  
   static int getWC_count(byte[] bytes) {
 200  141
     return ThreePacker.number_(bytes, 3);
 201  
   } 
 202  
 
 203  
   static void setWord(DatabaseEntry key, String word) {
 204  283
     byte[] data = key.getData();
 205  283
     int length = Math.min(word.length(), wordTruncationLength);
 206  283
     if (data.length < length) {
 207  0
       key.setData(data = new byte[length]);
 208  0
       key.setUserBuffer(length, true);
 209  
     }
 210  283
     key.setSize(length);
 211  2016
     for (int i = 0; i < length; ++i)
 212  1733
       data[i] = (byte)word.charAt(i);
 213  283
   }
 214  
 
 215  
   public class WordIDExceededMaxException extends MelatiRuntimeException {
 216  
     /**
 217  
      * 
 218  
      */
 219  
     private static final long serialVersionUID = 1L;
 220  
 
 221  
     public String getMessage() {
 222  
       return "Word dictionary in FTI subsystem " + dbHome +
 223  
              " exceeded maximum size";
 224  
     }
 225  
   }
 226  
 
 227  
   public int idOfWord(String word) throws FTIException, DatabaseException {
 228  141
     word = word.toLowerCase();
 229  
     // FIXME synchronized???? is that a good idea??? what about WordFinder?
 230  141
     synchronized (idOfWord) {
 231  141
       setWord(idOfWordKey, word);
 232  141
       if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS)
 233  99
         return getWC_wordID(idOfWordData.getData());
 234  
       else {
 235  42
         if (nextWordID > 0xFFFFFF)
 236  0
           throw new WordIDExceededMaxException();
 237  42
         setWC(idOfWordData.getData(), nextWordID, 0);
 238  42
         idOfWord.put(null, idOfWordKey, idOfWordData);
 239  42
         return nextWordID++;
 240  
       }
 241  0
     }
 242  
   }
 243  
 
 244  
   private void addWordCount(String word, int count) throws DatabaseException {
 245  141
     synchronized (idOfWord) {
 246  141
       setWord(idOfWordKey, word.toLowerCase());
 247  141
       if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
 248  141
         byte[] data = idOfWordData.getData();
 249  141
         setWC_count(data, getWC_count(data) + count);
 250  141
         idOfWord.put(null, idOfWordKey, idOfWordData);
 251  
       }
 252  141
     }
 253  141
   }
 254  
 
 255  
   public static final int textIdBytesLength = 8;
 256  
 
 257  
   public static void setTI(byte[] bytes, long textID, int index) {
 258  1
     FivePacker.set_(bytes, 0, textID);
 259  1
     ThreePacker.set_(bytes, 5, index);
 260  1
   }
 261  
 
 262  
   public static long getTI_textID(byte[] bytes) {
 263  0
     return FivePacker.number_(bytes, 0);
 264  
   }
 265  
 
 266  
   public void unIndex(long textID)
 267  
       throws FTIException, DatabaseException {
 268  2
     DatabaseEntry text = DbUtils.userMemDatabaseEntry(5);
 269  2
     FivePacker.set_(text.getData(), 0, textID);
 270  2
     DatabaseEntry words = DbUtils.userMemDatabaseEntry(256);
 271  
 
 272  2
     if (DbUtils.get(wordsInText, text, words, 256) == OperationStatus.SUCCESS) {
 273  1
       DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
 274  72
       for (int i = 0; i < words.getSize(); i += 3) {
 275  71
         setWT(wt.getData(),
 276  
             ThreePacker.number_(words.getData(), i),
 277  
             textID);
 278  71
         occurrencesOfWordInText.delete(null, wt);
 279  
       }
 280  
     }
 281  
 
 282  2
     wordsInText.delete(null, text);
 283  2
   }
 284  
 
 285  
   private void noteOccurrence(Hashtable<String, WordRecord> wordRecords, String word,
 286  
                               int index, int offset) throws DatabaseException {
 287  
 
 288  231
     WordRecord wordRecord = (WordRecord)wordRecords.get(word);
 289  231
     if (wordRecord == null)
 290  141
       wordRecords.put(word, wordRecord = new WordRecord(word));
 291  
 
 292  231
     wordRecord.noteOccurrence(index, offset);
 293  231
   }
 294  
 
 295  
   public void index(Text text)
 296  
       throws FTIException, IOException, DatabaseException {
 297  
 
 298  2
     long textID = text.ftiTextID();
 299  
 
 300  2
     unIndex(textID);
 301  
 
 302  2
     DatabaseEntry anchorOfIndexKey = DbUtils.userMemDatabaseEntry(textIdBytesLength);
 303  2
     DatabaseEntry anchorOfIndexData = DbUtils.userMemDatabaseEntry(256);
 304  
 
 305  2
     Hashtable<String, WordRecord> wordRecords = new Hashtable<String, WordRecord>();
 306  2
     int[] offsetHistory = new int[contextWordsBeforeHit];
 307  2
     int wordCount = 0;
 308  
 
 309  2
     InputStream body = new BufferedInputStream(text.body());
 310  
     try {
 311  2
       for (IndexTokenizer words = new IndexTokenizer(body);
 312  233
            words.hasMoreWords();) {
 313  231
         String word = words.nextWord();
 314  231
         if (word.startsWith("#")) {
 315  1
           boolean blockmark = word.startsWith("#__");
 316  1
           setWord(anchorOfIndexData, word.substring(1));
 317  1
           setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
 318  1
           (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
 319  
               null, anchorOfIndexKey, anchorOfIndexData);
 320  1
         }
 321  
         else
 322  230
           word = word.toLowerCase();
 323  
 
 324  231
         offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
 325  
 
 326  231
         int index = words.wordIndex();
 327  231
         int offset = wordCount < contextWordsBeforeHit ?
 328  
                        offsetHistory[0] :
 329  
                        offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
 330  
 
 331  231
         noteOccurrence(wordRecords, word, index, offset);
 332  231
         if (word.startsWith("$"))
 333  0
           noteOccurrence(wordRecords, word.substring(1), index, offset);
 334  
 
 335  231
         ++wordCount;
 336  231
       }
 337  
     }
 338  
     finally {
 339  2
       try { body.close(); } catch (IOException e) {}
 340  0
     }
 341  
 
 342  2
     byte[] allWordIDs = new byte[wordRecords.size() * 3];
 343  2
     int allWordIDs_p = 0;
 344  
 
 345  2
     DatabaseEntry occurrences = DbUtils.userMemDatabaseEntry(1);
 346  2
     DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
 347  
 
 348  2
     for (Enumeration<WordRecord> w = wordRecords.elements(); w.hasMoreElements();) {
 349  141
       WordRecord wordRecord = (WordRecord)w.nextElement();
 350  141
       setWT(wt.getData(), wordRecord.wordID, textID);
 351  141
       occurrences.setData(wordRecord.occurrenceData.buffer());
 352  141
       occurrences.setSize(wordRecord.occurrenceData.count());
 353  141
       occurrences.setUserBuffer(occurrences.getData().length,true);
 354  141
       occurrencesOfWordInText.put(null, wt, occurrences);
 355  141
       addWordCount(wordRecord.word, wordRecord.count);
 356  
 
 357  141
       ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
 358  141
       allWordIDs_p += 3;
 359  141
     }
 360  
 
 361  2
     DatabaseEntry textIDDatabaseEntry = DbUtils.userMemDatabaseEntry(5);
 362  2
     FivePacker.set_(textIDDatabaseEntry.getData(), 0, textID);
 363  2
     wordsInText.put(null, textIDDatabaseEntry, DbUtils.userMemDatabaseEntry(allWordIDs));
 364  2
   }
 365  
 
 366  
   public SearchResults andSearchResults(String[] args)
 367  
       throws FTIException, DatabaseException {
 368  0
     return new AndSearchResults(this, args, false);
 369  
   }
 370  
 
 371  
   public SearchResults groupSearchResults(String[] args)
 372  
       throws FTIException, DatabaseException {
 373  0
     return new AndSearchResults(this, args, true);
 374  
   }
 375  
 
 376  
   // i.e., which page is the #anchor marking this poem on?
 377  
 
 378  
   public String blockmarkerBeforeFirstOccurrence(long textID, String word)
 379  
       throws DatabaseException {
 380  0
     TextStream stream = new TextStream(this, word);
 381  0
     stream.gotoText(textID);
 382  0
     stream.init();
 383  0
     int index = stream.currentWordIndex();
 384  0
     return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
 385  
   }
 386  
 
 387  
   public void stat() throws DatabaseException {
 388  0
     Cursor words = idOfWord.openCursor(null, null);
 389  
     try {
 390  0
       int total = 0;
 391  0
       while (words.getNext(idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
 392  0
         System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
 393  0
         System.out.print(": ");
 394  0
         int count = getWC_count(idOfWordData.getData());
 395  0
         System.out.println(count);
 396  0
         total += count;
 397  0
       }
 398  0
       System.out.println("--- total " + total);
 399  
     }
 400  
     finally {
 401  0
       try { words.close(); } catch (DatabaseException e) {}
 402  0
     }
 403  0
   }
 404  
 
 405  
   public void flush() throws DatabaseException {
 406  2
     idOfWord.sync();
 407  2
     occurrencesOfWordInText.sync();
 408  2
     anchorOfIndex.sync();
 409  2
     blockmarkOfIndex.sync();
 410  2
     wordsInText.sync();
 411  2
   }
 412  
 
 413  
   public void close() throws DatabaseException {
 414  2
     idOfWord.close(false);
 415  2
     occurrencesOfWordInText.close(false);
 416  2
     anchorOfIndex.close(false);
 417  2
     blockmarkOfIndex.close(false);
 418  2
     wordsInText.close(false);
 419  2
   }
 420  
 
 421  
   protected void finalize() throws Throwable {
 422  2
     close();
 423  2
   }
 424  
 
 425  
   public void appendTerms(Vector<SearchResults> terms, String query, boolean keywords)
 426  
       throws DatabaseException {
 427  0
     Vector<String> group = null;
 428  
 
 429  0
     StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
 430  
 
 431  0
     while (tokens.hasMoreTokens()) {
 432  0
       String token = tokens.nextToken();
 433  0
       switch (token.charAt(0)) {
 434  
         case ' ': case '\t' : case '\n': case '\r': case '\f':
 435  0
       break;
 436  
         case '"':
 437  0
       if (group == null)
 438  0
         group = new Vector<String>();
 439  
       else {
 440  0
         String[] g = new String[group.size()];
 441  0
         group.copyInto(g);
 442  0
         terms.addElement(new AndSearchResults(this, g, true));
 443  0
         group = null;
 444  
       }
 445  0
       break;
 446  
         default:
 447  0
           if (keywords)
 448  0
             token = '$' + token;
 449  
 
 450  0
       if (group != null)
 451  0
         group.addElement(token);
 452  
       else
 453  0
         terms.addElement(new TextStream(this, token));
 454  
       }
 455  0
     }
 456  
 
 457  0
     if (group != null) {
 458  0
       String[] g = new String[group.size()];
 459  0
       group.copyInto(g);
 460  0
       terms.addElement(new AndSearchResults(this, g, true));
 461  
     }
 462  0
   }
 463  
 
 464  
   public SearchResults querySearchResults(
 465  
       String textQuery, String keywordsQuery) throws DatabaseException, FTIException {
 466  0
     Vector<SearchResults> terms = new Vector<SearchResults>();
 467  0
     if (textQuery != null) appendTerms(terms, textQuery, false);
 468  0
     if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
 469  0
     SearchResults[] t = new SearchResults[terms.size()];
 470  0
     terms.copyInto(t);
 471  0
     return new AndSearchResults(t);
 472  
   }
 473  
 
 474  
   public SearchResults querySearchResults(String query)
 475  
       throws DatabaseException, FTIException {
 476  0
     return querySearchResults(query, null);
 477  
   }
 478  
 
 479  
   public IndexCursor allEntries() throws DatabaseException {
 480  1
     return new IndexCursor(this);
 481  
   }
 482  
 
 483  1
   public static boolean debug = false;
 484  
 
 485  
   public static void main(final String[] args) throws Exception {
 486  0
     IndexOther index = new IndexOther(
 487  
         new File(USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI));
 488  
 
 489  0
     if (args[0].equals("-index"))
 490  0
       for (int i = 1; i < args.length; i += 2) {
 491  0
         System.err.println("indexing " + args[i + 1]);
 492  0
         final String filename = args[i + 1];
 493  0
         final long textID = Long.parseLong(args[i]);
 494  0
         final InputStream body = new BufferedInputStream(new FileInputStream(
 495  
             filename));
 496  0
         index.index(new Text() {
 497  
           public InputStream body() {
 498  0
             return body;
 499  
           }
 500  
 
 501  
           public InputStream bodyForFragment() {
 502  0
             return body;
 503  
           }
 504  
 
 505  
           public long ftiTextID() {
 506  0
             return textID;
 507  
           }
 508  
         });
 509  
       }
 510  0
     else if (args[0].equals("-stat"))
 511  0
       index.stat();
 512  0
     else if (args[0].equals("-anchorpage"))
 513  0
       System.out.println(index.blockmarkerBeforeFirstOccurrence(
 514  
           Integer.parseInt(args[1]), args[2]));
 515  
     else {
 516  
       SearchResults results;
 517  0
       if (args[0].charAt(0) == '_') {
 518  0
         args[0] = args[0].substring(1);
 519  0
         results = index.groupSearchResults(args);
 520  0
         System.out.println("phrase");
 521  0
       } else if (args[0].equals("-query")) {
 522  0
         results = index.querySearchResults(args[1]);
 523  
       } else {
 524  0
         results = index.andSearchResults(args);
 525  0
         System.out.println("and");
 526  
       }
 527  
 
 528  0
       for (results.gotoText(0L); results.currentTextID() != -1L;
 529  
            // FIXME this isn't very clever! (maybe?)
 530  0
            results.gotoText(results.currentTextID() + 1L)) {
 531  0
         System.out.print("-- " + results.currentTextID() + "\n ");
 532  0
         for (; results.currentOffset() != -1; results.skipToNextHit())
 533  0
           System.out.print(" " + // results.currentAnchor() + ":" +
 534  
               results.currentOffset());
 535  0
         System.out.println();
 536  
       }
 537  
     }
 538  0
     index.close();
 539  0
   }
 540  
 }