Coverage Report

 package org.paneris.bibliomania.fti;
 
 import java.io.BufferedInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.Properties;
 import java.util.StringTokenizer;
 import java.util.Vector;
 
 import org.melati.util.MelatiRuntimeException;
 import org.melati.util.PropertiesUtils;
 import org.melati.util.PropertyException;
 import org.melati.util.UnexpectedExceptionException;
 
 import com.sleepycat.db.Database;
 import com.sleepycat.db.BtreeStats;
 import com.sleepycat.db.DatabaseConfig;
 import com.sleepycat.db.DatabaseException;
 import com.sleepycat.db.Cursor;
 import com.sleepycat.db.DatabaseEntry;
 import com.sleepycat.db.LockMode;
 import com.sleepycat.db.OperationStatus;
 
 public class IndexOther {
 
   private static final String USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI = "/usr/local/share/bibliomania/workspace/infoFTI";
   public static final int wordTruncationLength = 100;
   public static final int contextWordsBeforeHit = 5;
 
 
   File dbHome;
   Database idOfWord;
   DatabaseConfig idOfWordConfig;
   DatabaseEntry idOfWordKey = DbUtils.userMemDatabaseEntry(wordTruncationLength);
   DatabaseEntry idOfWordData = DbUtils.userMemDatabaseEntry(wcBytesLength);
   int nextWordID = 0;
   Database occurrencesOfWordInText;
   Database anchorOfIndex;
   Database blockmarkOfIndex;          // i.e. page of index
   Database wordsInText;  
 
   private AnchorFinder blockmarkFinder;
 
   public static class CorruptionException extends FTIException {
 
     private static final long serialVersionUID = 6251343503765495522L;
     public String corruptionProblem;
 
     public CorruptionException(String corruptionProblem) {
       this.corruptionProblem = corruptionProblem;
     }
 
     public String getMessage() {
       return "The text index is corrupt, and the " +
              "index must be rebuilt (problem: " + corruptionProblem + ")";
     }
   }
 
   public IndexOther(File dbHome)
       throws IOException, DatabaseException, PropertyException {
     this(dbHome, null);
   }
 
   public IndexOther(File dbHome, Properties config)
       throws IOException, DatabaseException, PropertyException {
     this.dbHome = dbHome;
     if (!dbHome.exists())
       throw new IllegalArgumentException("dbHome `" + dbHome +
                      "' does nto exist");
     if (!dbHome.isDirectory())
       throw new IllegalArgumentException("dbHome `" + dbHome +
                      "' is not a directory");
 
     if (config == null)
 //      config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
       config = PropertiesUtils.fromResource(this.getClass());
 
     idOfWord = DbUtils.openOrCreateBTreeDb(dbHome, "idOfWord", 
         PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"));
 
     occurrencesOfWordInText = DbUtils.openOrCreateBTreeDb(dbHome, "occurrencesOfWordInText", 
         PropertiesUtils.getOrDie_int(config, "occurrencesOfWordInText.cacheSize"));
     
     nextWordID = ((BtreeStats)idOfWord.getStats(null,null)).getNumKeys();
 
 //    System.err.println("nextWordID = " + nextWordID);
 
     anchorOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "anchorOfIndex", 
         PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"));
     
     blockmarkOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "blockmarkOfIndex", 
         PropertiesUtils.getOrDie_int(config, "blockmarkOfIndex.cacheSize"));
     
     // make it a btree because of possible locality of reference when
     // processing many books
     wordsInText = DbUtils.openOrCreateBTreeDb(dbHome, "wordsInText", 0);
     
     blockmarkFinder = new AnchorFinder(this, true);
   }
   
   private static final class Buffer extends ByteArrayOutputStream {
     public byte[] buffer() {
       return buf;
     }
 
     public int count() {
       return count;
     }
   }
 
   public class WordRecord {
     public String word;
     public int wordID;
     public Buffer occurrenceData = new Buffer();
     public int count;
 
     public WordRecord(String word) throws FTIException, DatabaseException {
       this.word = word;
       wordID = idOfWord(word);
     }
 
     // FIXME this really goes with WordTextSearchResults
 
     private Packer wordIndexPacker = OnePacker.it;
     private Packer offsetPacker = OnePacker.it;
 
     public void noteOccurrence(int wordIndex, int offset) {
       try {
     while (wordIndex >= wordIndexPacker.numberMax()) {
       wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
       wordIndexPacker = wordIndexPacker.bigger();
     }
 
     wordIndexPacker.write(occurrenceData, wordIndex);
 
     while (offset >= offsetPacker.numberMax()) {
       offsetPacker.write(occurrenceData, offsetPacker.numberMax());
       offsetPacker = offsetPacker.bigger();
     }
 
     offsetPacker.write(occurrenceData, offset);
       }
       catch (IOException e) {
     throw new UnexpectedExceptionException(e);
       }
 
       ++count;
     }
   }
 
   static final int wtBytesLength = 8;
 
   static int getWT_wordID(byte[] bytes) {
     return ThreePacker.number_(bytes, 0);
   }
 
   static long getWT_textID(byte[] bytes) {
     // Old version used 4-byte textid keys, 
     // we use 5 now to accommodate more bits
     // for the `Section' part
 
     if (bytes.length == 3 + 5)
       return FivePacker.number_(bytes, 3);
     else if (bytes.length == 3 + 4)
       throw new CorruptionException(
           "unexpected occurrence key length 7; are you running against an " +
           "index built with an old version of the bibliomania software, from " +
           "before the number of allowed `sections' was increased?");
     else
       throw new CorruptionException(
           "unexpected occurrence key length " + bytes.length);
   }
 
   static void setWT(byte[] bytes, int wordID, long textID) {
     ThreePacker.set_(bytes, 0, wordID);
     FivePacker.set_(bytes, 3, textID);
   }
 
   static final int wcBytesLength = 6;
 
   static int getWC_wordID(byte[] bytes) {
     return ThreePacker.number_(bytes, 0);
   } 
 
   static void setWC_count(byte[] bytes, int count) {
     ThreePacker.set_(bytes, 3, count);
   }
 
   static void setWC(byte[] bytes, int wordID, int count) {
     ThreePacker.set_(bytes, 0, wordID);
     setWC_count(bytes, count);
   }
 
   static int getWC_count(byte[] bytes) {
     return ThreePacker.number_(bytes, 3);
   } 
 
   static void setWord(DatabaseEntry key, String word) {
     byte[] data = key.getData();
     int length = Math.min(word.length(), wordTruncationLength);
     if (data.length < length) {
       key.setData(data = new byte[length]);
       key.setUserBuffer(length, true);
     }
     key.setSize(length);
     for (int i = 0; i < length; ++i)
       data[i] = (byte)word.charAt(i);
   }
 
   public class WordIDExceededMaxException extends MelatiRuntimeException {
     /**
      * 
      */
     private static final long serialVersionUID = 1L;
 
     public String getMessage() {
       return "Word dictionary in FTI subsystem " + dbHome +
              " exceeded maximum size";
     }
   }
 
   public int idOfWord(String word) throws FTIException, DatabaseException {
     word = word.toLowerCase();
     // FIXME synchronized???? is that a good idea??? what about WordFinder?
     synchronized (idOfWord) {
       setWord(idOfWordKey, word);
       if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS)
         return getWC_wordID(idOfWordData.getData());
       else {
         if (nextWordID > 0xFFFFFF)
           throw new WordIDExceededMaxException();
         setWC(idOfWordData.getData(), nextWordID, 0);
         idOfWord.put(null, idOfWordKey, idOfWordData);
         return nextWordID++;
       }
     }
   }
 
   private void addWordCount(String word, int count) throws DatabaseException {
     synchronized (idOfWord) {
       setWord(idOfWordKey, word.toLowerCase());
       if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
         byte[] data = idOfWordData.getData();
         setWC_count(data, getWC_count(data) + count);
         idOfWord.put(null, idOfWordKey, idOfWordData);
       }
     }
   }
 
   public static final int textIdBytesLength = 8;
 
   public static void setTI(byte[] bytes, long textID, int index) {
     FivePacker.set_(bytes, 0, textID);
     ThreePacker.set_(bytes, 5, index);
   }
 
   public static long getTI_textID(byte[] bytes) {
     return FivePacker.number_(bytes, 0);
   }
 
   public void unIndex(long textID)
       throws FTIException, DatabaseException {
     DatabaseEntry text = DbUtils.userMemDatabaseEntry(5);
     FivePacker.set_(text.getData(), 0, textID);
     DatabaseEntry words = DbUtils.userMemDatabaseEntry(256);
 
     if (DbUtils.get(wordsInText, text, words, 256) == OperationStatus.SUCCESS) {
       DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
       for (int i = 0; i < words.getSize(); i += 3) {
         setWT(wt.getData(),
             ThreePacker.number_(words.getData(), i),
             textID);
         occurrencesOfWordInText.delete(null, wt);
       }
     }
 
     wordsInText.delete(null, text);
   }
 
   private void noteOccurrence(Hashtable<String, WordRecord> wordRecords, String word,
                               int index, int offset) throws DatabaseException {
 
     WordRecord wordRecord = (WordRecord)wordRecords.get(word);
     if (wordRecord == null)
       wordRecords.put(word, wordRecord = new WordRecord(word));
 
     wordRecord.noteOccurrence(index, offset);
   }
 
   public void index(Text text)
       throws FTIException, IOException, DatabaseException {
 
     long textID = text.ftiTextID();
 
     unIndex(textID);
 
     DatabaseEntry anchorOfIndexKey = DbUtils.userMemDatabaseEntry(textIdBytesLength);
     DatabaseEntry anchorOfIndexData = DbUtils.userMemDatabaseEntry(256);
 
     Hashtable<String, WordRecord> wordRecords = new Hashtable<String, WordRecord>();
     int[] offsetHistory = new int[contextWordsBeforeHit];
     int wordCount = 0;
 
     InputStream body = new BufferedInputStream(text.body());
     try {
       for (IndexTokenizer words = new IndexTokenizer(body);
            words.hasMoreWords();) {
         String word = words.nextWord();
         if (word.startsWith("#")) {
           boolean blockmark = word.startsWith("#__");
           setWord(anchorOfIndexData, word.substring(1));
           setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
           (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
               null, anchorOfIndexKey, anchorOfIndexData);
         }
         else
           word = word.toLowerCase();
 
         offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
 
         int index = words.wordIndex();
         int offset = wordCount < contextWordsBeforeHit ?
                        offsetHistory[0] :
                        offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
 
         noteOccurrence(wordRecords, word, index, offset);
         if (word.startsWith("$"))
           noteOccurrence(wordRecords, word.substring(1), index, offset);
 
         ++wordCount;
       }
     }
     finally {
       try { body.close(); } catch (IOException e) {}
     }
 
     byte[] allWordIDs = new byte[wordRecords.size() * 3];
     int allWordIDs_p = 0;
 
     DatabaseEntry occurrences = DbUtils.userMemDatabaseEntry(1);
     DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
 
     for (Enumeration<WordRecord> w = wordRecords.elements(); w.hasMoreElements();) {
       WordRecord wordRecord = (WordRecord)w.nextElement();
       setWT(wt.getData(), wordRecord.wordID, textID);
       occurrences.setData(wordRecord.occurrenceData.buffer());
       occurrences.setSize(wordRecord.occurrenceData.count());
       occurrences.setUserBuffer(occurrences.getData().length,true);
       occurrencesOfWordInText.put(null, wt, occurrences);
       addWordCount(wordRecord.word, wordRecord.count);
 
       ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
       allWordIDs_p += 3;
     }
 
     DatabaseEntry textIDDatabaseEntry = DbUtils.userMemDatabaseEntry(5);
     FivePacker.set_(textIDDatabaseEntry.getData(), 0, textID);
     wordsInText.put(null, textIDDatabaseEntry, DbUtils.userMemDatabaseEntry(allWordIDs));
   }
 
   public SearchResults andSearchResults(String[] args)
       throws FTIException, DatabaseException {
     return new AndSearchResults(this, args, false);
   }
 
   public SearchResults groupSearchResults(String[] args)
       throws FTIException, DatabaseException {
     return new AndSearchResults(this, args, true);
   }
 
   // i.e., which page is the #anchor marking this poem on?
 
   public String blockmarkerBeforeFirstOccurrence(long textID, String word)
       throws DatabaseException {
     TextStream stream = new TextStream(this, word);
     stream.gotoText(textID);
     stream.init();
     int index = stream.currentWordIndex();
     return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
   }
 
   public void stat() throws DatabaseException {
     Cursor words = idOfWord.openCursor(null, null);
     try {
       int total = 0;
       while (words.getNext(idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
         System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
         System.out.print(": ");
         int count = getWC_count(idOfWordData.getData());
         System.out.println(count);
         total += count;
       }
       System.out.println("--- total " + total);
     }
     finally {
       try { words.close(); } catch (DatabaseException e) {}
     }
   }
 
   public void flush() throws DatabaseException {
     idOfWord.sync();
     occurrencesOfWordInText.sync();
     anchorOfIndex.sync();
     blockmarkOfIndex.sync();
     wordsInText.sync();
   }
 
   public void close() throws DatabaseException {
     idOfWord.close(false);
     occurrencesOfWordInText.close(false);
     anchorOfIndex.close(false);
     blockmarkOfIndex.close(false);
     wordsInText.close(false);
   }
 
   protected void finalize() throws Throwable {
     close();
   }
 
   public void appendTerms(Vector<SearchResults> terms, String query, boolean keywords)
       throws DatabaseException {
     Vector<String> group = null;
 
     StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
 
     while (tokens.hasMoreTokens()) {
       String token = tokens.nextToken();
       switch (token.charAt(0)) {
         case ' ': case '\t' : case '\n': case '\r': case '\f':
       break;
         case '"':
       if (group == null)
         group = new Vector<String>();
       else {
         String[] g = new String[group.size()];
         group.copyInto(g);
         terms.addElement(new AndSearchResults(this, g, true));
         group = null;
       }
       break;
         default:
           if (keywords)
             token = '$' + token;
 
       if (group != null)
         group.addElement(token);
       else
         terms.addElement(new TextStream(this, token));
       }
     }
 
     if (group != null) {
       String[] g = new String[group.size()];
       group.copyInto(g);
       terms.addElement(new AndSearchResults(this, g, true));
     }
   }
 
   public SearchResults querySearchResults(
       String textQuery, String keywordsQuery) throws DatabaseException, FTIException {
     Vector<SearchResults> terms = new Vector<SearchResults>();
     if (textQuery != null) appendTerms(terms, textQuery, false);
     if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
     SearchResults[] t = new SearchResults[terms.size()];
     terms.copyInto(t);
     return new AndSearchResults(t);
   }
 
   public SearchResults querySearchResults(String query)
       throws DatabaseException, FTIException {
     return querySearchResults(query, null);
   }
 
   public IndexCursor allEntries() throws DatabaseException {
     return new IndexCursor(this);
   }
 
   public static boolean debug = false;
 
   public static void main(final String[] args) throws Exception {
     IndexOther index = new IndexOther(
         new File(USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI));
 
     if (args[0].equals("-index"))
       for (int i = 1; i < args.length; i += 2) {
         System.err.println("indexing " + args[i + 1]);
         final String filename = args[i + 1];
         final long textID = Long.parseLong(args[i]);
         final InputStream body = new BufferedInputStream(new FileInputStream(
             filename));
         index.index(new Text() {
           public InputStream body() {
             return body;
           }
 
           public InputStream bodyForFragment() {
             return body;
           }
 
           public long ftiTextID() {
             return textID;
           }
         });
       }
     else if (args[0].equals("-stat"))
       index.stat();
     else if (args[0].equals("-anchorpage"))
       System.out.println(index.blockmarkerBeforeFirstOccurrence(
           Integer.parseInt(args[1]), args[2]));
     else {
       SearchResults results;
       if (args[0].charAt(0) == '_') {
         args[0] = args[0].substring(1);
         results = index.groupSearchResults(args);
         System.out.println("phrase");
       } else if (args[0].equals("-query")) {
         results = index.querySearchResults(args[1]);
       } else {
         results = index.andSearchResults(args);
         System.out.println("and");
       }
 
       for (results.gotoText(0L); results.currentTextID() != -1L;
            // FIXME this isn't very clever! (maybe?)
            results.gotoText(results.currentTextID() + 1L)) {
         System.out.print("-- " + results.currentTextID() + "\n ");
         for (; results.currentOffset() != -1; results.skipToNextHit())
           System.out.print(" " + // results.currentAnchor() + ":" +
               results.currentOffset());
         System.out.println();
       }
     }
     index.close();
   }
 }

1		package org.paneris.bibliomania.fti;
2
3		import java.io.BufferedInputStream;
4		import java.io.ByteArrayOutputStream;
5		import java.io.File;
6		import java.io.FileInputStream;
7		import java.io.IOException;
8		import java.io.InputStream;
9		import java.util.Enumeration;
10		import java.util.Hashtable;
11		import java.util.Properties;
12		import java.util.StringTokenizer;
13		import java.util.Vector;
14
15		import org.melati.util.MelatiRuntimeException;
16		import org.melati.util.PropertiesUtils;
17		import org.melati.util.PropertyException;
18		import org.melati.util.UnexpectedExceptionException;
19
20		import com.sleepycat.db.Database;
21		import com.sleepycat.db.BtreeStats;
22		import com.sleepycat.db.DatabaseConfig;
23		import com.sleepycat.db.DatabaseException;
24		import com.sleepycat.db.Cursor;
25		import com.sleepycat.db.DatabaseEntry;
26		import com.sleepycat.db.LockMode;
27		import com.sleepycat.db.OperationStatus;
28
29		public class IndexOther {
30
31		private static final String USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI = "/usr/local/share/bibliomania/workspace/infoFTI";
32		public static final int wordTruncationLength = 100;
33		public static final int contextWordsBeforeHit = 5;
34
35
36		File dbHome;
37		Database idOfWord;
38		DatabaseConfig idOfWordConfig;
39	17	DatabaseEntry idOfWordKey = DbUtils.userMemDatabaseEntry(wordTruncationLength);
40	17	DatabaseEntry idOfWordData = DbUtils.userMemDatabaseEntry(wcBytesLength);
41	17	int nextWordID = 0;
42		Database occurrencesOfWordInText;
43		Database anchorOfIndex;
44		Database blockmarkOfIndex; // i.e. page of index
45		Database wordsInText;
46
47		private AnchorFinder blockmarkFinder;
48
49		public static class CorruptionException extends FTIException {
50
51		private static final long serialVersionUID = 6251343503765495522L;
52		public String corruptionProblem;
53
54		public CorruptionException(String corruptionProblem) {
55		this.corruptionProblem = corruptionProblem;
56		}
57
58		public String getMessage() {
59		return "The text index is corrupt, and the " +
60		"index must be rebuilt (problem: " + corruptionProblem + ")";
61		}
62		}
63
64		public IndexOther(File dbHome)
65		throws IOException, DatabaseException, PropertyException {
66	16	this(dbHome, null);
67	16	}
68
69		public IndexOther(File dbHome, Properties config)
70	17	throws IOException, DatabaseException, PropertyException {
71	17	this.dbHome = dbHome;
72	17	if (!dbHome.exists())
73	0	throw new IllegalArgumentException("dbHome `" + dbHome +
74		"' does nto exist");
75	17	if (!dbHome.isDirectory())
76	0	throw new IllegalArgumentException("dbHome `" + dbHome +
77		"' is not a directory");
78
79	17	if (config == null)
80		// config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
81	17	config = PropertiesUtils.fromResource(this.getClass());
82
83	17	idOfWord = DbUtils.openOrCreateBTreeDb(dbHome, "idOfWord",
84		PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"));
85
86	17	occurrencesOfWordInText = DbUtils.openOrCreateBTreeDb(dbHome, "occurrencesOfWordInText",
87		PropertiesUtils.getOrDie_int(config, "occurrencesOfWordInText.cacheSize"));
88
89	17	nextWordID = ((BtreeStats)idOfWord.getStats(null,null)).getNumKeys();
90
91		// System.err.println("nextWordID = " + nextWordID);
92
93	17	anchorOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "anchorOfIndex",
94		PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"));
95
96	17	blockmarkOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "blockmarkOfIndex",
97		PropertiesUtils.getOrDie_int(config, "blockmarkOfIndex.cacheSize"));
98
99		// make it a btree because of possible locality of reference when
100		// processing many books
101	17	wordsInText = DbUtils.openOrCreateBTreeDb(dbHome, "wordsInText", 0);
102
103	17	blockmarkFinder = new AnchorFinder(this, true);
104	17	}
105
106	282	private static final class Buffer extends ByteArrayOutputStream {
107		public byte[] buffer() {
108	141	return buf;
109		}
110
111		public int count() {
112	141	return count;
113		}
114		}
115
116		public class WordRecord {
117		public String word;
118		public int wordID;
119	141	public Buffer occurrenceData = new Buffer();
120		public int count;
121
122	141	public WordRecord(String word) throws FTIException, DatabaseException {
123	141	this.word = word;
124	141	wordID = idOfWord(word);
125	141	}
126
127		// FIXME this really goes with WordTextSearchResults
128
129	141	private Packer wordIndexPacker = OnePacker.it;
130	141	private Packer offsetPacker = OnePacker.it;
131
132		public void noteOccurrence(int wordIndex, int offset) {
133		try {
134	231	while (wordIndex >= wordIndexPacker.numberMax()) {
135	0	wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
136	0	wordIndexPacker = wordIndexPacker.bigger();
137		}
138
139	231	wordIndexPacker.write(occurrenceData, wordIndex);
140
141	342	while (offset >= offsetPacker.numberMax()) {
142	111	offsetPacker.write(occurrenceData, offsetPacker.numberMax());
143	111	offsetPacker = offsetPacker.bigger();
144		}
145
146	231	offsetPacker.write(occurrenceData, offset);
147		}
148	0	catch (IOException e) {
149	0	throw new UnexpectedExceptionException(e);
150	231	}
151
152	231	++count;
153	231	}
154		}
155
156		static final int wtBytesLength = 8;
157
158		static int getWT_wordID(byte[] bytes) {
159	0	return ThreePacker.number_(bytes, 0);
160		}
161
162		static long getWT_textID(byte[] bytes) {
163		// Old version used 4-byte textid keys,
164		// we use 5 now to accommodate more bits
165		// for the `Section' part
166
167	0	if (bytes.length == 3 + 5)
168	0	return FivePacker.number_(bytes, 3);
169	0	else if (bytes.length == 3 + 4)
170	0	throw new CorruptionException(
171		"unexpected occurrence key length 7; are you running against an " +
172		"index built with an old version of the bibliomania software, from " +
173		"before the number of allowed `sections' was increased?");
174		else
175	0	throw new CorruptionException(
176		"unexpected occurrence key length " + bytes.length);
177		}
178
179		static void setWT(byte[] bytes, int wordID, long textID) {
180	212	ThreePacker.set_(bytes, 0, wordID);
181	212	FivePacker.set_(bytes, 3, textID);
182	212	}
183
184		static final int wcBytesLength = 6;
185
186		static int getWC_wordID(byte[] bytes) {
187	99	return ThreePacker.number_(bytes, 0);
188		}
189
190		static void setWC_count(byte[] bytes, int count) {
191	183	ThreePacker.set_(bytes, 3, count);
192	183	}
193
194		static void setWC(byte[] bytes, int wordID, int count) {
195	42	ThreePacker.set_(bytes, 0, wordID);
196	42	setWC_count(bytes, count);
197	42	}
198
199		static int getWC_count(byte[] bytes) {
200	141	return ThreePacker.number_(bytes, 3);
201		}
202
203		static void setWord(DatabaseEntry key, String word) {
204	283	byte[] data = key.getData();
205	283	int length = Math.min(word.length(), wordTruncationLength);
206	283	if (data.length < length) {
207	0	key.setData(data = new byte[length]);
208	0	key.setUserBuffer(length, true);
209		}
210	283	key.setSize(length);
211	2016	for (int i = 0; i < length; ++i)
212	1733	data[i] = (byte)word.charAt(i);
213	283	}
214
215		public class WordIDExceededMaxException extends MelatiRuntimeException {
216		/**
217		*
218		*/
219		private static final long serialVersionUID = 1L;
220
221		public String getMessage() {
222		return "Word dictionary in FTI subsystem " + dbHome +
223		" exceeded maximum size";
224		}
225		}
226
227		public int idOfWord(String word) throws FTIException, DatabaseException {
228	141	word = word.toLowerCase();
229		// FIXME synchronized???? is that a good idea??? what about WordFinder?
230	141	synchronized (idOfWord) {
231	141	setWord(idOfWordKey, word);
232	141	if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS)
233	99	return getWC_wordID(idOfWordData.getData());
234		else {
235	42	if (nextWordID > 0xFFFFFF)
236	0	throw new WordIDExceededMaxException();
237	42	setWC(idOfWordData.getData(), nextWordID, 0);
238	42	idOfWord.put(null, idOfWordKey, idOfWordData);
239	42	return nextWordID++;
240		}
241	0	}
242		}
243
244		private void addWordCount(String word, int count) throws DatabaseException {
245	141	synchronized (idOfWord) {
246	141	setWord(idOfWordKey, word.toLowerCase());
247	141	if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
248	141	byte[] data = idOfWordData.getData();
249	141	setWC_count(data, getWC_count(data) + count);
250	141	idOfWord.put(null, idOfWordKey, idOfWordData);
251		}
252	141	}
253	141	}
254
255		public static final int textIdBytesLength = 8;
256
257		public static void setTI(byte[] bytes, long textID, int index) {
258	1	FivePacker.set_(bytes, 0, textID);
259	1	ThreePacker.set_(bytes, 5, index);
260	1	}
261
262		public static long getTI_textID(byte[] bytes) {
263	0	return FivePacker.number_(bytes, 0);
264		}
265
266		public void unIndex(long textID)
267		throws FTIException, DatabaseException {
268	2	DatabaseEntry text = DbUtils.userMemDatabaseEntry(5);
269	2	FivePacker.set_(text.getData(), 0, textID);
270	2	DatabaseEntry words = DbUtils.userMemDatabaseEntry(256);
271
272	2	if (DbUtils.get(wordsInText, text, words, 256) == OperationStatus.SUCCESS) {
273	1	DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
274	72	for (int i = 0; i < words.getSize(); i += 3) {
275	71	setWT(wt.getData(),
276		ThreePacker.number_(words.getData(), i),
277		textID);
278	71	occurrencesOfWordInText.delete(null, wt);
279		}
280		}
281
282	2	wordsInText.delete(null, text);
283	2	}
284
285		private void noteOccurrence(Hashtable<String, WordRecord> wordRecords, String word,
286		int index, int offset) throws DatabaseException {
287
288	231	WordRecord wordRecord = (WordRecord)wordRecords.get(word);
289	231	if (wordRecord == null)
290	141	wordRecords.put(word, wordRecord = new WordRecord(word));
291
292	231	wordRecord.noteOccurrence(index, offset);
293	231	}
294
295		public void index(Text text)
296		throws FTIException, IOException, DatabaseException {
297
298	2	long textID = text.ftiTextID();
299
300	2	unIndex(textID);
301
302	2	DatabaseEntry anchorOfIndexKey = DbUtils.userMemDatabaseEntry(textIdBytesLength);
303	2	DatabaseEntry anchorOfIndexData = DbUtils.userMemDatabaseEntry(256);
304
305	2	Hashtable<String, WordRecord> wordRecords = new Hashtable<String, WordRecord>();
306	2	int[] offsetHistory = new int[contextWordsBeforeHit];
307	2	int wordCount = 0;
308
309	2	InputStream body = new BufferedInputStream(text.body());
310		try {
311	2	for (IndexTokenizer words = new IndexTokenizer(body);
312	233	words.hasMoreWords();) {
313	231	String word = words.nextWord();
314	231	if (word.startsWith("#")) {
315	1	boolean blockmark = word.startsWith("#__");
316	1	setWord(anchorOfIndexData, word.substring(1));
317	1	setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
318	1	(blockmark ? blockmarkOfIndex : anchorOfIndex).put(
319		null, anchorOfIndexKey, anchorOfIndexData);
320	1	}
321		else
322	230	word = word.toLowerCase();
323
324	231	offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
325
326	231	int index = words.wordIndex();
327	231	int offset = wordCount < contextWordsBeforeHit ?
328		offsetHistory[0] :
329		offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
330
331	231	noteOccurrence(wordRecords, word, index, offset);
332	231	if (word.startsWith("$"))
333	0	noteOccurrence(wordRecords, word.substring(1), index, offset);
334
335	231	++wordCount;
336	231	}
337		}
338		finally {
339	2	try { body.close(); } catch (IOException e) {}
340	0	}
341
342	2	byte[] allWordIDs = new byte[wordRecords.size() * 3];
343	2	int allWordIDs_p = 0;
344
345	2	DatabaseEntry occurrences = DbUtils.userMemDatabaseEntry(1);
346	2	DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
347
348	2	for (Enumeration<WordRecord> w = wordRecords.elements(); w.hasMoreElements();) {
349	141	WordRecord wordRecord = (WordRecord)w.nextElement();
350	141	setWT(wt.getData(), wordRecord.wordID, textID);
351	141	occurrences.setData(wordRecord.occurrenceData.buffer());
352	141	occurrences.setSize(wordRecord.occurrenceData.count());
353	141	occurrences.setUserBuffer(occurrences.getData().length,true);
354	141	occurrencesOfWordInText.put(null, wt, occurrences);
355	141	addWordCount(wordRecord.word, wordRecord.count);
356
357	141	ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
358	141	allWordIDs_p += 3;
359	141	}
360
361	2	DatabaseEntry textIDDatabaseEntry = DbUtils.userMemDatabaseEntry(5);
362	2	FivePacker.set_(textIDDatabaseEntry.getData(), 0, textID);
363	2	wordsInText.put(null, textIDDatabaseEntry, DbUtils.userMemDatabaseEntry(allWordIDs));
364	2	}
365
366		public SearchResults andSearchResults(String[] args)
367		throws FTIException, DatabaseException {
368	0	return new AndSearchResults(this, args, false);
369		}
370
371		public SearchResults groupSearchResults(String[] args)
372		throws FTIException, DatabaseException {
373	0	return new AndSearchResults(this, args, true);
374		}
375
376		// i.e., which page is the #anchor marking this poem on?
377
378		public String blockmarkerBeforeFirstOccurrence(long textID, String word)
379		throws DatabaseException {
380	0	TextStream stream = new TextStream(this, word);
381	0	stream.gotoText(textID);
382	0	stream.init();
383	0	int index = stream.currentWordIndex();
384	0	return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
385		}
386
387		public void stat() throws DatabaseException {
388	0	Cursor words = idOfWord.openCursor(null, null);
389		try {
390	0	int total = 0;
391	0	while (words.getNext(idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
392	0	System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
393	0	System.out.print(": ");
394	0	int count = getWC_count(idOfWordData.getData());
395	0	System.out.println(count);
396	0	total += count;
397	0	}
398	0	System.out.println("--- total " + total);
399		}
400		finally {
401	0	try { words.close(); } catch (DatabaseException e) {}
402	0	}
403	0	}
404
405		public void flush() throws DatabaseException {
406	2	idOfWord.sync();
407	2	occurrencesOfWordInText.sync();
408	2	anchorOfIndex.sync();
409	2	blockmarkOfIndex.sync();
410	2	wordsInText.sync();
411	2	}
412
413		public void close() throws DatabaseException {
414	2	idOfWord.close(false);
415	2	occurrencesOfWordInText.close(false);
416	2	anchorOfIndex.close(false);
417	2	blockmarkOfIndex.close(false);
418	2	wordsInText.close(false);
419	2	}
420
421		protected void finalize() throws Throwable {
422	2	close();
423	2	}
424
425		public void appendTerms(Vector<SearchResults> terms, String query, boolean keywords)
426		throws DatabaseException {
427	0	Vector<String> group = null;
428
429	0	StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
430
431	0	while (tokens.hasMoreTokens()) {
432	0	String token = tokens.nextToken();
433	0	switch (token.charAt(0)) {
434		case ' ': case '\t' : case '\n': case '\r': case '\f':
435	0	break;
436		case '"':
437	0	if (group == null)
438	0	group = new Vector<String>();
439		else {
440	0	String[] g = new String[group.size()];
441	0	group.copyInto(g);
442	0	terms.addElement(new AndSearchResults(this, g, true));
443	0	group = null;
444		}
445	0	break;
446		default:
447	0	if (keywords)
448	0	token = '$' + token;
449
450	0	if (group != null)
451	0	group.addElement(token);
452		else
453	0	terms.addElement(new TextStream(this, token));
454		}
455	0	}
456
457	0	if (group != null) {
458	0	String[] g = new String[group.size()];
459	0	group.copyInto(g);
460	0	terms.addElement(new AndSearchResults(this, g, true));
461		}
462	0	}
463
464		public SearchResults querySearchResults(
465		String textQuery, String keywordsQuery) throws DatabaseException, FTIException {
466	0	Vector<SearchResults> terms = new Vector<SearchResults>();
467	0	if (textQuery != null) appendTerms(terms, textQuery, false);
468	0	if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
469	0	SearchResults[] t = new SearchResults[terms.size()];
470	0	terms.copyInto(t);
471	0	return new AndSearchResults(t);
472		}
473
474		public SearchResults querySearchResults(String query)
475		throws DatabaseException, FTIException {
476	0	return querySearchResults(query, null);
477		}
478
479		public IndexCursor allEntries() throws DatabaseException {
480	1	return new IndexCursor(this);
481		}
482
483	1	public static boolean debug = false;
484
485		public static void main(final String[] args) throws Exception {
486	0	IndexOther index = new IndexOther(
487		new File(USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI));
488
489	0	if (args[0].equals("-index"))
490	0	for (int i = 1; i < args.length; i += 2) {
491	0	System.err.println("indexing " + args[i + 1]);
492	0	final String filename = args[i + 1];
493	0	final long textID = Long.parseLong(args[i]);
494	0	final InputStream body = new BufferedInputStream(new FileInputStream(
495		filename));
496	0	index.index(new Text() {
497		public InputStream body() {
498	0	return body;
499		}
500
501		public InputStream bodyForFragment() {
502	0	return body;
503		}
504
505		public long ftiTextID() {
506	0	return textID;
507		}
508		});
509		}
510	0	else if (args[0].equals("-stat"))
511	0	index.stat();
512	0	else if (args[0].equals("-anchorpage"))
513	0	System.out.println(index.blockmarkerBeforeFirstOccurrence(
514		Integer.parseInt(args[1]), args[2]));
515		else {
516		SearchResults results;
517	0	if (args[0].charAt(0) == '_') {
518	0	args[0] = args[0].substring(1);
519	0	results = index.groupSearchResults(args);
520	0	System.out.println("phrase");
521	0	} else if (args[0].equals("-query")) {
522	0	results = index.querySearchResults(args[1]);
523		} else {
524	0	results = index.andSearchResults(args);
525	0	System.out.println("and");
526		}
527
528	0	for (results.gotoText(0L); results.currentTextID() != -1L;
529		// FIXME this isn't very clever! (maybe?)
530	0	results.gotoText(results.currentTextID() + 1L)) {
531	0	System.out.print("-- " + results.currentTextID() + "\n ");
532	0	for (; results.currentOffset() != -1; results.skipToNextHit())
533	0	System.out.print(" " + // results.currentAnchor() + ":" +
534		results.currentOffset());
535	0	System.out.println();
536		}
537		}
538	0	index.close();
539	0	}
540		}