1 package org.paneris.bibliomania.fti;
2
3 import java.io.BufferedInputStream;
4 import java.io.ByteArrayOutputStream;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.util.Enumeration;
10 import java.util.Hashtable;
11 import java.util.Properties;
12 import java.util.StringTokenizer;
13 import java.util.Vector;
14
15 import org.melati.util.MelatiRuntimeException;
16 import org.melati.util.PropertiesUtils;
17 import org.melati.util.PropertyException;
18 import org.melati.util.UnexpectedExceptionException;
19
20 import com.sleepycat.db.Db;
21 import com.sleepycat.db.DbBtreeStat;
22 import com.sleepycat.db.DbEnv;
23 import com.sleepycat.db.DbException;
24 import com.sleepycat.db.Dbc;
25 import com.sleepycat.db.Dbt;
26
27 public class IndexOther {
28
29 public static final int wordTruncationLength = 100;
30 public static final int contextWordsBeforeHit = 5;
31
32 private static final String propertiesName =
33 "org.paneris.bibliomania.fti.IndexOther.properties";
34
35 File dbHome;
36 Db idOfWord;
37 Dbt idOfWordKey = DbUtils.userMemDbt(wordTruncationLength);
38 Dbt idOfWordData = DbUtils.userMemDbt(wcBytesLength);
39 int nextWordID = 0;
40 Db occurrencesOfWordInText;
41 Db anchorOfIndex;
42 Db blockmarkOfIndex;
43 Db wordsInText;
44 private AnchorFinder blockmarkFinder;
45
46 public static class CorruptionException extends FTIException {
47
48
49
50
51 private static final long serialVersionUID = 1L;
52 public String corruptionProblem;
53
54 public CorruptionException(String corruptionProblem) {
55 this.corruptionProblem = corruptionProblem;
56 }
57
58 public String getMessage() {
59 return "The text index is corrupt, and the " +
60 "index must be rebuilt (problem: " + corruptionProblem + ")";
61 }
62 }
63
64 public IndexOther(File dbHome)
65 throws IOException, DbException, PropertyException {
66 this(dbHome, null);
67 }
68
69 public IndexOther(File dbHome, Properties config)
70 throws IOException, DbException, PropertyException {
71 this.dbHome = dbHome;
72 if (!dbHome.isDirectory())
73 throw new IllegalArgumentException("dbHome `" + dbHome +
74 "' is not a directory");
75
76 if (config == null)
77 config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
78
79 DbEnv env = null;
80
81 idOfWord = new Db(env, 0);
82 idOfWord.setCacheSize(
83 PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"), 1);
84 System.err.println("Attempting to open: " + new File(dbHome, "idOfWord.db").getPath());
85 idOfWord.open(null, new File(dbHome, "idOfWord.db").getPath(),
86 null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
87
88 occurrencesOfWordInText = new Db(env, 0);
89 occurrencesOfWordInText.setCacheSize(
90 PropertiesUtils.getOrDie_int(config, "occurrences.cacheSize"), 1);
91 occurrencesOfWordInText.open(null, new File(dbHome, "occurrences.db").getPath(),
92 null, Db.DB_BTREE,
93 Db.DB_CREATE | Db.DB_THREAD, 0644);
94
95 nextWordID = ((DbBtreeStat)idOfWord.stat(0)).bt_nkeys;
96
97
98
99 anchorOfIndex = new Db(env, 0);
100 anchorOfIndex.setCacheSize(
101 PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"), 1);
102 anchorOfIndex.open(null, new File(dbHome, "anchorOfIndex.db").getPath(),
103 null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
104
105 blockmarkOfIndex = new Db(env, 0);
106 blockmarkOfIndex.setCacheSize(
107 PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"), 1);
108 blockmarkOfIndex.open(null, new File(dbHome, "blockmarkOfIndex.db").getPath(),
109 null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
110
111 wordsInText = new Db(env, 0);
112
113
114 wordsInText.open(null, new File(dbHome, "wordsInText.db").getPath(),
115 null, Db.DB_BTREE, Db.DB_CREATE | Db.DB_THREAD, 0644);
116
117 blockmarkFinder = new AnchorFinder(this, true);
118 }
119
120 private static final class Buffer extends ByteArrayOutputStream {
121 public byte[] buffer() {
122 return buf;
123 }
124
125 public int count() {
126 return count;
127 }
128 }
129
130 private class WordRecord {
131 public String word;
132 public int wordID;
133 public Buffer occurrenceData = new Buffer();
134 public int count;
135
136 public WordRecord(String word) throws FTIException, DbException {
137 this.word = word;
138 wordID = idOfWord(word);
139 }
140
141
142
143 private Packer wordIndexPacker = OnePacker.it;
144 private Packer offsetPacker = OnePacker.it;
145
146 public void noteOccurrence(int wordIndex, int offset) {
147 try {
148 while (wordIndex >= wordIndexPacker.numberMax()) {
149 wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
150 wordIndexPacker = wordIndexPacker.bigger();
151 }
152
153 wordIndexPacker.write(occurrenceData, wordIndex);
154
155 while (offset >= offsetPacker.numberMax()) {
156 offsetPacker.write(occurrenceData, offsetPacker.numberMax());
157 offsetPacker = offsetPacker.bigger();
158 }
159
160 offsetPacker.write(occurrenceData, offset);
161 }
162 catch (IOException e) {
163 throw new UnexpectedExceptionException(e);
164 }
165
166 ++count;
167 }
168 }
169
170 static final int wtBytesLength = 8;
171
172 static int getWT_wordID(byte[] bytes) {
173 return ThreePacker.number_(bytes, 0);
174 }
175
176 static long getWT_textID(byte[] bytes) {
177
178
179
180 if (bytes.length == 3 + 5)
181 return FivePacker.number_(bytes, 3);
182 else if (bytes.length == 3 + 4)
183 throw new CorruptionException(
184 "unexpected occurrence key length 7; are you running against an " +
185 "index built with an old version of the bibliomania software, from " +
186 "before the number of allowed `sections' was increased?");
187 else
188 throw new CorruptionException(
189 "unexpected occurrence key length " + bytes.length);
190 }
191
192 static void setWT(byte[] bytes, int wordID, long textID) {
193 ThreePacker.set_(bytes, 0, wordID);
194 FivePacker.set_(bytes, 3, textID);
195 }
196
197 static final int wcBytesLength = 6;
198
199 static int getWC_wordID(byte[] bytes) {
200 return ThreePacker.number_(bytes, 0);
201 }
202
203 static void setWC_count(byte[] bytes, int count) {
204 ThreePacker.set_(bytes, 3, count);
205 }
206
207 static void setWC(byte[] bytes, int wordID, int count) {
208 ThreePacker.set_(bytes, 0, wordID);
209 setWC_count(bytes, count);
210 }
211
212 static int getWC_count(byte[] bytes) {
213 return ThreePacker.number_(bytes, 3);
214 }
215
216 static void setWord(Dbt key, String word) {
217 byte[] data = key.getData();
218 int length = Math.min(word.length(), wordTruncationLength);
219 if (data.length < length) {
220 key.setData(data = new byte[length]);
221 key.setUserBufferLength(length);
222 }
223 key.setSize(length);
224 for (int i = 0; i < length; ++i)
225 data[i] = (byte)word.charAt(i);
226 }
227
228 public class WordIDExceededMaxException extends MelatiRuntimeException {
229
230
231
232 private static final long serialVersionUID = 1L;
233
234 public String getMessage() {
235 return "Word dictionary in FTI subsystem " + dbHome +
236 " exceeded maximum size";
237 }
238 }
239
240 public int idOfWord(String word) throws FTIException, DbException {
241 word = word.toLowerCase();
242
243 synchronized (idOfWord) {
244 setWord(idOfWordKey, word);
245 if (idOfWord.get(null, idOfWordKey, idOfWordData, 0) == 0)
246 return getWC_wordID(idOfWordData.getData());
247 else {
248 if (nextWordID > 0xFFFFFF)
249 throw new WordIDExceededMaxException();
250 setWC(idOfWordData.getData(), nextWordID, 0);
251 idOfWord.put(null, idOfWordKey, idOfWordData, 0);
252 return nextWordID++;
253 }
254 }
255 }
256
257 private void addWordCount(String word, int count) throws DbException {
258 synchronized (idOfWord) {
259 setWord(idOfWordKey, word.toLowerCase());
260 if (idOfWord.get(null, idOfWordKey, idOfWordData, 0) == 0) {
261 byte[] data = idOfWordData.getData();
262 setWC_count(data, getWC_count(data) + count);
263 idOfWord.put(null, idOfWordKey, idOfWordData, 0);
264 }
265 }
266 }
267
268 public static final int tiBytesLength = 8;
269
270 public static void setTI(byte[] bytes, long textID, int index) {
271 FivePacker.set_(bytes, 0, textID);
272 ThreePacker.set_(bytes, 5, index);
273 }
274
275 public static long getTI_textID(byte[] bytes) {
276 return FivePacker.number_(bytes, 0);
277 }
278
279 public void unIndex(long textID)
280 throws FTIException, DbException {
281 Dbt text = DbUtils.userMemDbt(5);
282 FivePacker.set_(text.getData(), 0, textID);
283 Dbt words = DbUtils.userMemDbt(256);
284
285 if (DbUtils.get(wordsInText, text, words, 256) == 0) {
286 Dbt wt = DbUtils.userMemDbt(wtBytesLength);
287 for (int i = 0; i < words.getSize(); i += 3) {
288 setWT(wt.getData(),
289 ThreePacker.number_(words.getData(), i),
290 textID);
291 occurrencesOfWordInText.delete(null, wt, 0);
292 }
293 }
294
295 wordsInText.delete(null, text, 0);
296 }
297
298 private void noteOccurrence(Hashtable wordRecords, String word,
299 int index, int offset) throws DbException {
300
301 WordRecord wordRecord = (WordRecord)wordRecords.get(word);
302 if (wordRecord == null)
303 wordRecords.put(word, wordRecord = new WordRecord(word));
304
305 wordRecord.noteOccurrence(index, offset);
306 }
307
308 public void index(Text text)
309 throws FTIException, IOException, DbException {
310
311 long textID = text.ftiTextID();
312
313 unIndex(textID);
314
315 Dbt anchorOfIndexKey = DbUtils.userMemDbt(tiBytesLength);
316 Dbt anchorOfIndexData = DbUtils.userMemDbt(256);
317
318 Hashtable wordRecords = new Hashtable();
319 int[] offsetHistory = new int[contextWordsBeforeHit];
320 int wordCount = 0;
321
322 InputStream body = new BufferedInputStream(text.body());
323 try {
324 for (IndexTokenizer words = new IndexTokenizer(body);
325 words.hasMoreWords();) {
326 String word = words.nextWord();
327 if (word.startsWith("#")) {
328 boolean blockmark = word.startsWith("#__");
329 setWord(anchorOfIndexData, word.substring(1));
330 setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
331 (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
332 null, anchorOfIndexKey, anchorOfIndexData, 0);
333 }
334 else
335 word = word.toLowerCase();
336
337 offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
338
339 int index = words.wordIndex();
340 int offset = wordCount < contextWordsBeforeHit ?
341 offsetHistory[0] :
342 offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
343
344 noteOccurrence(wordRecords, word, index, offset);
345 if (word.startsWith("$"))
346 noteOccurrence(wordRecords, word.substring(1), index, offset);
347
348 ++wordCount;
349 }
350 }
351 finally {
352 try { body.close(); } catch (IOException e) {}
353 }
354
355 byte[] allWordIDs = new byte[wordRecords.size() * 3];
356 int allWordIDs_p = 0;
357
358 Dbt occurrences = DbUtils.userMemDbt(1);
359 Dbt wt = DbUtils.userMemDbt(wtBytesLength);
360
361 for (Enumeration w = wordRecords.elements(); w.hasMoreElements();) {
362 WordRecord wordRecord = (WordRecord)w.nextElement();
363 setWT(wt.getData(), wordRecord.wordID, textID);
364 occurrences.setData(wordRecord.occurrenceData.buffer());
365 occurrences.setSize(wordRecord.occurrenceData.count());
366 occurrences.setUserBufferLength(occurrences.getData().length);
367 occurrencesOfWordInText.put(null, wt, occurrences, 0);
368 addWordCount(wordRecord.word, wordRecord.count);
369
370 ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
371 allWordIDs_p += 3;
372 }
373
374 Dbt textIDDbt = DbUtils.userMemDbt(5);
375 FivePacker.set_(textIDDbt.getData(), 0, textID);
376 wordsInText.put(null, textIDDbt, DbUtils.userMemDbt(allWordIDs), 0);
377 }
378
379 public SearchResults andSearchResults(String[] args)
380 throws FTIException, DbException {
381 return new AndSearchResults(this, args, false);
382 }
383
384 public SearchResults groupSearchResults(String[] args)
385 throws FTIException, DbException {
386 return new AndSearchResults(this, args, true);
387 }
388
389
390
391 public String blockmarkerBeforeFirstOccurrence(long textID, String word)
392 throws DbException {
393 TextStream stream = new TextStream(this, word);
394 stream.gotoText(textID);
395 stream.init();
396 int index = stream.currentWordIndex();
397 return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
398 }
399
400 public void stat() throws DbException {
401 Dbc words = idOfWord.cursor(null, 0);
402 try {
403 int total = 0;
404 while (words.get(idOfWordKey, idOfWordData, Db.DB_NEXT) == 0) {
405 System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
406 System.out.print(": ");
407 int count = getWC_count(idOfWordData.getData());
408 System.out.println(count);
409 total += count;
410 }
411 System.out.println("--- total " + total);
412 }
413 finally {
414 try { words.close(); } catch (DbException e) {}
415 }
416 }
417
418 public void flush() throws DbException {
419 idOfWord.sync(0);
420 occurrencesOfWordInText.sync(0);
421 anchorOfIndex.sync(0);
422 blockmarkOfIndex.sync(0);
423 wordsInText.sync(0);
424 }
425
426 public void close() throws DbException {
427 idOfWord.close(0);
428 occurrencesOfWordInText.close(0);
429 anchorOfIndex.close(0);
430 blockmarkOfIndex.close(0);
431 wordsInText.close(0);
432 }
433
434 protected void finalize() throws Throwable {
435 close();
436 }
437
438 public void appendTerms(Vector terms, String query, boolean keywords)
439 throws DbException {
440 Vector group = null;
441
442 StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
443
444 while (tokens.hasMoreTokens()) {
445 String token = tokens.nextToken();
446 switch (token.charAt(0)) {
447 case ' ': case '\t' : case '\n': case '\r': case '\f':
448 break;
449 case '"':
450 if (group == null)
451 group = new Vector();
452 else {
453 String[] g = new String[group.size()];
454 group.copyInto(g);
455 terms.addElement(new AndSearchResults(this, g, true));
456 group = null;
457 }
458 break;
459 default:
460 if (keywords)
461 token = '$' + token;
462
463 if (group != null)
464 group.addElement(token);
465 else
466 terms.addElement(new TextStream(this, token));
467 }
468 }
469
470 if (group != null) {
471 String[] g = new String[group.size()];
472 group.copyInto(g);
473 terms.addElement(new AndSearchResults(this, g, true));
474 }
475 }
476
477 public SearchResults querySearchResults(
478 String textQuery, String keywordsQuery) throws DbException, FTIException {
479 Vector terms = new Vector();
480 if (textQuery != null) appendTerms(terms, textQuery, false);
481 if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
482 SearchResults[] t = new SearchResults[terms.size()];
483 terms.copyInto(t);
484 return new AndSearchResults(t);
485 }
486
487 public SearchResults querySearchResults(String query)
488 throws DbException, FTIException {
489 return querySearchResults(query, null);
490 }
491
492 public IndexCursor allEntries() throws DbException {
493 return new IndexCursor(this);
494 }
495
496 public static boolean debug = false;
497
498 public static void main(final String[] args) throws Exception {
499 IndexOther index = new IndexOther(
500 new File("/usr/local/share/bibliomania/workspace/infoFTI"));
501
502 if (args[0].equals("-index"))
503 for (int i = 1; i < args.length; i += 2) {
504 try {
505 System.err.println("indexing " + args[i + 1]);
506 final InputStream body =
507 new BufferedInputStream(new FileInputStream(args[i + 1]));
508 final long textID = Long.parseLong(args[i]);
509 index.index(new Text() {
510 public InputStream body() {
511 return body;
512 }
513 public InputStream bodyForFragment() {
514 return body;
515 }
516 public long ftiTextID() {
517 return textID;
518 }
519 });
520
521 }
522 catch (Exception e) {
523 e.printStackTrace();
524 }
525 }
526 else if (args[0].equals("-stat"))
527 index.stat();
528 else if (args[0].equals("-anchorpage"))
529 System.out.println(index.blockmarkerBeforeFirstOccurrence(Integer.parseInt(args[1]), args[2]));
530 else {
531 SearchResults results;
532 if (args[0].charAt(0) == '_') {
533 args[0] = args[0].substring(1);
534 results = index.groupSearchResults(args);
535 System.out.println("phrase");
536 }
537 else if (args[0].equals("-query")) {
538 results = index.querySearchResults(args[1]);
539 }
540 else {
541 results = index.andSearchResults(args);
542 System.out.println("and");
543 }
544
545 for (results.gotoText(0L); results.currentTextID() != -1L;
546
547 results.gotoText(results.currentTextID() + 1L)) {
548 System.out.print("-- " + results.currentTextID() + "\n ");
549 for (; results.currentOffset() != -1; results.skipToNextHit())
550 System.out.print(" " +
551 results.currentOffset());
552 System.out.println();
553 }
554 }
555
556 index.close();
557 }
558 }