IndexOther

package org.paneris.bibliomania.fti;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;

import org.melati.util.MelatiRuntimeException;
import org.melati.util.PropertiesUtils;
import org.melati.util.PropertyException;
import org.melati.util.UnexpectedExceptionException;

import com.sleepycat.db.Database;
import com.sleepycat.db.BtreeStats;
import com.sleepycat.db.DatabaseConfig;
import com.sleepycat.db.DatabaseException;
import com.sleepycat.db.Cursor;
import com.sleepycat.db.DatabaseEntry;
import com.sleepycat.db.LockMode;
import com.sleepycat.db.OperationStatus;

public class IndexOther {

  private static final String USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI = "/usr/local/share/bibliomania/workspace/infoFTI";
  public static final int wordTruncationLength = 100;
  public static final int contextWordsBeforeHit = 5;


  File dbHome;
  Database idOfWord;
  DatabaseConfig idOfWordConfig;
  DatabaseEntry idOfWordKey = DbUtils.userMemDatabaseEntry(wordTruncationLength);
  DatabaseEntry idOfWordData = DbUtils.userMemDatabaseEntry(wcBytesLength);
  int nextWordID = 0;
  Database occurrencesOfWordInText;
  Database anchorOfIndex;
  Database blockmarkOfIndex;          // i.e. page of index
  Database wordsInText;

  private AnchorFinder blockmarkFinder;

  public static class CorruptionException extends FTIException {

    private static final long serialVersionUID = 6251343503765495522L;
    public String corruptionProblem;

    public CorruptionException(String corruptionProblem) {
      this.corruptionProblem = corruptionProblem;
    }

    public String getMessage() {
      return "The text index is corrupt, and the " +
             "index must be rebuilt (problem: " + corruptionProblem + ")";
    }
  }

  public IndexOther(File dbHome)
      throws IOException, DatabaseException, PropertyException {
    this(dbHome, null);
  }

  public IndexOther(File dbHome, Properties config)
      throws IOException, DatabaseException, PropertyException {
    this.dbHome = dbHome;
    if (!dbHome.exists())
      throw new IllegalArgumentException("dbHome `" + dbHome +
                     "' does nto exist");
    if (!dbHome.isDirectory())
      throw new IllegalArgumentException("dbHome `" + dbHome +
                     "' is not a directory");

    if (config == null)
//      config = PropertiesUtils.fromFile(new File(dbHome, propertiesName));
      config = PropertiesUtils.fromResource(this.getClass());

    idOfWord = DbUtils.openOrCreateBTreeDb(dbHome, "idOfWord",
        PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"));

    occurrencesOfWordInText = DbUtils.openOrCreateBTreeDb(dbHome, "occurrencesOfWordInText",
        PropertiesUtils.getOrDie_int(config, "occurrencesOfWordInText.cacheSize"));

    nextWordID = ((BtreeStats)idOfWord.getStats(null,null)).getNumKeys();

//    System.err.println("nextWordID = " + nextWordID);

    anchorOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "anchorOfIndex",
        PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"));

    blockmarkOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "blockmarkOfIndex",
        PropertiesUtils.getOrDie_int(config, "blockmarkOfIndex.cacheSize"));

    // make it a btree because of possible locality of reference when
    // processing many books
    wordsInText = DbUtils.openOrCreateBTreeDb(dbHome, "wordsInText", 0);

    blockmarkFinder = new AnchorFinder(this, true);
  }

  private static final class Buffer extends ByteArrayOutputStream {
    public byte[] buffer() {
      return buf;
    }

    public int count() {
      return count;
    }
  }

  public class WordRecord {
    public String word;
    public int wordID;
    public Buffer occurrenceData = new Buffer();
    public int count;

    public WordRecord(String word) throws FTIException, DatabaseException {
      this.word = word;
      wordID = idOfWord(word);
    }

    // FIXME this really goes with WordTextSearchResults

    private Packer wordIndexPacker = OnePacker.it;
    private Packer offsetPacker = OnePacker.it;

    public void noteOccurrence(int wordIndex, int offset) {
      try {
    while (wordIndex >= wordIndexPacker.numberMax()) {
      wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
      wordIndexPacker = wordIndexPacker.bigger();
    }

    wordIndexPacker.write(occurrenceData, wordIndex);

    while (offset >= offsetPacker.numberMax()) {
      offsetPacker.write(occurrenceData, offsetPacker.numberMax());
      offsetPacker = offsetPacker.bigger();
    }

    offsetPacker.write(occurrenceData, offset);
      }
      catch (IOException e) {
    throw new UnexpectedExceptionException(e);
      }

      ++count;
    }
  }

  static final int wtBytesLength = 8;

  static int getWT_wordID(byte[] bytes) {
    return ThreePacker.number_(bytes, 0);
  }

  static long getWT_textID(byte[] bytes) {
    // Old version used 4-byte textid keys,
    // we use 5 now to accommodate more bits
    // for the `Section' part

    if (bytes.length == 3 + 5)
      return FivePacker.number_(bytes, 3);
    else if (bytes.length == 3 + 4)
      throw new CorruptionException(
          "unexpected occurrence key length 7; are you running against an " +
          "index built with an old version of the bibliomania software, from " +
          "before the number of allowed `sections' was increased?");
    else
      throw new CorruptionException(
          "unexpected occurrence key length " + bytes.length);
  }

  static void setWT(byte[] bytes, int wordID, long textID) {
    ThreePacker.set_(bytes, 0, wordID);
    FivePacker.set_(bytes, 3, textID);
  }

  static final int wcBytesLength = 6;

  static int getWC_wordID(byte[] bytes) {
    return ThreePacker.number_(bytes, 0);
  }

  static void setWC_count(byte[] bytes, int count) {
    ThreePacker.set_(bytes, 3, count);
  }

  static void setWC(byte[] bytes, int wordID, int count) {
    ThreePacker.set_(bytes, 0, wordID);
    setWC_count(bytes, count);
  }

  static int getWC_count(byte[] bytes) {
    return ThreePacker.number_(bytes, 3);
  }

  static void setWord(DatabaseEntry key, String word) {
    byte[] data = key.getData();
    int length = Math.min(word.length(), wordTruncationLength);
    if (data.length < length) {
      key.setData(data = new byte[length]);
      key.setUserBuffer(length, true);
    }
    key.setSize(length);
    for (int i = 0; i < length; ++i)
      data[i] = (byte)word.charAt(i);
  }

  public class WordIDExceededMaxException extends MelatiRuntimeException {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    public String getMessage() {
      return "Word dictionary in FTI subsystem " + dbHome +
             " exceeded maximum size";
    }
  }

  public int idOfWord(String word) throws FTIException, DatabaseException {
    word = word.toLowerCase();
    // FIXME synchronized???? is that a good idea??? what about WordFinder?
    synchronized (idOfWord) {
      setWord(idOfWordKey, word);
      if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS)
        return getWC_wordID(idOfWordData.getData());
      else {
        if (nextWordID > 0xFFFFFF)
          throw new WordIDExceededMaxException();
        setWC(idOfWordData.getData(), nextWordID, 0);
        idOfWord.put(null, idOfWordKey, idOfWordData);
        return nextWordID++;
      }
    }
  }

  private void addWordCount(String word, int count) throws DatabaseException {
    synchronized (idOfWord) {
      setWord(idOfWordKey, word.toLowerCase());
      if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
        byte[] data = idOfWordData.getData();
        setWC_count(data, getWC_count(data) + count);
        idOfWord.put(null, idOfWordKey, idOfWordData);
      }
    }
  }

  public static final int textIdBytesLength = 8;

  public static void setTI(byte[] bytes, long textID, int index) {
    FivePacker.set_(bytes, 0, textID);
    ThreePacker.set_(bytes, 5, index);
  }

  public static long getTI_textID(byte[] bytes) {
    return FivePacker.number_(bytes, 0);
  }

  public void unIndex(long textID)
      throws FTIException, DatabaseException {
    DatabaseEntry text = DbUtils.userMemDatabaseEntry(5);
    FivePacker.set_(text.getData(), 0, textID);
    DatabaseEntry words = DbUtils.userMemDatabaseEntry(256);

    if (DbUtils.get(wordsInText, text, words, 256) == OperationStatus.SUCCESS) {
      DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
      for (int i = 0; i < words.getSize(); i += 3) {
        setWT(wt.getData(),
            ThreePacker.number_(words.getData(), i),
            textID);
        occurrencesOfWordInText.delete(null, wt);
      }
    }

    wordsInText.delete(null, text);
  }

  private void noteOccurrence(Hashtable<String, WordRecord> wordRecords, String word,
                              int index, int offset) throws DatabaseException {

    WordRecord wordRecord = (WordRecord)wordRecords.get(word);
    if (wordRecord == null)
      wordRecords.put(word, wordRecord = new WordRecord(word));

    wordRecord.noteOccurrence(index, offset);
  }

  public void index(Text text)
      throws FTIException, IOException, DatabaseException {

    long textID = text.ftiTextID();

    unIndex(textID);

    DatabaseEntry anchorOfIndexKey = DbUtils.userMemDatabaseEntry(textIdBytesLength);
    DatabaseEntry anchorOfIndexData = DbUtils.userMemDatabaseEntry(256);

    Hashtable<String, WordRecord> wordRecords = new Hashtable<String, WordRecord>();
    int[] offsetHistory = new int[contextWordsBeforeHit];
    int wordCount = 0;

    InputStream body = new BufferedInputStream(text.body());
    try {
      for (IndexTokenizer words = new IndexTokenizer(body);
           words.hasMoreWords();) {
        String word = words.nextWord();
        if (word.startsWith("#")) {
          boolean blockmark = word.startsWith("#__");
          setWord(anchorOfIndexData, word.substring(1));
          setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
          (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
              null, anchorOfIndexKey, anchorOfIndexData);
        }
        else
          word = word.toLowerCase();

        offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();

        int index = words.wordIndex();
        int offset = wordCount < contextWordsBeforeHit ?
                       offsetHistory[0] :
                       offsetHistory[(wordCount + 1) % contextWordsBeforeHit];

        noteOccurrence(wordRecords, word, index, offset);
        if (word.startsWith("$"))
          noteOccurrence(wordRecords, word.substring(1), index, offset);

        ++wordCount;
      }
    }
    finally {
      try { body.close(); } catch (IOException e) {}
    }

    byte[] allWordIDs = new byte[wordRecords.size() * 3];
    int allWordIDs_p = 0;

    DatabaseEntry occurrences = DbUtils.userMemDatabaseEntry(1);
    DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);

    for (Enumeration<WordRecord> w = wordRecords.elements(); w.hasMoreElements();) {
      WordRecord wordRecord = (WordRecord)w.nextElement();
      setWT(wt.getData(), wordRecord.wordID, textID);
      occurrences.setData(wordRecord.occurrenceData.buffer());
      occurrences.setSize(wordRecord.occurrenceData.count());
      occurrences.setUserBuffer(occurrences.getData().length,true);
      occurrencesOfWordInText.put(null, wt, occurrences);
      addWordCount(wordRecord.word, wordRecord.count);

      ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
      allWordIDs_p += 3;
    }

    DatabaseEntry textIDDatabaseEntry = DbUtils.userMemDatabaseEntry(5);
    FivePacker.set_(textIDDatabaseEntry.getData(), 0, textID);
    wordsInText.put(null, textIDDatabaseEntry, DbUtils.userMemDatabaseEntry(allWordIDs));
  }

  public SearchResults andSearchResults(String[] args)
      throws FTIException, DatabaseException {
    return new AndSearchResults(this, args, false);
  }

  public SearchResults groupSearchResults(String[] args)
      throws FTIException, DatabaseException {
    return new AndSearchResults(this, args, true);
  }

  // i.e., which page is the #anchor marking this poem on?

  public String blockmarkerBeforeFirstOccurrence(long textID, String word)
      throws DatabaseException {
    TextStream stream = new TextStream(this, word);
    stream.gotoText(textID);
    stream.init();
    int index = stream.currentWordIndex();
    return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
  }

  public void stat() throws DatabaseException {
    Cursor words = idOfWord.openCursor(null, null);
    try {
      int total = 0;
      while (words.getNext(idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
        System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
        System.out.print(": ");
        int count = getWC_count(idOfWordData.getData());
        System.out.println(count);
        total += count;
      }
      System.out.println("--- total " + total);
    }
    finally {
      try { words.close(); } catch (DatabaseException e) {}
    }
  }

  public void flush() throws DatabaseException {
    idOfWord.sync();
    occurrencesOfWordInText.sync();
    anchorOfIndex.sync();
    blockmarkOfIndex.sync();
    wordsInText.sync();
  }

  public void close() throws DatabaseException {
    idOfWord.close(false);
    occurrencesOfWordInText.close(false);
    anchorOfIndex.close(false);
    blockmarkOfIndex.close(false);
    wordsInText.close(false);
  }

  protected void finalize() throws Throwable {
    close();
  }

  public void appendTerms(Vector<SearchResults> terms, String query, boolean keywords)
      throws DatabaseException {
    Vector<String> group = null;

    StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);

    while (tokens.hasMoreTokens()) {
      String token = tokens.nextToken();
      switch (token.charAt(0)) {
        case ' ': case '\t' : case '\n': case '\r': case '\f':
      break;
        case '"':
      if (group == null)
        group = new Vector<String>();
      else {
        String[] g = new String[group.size()];
        group.copyInto(g);
        terms.addElement(new AndSearchResults(this, g, true));
        group = null;
      }
      break;
        default:
          if (keywords)
            token = '$' + token;

      if (group != null)
        group.addElement(token);
      else
        terms.addElement(new TextStream(this, token));
      }
    }

    if (group != null) {
      String[] g = new String[group.size()];
      group.copyInto(g);
      terms.addElement(new AndSearchResults(this, g, true));
    }
  }

  public SearchResults querySearchResults(
      String textQuery, String keywordsQuery) throws DatabaseException, FTIException {
    Vector<SearchResults> terms = new Vector<SearchResults>();
    if (textQuery != null) appendTerms(terms, textQuery, false);
    if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
    SearchResults[] t = new SearchResults[terms.size()];
    terms.copyInto(t);
    return new AndSearchResults(t);
  }

  public SearchResults querySearchResults(String query)
      throws DatabaseException, FTIException {
    return querySearchResults(query, null);
  }

  public IndexCursor allEntries() throws DatabaseException {
    return new IndexCursor(this);
  }

  public static boolean debug = false;

  public static void main(final String[] args) throws Exception {
    IndexOther index = new IndexOther(
        new File(USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI));

    if (args[0].equals("-index"))
      for (int i = 1; i < args.length; i += 2) {
        System.err.println("indexing " + args[i + 1]);
        final String filename = args[i + 1];
        final long textID = Long.parseLong(args[i]);
        final InputStream body = new BufferedInputStream(new FileInputStream(
            filename));
        index.index(new Text() {
          public InputStream body() {
            return body;
          }

          public InputStream bodyForFragment() {
            return body;
          }

          public long ftiTextID() {
            return textID;
          }
        });
      }
    else if (args[0].equals("-stat"))
      index.stat();
    else if (args[0].equals("-anchorpage"))
      System.out.println(index.blockmarkerBeforeFirstOccurrence(
          Integer.parseInt(args[1]), args[2]));
    else {
      SearchResults results;
      if (args[0].charAt(0) == '_') {
        args[0] = args[0].substring(1);
        results = index.groupSearchResults(args);
        System.out.println("phrase");
      } else if (args[0].equals("-query")) {
        results = index.querySearchResults(args[1]);
      } else {
        results = index.andSearchResults(args);
        System.out.println("and");
      }

      for (results.gotoText(0L); results.currentTextID() != -1L;
           // FIXME this isn't very clever! (maybe?)
           results.gotoText(results.currentTextID() + 1L)) {
        System.out.print("-- " + results.currentTextID() + "\n ");
        for (; results.currentOffset() != -1; results.skipToNextHit())
          System.out.print(" " + // results.currentAnchor() + ":" +
              results.currentOffset());
        System.out.println();
      }
    }
    index.close();
  }
}