| 1 | |
package org.paneris.bibliomania.fti; |
| 2 | |
|
| 3 | |
import java.io.BufferedInputStream; |
| 4 | |
import java.io.File; |
| 5 | |
import java.io.FileInputStream; |
| 6 | |
import java.io.IOException; |
| 7 | |
import java.io.InputStream; |
| 8 | |
|
| 9 | |
import org.melati.poem.NoSuchRowPoemException; |
| 10 | |
|
| 11 | |
import com.sleepycat.db.DatabaseException; |
| 12 | |
|
| 13 | |
public class ContextSearchResults implements SearchResults { |
| 14 | |
|
| 15 | 0 | private static byte[] skipBuffer = new byte[5000]; |
| 16 | |
|
| 17 | |
public static final String contextUnavailable = "(context not available)"; |
| 18 | |
|
| 19 | |
public static final int contextWordsAfterHit = 5; |
| 20 | |
|
| 21 | |
public static final int wordsGapBetweenAreas = 6; |
| 22 | |
|
| 23 | |
private Library library; |
| 24 | |
|
| 25 | 0 | private Text currentText = null; |
| 26 | |
|
| 27 | |
private SearchResults results; |
| 28 | |
|
| 29 | |
private AnchorFinder blockmarks; |
| 30 | |
|
| 31 | |
|
| 32 | |
|
| 33 | |
|
| 34 | 0 | private InputStream body = null; |
| 35 | |
|
| 36 | |
private long bodyPosition; |
| 37 | |
|
| 38 | |
public ContextSearchResults(Library library, SearchResults results, |
| 39 | 0 | IndexOther fti) { |
| 40 | 0 | this.library = library; |
| 41 | 0 | this.results = results; |
| 42 | 0 | blockmarks = new AnchorFinder(fti, true); |
| 43 | 0 | } |
| 44 | |
|
| 45 | |
public int frequency() { |
| 46 | 0 | return results.frequency(); |
| 47 | |
} |
| 48 | |
|
| 49 | |
public int hitWordsCount() { |
| 50 | 0 | return results.hitWordsCount(); |
| 51 | |
} |
| 52 | |
|
| 53 | |
public void init() { |
| 54 | 0 | closeBody(); |
| 55 | 0 | results.init(); |
| 56 | 0 | } |
| 57 | |
|
| 58 | |
public void skipToNextHit() { |
| 59 | 0 | results.skipToNextHit(); |
| 60 | 0 | } |
| 61 | |
|
| 62 | |
public void skipToWordIndex(int wordIndex) { |
| 63 | 0 | results.skipToWordIndex(wordIndex); |
| 64 | 0 | } |
| 65 | |
|
| 66 | |
public int currentWordIndex() { |
| 67 | 0 | return results.currentWordIndex(); |
| 68 | |
} |
| 69 | |
|
| 70 | |
public int currentOffset() { |
| 71 | 0 | return results.currentOffset(); |
| 72 | |
} |
| 73 | |
|
| 74 | |
public void gotoText(long textID) throws DatabaseException { |
| 75 | 0 | closeBody(); |
| 76 | 0 | results.gotoText(textID); |
| 77 | 0 | long it = results.currentTextID(); |
| 78 | 0 | currentText = it == -1 ? null : library.text(it); |
| 79 | 0 | } |
| 80 | |
|
| 81 | |
public void gotoPosition(long position) throws DatabaseException { |
| 82 | 0 | gotoText(position >> 32); |
| 83 | 0 | if (currentTextID() != -1) |
| 84 | 0 | skipToWordIndex((int) position); |
| 85 | 0 | } |
| 86 | |
|
| 87 | |
public long getPosition() { |
| 88 | 0 | int wi = currentWordIndex(); |
| 89 | 0 | return wi == -1 ? -1 : (currentTextID() << 32 | wi); |
| 90 | |
} |
| 91 | |
|
| 92 | |
public long currentTextID() { |
| 93 | 0 | return results.currentTextID(); |
| 94 | |
} |
| 95 | |
|
| 96 | |
public Text currentText() { |
| 97 | 0 | return currentText; |
| 98 | |
} |
| 99 | |
|
| 100 | |
public String currentAnchor() throws DatabaseException { |
| 101 | |
|
| 102 | |
|
| 103 | 0 | int index = currentWordIndex(); |
| 104 | 0 | return index == -1 ? null : blockmarks |
| 105 | |
.anchorOfIndex(currentTextID(), index); |
| 106 | |
} |
| 107 | |
|
| 108 | |
private synchronized void closeBody() { |
| 109 | 0 | if (body != null) { |
| 110 | |
try { |
| 111 | 0 | body.close(); |
| 112 | 0 | } catch (IOException e) { |
| 113 | 0 | } |
| 114 | 0 | body = null; |
| 115 | |
} |
| 116 | 0 | } |
| 117 | |
|
| 118 | |
public synchronized String nextArea() { |
| 119 | |
try { |
| 120 | 0 | long target = results.currentOffset(); |
| 121 | 0 | if (target == -1) { |
| 122 | 0 | closeBody(); |
| 123 | 0 | return null; |
| 124 | |
} |
| 125 | |
|
| 126 | 0 | if (body == null) { |
| 127 | 0 | if (currentText == null) |
| 128 | 0 | return null; |
| 129 | |
try { |
| 130 | 0 | body = new BufferedInputStream(currentText.bodyForFragment()); |
| 131 | 0 | bodyPosition = 0; |
| 132 | 0 | } catch (NoSuchRowPoemException e) { |
| 133 | |
|
| 134 | |
|
| 135 | 0 | return null; |
| 136 | 0 | } |
| 137 | |
} |
| 138 | |
|
| 139 | 0 | if (bodyPosition > target) |
| 140 | 0 | throw new IOException("tried to skip backwards"); |
| 141 | |
|
| 142 | |
|
| 143 | |
|
| 144 | |
|
| 145 | |
|
| 146 | 0 | if (bodyPosition == 0) |
| 147 | 0 | bodyPosition += body.skip(target - bodyPosition); |
| 148 | |
else { |
| 149 | |
long read; |
| 150 | |
do { |
| 151 | 0 | bodyPosition += (read = body.read(skipBuffer, 0, (int) Math.min( |
| 152 | |
target - bodyPosition, skipBuffer.length))); |
| 153 | 0 | } while (read > 0); |
| 154 | |
} |
| 155 | |
|
| 156 | 0 | if (bodyPosition < target) |
| 157 | 0 | throw new IOException("skipped only to " + bodyPosition + " not " |
| 158 | |
+ target); |
| 159 | |
|
| 160 | 0 | StringBuffer buf = new StringBuffer(); |
| 161 | 0 | IndexTokenizer words = new IndexTokenizer(body, true); |
| 162 | 0 | int baseIndex = Math.max(results.currentWordIndex() |
| 163 | |
- IndexOther.contextWordsBeforeHit, 0); |
| 164 | |
|
| 165 | |
do { |
| 166 | 0 | int limitIndex = results.currentWordIndex() + results.hitWordsCount() |
| 167 | |
+ contextWordsAfterHit - baseIndex; |
| 168 | |
|
| 169 | 0 | while (words.hasMoreWords() && words.wordIndex() < limitIndex) { |
| 170 | 0 | String word = words.nextWord(); |
| 171 | 0 | if (!word.startsWith("#")) { |
| 172 | 0 | if (words.hadBreak() && buf.length() > 0) |
| 173 | 0 | buf.append(' '); |
| 174 | 0 | buf.append(word); |
| 175 | |
} |
| 176 | 0 | } |
| 177 | |
|
| 178 | |
|
| 179 | |
|
| 180 | 0 | results.skipToNextHit(); |
| 181 | |
} while (results.currentWordIndex() != -1 |
| 182 | 0 | && (results.currentWordIndex() - baseIndex) - words.wordIndex() < wordsGapBetweenAreas); |
| 183 | |
|
| 184 | 0 | bodyPosition += words.bytesReadFromUnderlyingStream(); |
| 185 | |
|
| 186 | 0 | return buf.toString(); |
| 187 | 0 | } catch (IOException e) { |
| 188 | 0 | System.err.println(e); |
| 189 | |
|
| 190 | |
try { |
| 191 | 0 | body.close(); |
| 192 | 0 | } catch (Exception ee) { |
| 193 | 0 | } |
| 194 | |
|
| 195 | 0 | return null; |
| 196 | |
} |
| 197 | |
} |
| 198 | |
|
| 199 | |
public static void main(String[] args) throws Exception { |
| 200 | 0 | IndexOther fti = new IndexOther(new File("/tmp")); |
| 201 | |
|
| 202 | |
SearchResults rawResults; |
| 203 | 0 | if (args[0].charAt(0) == '_') { |
| 204 | 0 | args[0] = args[0].substring(1); |
| 205 | 0 | rawResults = fti.groupSearchResults(args); |
| 206 | 0 | System.out.println("phrase"); |
| 207 | 0 | } else if (args[0].equals("-query")) { |
| 208 | 0 | rawResults = fti.querySearchResults(args[1]); |
| 209 | |
} else { |
| 210 | 0 | rawResults = fti.andSearchResults(args); |
| 211 | 0 | System.out.println("and"); |
| 212 | |
} |
| 213 | |
|
| 214 | 0 | ContextSearchResults results = new ContextSearchResults(new Library() { |
| 215 | |
public Text text(final long textID) { |
| 216 | 0 | return new Text() { |
| 217 | |
public InputStream body() throws IOException { |
| 218 | 0 | return new FileInputStream( |
| 219 | |
"/usr/doc/HOWTO/other-formats/html/CDROM-HOWTO-3.html"); |
| 220 | |
} |
| 221 | |
|
| 222 | |
public InputStream bodyForFragment() throws IOException { |
| 223 | 0 | return body(); |
| 224 | |
} |
| 225 | |
|
| 226 | |
public long ftiTextID() { |
| 227 | 0 | return textID; |
| 228 | |
} |
| 229 | |
}; |
| 230 | |
} |
| 231 | |
}, rawResults, fti); |
| 232 | |
|
| 233 | 0 | for (results.gotoText(0); results.currentTextID() != -1; |
| 234 | |
|
| 235 | 0 | results.gotoText(results.currentTextID() + 1)) { |
| 236 | 0 | System.out.println("== " + results.currentTextID()); |
| 237 | |
for (;;) { |
| 238 | 0 | String anchor = results.currentAnchor(); |
| 239 | 0 | String area = results.nextArea(); |
| 240 | 0 | if (area == null) |
| 241 | 0 | break; |
| 242 | 0 | System.out.println("-- A HREF=http://doc#" + anchor); |
| 243 | 0 | System.out.println(area); |
| 244 | 0 | } |
| 245 | |
} |
| 246 | 0 | } |
| 247 | |
|
| 248 | |
public void close() { |
| 249 | 0 | results.close(); |
| 250 | 0 | blockmarks.close(); |
| 251 | 0 | closeBody(); |
| 252 | 0 | } |
| 253 | |
} |