1 | |
package org.paneris.bibliomania.fti; |
2 | |
|
3 | |
import java.io.BufferedInputStream; |
4 | |
import java.io.File; |
5 | |
import java.io.FileInputStream; |
6 | |
import java.io.IOException; |
7 | |
import java.io.InputStream; |
8 | |
|
9 | |
import org.melati.poem.NoSuchRowPoemException; |
10 | |
|
11 | |
import com.sleepycat.db.DatabaseException; |
12 | |
|
13 | |
public class ContextSearchResults implements SearchResults { |
14 | |
|
15 | 0 | private static byte[] skipBuffer = new byte[5000]; |
16 | |
|
17 | |
public static final String contextUnavailable = "(context not available)"; |
18 | |
|
19 | |
public static final int contextWordsAfterHit = 5; |
20 | |
|
21 | |
public static final int wordsGapBetweenAreas = 6; |
22 | |
|
23 | |
private Library library; |
24 | |
|
25 | 0 | private Text currentText = null; |
26 | |
|
27 | |
private SearchResults results; |
28 | |
|
29 | |
private AnchorFinder blockmarks; |
30 | |
|
31 | |
|
32 | |
|
33 | |
|
34 | 0 | private InputStream body = null; |
35 | |
|
36 | |
private long bodyPosition; |
37 | |
|
38 | |
public ContextSearchResults(Library library, SearchResults results, |
39 | 0 | IndexOther fti) { |
40 | 0 | this.library = library; |
41 | 0 | this.results = results; |
42 | 0 | blockmarks = new AnchorFinder(fti, true); |
43 | 0 | } |
44 | |
|
45 | |
public int frequency() { |
46 | 0 | return results.frequency(); |
47 | |
} |
48 | |
|
49 | |
public int hitWordsCount() { |
50 | 0 | return results.hitWordsCount(); |
51 | |
} |
52 | |
|
53 | |
public void init() { |
54 | 0 | closeBody(); |
55 | 0 | results.init(); |
56 | 0 | } |
57 | |
|
58 | |
public void skipToNextHit() { |
59 | 0 | results.skipToNextHit(); |
60 | 0 | } |
61 | |
|
62 | |
public void skipToWordIndex(int wordIndex) { |
63 | 0 | results.skipToWordIndex(wordIndex); |
64 | 0 | } |
65 | |
|
66 | |
public int currentWordIndex() { |
67 | 0 | return results.currentWordIndex(); |
68 | |
} |
69 | |
|
70 | |
public int currentOffset() { |
71 | 0 | return results.currentOffset(); |
72 | |
} |
73 | |
|
74 | |
public void gotoText(long textID) throws DatabaseException { |
75 | 0 | closeBody(); |
76 | 0 | results.gotoText(textID); |
77 | 0 | long it = results.currentTextID(); |
78 | 0 | currentText = it == -1 ? null : library.text(it); |
79 | 0 | } |
80 | |
|
81 | |
public void gotoPosition(long position) throws DatabaseException { |
82 | 0 | gotoText(position >> 32); |
83 | 0 | if (currentTextID() != -1) |
84 | 0 | skipToWordIndex((int) position); |
85 | 0 | } |
86 | |
|
87 | |
public long getPosition() { |
88 | 0 | int wi = currentWordIndex(); |
89 | 0 | return wi == -1 ? -1 : (currentTextID() << 32 | wi); |
90 | |
} |
91 | |
|
92 | |
public long currentTextID() { |
93 | 0 | return results.currentTextID(); |
94 | |
} |
95 | |
|
96 | |
public Text currentText() { |
97 | 0 | return currentText; |
98 | |
} |
99 | |
|
100 | |
public String currentAnchor() throws DatabaseException { |
101 | |
|
102 | |
|
103 | 0 | int index = currentWordIndex(); |
104 | 0 | return index == -1 ? null : blockmarks |
105 | |
.anchorOfIndex(currentTextID(), index); |
106 | |
} |
107 | |
|
108 | |
private synchronized void closeBody() { |
109 | 0 | if (body != null) { |
110 | |
try { |
111 | 0 | body.close(); |
112 | 0 | } catch (IOException e) { |
113 | 0 | } |
114 | 0 | body = null; |
115 | |
} |
116 | 0 | } |
117 | |
|
118 | |
public synchronized String nextArea() { |
119 | |
try { |
120 | 0 | long target = results.currentOffset(); |
121 | 0 | if (target == -1) { |
122 | 0 | closeBody(); |
123 | 0 | return null; |
124 | |
} |
125 | |
|
126 | 0 | if (body == null) { |
127 | 0 | if (currentText == null) |
128 | 0 | return null; |
129 | |
try { |
130 | 0 | body = new BufferedInputStream(currentText.bodyForFragment()); |
131 | 0 | bodyPosition = 0; |
132 | 0 | } catch (NoSuchRowPoemException e) { |
133 | |
|
134 | |
|
135 | 0 | return null; |
136 | 0 | } |
137 | |
} |
138 | |
|
139 | 0 | if (bodyPosition > target) |
140 | 0 | throw new IOException("tried to skip backwards"); |
141 | |
|
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | 0 | if (bodyPosition == 0) |
147 | 0 | bodyPosition += body.skip(target - bodyPosition); |
148 | |
else { |
149 | |
long read; |
150 | |
do { |
151 | 0 | bodyPosition += (read = body.read(skipBuffer, 0, (int) Math.min( |
152 | |
target - bodyPosition, skipBuffer.length))); |
153 | 0 | } while (read > 0); |
154 | |
} |
155 | |
|
156 | 0 | if (bodyPosition < target) |
157 | 0 | throw new IOException("skipped only to " + bodyPosition + " not " |
158 | |
+ target); |
159 | |
|
160 | 0 | StringBuffer buf = new StringBuffer(); |
161 | 0 | IndexTokenizer words = new IndexTokenizer(body, true); |
162 | 0 | int baseIndex = Math.max(results.currentWordIndex() |
163 | |
- IndexOther.contextWordsBeforeHit, 0); |
164 | |
|
165 | |
do { |
166 | 0 | int limitIndex = results.currentWordIndex() + results.hitWordsCount() |
167 | |
+ contextWordsAfterHit - baseIndex; |
168 | |
|
169 | 0 | while (words.hasMoreWords() && words.wordIndex() < limitIndex) { |
170 | 0 | String word = words.nextWord(); |
171 | 0 | if (!word.startsWith("#")) { |
172 | 0 | if (words.hadBreak() && buf.length() > 0) |
173 | 0 | buf.append(' '); |
174 | 0 | buf.append(word); |
175 | |
} |
176 | 0 | } |
177 | |
|
178 | |
|
179 | |
|
180 | 0 | results.skipToNextHit(); |
181 | |
} while (results.currentWordIndex() != -1 |
182 | 0 | && (results.currentWordIndex() - baseIndex) - words.wordIndex() < wordsGapBetweenAreas); |
183 | |
|
184 | 0 | bodyPosition += words.bytesReadFromUnderlyingStream(); |
185 | |
|
186 | 0 | return buf.toString(); |
187 | 0 | } catch (IOException e) { |
188 | 0 | System.err.println(e); |
189 | |
|
190 | |
try { |
191 | 0 | body.close(); |
192 | 0 | } catch (Exception ee) { |
193 | 0 | } |
194 | |
|
195 | 0 | return null; |
196 | |
} |
197 | |
} |
198 | |
|
199 | |
public static void main(String[] args) throws Exception { |
200 | 0 | IndexOther fti = new IndexOther(new File("/tmp")); |
201 | |
|
202 | |
SearchResults rawResults; |
203 | 0 | if (args[0].charAt(0) == '_') { |
204 | 0 | args[0] = args[0].substring(1); |
205 | 0 | rawResults = fti.groupSearchResults(args); |
206 | 0 | System.out.println("phrase"); |
207 | 0 | } else if (args[0].equals("-query")) { |
208 | 0 | rawResults = fti.querySearchResults(args[1]); |
209 | |
} else { |
210 | 0 | rawResults = fti.andSearchResults(args); |
211 | 0 | System.out.println("and"); |
212 | |
} |
213 | |
|
214 | 0 | ContextSearchResults results = new ContextSearchResults(new Library() { |
215 | |
public Text text(final long textID) { |
216 | 0 | return new Text() { |
217 | |
public InputStream body() throws IOException { |
218 | 0 | return new FileInputStream( |
219 | |
"/usr/doc/HOWTO/other-formats/html/CDROM-HOWTO-3.html"); |
220 | |
} |
221 | |
|
222 | |
public InputStream bodyForFragment() throws IOException { |
223 | 0 | return body(); |
224 | |
} |
225 | |
|
226 | |
public long ftiTextID() { |
227 | 0 | return textID; |
228 | |
} |
229 | |
}; |
230 | |
} |
231 | |
}, rawResults, fti); |
232 | |
|
233 | 0 | for (results.gotoText(0); results.currentTextID() != -1; |
234 | |
|
235 | 0 | results.gotoText(results.currentTextID() + 1)) { |
236 | 0 | System.out.println("== " + results.currentTextID()); |
237 | |
for (;;) { |
238 | 0 | String anchor = results.currentAnchor(); |
239 | 0 | String area = results.nextArea(); |
240 | 0 | if (area == null) |
241 | 0 | break; |
242 | 0 | System.out.println("-- A HREF=http://doc#" + anchor); |
243 | 0 | System.out.println(area); |
244 | 0 | } |
245 | |
} |
246 | 0 | } |
247 | |
|
248 | |
public void close() { |
249 | 0 | results.close(); |
250 | 0 | blockmarks.close(); |
251 | 0 | closeBody(); |
252 | 0 | } |
253 | |
} |