| Id | Title | Timestamp | Comments | Document Link |
|---|---|---|---|---|
io.file.buffer.size specified in the given Configuration.
67 | *
68 | * @param in
69 | * input stream
70 | * @param conf
71 | * configuration
72 | */
73 | public LfLineReader(InputStream in, Configuration conf) throws IOException {
74 | this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
75 | }
76 |
77 | /**
78 | * Close the underlying stream.
79 | */
80 | public void close() throws IOException {
81 | in.close();
82 | }
83 |
84 | /**
85 | * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF).
86 | * EOF also terminates an otherwise unterminated line.
87 | *
88 | * @param str
89 | * the object to store the given line (without newline)
90 | * @param maxLineLength
91 | * the maximum number of bytes to store into str; the rest of the line is silently
92 | * discarded.
93 | * @param maxBytesToConsume
94 | * the maximum number of bytes to consume in this call. This is only a hint, because if
95 | * the line cross this threshold, we allow it to happen. It can overshoot potentially by
96 | * as much as one buffer length.
97 | *
98 | * @return the number of bytes read including the (longest) newline found.
99 | *
100 | * @throws IOException
101 | * if the underlying stream throws
102 | */
103 | public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
104 | /*
105 | * We're reading data from in, but the head of the stream may be already buffered in buffer, so
106 | * we have several cases: 1. No newline characters are in the buffer, so we need to copy
107 | * everything and read another buffer from the stream. 2. An unambiguously terminated line is in
108 | * buffer, so we just copy to str.
109 | */
110 | str.clear();
111 | int txtLength = 0; // tracks str.getLength(), as an optimization
112 | int newlineLength = 0; // length of terminating newline
113 | long bytesConsumed = 0;
114 | do {
115 | int startPosn = bufferPosn; // starting from where we left off the last time
116 | if (bufferPosn >= bufferLength) {
117 | startPosn = bufferPosn = 0;
118 | bufferLength = in.read(buffer);
119 | if (bufferLength <= 0) {
120 | break; // EOF
121 | }
122 | }
123 | for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
124 | if (buffer[bufferPosn] == LF) {
125 | newlineLength = 1;
126 | ++bufferPosn; // at next invocation proceed from following byte
127 | break;
128 | }
129 | }
130 | int readLength = bufferPosn - startPosn;
131 | bytesConsumed += readLength;
132 | int appendLength = readLength - newlineLength;
133 | if (appendLength > maxLineLength - txtLength) {
134 | appendLength = maxLineLength - txtLength;
135 | }
136 | if (appendLength > 0) {
137 | str.append(buffer, startPosn, appendLength);
138 | txtLength += appendLength;
139 | }
140 | } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
141 |
142 | if (bytesConsumed > Integer.MAX_VALUE) {
143 | throw new IOException("Too many bytes before newline: " + bytesConsumed);
144 | }
145 | return (int) bytesConsumed;
146 | }
147 |
148 | /**
149 | * Read from the InputStream into the given Text.
150 | *
151 | * @param str
152 | * the object to store the given line
153 | * @param maxLineLength
154 | * the maximum number of bytes to store into str.
155 | * @return the number of bytes read including the newline
156 | * @throws IOException
157 | * if the underlying stream throws
158 | */
159 | public int readLine(Text str, int maxLineLength) throws IOException {
160 | return readLine(str, maxLineLength, Integer.MAX_VALUE);
161 | }
162 |
163 | /**
164 | * Read from the InputStream into the given Text.
165 | *
166 | * @param str
167 | * the object to store the given line
168 | * @return the number of bytes read including the newline
169 | * @throws IOException
170 | * if the underlying stream throws
171 | */
172 | public int readLine(Text str) throws IOException {
173 | return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
174 | }
175 |
176 | }
177 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.accumulo.examples.wikisearch.iterator;
18 |
19 | import java.util.ArrayList;
20 | import java.util.Collections;
21 | import java.util.List;
22 |
23 | import org.apache.accumulo.core.data.Key;
24 | import org.apache.accumulo.core.data.Value;
25 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight;
26 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder;
27 | import org.junit.After;
28 | import org.junit.Assert;
29 | import org.junit.Before;
30 | import org.junit.Test;
31 |
32 | import com.google.protobuf.InvalidProtocolBufferException;
33 |
34 | public class TextIndexTest {
35 | private TextIndexCombiner combiner;
36 | private List
Wiki Search using Apache Accumulo
50 |This sample application demonstrates the ability to use search documents using Apache Accumulo. The associated ingest software 51 | extracts the id, title, timestamp, and comments from each wikipedia article. In addition, the wikipedia text has been tokenized 52 | and is available for searching. You can enter a boolean expression into the search box below and select the particular set of 53 | wikipedia languages you want to search.
54 |Fields available for searching: 55 |
56 |- TEXT
57 | - ID
58 | - TITLE
59 | - TIMESTAMP
60 | - COMMENTS
61 |
62 |The search syntax is boolean logic, for example: TEXT == 'boy' and TITLE =~ 'Autism'. The supported operators are: 63 | ==, !=, <, >, ≤, ≥, =~, and !~. Likewise grouping can be performed using parentheses and predicates can be 64 | joined using and, or, and not. 65 |
To highlight the cell-level access control of Apache Accumulo, the "authorization" required for a particular cell is the language 66 | of the associated wikipedia article. 67 |