├── query
    ├── src
    │   ├── main
    │   │   ├── resources
    │   │   │   └── META-INF
    │   │   │   │   ├── MANIFEST.MF
    │   │   │   │   ├── ejb-jar.xml.uno
    │   │   │   │   └── ejb-jar.xml.example
    │   │   └── java
    │   │   │   └── org
    │   │   │       └── apache
    │   │   │           └── accumulo
    │   │   │               └── examples
    │   │   │                   └── wikisearch
    │   │   │                       ├── sample
    │   │   │                           ├── Results.java
    │   │   │                           ├── Field.java
    │   │   │                           └── Document.java
    │   │   │                       ├── query
    │   │   │                           └── IQuery.java
    │   │   │                       ├── util
    │   │   │                           ├── FieldIndexKeyParser.java
    │   │   │                           ├── KeyParser.java
    │   │   │                           └── BaseKeyParser.java
    │   │   │                       ├── function
    │   │   │                           └── QueryFunctions.java
    │   │   │                       ├── iterator
    │   │   │                           ├── DefaultIteratorEnvironment.java
    │   │   │                           ├── EvaluatingIterator.java
    │   │   │                           └── OptimizedQueryIterator.java
    │   │   │                       ├── parser
    │   │   │                           ├── JexlOperatorConstants.java
    │   │   │                           ├── TreeNode.java
    │   │   │                           └── EventFields.java
    │   │   │                       ├── logic
    │   │   │                           └── ContentLogic.java
    │   │   │                       └── jexl
    │   │   │                           └── Arithmetic.java
    │   ├── assembly
    │   │   └── dist.xml
    │   └── test
    │   │   ├── java
    │   │       └── org
    │   │       │   └── apache
    │   │       │       └── accumulo
    │   │       │           └── examples
    │   │       │               └── wikisearch
    │   │       │                   └── logic
    │   │       │                       └── StandaloneStatusReporter.java
    │   │   └── hadoop1
    │   │       └── org
    │   │           └── apache
    │   │               └── accumulo
    │   │                   └── examples
    │   │                       └── wikisearch
    │   │                           └── logic
    │   │                               └── TestQueryLogic.java
    └── pom.xml
├── .gitignore
├── NOTICE
├── ingest
    ├── src
    │   ├── main
    │   │   ├── protobuf
    │   │   │   ├── compile_protos.sh
    │   │   │   ├── TermWeight.proto
    │   │   │   └── Uid.proto
    │   │   └── java
    │   │   │   └── org
    │   │   │       └── apache
    │   │   │           └── accumulo
    │   │   │               └── examples
    │   │   │                   └── wikisearch
    │   │   │                       ├── normalizer
    │   │   │                           ├── NoOpNormalizer.java
    │   │   │                           ├── Normalizer.java
    │   │   │                           └── LcNoDiacriticsNormalizer.java
    │   │   │                       ├── ingest
    │   │   │                           ├── LRUOutputCombiner.java
    │   │   │                           ├── WikipediaPartitioner.java
    │   │   │                           ├── WikipediaInputFormat.java
    │   │   │                           ├── ArticleExtractor.java
    │   │   │                           └── WikipediaConfiguration.java
    │   │   │                       ├── iterator
    │   │   │                           ├── GlobalIndexUidCombiner.java
    │   │   │                           └── TextIndexCombiner.java
    │   │   │                       ├── util
    │   │   │                           └── TextUtil.java
    │   │   │                       ├── output
    │   │   │                           ├── SortingRFileOutputFormat.java
    │   │   │                           └── BufferingRFileRecordWriter.java
    │   │   │                       └── reader
    │   │   │                           ├── LongLineRecordReader.java
    │   │   │                           ├── LfLineReader.java
    │   │   │                           └── AggregatingRecordReader.java
    │   ├── assembly
    │   │   └── dist.xml
    │   └── test
    │   │   └── java
    │   │       └── org
    │   │           └── apache
    │   │               └── accumulo
    │   │                   └── examples
    │   │                       └── wikisearch
    │   │                           ├── ingest
    │   │                               └── WikipediaInputSplitTest.java
    │   │                           └── iterator
    │   │                               ├── TextIndexTest.java
    │   │                               └── GlobalIndexUidTest.java
    ├── conf
    │   ├── wikipedia.xml.uno
    │   ├── wikipedia.xml.example
    │   └── wikipedia_parallel.xml.example
    ├── bin
    │   ├── ingest.sh
    │   └── ingest_parallel.sh
    └── pom.xml
├── query-war
    ├── src
    │   └── main
    │   │   └── webapp
    │   │       ├── WEB-INF
    │   │           ├── jboss-web.xml
    │   │           └── web.xml
    │   │       ├── style.xsl
    │   │       └── ui.html
    └── pom.xml
├── .asf.yaml
├── CONTRIBUTING.md
├── .github
    └── workflows
    │   └── maven.yaml
└── INSTALL.md


/query/src/main/resources/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/target
 2 | .idea
 3 | **/*.iml
 4 | **/lib
 5 | .project
 6 | .settings/
 7 | .classpath
 8 | wikipedia.xml
 9 | ejb-jar.xml
10 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Apache Accumulo Wikisearch
2 | Copyright 2011-2019 The Apache Software Foundation
3 | 
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 | 


--------------------------------------------------------------------------------
/ingest/src/main/protobuf/compile_protos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | for PROTO in `ls -1 *proto`; do protoc --java_out ../java $PROTO; done
20 | 


--------------------------------------------------------------------------------
/query-war/src/main/webapp/WEB-INF/jboss-web.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <jboss-web>
19 |   <context-root>/accumulo-wikisearch</context-root>
20 | </jboss-web>
21 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.normalizer;
18 | 
19 | public class NoOpNormalizer implements Normalizer {
20 |   public String normalizeFieldValue(String field, Object value) {
21 |     return value.toString();
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/.asf.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 | 
20 | # https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
21 | 
22 | github:
23 |   description: "Apache Accumulo Wikisearch"
24 |   homepage: https://accumulo.apache.org
25 |   labels:
26 |     - accumulo
27 |     - big-data
28 |     - hacktoberfest
29 |   features:
30 |     wiki: false
31 |     issues: true
32 |     projects: true
33 | 
34 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Licensed to the Apache Software Foundation (ASF) under one or more
 3 | contributor license agreements.  See the NOTICE file distributed with
 4 | this work for additional information regarding copyright ownership.
 5 | The ASF licenses this file to You under the Apache License, Version 2.0
 6 | (the "License"); you may not use this file except in compliance with
 7 | the License.  You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | -->
17 | 
18 | # Contributing to the Accumulo Wikisearch application
19 | 
20 | Contributions to the Accumulo Wikisearch application can be made by creating a pull
21 | request to this repo on GitHub.
22 | 
23 | Before creating a pull request, run `mvn clean verify`.
24 | 
25 | For general information on contributing to Accumulo projects, check out the
26 | [Accumulo Contributor guide][contribute].
27 | 
28 | [contribute]: https://accumulo.apache.org/contributor/
29 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.normalizer;
18 | 
19 | public interface Normalizer {
20 |   
21 |   /**
22 |    * Creates normalized content for ingest based upon implemented logic.
23 |    * 
24 |    * @param field
25 |    *          The field being normalized
26 |    * @param value
27 |    *          The value to normalize
28 |    * @return a normalized value
29 |    */
30 |   public String normalizeFieldValue(String field, Object value);
31 |   
32 | }
33 | 


--------------------------------------------------------------------------------
/ingest/src/assembly/dist.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <assembly>
19 |   <id>dist</id>
20 |   <formats>
21 |     <format>tar.gz</format>
22 |   </formats>
23 |   <baseDirectory></baseDirectory>
24 |   <fileSets>
25 |     <fileSet>
26 |       <directory>lib</directory>
27 |       <fileMode>0644</fileMode>
28 |     </fileSet>
29 |     <fileSet>
30 |       <directory>bin</directory>
31 |       <fileMode>0744</fileMode>
32 |     </fileSet>
33 |     <fileSet>
34 |       <directory>conf</directory>
35 |       <fileMode>0644</fileMode>
36 |     </fileSet>
37 |   </fileSets>
38 | </assembly>
39 | 


--------------------------------------------------------------------------------
/ingest/src/main/protobuf/TermWeight.proto:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one or more
 2 | // contributor license agreements.  See the NOTICE file distributed with
 3 | // this work for additional information regarding copyright ownership.
 4 | // The ASF licenses this file to You under the Apache License, Version 2.0
 5 | // (the "License"); you may not use this file except in compliance with
 6 | // the License.  You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | 
16 | // compile with protoc --java_out ../java
17 | // compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
18 | //      classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
19 | 
20 | package org.apache.accumulo.examples.wikisearch.protobuf;
21 | 
22 | option java_package = "org.apache.accumulo.examples.wikisearch.protobuf";
23 | option optimize_for = SPEED;
24 | 
25 | message Info {
26 | 	required float normalizedTermFrequency = 1;
27 | 	repeated uint32 wordOffset = 2;
28 | }
29 | 


--------------------------------------------------------------------------------
/ingest/src/main/protobuf/Uid.proto:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one or more
 2 | // contributor license agreements.  See the NOTICE file distributed with
 3 | // this work for additional information regarding copyright ownership.
 4 | // The ASF licenses this file to You under the Apache License, Version 2.0
 5 | // (the "License"); you may not use this file except in compliance with
 6 | // the License.  You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | 
16 | // compile with protoc --java_out ../java
17 | // compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
18 | //      classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
19 | 
20 | package org.apache.accumulo.examples.wikisearch.protobuf;
21 | 
22 | option java_package = "org.apache.accumulo.examples.wikisearch.protobuf";
23 | option optimize_for = SPEED;
24 | 
25 | message List {
26 |   required bool IGNORE = 1;
27 |   required uint64 COUNT = 2;
28 |   repeated string UID = 3;
29 | }
30 | 


--------------------------------------------------------------------------------
/query/src/assembly/dist.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <assembly>
19 |   <id>dist</id>
20 |   <formats>
21 |     <format>tar.gz</format>
22 |   </formats>
23 |   <baseDirectory></baseDirectory>
24 |   <fileSets>
25 |     <fileSet>
26 |       <directory>lib</directory>
27 |       <outputDirectory>lib</outputDirectory>
28 |       <excludes>
29 |         <exclude>${project.name}-${project.version}.jar</exclude>
30 |       </excludes>
31 |       <fileMode>0644</fileMode>
32 |     </fileSet>
33 |   </fileSets>
34 |   <files>
35 |     <file>
36 |       <source>target/${project.name}-${project.version}.jar</source>
37 |       <outputDirectory>deploy</outputDirectory>
38 |     </file>
39 |  </files>
40 | </assembly>
41 | 


--------------------------------------------------------------------------------
/ingest/conf/wikipedia.xml.uno:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <configuration>
19 |   <property>
20 |     <name>wikipedia.accumulo.zookeepers</name>
21 |     <value>localhost:2181</value>
22 |   </property>
23 |   <property>
24 |     <name>wikipedia.accumulo.instance_name</name>
25 |     <value>uno</value>
26 |   </property>
27 |   <property>
28 |     <name>wikipedia.accumulo.user</name>
29 |     <value>root</value>
30 |   </property>
31 |   <property>
32 |     <name>wikipedia.accumulo.password</name>
33 |     <value>secret</value>
34 |   </property>
35 |   <property>
36 |     <name>wikipedia.accumulo.table</name>
37 |     <value>wikipedia</value>
38 |   </property>
39 |   <property>
40 |     <name>wikipedia.ingest.partitions</name>
41 |     <value>1</value>
42 |   </property>
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/ingest/conf/wikipedia.xml.example:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <configuration>
19 |   <property>
20 |     <name>wikipedia.accumulo.zookeepers</name>
21 |     <value><!--zookeeper servers --></value>
22 |   </property>
23 |   <property>
24 |     <name>wikipedia.accumulo.instance_name</name>
25 |     <value><!--instance name --></value>
26 |   </property>
27 |   <property>
28 |     <name>wikipedia.accumulo.user</name>
29 |     <value><!--user name --></value>
30 |   </property>
31 |   <property>
32 |     <name>wikipedia.accumulo.password</name>
33 |     <value><!-- password --></value>
34 |   </property>
35 |   <property>
36 |     <name>wikipedia.accumulo.table</name>
37 |     <value><!--table name --></value>
38 |   </property>
39 |   <property>
40 |     <name>wikipedia.ingest.partitions</name>
41 |     <value><!--number of partitions --></value>
42 |   </property>
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/ingest/bin/ingest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | 
20 | THIS_SCRIPT="$0"
21 | SCRIPT_DIR="${THIS_SCRIPT%/*}"
22 | SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd`
23 | echo $SCRIPT_DIR
24 | 
25 | #
26 | # Add our jars
27 | #
28 | for f in $SCRIPT_DIR/../lib/*.jar; do
29 | 	CLASSPATH=${CLASSPATH}:$f  
30 | done
31 | 
32 | #
33 | # Transform the classpath into a comma-separated list also
34 | #
35 | LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'`
36 | 
37 | 
38 | #
39 | # Map/Reduce job
40 | #
41 | JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-*.jar
42 | CONF=$SCRIPT_DIR/../conf/wikipedia.xml
43 | HDFS_DATA_DIR=$1
44 | export HADOOP_CLASSPATH=$CLASSPATH
45 | echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}"
46 | hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}
47 | 


--------------------------------------------------------------------------------
/.github/workflows/maven.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 | 
20 | # This workflow will build a Java project with Maven
21 | # See also:
22 | #   https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
23 | 
24 | name: mvn
25 | 
26 | on:
27 |   push:
28 |     branches: [ '*' ]
29 |   pull_request:
30 |     branches: [ '*' ]
31 | 
32 | jobs:
33 |   # fast build to populate the local maven repository cache
34 |   verify:
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |     - uses: actions/checkout@v4
38 |     - name: Set up JDK 17
39 |       uses: actions/setup-java@v4
40 |       with:
41 |         distribution: adopt
42 |         java-version: 17
43 |         cache: 'maven'
44 |     - name: Show the first log message
45 |       run: git log -n1
46 |     - name: Build with Maven
47 |       timeout-minutes: 5
48 |       run: mvn -B -V -e -ntp "-Dstyle.color=always" clean verify
49 |       env:
50 |         MAVEN_OPTS: -Djansi.force=true
51 | 
52 | 


--------------------------------------------------------------------------------
/query-war/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements. See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License. You may obtain a copy of the License at
 9 | 
10 |   http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
19 |   <modelVersion>4.0.0</modelVersion>
20 |   <parent>
21 |     <groupId>org.apache.accumulo</groupId>
22 |     <artifactId>accumulo-wikisearch</artifactId>
23 |     <version>2.0.0-SNAPSHOT</version>
24 |   </parent>
25 | 
26 |   <artifactId>wikisearch-query-war</artifactId>
27 |   <packaging>war</packaging>
28 |   <name>wikisearch-query-war</name>
29 |   <build>
30 |     <plugins>
31 |       <plugin>
32 |         <groupId>org.apache.maven.plugins</groupId>
33 |         <artifactId>maven-war-plugin</artifactId>
34 |         <configuration>
35 |           <failOnMissingWebXml>true</failOnMissingWebXml>
36 |         </configuration>
37 |       </plugin>
38 |     </plugins>
39 |   </build>
40 | 
41 | </project>
42 | 


--------------------------------------------------------------------------------
/ingest/bin/ingest_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | 
20 | THIS_SCRIPT="$0"
21 | SCRIPT_DIR="${THIS_SCRIPT%/*}"
22 | SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd`
23 | echo $SCRIPT_DIR
24 | 
25 | #
26 | # Add our jars
27 | #
28 | for f in $SCRIPT_DIR/../lib/*.jar; do
29 | 	CLASSPATH=${CLASSPATH}:$f  
30 | done
31 | 
32 | #
33 | # Transform the classpath into a comma-separated list also
34 | #
35 | LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'`
36 | 
37 | 
38 | #
39 | # Map/Reduce job
40 | #
41 | JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.5.0.jar
42 | CONF=$SCRIPT_DIR/../conf/wikipedia_parallel.xml
43 | HDFS_DATA_DIR=$1
44 | export HADOOP_CLASSPATH=$CLASSPATH
45 | echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}"
46 | hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}
47 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/sample/Results.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.sample;
18 | 
19 | import java.util.ArrayList;
20 | import java.util.List;
21 | 
22 | import javax.xml.bind.annotation.XmlAccessType;
23 | import javax.xml.bind.annotation.XmlAccessorType;
24 | import javax.xml.bind.annotation.XmlElement;
25 | import javax.xml.bind.annotation.XmlRootElement;
26 | 
27 | @XmlRootElement
28 | @XmlAccessorType(XmlAccessType.FIELD)
29 | public class Results {
30 |   
31 |   @XmlElement
32 |   private List<Document> document = new ArrayList<Document>();
33 |   
34 |   public Results() {
35 |     super();
36 |   }
37 |   
38 |   public List<Document> getResults() {
39 |     return document;
40 |   }
41 |   
42 |   public void setResults(List<Document> results) {
43 |     this.document = results;
44 |   }
45 |   
46 |   public int size() {
47 |     if (null == document)
48 |       return 0;
49 |     else
50 |       return document.size();
51 |   }
52 |   
53 | }
54 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/sample/Field.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.sample;
18 | 
19 | import javax.xml.bind.annotation.XmlAccessType;
20 | import javax.xml.bind.annotation.XmlAccessorType;
21 | import javax.xml.bind.annotation.XmlAttribute;
22 | import javax.xml.bind.annotation.XmlValue;
23 | 
24 | @XmlAccessorType(XmlAccessType.FIELD)
25 | public class Field {
26 |   
27 |   @XmlAttribute
28 |   private String name = null;
29 |   @XmlValue
30 |   private String value = null;
31 |   
32 |   public Field() {
33 |     super();
34 |   }
35 |   
36 |   public Field(String fieldName, String fieldValue) {
37 |     super();
38 |     this.name = fieldName;
39 |     this.value = fieldValue;
40 |   }
41 |   
42 |   public String getFieldName() {
43 |     return name;
44 |   }
45 |   
46 |   public String getFieldValue() {
47 |     return value;
48 |   }
49 |   
50 |   public void setFieldName(String fieldName) {
51 |     this.name = fieldName;
52 |   }
53 |   
54 |   public void setFieldValue(String fieldValue) {
55 |     this.value = fieldValue;
56 |   }
57 |   
58 | }
59 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/sample/Document.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.sample;
18 | 
19 | import java.util.ArrayList;
20 | import java.util.List;
21 | 
22 | import javax.xml.bind.annotation.XmlAccessType;
23 | import javax.xml.bind.annotation.XmlAccessorType;
24 | import javax.xml.bind.annotation.XmlElement;
25 | 
26 | @XmlAccessorType(XmlAccessType.FIELD)
27 | public class Document {
28 |   
29 |   @XmlElement
30 |   private String id = null;
31 |   
32 |   @XmlElement
33 |   private List<Field> field = new ArrayList<Field>();
34 |   
35 |   public Document() {
36 |     super();
37 |   }
38 |   
39 |   public Document(String id, List<Field> fields) {
40 |     super();
41 |     this.id = id;
42 |     this.field = fields;
43 |   }
44 |   
45 |   public String getId() {
46 |     return id;
47 |   }
48 |   
49 |   public List<Field> getFields() {
50 |     return field;
51 |   }
52 |   
53 |   public void setId(String id) {
54 |     this.id = id;
55 |   }
56 |   
57 |   public void setFields(List<Field> fields) {
58 |     this.field = fields;
59 |   }
60 |   
61 | }
62 | 


--------------------------------------------------------------------------------
/query-war/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <web-app xmlns="http://java.sun.com/xml/ns/javaee"
19 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
20 |   xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
21 |           http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd" version="3.0">
22 | 
23 |   <context-param>
24 |     <param-name>resteasy.jndi.resources</param-name>
25 |     <param-value>Query/local</param-value>
26 |   </context-param>
27 | 
28 |   <listener>
29 |     <listener-class>org.jboss.resteasy.plugins.server.servlet.ResteasyBootstrap</listener-class>
30 |   </listener>
31 | 
32 |   <servlet>
33 |     <servlet-name>Resteasy</servlet-name>
34 |     <servlet-class>org.jboss.resteasy.plugins.server.servlet.HttpServletDispatcher</servlet-class>
35 |   </servlet>
36 | 
37 |   <servlet-mapping>
38 |     <servlet-name>Resteasy</servlet-name>
39 |     <url-pattern>/rest/*</url-pattern>
40 |   </servlet-mapping>
41 | 
42 |   <context-param>
43 |     <param-name>resteasy.servlet.mapping.prefix</param-name>
44 |     <param-value>/rest</param-value>
45 |   </context-param>
46 | 
47 | </web-app>
48 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.normalizer;
18 | 
19 | import java.text.Normalizer;
20 | import java.text.Normalizer.Form;
21 | import java.util.Locale;
22 | import java.util.regex.Matcher;
23 | import java.util.regex.Pattern;
24 | 
25 | /**
26 |  * An {@link Normalizer} which performs the following steps:
27 |  * <ol>
28 |  * <li>Unicode canonical decomposition ({@link Form#NFD})</li>
29 |  * <li>Removal of diacritical marks</li>
30 |  * <li>Unicode canonical composition ({@link Form#NFC})</li>
31 |  * <li>lower casing in the {@link Locale#ENGLISH English local}
32 |  * </ol>
33 |  */
34 | public class LcNoDiacriticsNormalizer implements org.apache.accumulo.examples.wikisearch.normalizer.Normalizer {
35 |   private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
36 |   
37 |   public String normalizeFieldValue(String fieldName, Object fieldValue) {
38 |     String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD);
39 |     String noDiacriticals = removeDiacriticalMarks(decomposed);
40 |     String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
41 |     return recomposed.toLowerCase(Locale.ENGLISH);
42 |   }
43 |   
44 |   private String removeDiacriticalMarks(String str) {
45 |     Matcher matcher = diacriticals.matcher(str);
46 |     return matcher.replaceAll("");
47 |   }
48 |   
49 | }
50 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/query/IQuery.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.query;
18 | 
19 | import javax.ws.rs.Consumes;
20 | import javax.ws.rs.GET;
21 | import javax.ws.rs.POST;
22 | import javax.ws.rs.Path;
23 | import javax.ws.rs.Produces;
24 | import javax.ws.rs.QueryParam;
25 | 
26 | import org.apache.accumulo.examples.wikisearch.sample.Results;
27 | 
28 | 
29 | @Path("/Query")
30 | public interface IQuery {
31 |   
32 |   @GET
33 |   @POST
34 |   @Path("/html")
35 |   @Consumes("*/*")
36 |   public String html(@QueryParam("query") String query, @QueryParam("auths") String auths);
37 |   
38 |   @GET
39 |   @POST
40 |   @Path("/xml")
41 |   @Consumes("*/*")
42 |   @Produces("application/xml")
43 |   public Results xml(@QueryParam("query") String query, @QueryParam("auths") String auths);
44 |   
45 |   @GET
46 |   @POST
47 |   @Path("/json")
48 |   @Consumes("*/*")
49 |   @Produces("application/json")
50 |   public Results json(@QueryParam("query") String query, @QueryParam("auths") String auths);
51 |   
52 |   @GET
53 |   @POST
54 |   @Path("/yaml")
55 |   @Consumes("*/*")
56 |   @Produces("text/x-yaml")
57 |   public Results yaml(@QueryParam("query") String query, @QueryParam("auths") String auths);
58 |   
59 |   @GET
60 |   @POST
61 |   @Path("/content")
62 |   @Consumes("*/*")
63 |   @Produces("application/xml")
64 |   public Results content(@QueryParam("query") String query, @QueryParam("auths") String auths);
65 |   
66 | }
67 | 


--------------------------------------------------------------------------------
/query-war/src/main/webapp/style.xsl:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
19 | <xsl:output method="html" indent="yes" encoding="UTF-8" omit-xml-declaration="yes" />
20 | 	<xsl:template match="/results">
21 | 	<html>
22 | 	<table border="1">
23 | 		<tr><th>Id</th><th>Title</th><th>Timestamp</th><th>Comments</th><th>Document Link</th></tr>
24 | 		<xsl:for-each select="document">
25 | 			<tr>
26 | 				<td><xsl:value-of select="field[@name = 'ID']/text()" /></td>
27 | 				<td><xsl:value-of select="field[@name = 'TITLE']/text()" /></td>
28 | 				<td><xsl:value-of select="field[@name = 'TIMESTAMP']/text()" /></td>
29 | 				<td><xsl:value-of select="field[@name = 'COMMENTS']/text()" /></td>
30 | 				<xsl:variable name="pointer" select="field[@name ='DOCUMENT']/text()" />
31 | 				<xsl:variable name="href">
32 | 					<xsl:text>/accumulo-wikisearch/rest/Query/content?query=</xsl:text><xsl:copy-of select="$pointer"/><xsl:text>&amp;auths=all</xsl:text>
33 | 				</xsl:variable>
34 | 				<xsl:variable name="link">
35 | 					<xsl:element name="a">
36 | 						<xsl:attribute name="href"><xsl:copy-of select="$href" /></xsl:attribute>
37 | 						<xsl:attribute name="target">_blank</xsl:attribute>
38 | 						<xsl:text>View Document</xsl:text>
39 | 					</xsl:element>
40 | 				</xsl:variable>
41 | 				<td><xsl:copy-of select="$link"/></td>
42 | 			</tr>
43 | 		</xsl:for-each>
44 | 	</table>
45 | 	</html>
46 | 	</xsl:template>
47 | </xsl:stylesheet>
48 | 


--------------------------------------------------------------------------------
/query/src/test/java/org/apache/accumulo/examples/wikisearch/logic/StandaloneStatusReporter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.logic;
18 | 
19 | import org.apache.hadoop.mapreduce.Counter;
20 | import org.apache.hadoop.mapreduce.Counters;
21 | import org.apache.hadoop.mapreduce.StatusReporter;
22 | 
23 | public class StandaloneStatusReporter extends StatusReporter {
24 |   
25 |   private Counters c = new Counters();
26 |   
27 |   private long filesProcessed = 0;
28 |   private long recordsProcessed = 0;
29 |   
30 |   public Counters getCounters() {
31 |     return c;
32 |   }
33 |   
34 |   @Override
35 |   public Counter getCounter(Enum<?> name) {
36 |     return c.findCounter(name);
37 |   }
38 |   
39 |   @Override
40 |   public Counter getCounter(String group, String name) {
41 |     return c.findCounter(group, name);
42 |   }
43 |   
44 |   @Override
45 |   public void progress() {
46 |     // do nothing
47 |   }
48 |   
49 |   @Override
50 |   public void setStatus(String status) {
51 |     // do nothing
52 |   }
53 |   
54 |   public long getFilesProcessed() {
55 |     return filesProcessed;
56 |   }
57 |   
58 |   public long getRecordsProcessed() {
59 |     return recordsProcessed;
60 |   }
61 |   
62 |   public void incrementFilesProcessed() {
63 |     filesProcessed++;
64 |     recordsProcessed = 0;
65 |   }
66 |   
67 |   public void incrementRecordsProcessed() {
68 |     recordsProcessed++;
69 |   }
70 |   
71 |   public float getProgress() {
72 |     return 0;
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.ingest;
18 | 
19 | import java.util.LinkedHashMap;
20 | import java.util.Map;
21 | 
22 | public class LRUOutputCombiner<Key,Value> extends LinkedHashMap<Key,Value> {
23 |   
24 |   private static final long serialVersionUID = 1L;
25 |   
26 |   public static abstract class Fold<Value> {
27 |     public abstract Value fold(Value oldValue, Value newValue);
28 |   }
29 |   
30 |   public static abstract class Output<Key,Value> {
31 |     public abstract void output(Key key, Value value);
32 |   }
33 |   
34 |   private final int capacity;
35 |   private final Fold<Value> fold;
36 |   private final Output<Key,Value> output;
37 |   
38 |   private long cacheHits = 0;
39 |   private long cacheMisses = 0;
40 |   
41 |   public LRUOutputCombiner(int capacity, Fold<Value> fold, Output<Key,Value> output) {
42 |     super(capacity + 1, 1.1f, true);
43 |     this.capacity = capacity;
44 |     this.fold = fold;
45 |     this.output = output;
46 |   }
47 |   
48 |   protected boolean removeEldestEntry(Map.Entry<Key,Value> eldest) {
49 |     if (size() > capacity) {
50 |       output.output(eldest.getKey(), eldest.getValue());
51 |       return true;
52 |     }
53 |     return false;
54 |   }
55 |   
56 |   @Override
57 |   public Value put(Key key, Value value) {
58 |     Value val = get(key);
59 |     if (val != null) {
60 |       value = fold.fold(val, value);
61 |       cacheHits++;
62 |     } else {
63 |       cacheMisses++;
64 |     }
65 |     super.put(key, value);
66 |     return null;
67 |   }
68 |   
69 |   public void flush() {
70 |     for (Map.Entry<Key,Value> e : entrySet()) {
71 |       output.output(e.getKey(), e.getValue());
72 |     }
73 |     clear();
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/util/FieldIndexKeyParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.util;
18 | 
19 | import org.apache.accumulo.core.data.Key;
20 | 
21 | public class FieldIndexKeyParser extends KeyParser {
22 |   
23 |   public static final String DELIMITER = "\0";
24 |   
25 |   @Override
26 |   public void parse(Key key) {
27 |     super.parse(key);
28 |     
29 |     String[] colFamParts = this.keyFields.get(BaseKeyParser.COLUMN_FAMILY_FIELD).split(DELIMITER);
30 |     this.keyFields.put(FIELDNAME_FIELD, colFamParts.length >= 2 ? colFamParts[1] : "");
31 |     
32 |     String[] colQualParts = this.keyFields.get(BaseKeyParser.COLUMN_QUALIFIER_FIELD).split(DELIMITER);
33 |     this.keyFields.put(SELECTOR_FIELD, colQualParts.length >= 1 ? colQualParts[0] : "");
34 |     this.keyFields.put(DATATYPE_FIELD, colQualParts.length >= 2 ? colQualParts[1] : "");
35 |     this.keyFields.put(UID_FIELD, colQualParts.length >= 3 ? colQualParts[2] : "");
36 |   }
37 |   
38 |   @Override
39 |   public BaseKeyParser duplicate() {
40 |     return new FieldIndexKeyParser();
41 |   }
42 |   
43 |   @Override
44 |   public String getSelector() {
45 |     return keyFields.get(SELECTOR_FIELD);
46 |   }
47 |   
48 |   @Override
49 |   public String getDataType() {
50 |     return keyFields.get(DATATYPE_FIELD);
51 |   }
52 |   
53 |   @Override
54 |   public String getFieldName() {
55 |     return keyFields.get(FIELDNAME_FIELD);
56 |   }
57 |   
58 |   @Override
59 |   public String getUid() {
60 |     return keyFields.get(UID_FIELD);
61 |   }
62 |   
63 |   public String getDataTypeUid() {
64 |     return getDataType() + DELIMITER + getUid();
65 |   }
66 |   
67 |   // An alias for getSelector
68 |   public String getFieldValue() {
69 |     return getSelector();
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/function/QueryFunctions.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.function;
18 | 
19 | import org.apache.commons.lang.math.NumberUtils;
20 | import org.apache.log4j.Logger;
21 | 
22 | public class QueryFunctions {
23 |   
24 |   protected static Logger log = Logger.getLogger(QueryFunctions.class);
25 |   
26 |   public static boolean between(String fieldValue, double left, double right) {
27 |     try {
28 |       Double value = Double.parseDouble(fieldValue);
29 |       if (value >= left && value <= right)
30 |         return true;
31 |       return false;
32 |     } catch (NumberFormatException nfe) {
33 |       return false;
34 |     }
35 |   }
36 |   
37 |   public static boolean between(String fieldValue, long left, long right) {
38 |     try {
39 |       Long value = Long.parseLong(fieldValue);
40 |       if (value >= left && value <= right)
41 |         return true;
42 |       return false;
43 |     } catch (NumberFormatException nfe) {
44 |       return false;
45 |     }
46 |   }
47 |   
48 |   public static Number abs(String fieldValue) {
49 |     Number retval = null;
50 |     try {
51 |       Number value = NumberUtils.createNumber(fieldValue);
52 |       if (null == value)
53 |         retval = (Number) Integer.MIN_VALUE;
54 |       else if (value instanceof Long)
55 |         retval = Math.abs(value.longValue());
56 |       else if (value instanceof Double)
57 |         retval = Math.abs(value.doubleValue());
58 |       else if (value instanceof Float)
59 |         retval = Math.abs(value.floatValue());
60 |       else if (value instanceof Integer)
61 |         retval = Math.abs(value.intValue());
62 |     } catch (NumberFormatException nfe) {
63 |       return (Number) Integer.MIN_VALUE;
64 |     }
65 |     return retval;
66 |   }
67 |   
68 | }
69 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/util/KeyParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.util;
18 | 
19 | import org.apache.accumulo.core.data.Key;
20 | 
21 | public class KeyParser extends BaseKeyParser {
22 |   public static final String SELECTOR_FIELD = "selector";
23 |   public static final String DATATYPE_FIELD = "dataType";
24 |   public static final String FIELDNAME_FIELD = "fieldName";
25 |   public static final String UID_FIELD = "uid";
26 |   public static final String DELIMITER = "\0";
27 |   
28 |   @Override
29 |   public void parse(Key key) {
30 |     super.parse(key);
31 |     
32 |     String[] colFamParts = this.keyFields.get(BaseKeyParser.COLUMN_FAMILY_FIELD).split(DELIMITER);
33 |     this.keyFields.put(FIELDNAME_FIELD, colFamParts.length >= 2 ? colFamParts[1] : "");
34 |     
35 |     String[] colQualParts = this.keyFields.get(BaseKeyParser.COLUMN_QUALIFIER_FIELD).split(DELIMITER);
36 |     this.keyFields.put(SELECTOR_FIELD, colQualParts.length >= 1 ? colQualParts[0] : "");
37 |     this.keyFields.put(DATATYPE_FIELD, colQualParts.length >= 2 ? colQualParts[1] : "");
38 |     this.keyFields.put(UID_FIELD, colQualParts.length >= 3 ? colQualParts[2] : "");
39 |   }
40 |   
41 |   @Override
42 |   public BaseKeyParser duplicate() {
43 |     return new KeyParser();
44 |   }
45 |   
46 |   public String getSelector() {
47 |     return keyFields.get(SELECTOR_FIELD);
48 |   }
49 |   
50 |   public String getDataType() {
51 |     return keyFields.get(DATATYPE_FIELD);
52 |   }
53 |   
54 |   public String getFieldName() {
55 |     return keyFields.get(FIELDNAME_FIELD);
56 |   }
57 |   
58 |   public String getUid() {
59 |     return keyFields.get(UID_FIELD);
60 |   }
61 |   
62 |   public String getDataTypeUid() {
63 |     return getDataType() + DELIMITER + getUid();
64 |   }
65 |   
66 |   // An alias for getSelector
67 |   public String getFieldValue() {
68 |     return getSelector();
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/util/BaseKeyParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.util;
18 | 
19 | import java.util.HashMap;
20 | import java.util.Map;
21 | 
22 | import org.apache.accumulo.core.data.Key;
23 | 
24 | public class BaseKeyParser {
25 |   public static final String ROW_FIELD = "row";
26 |   public static final String COLUMN_FAMILY_FIELD = "columnFamily";
27 |   public static final String COLUMN_QUALIFIER_FIELD = "columnQualifier";
28 | 
29 |   protected Map<String,String> keyFields = new HashMap<>();
30 |   protected Key key = null;
31 | 
32 |   /**
33 |    * Parses a Key object into its constituent fields. This method clears any prior values, so the
34 |    * object can be reused without requiring a new instantiation. This default implementation makes
35 |    * the row, columnFamily, and columnQualifier available.
36 |    */
37 |   public void parse(Key key) {
38 |     this.key = key;
39 | 
40 |     keyFields.clear();
41 | 
42 |     keyFields.put(ROW_FIELD, key.getRow().toString());
43 |     keyFields.put(COLUMN_FAMILY_FIELD, key.getColumnFamily().toString());
44 |     keyFields.put(COLUMN_QUALIFIER_FIELD, key.getColumnQualifier().toString());
45 |   }
46 | 
47 |   public String getFieldValue(String fieldName) {
48 |     return keyFields.get(fieldName);
49 |   }
50 | 
51 |   public String[] getFieldNames() {
52 |     String[] fieldNames = new String[keyFields.size()];
53 |     return keyFields.keySet().toArray(fieldNames);
54 |   }
55 | 
56 |   public BaseKeyParser duplicate() {
57 |     return new BaseKeyParser();
58 |   }
59 | 
60 |   public String getRow() {
61 |     return keyFields.get(ROW_FIELD);
62 |   }
63 | 
64 |   public String getColumnFamily() {
65 |     return keyFields.get(COLUMN_FAMILY_FIELD);
66 |   }
67 | 
68 |   public String getColumnQualifier() {
69 |     return keyFields.get(COLUMN_QUALIFIER_FIELD);
70 |   }
71 | 
72 |   public Key getKey() {
73 |     return this.key;
74 |   }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/query/src/main/resources/META-INF/ejb-jar.xml.uno:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <ejb-jar xmlns="http://java.sun.com/xml/ns/javaee"
19 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
20 |   xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
21 |           http://java.sun.com/xml/ns/javaee/ejb-jar_3_1.xsd" version="3.1">
22 |   <enterprise-beans>
23 |     <session>
24 |       <ejb-name>Query</ejb-name>
25 |       <env-entry>
26 |         <env-entry-name>instanceName</env-entry-name>
27 |         <env-entry-type>java.lang.String</env-entry-type>
28 |         <env-entry-value>uno</env-entry-value>
29 |       </env-entry>
30 |       <env-entry>
31 |         <env-entry-name>zooKeepers</env-entry-name>
32 |         <env-entry-type>java.lang.String</env-entry-type>
33 |         <env-entry-value>localhost:2181</env-entry-value>
34 |       </env-entry>
35 |       <env-entry>
36 |         <env-entry-name>username</env-entry-name>
37 |         <env-entry-type>java.lang.String</env-entry-type>
38 |         <env-entry-value>root</env-entry-value>
39 |       </env-entry>
40 |       <env-entry>
41 |         <env-entry-name>password</env-entry-name>
42 |         <env-entry-type>java.lang.String</env-entry-type>
43 |         <env-entry-value>secret</env-entry-value>
44 |       </env-entry>
45 |       <env-entry>
46 |         <env-entry-name>tableName</env-entry-name>
47 |         <env-entry-type>java.lang.String</env-entry-type>
48 |         <env-entry-value>wikipedia</env-entry-value>
49 |       </env-entry>
50 |       <env-entry>
51 |         <env-entry-name>partitions</env-entry-name>
52 |         <env-entry-type>java.lang.Integer</env-entry-type>
53 |         <env-entry-value>100</env-entry-value>
54 |       </env-entry>
55 |       <env-entry>
56 |         <env-entry-name>threads</env-entry-name>
57 |         <env-entry-type>java.lang.Integer</env-entry-type>
58 |         <env-entry-value>8</env-entry-value>
59 |       </env-entry>
60 |     </session>
61 |   </enterprise-beans>
62 | </ejb-jar>
63 | 


--------------------------------------------------------------------------------
/query/src/main/resources/META-INF/ejb-jar.xml.example:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <ejb-jar xmlns="http://java.sun.com/xml/ns/javaee"
19 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
20 |   xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
21 |           http://java.sun.com/xml/ns/javaee/ejb-jar_3_1.xsd" version="3.1">
22 |   <enterprise-beans>
23 |     <session>
24 |       <ejb-name>Query</ejb-name>
25 |       <env-entry>
26 |         <env-entry-name>instanceName</env-entry-name>
27 |         <env-entry-type>java.lang.String</env-entry-type>
28 |         <env-entry-value><!-- replace me --></env-entry-value>
29 |       </env-entry>
30 |       <env-entry>
31 |         <env-entry-name>zooKeepers</env-entry-name>
32 |         <env-entry-type>java.lang.String</env-entry-type>
33 |         <env-entry-value><!-- replace me --></env-entry-value>
34 |       </env-entry>
35 |       <env-entry>
36 |         <env-entry-name>username</env-entry-name>
37 |         <env-entry-type>java.lang.String</env-entry-type>
38 |         <env-entry-value><!-- replace me --></env-entry-value>
39 |       </env-entry>
40 |       <env-entry>
41 |         <env-entry-name>password</env-entry-name>
42 |         <env-entry-type>java.lang.String</env-entry-type>
43 |         <env-entry-value><!-- replace me --></env-entry-value>
44 |       </env-entry>
45 |       <env-entry>
46 |         <env-entry-name>tableName</env-entry-name>
47 |         <env-entry-type>java.lang.String</env-entry-type>
48 |         <env-entry-value>wiki</env-entry-value>
49 |       </env-entry>
50 |       <env-entry>
51 |         <env-entry-name>partitions</env-entry-name>
52 |         <env-entry-type>java.lang.Integer</env-entry-type>
53 |         <env-entry-value>100</env-entry-value>
54 |       </env-entry>
55 |       <env-entry>
56 |         <env-entry-name>threads</env-entry-name>
57 |         <env-entry-type>java.lang.Integer</env-entry-type>
58 |         <env-entry-value>8</env-entry-value>
59 |       </env-entry>
60 |     </session>
61 |   </enterprise-beans>
62 | </ejb-jar>
63 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputSplitTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.ingest;
18 | 
19 | import java.io.ByteArrayInputStream;
20 | import java.io.ByteArrayOutputStream;
21 | import java.io.DataInput;
22 | import java.io.IOException;
23 | import java.io.ObjectInputStream;
24 | import java.io.ObjectOutputStream;
25 | 
26 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
27 | import org.apache.hadoop.fs.Path;
28 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
29 | import org.junit.Assert;
30 | import org.junit.Test;
31 | 
32 | public class WikipediaInputSplitTest {
33 |   @Test
34 |   public void testSerialization() throws IOException {
35 |     Path testPath = new Path("/foo/bar");
36 |     String[] hosts = new String[2];
37 |     hosts[0] = "abcd";
38 |     hosts[1] = "efgh";
39 |     FileSplit fSplit = new FileSplit(testPath, 1, 2, hosts);
40 |     WikipediaInputSplit split = new WikipediaInputSplit(fSplit, 7);
41 |     ByteArrayOutputStream baos = new ByteArrayOutputStream();
42 |     ObjectOutputStream out = new ObjectOutputStream(baos);
43 |     split.write(out);
44 |     out.close();
45 |     baos.close();
46 | 
47 |     ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
48 |     DataInput in = new ObjectInputStream(bais);
49 | 
50 |     WikipediaInputSplit split2 = new WikipediaInputSplit();
51 |     split2.readFields(in);
52 |     Assert.assertTrue(bais.available() == 0);
53 |     bais.close();
54 | 
55 |     Assert.assertTrue(split.getPartition() == split2.getPartition());
56 | 
57 |     FileSplit fSplit2 = split2.getFileSplit();
58 |     Assert.assertTrue(fSplit.getPath().equals(fSplit2.getPath()));
59 |     Assert.assertTrue(fSplit.getStart() == fSplit2.getStart());
60 |     Assert.assertTrue(fSplit.getLength() == fSplit2.getLength());
61 | 
62 |     String[] hosts2 = fSplit2.getLocations();
63 |     Assert.assertEquals(hosts.length, hosts2.length);
64 |     for (int i = 0; i < hosts.length; i++) {
65 |       Assert.assertEquals(hosts[i], hosts2[i]);
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/ingest/conf/wikipedia_parallel.xml.example:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <configuration>
19 |   <property>
20 |     <name>wikipedia.accumulo.zookeepers</name>
21 |     <value><!--zookeeper servers --></value>
22 |   </property>
23 |   <property>
24 |     <name>wikipedia.accumulo.instance_name</name>
25 |     <value><!--instance name --></value>
26 |   </property>
27 |   <property>
28 |     <name>wikipedia.accumulo.user</name>
29 |     <value><!--user name --></value>
30 |   </property>
31 |   <property>
32 |     <name>wikipedia.accumulo.password</name>
33 |     <value><!-- password --></value>
34 |   </property>
35 |   <property>
36 |     <name>wikipedia.accumulo.table</name>
37 |     <value><!--table name --></value>
38 |   </property>
39 |   <property>
40 |     <name>wikipedia.ingest.partitions</name>
41 |     <value><!--number of partitions --></value>
42 |   </property>
43 |   <property>
44 |     <name>wikipedia.partitioned.directory</name>
45 |     <value><!--hdfs directory for intemediate partitioned storage --></value>
46 |   </property>
47 |   <property>
48 |     <name>wikipedia.ingest.groups</name>
49 |     <value><!--the number of intermediate partition groups to generate --></value>
50 |   </property>
51 |   <property>
52 |     <name>wikipedia.run.partitioner</name>
53 |     <value><!--whether to run the partitioner step --></value>
54 |   </property>
55 |   <property>
56 |     <name>wikipedia.run.ingest</name>
57 |     <value><!--whether to run the ingest step --></value>
58 |   </property>
59 |   <property>
60 |     <name>wikipedia.bulk.ingest</name>
61 |     <value><!--whether to use bulk ingest vice streaming ingest --></value>
62 |   </property>
63 |   <property>
64 |     <name>wikipedia.bulk.ingest.dir</name>
65 |     <value><!--the directory to store rfiles for bulk ingest --></value>
66 |   </property>
67 |   <property>
68 |     <name>wikipedia.bulk.ingest.failure.dir</name>
69 |     <value><!--the directory to store failed rfiles after bulk ingest --></value>
70 |   </property>
71 |   <property>
72 |     <name>wikipedia.bulk.ingest.buffer.size</name>
73 |     <value><!--the ammount of memory to use for buffering and sorting key/value pairs in each mapper before writing rfiles --></value>
74 |   </property>
75 | </configuration>
76 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/DefaultIteratorEnvironment.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.iterator;
18 | 
19 | import java.io.IOException;
20 | 
21 | import org.apache.accumulo.core.client.sample.SamplerConfiguration;
22 | import org.apache.accumulo.core.conf.AccumuloConfiguration;
23 | import org.apache.accumulo.core.conf.DefaultConfiguration;
24 | import org.apache.accumulo.core.data.Key;
25 | import org.apache.accumulo.core.data.Value;
26 | import org.apache.accumulo.core.iterators.IteratorEnvironment;
27 | import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
28 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
29 | import org.apache.accumulo.core.iterators.system.MapFileIterator;
30 | import org.apache.accumulo.core.security.Authorizations;
31 | import org.apache.hadoop.conf.Configuration;
32 | import org.apache.hadoop.fs.FileSystem;
33 | 
34 | public class DefaultIteratorEnvironment implements IteratorEnvironment {
35 | 
36 |   AccumuloConfiguration conf;
37 | 
38 |   public DefaultIteratorEnvironment() {
39 |     this.conf = DefaultConfiguration.getInstance();
40 |   }
41 | 
42 |   @Override
43 |   public SortedKeyValueIterator<Key,Value> reserveMapFileReader(String mapFileName)
44 |       throws IOException {
45 |     Configuration conf = new Configuration();
46 |     FileSystem fs = FileSystem.get(conf);
47 |     return new MapFileIterator(fs, mapFileName, conf);
48 |   }
49 | 
50 |   @Override
51 |   public AccumuloConfiguration getConfig() {
52 |     return conf;
53 |   }
54 | 
55 |   @Override
56 |   public boolean isSamplingEnabled() {
57 |     return false;
58 |   }
59 | 
60 |   @Override
61 |   public IteratorScope getIteratorScope() {
62 |     throw new UnsupportedOperationException();
63 |   }
64 | 
65 |   @Override
66 |   public boolean isFullMajorCompaction() {
67 |     throw new UnsupportedOperationException();
68 |   }
69 | 
70 |   @Override
71 |   public void registerSideChannel(SortedKeyValueIterator<Key,Value> iter) {
72 |     throw new UnsupportedOperationException();
73 |   }
74 | 
75 |   @Override
76 |   public Authorizations getAuthorizations() {
77 |     throw new UnsupportedOperationException();
78 |   }
79 | 
80 |   @Override
81 |   public SamplerConfiguration getSamplerConfiguration() {
82 |     throw new UnsupportedOperationException();
83 |   }
84 | 
85 |   @Override
86 |   public IteratorEnvironment cloneWithSamplingEnabled() {
87 |     throw new UnsupportedOperationException();
88 |   }
89 | 
90 |   @Override
91 |   public boolean isUserCompaction() {
92 |     return false;
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidCombiner.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.accumulo.examples.wikisearch.iterator;
18 | 
19 | import java.io.IOException;
20 | import java.util.HashSet;
21 | import java.util.Iterator;
22 | import java.util.Map;
23 | 
24 | import org.apache.accumulo.core.client.lexicoder.Encoder;
25 | import org.apache.accumulo.core.data.Key;
26 | import org.apache.accumulo.core.data.Value;
27 | import org.apache.accumulo.core.iterators.IteratorEnvironment;
28 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
29 | import org.apache.accumulo.core.iterators.TypedValueCombiner;
30 | import org.apache.accumulo.core.iterators.ValueFormatException;
31 | import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
32 | 
33 | import com.google.protobuf.InvalidProtocolBufferException;
34 | 
35 | /**
36 |  * 
37 |  */
38 | public class GlobalIndexUidCombiner extends TypedValueCombiner<Uid.List> {
39 |   public static final Encoder<Uid.List> UID_LIST_ENCODER = new UidListEncoder();
40 |   public static final int MAX = 20;
41 | 
42 |   @Override
43 |   public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options,
44 |       IteratorEnvironment env) throws IOException {
45 |     super.init(source, options, env);
46 |     setEncoder(UID_LIST_ENCODER);
47 |   }
48 | 
49 |   @Override
50 |   public Uid.List typedReduce(Key key, Iterator<Uid.List> iter) {
51 |     Uid.List.Builder builder = Uid.List.newBuilder();
52 |     HashSet<String> uids = new HashSet<>();
53 |     boolean seenIgnore = false;
54 |     long count = 0;
55 |     while (iter.hasNext()) {
56 |       Uid.List v = iter.next();
57 |       if (null == v)
58 |         continue;
59 |       count = count + v.getCOUNT();
60 |       if (v.getIGNORE()) {
61 |         seenIgnore = true;
62 |       }
63 |       uids.addAll(v.getUIDList());
64 |     }
65 |     // Special case logic
66 |     // If we have aggregated more than MAX UIDs, then null out the UID list and set IGNORE to true
67 |     // However, always maintain the count
68 |     builder.setCOUNT(count);
69 |     if (uids.size() > MAX || seenIgnore) {
70 |       builder.setIGNORE(true);
71 |       builder.clearUID();
72 |     } else {
73 |       builder.setIGNORE(false);
74 |       builder.addAllUID(uids);
75 |     }
76 |     return builder.build();
77 |   }
78 | 
79 |   public static class UidListEncoder implements Encoder<Uid.List> {
80 |     @Override
81 |     public byte[] encode(Uid.List v) {
82 |       return v.toByteArray();
83 |     }
84 | 
85 |     @Override
86 |     public Uid.List decode(byte[] b) {
87 |       if (b.length == 0)
88 |         return null;
89 |       try {
90 |         return Uid.List.parseFrom(b);
91 |       } catch (InvalidProtocolBufferException e) {
92 |         throw new ValueFormatException("Value passed to aggregator was not of type Uid.List");
93 |       }
94 |     }
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitioner.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | /**
18 |  * 
19 |  */
20 | package org.apache.accumulo.examples.wikisearch.ingest;
21 | 
22 | 
23 | import java.io.ByteArrayInputStream;
24 | import java.io.IOException;
25 | import java.io.InputStreamReader;
26 | import java.nio.charset.Charset;
27 | import java.util.regex.Matcher;
28 | import java.util.regex.Pattern;
29 | 
30 | import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
31 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
32 | import org.apache.hadoop.conf.Configuration;
33 | import org.apache.hadoop.io.LongWritable;
34 | import org.apache.hadoop.io.Text;
35 | import org.apache.hadoop.mapreduce.Mapper;
36 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
37 | 
38 | public class WikipediaPartitioner extends Mapper<LongWritable,Text,Text,Article> {
39 |   
40 |   // private static final Logger log = Logger.getLogger(WikipediaPartitioner.class);
41 |   
42 |   public final static Charset UTF8 = Charset.forName("UTF-8");
43 |   public static final String DOCUMENT_COLUMN_FAMILY = "d";
44 |   public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
45 |   public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
46 |   public static final String TOKENS_FIELD_NAME = "TEXT";
47 |   
48 |   private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
49 |   
50 |   private ArticleExtractor extractor;
51 |   private String language;
52 | 
53 |   private int myGroup = -1;
54 |   private int numGroups = -1;
55 |   
56 |   @Override
57 |   public void setup(Context context) {
58 |     Configuration conf = context.getConfiguration();
59 |     
60 |     WikipediaInputSplit wiSplit = (WikipediaInputSplit)context.getInputSplit();
61 |     myGroup = wiSplit.getPartition();
62 |     numGroups = WikipediaConfiguration.getNumGroups(conf);
63 |     
64 |     FileSplit split = wiSplit.getFileSplit();
65 |     String fileName = split.getPath().getName();
66 |     Matcher matcher = languagePattern.matcher(fileName);
67 |     if (matcher.matches()) {
68 |       language = matcher.group(1).replace('_', '-').toLowerCase();
69 |     } else {
70 |       throw new RuntimeException("Unknown ingest language! " + fileName);
71 |     }
72 |     extractor = new ArticleExtractor();
73 |   }
74 |   
75 |   @Override
76 |   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
77 |     Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
78 |     if (article != null) {
79 |       int groupId = WikipediaMapper.getPartitionId(article, numGroups);
80 |       if(groupId != myGroup)
81 |         return;
82 |       context.write(new Text(language), article);
83 |     } else {
84 |       context.getCounter("wikipedia", "invalid articles").increment(1);
85 |       context.progress();
86 |     }
87 |   }
88 |   
89 | }
90 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.util;
 18 | 
 19 | import java.nio.ByteBuffer;
 20 | import java.nio.charset.CharacterCodingException;
 21 | 
 22 | import org.apache.accumulo.core.iterators.user.SummingCombiner;
 23 | import org.apache.hadoop.io.Text;
 24 | 
 25 | public class TextUtil {
 26 | 
 27 |   /**
 28 |    * Appends a null byte followed by the UTF-8 bytes of the given string to the given {@link Text}
 29 |    *
 30 |    * @param text
 31 |    *          the Text to which to append
 32 |    * @param string
 33 |    *          the String to append
 34 |    */
 35 |   public static void textAppend(Text text, String string) {
 36 |     appendNullByte(text);
 37 |     textAppendNoNull(text, string);
 38 |   }
 39 | 
 40 |   public static void textAppend(Text text, String string, boolean replaceBadChar) {
 41 |     appendNullByte(text);
 42 |     textAppendNoNull(text, string, replaceBadChar);
 43 |   }
 44 | 
 45 |   public static void textAppend(Text t, long s) {
 46 |     t.append(nullByte, 0, 1);
 47 |     t.append(SummingCombiner.FIXED_LEN_ENCODER.encode(s), 0, 8);
 48 |   }
 49 | 
 50 |   private static final byte[] nullByte = {0};
 51 | 
 52 |   /**
 53 |    * Appends a null byte to the given text
 54 |    *
 55 |    * @param text
 56 |    *          the text to which to append the null byte
 57 |    */
 58 |   public static void appendNullByte(Text text) {
 59 |     text.append(nullByte, 0, nullByte.length);
 60 |   }
 61 | 
 62 |   /**
 63 |    * Appends the UTF-8 bytes of the given string to the given {@link Text}
 64 |    *
 65 |    * @param t
 66 |    *          the Text to which to append
 67 |    * @param s
 68 |    *          the String to append
 69 |    */
 70 |   public static void textAppendNoNull(Text t, String s) {
 71 |     textAppendNoNull(t, s, false);
 72 |   }
 73 | 
 74 |   /**
 75 |    * Appends the UTF-8 bytes of the given string to the given {@link Text}
 76 |    */
 77 |   public static void textAppendNoNull(Text t, String s, boolean replaceBadChar) {
 78 |     try {
 79 |       ByteBuffer buffer = Text.encode(s, replaceBadChar);
 80 |       t.append(buffer.array(), 0, buffer.limit());
 81 |     } catch (CharacterCodingException cce) {
 82 |       throw new IllegalArgumentException(cce);
 83 |     }
 84 |   }
 85 | 
 86 |   /**
 87 |    * Converts the given string its UTF-8 bytes. This uses Hadoop's method for converting string to
 88 |    * UTF-8 and is much faster than calling {@link String#getBytes(String)}.
 89 |    *
 90 |    * @param string
 91 |    *          the string to convert
 92 |    * @return the UTF-8 representation of the string
 93 |    */
 94 |   public static byte[] toUtf8(String string) {
 95 |     ByteBuffer buffer;
 96 |     try {
 97 |       buffer = Text.encode(string, false);
 98 |     } catch (CharacterCodingException cce) {
 99 |       throw new IllegalArgumentException(cce);
100 |     }
101 |     byte[] bytes = new byte[buffer.limit()];
102 |     System.arraycopy(buffer.array(), 0, bytes, 0, bytes.length);
103 |     return bytes;
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexCombiner.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.iterator;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.ArrayList;
 21 | import java.util.Collections;
 22 | import java.util.Iterator;
 23 | import java.util.List;
 24 | import java.util.Map;
 25 | 
 26 | import org.apache.accumulo.core.client.lexicoder.Encoder;
 27 | import org.apache.accumulo.core.data.Key;
 28 | import org.apache.accumulo.core.data.Value;
 29 | import org.apache.accumulo.core.iterators.IteratorEnvironment;
 30 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
 31 | import org.apache.accumulo.core.iterators.TypedValueCombiner;
 32 | import org.apache.accumulo.core.iterators.ValueFormatException;
 33 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight;
 34 | 
 35 | import com.google.protobuf.InvalidProtocolBufferException;
 36 | 
 37 | /**
 38 |  * 
 39 |  */
 40 | public class TextIndexCombiner extends TypedValueCombiner<TermWeight.Info> {
 41 |   public static final Encoder<TermWeight.Info> TERMWEIGHT_INFO_ENCODER =
 42 |       new TermWeightInfoEncoder();
 43 | 
 44 |   @Override
 45 |   public TermWeight.Info typedReduce(Key key, Iterator<TermWeight.Info> iter) {
 46 |     TermWeight.Info.Builder builder = TermWeight.Info.newBuilder();
 47 |     List<Integer> offsets = new ArrayList<>();
 48 |     float normalizedTermFrequency = 0f;
 49 | 
 50 |     while (iter.hasNext()) {
 51 |       TermWeight.Info info = iter.next();
 52 |       if (null == info)
 53 |         continue;
 54 | 
 55 |       // Add each offset into the list maintaining sorted order
 56 |       for (int offset : info.getWordOffsetList()) {
 57 |         int pos = Collections.binarySearch(offsets, offset);
 58 | 
 59 |         if (pos < 0) {
 60 |           // Undo the transform on the insertion point
 61 |           offsets.add((-1 * pos) - 1, offset);
 62 |         } else {
 63 |           offsets.add(pos, offset);
 64 |         }
 65 |       }
 66 | 
 67 |       if (info.getNormalizedTermFrequency() > 0) {
 68 |         normalizedTermFrequency += info.getNormalizedTermFrequency();
 69 |       }
 70 |     }
 71 | 
 72 |     // Keep the sorted order we tried to maintain
 73 |     for (Integer offset : offsets) {
 74 |       builder.addWordOffset(offset);
 75 |     }
 76 | 
 77 |     builder.setNormalizedTermFrequency(normalizedTermFrequency);
 78 |     return builder.build();
 79 |   }
 80 | 
 81 |   @Override
 82 |   public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options,
 83 |       IteratorEnvironment env) throws IOException {
 84 |     super.init(source, options, env);
 85 |     setEncoder(TERMWEIGHT_INFO_ENCODER);
 86 |   }
 87 | 
 88 |   public static class TermWeightInfoEncoder implements Encoder<TermWeight.Info> {
 89 |     @Override
 90 |     public byte[] encode(TermWeight.Info v) {
 91 |       return v.toByteArray();
 92 |     }
 93 | 
 94 |     @Override
 95 |     public TermWeight.Info decode(byte[] b) {
 96 |       if (b.length == 0)
 97 |         return null;
 98 |       try {
 99 |         return TermWeight.Info.parseFrom(b);
100 |       } catch (InvalidProtocolBufferException e) {
101 |         throw new ValueFormatException(
102 |             "Value passed to aggregator was not of type TermWeight.Info");
103 |       }
104 |     }
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/ingest/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |     Licensed to the Apache Software Foundation (ASF) under one or more
  4 |     contributor license agreements. See the NOTICE file distributed with
  5 |     this work for additional information regarding copyright ownership.
  6 |     The ASF licenses this file to You under the Apache License, Version 2.0
  7 |     (the "License"); you may not use this file except in compliance with
  8 |     the License. You may obtain a copy of the License at
  9 | 
 10 |     http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |     Unless required by applicable law or agreed to in writing, software
 13 |     distributed under the License is distributed on an "AS IS" BASIS,
 14 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |     See the License for the specific language governing permissions and
 16 |     limitations under the License.
 17 |   -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 19 |   <modelVersion>4.0.0</modelVersion>
 20 |   <parent>
 21 |     <groupId>org.apache.accumulo</groupId>
 22 |     <artifactId>accumulo-wikisearch</artifactId>
 23 |     <version>2.0.0-SNAPSHOT</version>
 24 |   </parent>
 25 |   <artifactId>wikisearch-ingest</artifactId>
 26 |   <name>wikisearch-ingest</name>
 27 |   <dependencies>
 28 |     <dependency>
 29 |       <groupId>com.google.guava</groupId>
 30 |       <artifactId>guava</artifactId>
 31 |     </dependency>
 32 |     <dependency>
 33 |       <groupId>com.google.protobuf</groupId>
 34 |       <artifactId>protobuf-java</artifactId>
 35 |     </dependency>
 36 |     <dependency>
 37 |       <groupId>commons-codec</groupId>
 38 |       <artifactId>commons-codec</artifactId>
 39 |     </dependency>
 40 |     <dependency>
 41 |       <groupId>commons-lang</groupId>
 42 |       <artifactId>commons-lang</artifactId>
 43 |     </dependency>
 44 |     <dependency>
 45 |       <groupId>log4j</groupId>
 46 |       <artifactId>log4j</artifactId>
 47 |     </dependency>
 48 |     <dependency>
 49 |       <groupId>org.apache.accumulo</groupId>
 50 |       <artifactId>accumulo-core</artifactId>
 51 |         <exclusions>
 52 |           <exclusion>
 53 |             <groupId>commons-digester</groupId>
 54 |             <artifactId>commons-digester</artifactId>
 55 |           </exclusion>
 56 |         </exclusions>
 57 |     </dependency>
 58 |     <dependency>
 59 |       <groupId>org.apache.lucene</groupId>
 60 |       <artifactId>lucene-analyzers-common</artifactId>
 61 |     </dependency>
 62 |     <dependency>
 63 |       <groupId>org.apache.zookeeper</groupId>
 64 |       <artifactId>zookeeper</artifactId>
 65 |       <scope>runtime</scope>
 66 |     </dependency>
 67 |     <dependency>
 68 |       <groupId>junit</groupId>
 69 |       <artifactId>junit</artifactId>
 70 |       <scope>test</scope>
 71 |     </dependency>
 72 |   </dependencies>
 73 |   <build>
 74 |     <plugins>
 75 |       <plugin>
 76 |         <groupId>org.apache.maven.plugins</groupId>
 77 |         <artifactId>maven-dependency-plugin</artifactId>
 78 |         <executions>
 79 |           <execution>
 80 |             <id>copy-dependencies</id>
 81 |             <goals>
 82 |               <goal>copy-dependencies</goal>
 83 |             </goals>
 84 |             <phase>prepare-package</phase>
 85 |             <configuration>
 86 |               <outputDirectory>lib</outputDirectory>
 87 | 	      <!-- just grab the non-provided runtime dependencies -->
 88 | 	      <!-- XXX we include guava at the same version as hadoop 2 provides so that we have it on hadoop 1 -->
 89 |               <includeArtifactIds>commons-lang,guava,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,zookeeper,commons-codec,accumulo-fate,accumulo-trace</includeArtifactIds>
 90 |               <excludeTransitive>false</excludeTransitive>
 91 |             </configuration>
 92 |           </execution>
 93 |         </executions>
 94 |       </plugin>
 95 |       <plugin>
 96 |         <artifactId>maven-assembly-plugin</artifactId>
 97 |         <configuration>
 98 |           <descriptors>
 99 |             <descriptor>src/assembly/dist.xml</descriptor>
100 |           </descriptors>
101 |         </configuration>
102 |       </plugin>
103 |     </plugins>
104 |   </build>
105 | </project>
106 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/SortingRFileOutputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.output;
 18 | 
 19 | import java.io.IOException;
 20 | 
 21 | import org.apache.accumulo.core.conf.AccumuloConfiguration;
 22 | import org.apache.accumulo.core.data.Mutation;
 23 | import org.apache.hadoop.conf.Configuration;
 24 | import org.apache.hadoop.fs.FileSystem;
 25 | import org.apache.hadoop.io.Text;
 26 | import org.apache.hadoop.mapreduce.JobContext;
 27 | import org.apache.hadoop.mapreduce.OutputCommitter;
 28 | import org.apache.hadoop.mapreduce.OutputFormat;
 29 | import org.apache.hadoop.mapreduce.RecordWriter;
 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 31 | 
 32 | public class SortingRFileOutputFormat extends OutputFormat<Text,Mutation> {
 33 |   
 34 |   // private static final Logger log = Logger.getLogger(SortingRFileOutputFormat.class);
 35 |   
 36 |   public static final String PATH_NAME = "sortingrfileoutputformat.path";
 37 |   public static final String MAX_BUFFER_SIZE = "sortingrfileoutputformat.max.buffer.size";
 38 |   
 39 |   public static void setPathName(Configuration conf, String path) {
 40 |     conf.set(PATH_NAME, path);
 41 |   }
 42 |   
 43 |   public static String getPathName(Configuration conf) {
 44 |     return conf.get(PATH_NAME);
 45 |   }
 46 |   
 47 |   public static void setMaxBufferSize(Configuration conf, long maxBufferSize) {
 48 |     conf.setLong(MAX_BUFFER_SIZE, maxBufferSize);
 49 |   }
 50 |   
 51 |   public static long getMaxBufferSize(Configuration conf) {
 52 |     return conf.getLong(MAX_BUFFER_SIZE, -1);
 53 |   }
 54 |   
 55 |   @Override
 56 |   public void checkOutputSpecs(JobContext job) throws IOException, InterruptedException {
 57 |     // TODO make sure the path is writable?
 58 |     // TODO make sure the max buffer size is set and is reasonable
 59 |   }
 60 |   
 61 |   @Override
 62 |   public OutputCommitter getOutputCommitter(TaskAttemptContext arg0) throws IOException, InterruptedException {
 63 |     return new OutputCommitter() {
 64 |       
 65 |       @Override
 66 |       public void setupTask(TaskAttemptContext arg0) throws IOException {
 67 |         // TODO Auto-generated method stub
 68 |         
 69 |       }
 70 |       
 71 |       @Override
 72 |       public void setupJob(JobContext arg0) throws IOException {
 73 |         // TODO Auto-generated method stub
 74 |         
 75 |       }
 76 |       
 77 |       @Override
 78 |       public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException {
 79 |         // TODO Auto-generated method stub
 80 |         return false;
 81 |       }
 82 |       
 83 |       @Override
 84 |       public void commitTask(TaskAttemptContext arg0) throws IOException {
 85 |         // TODO Auto-generated method stub
 86 |         
 87 |       }
 88 |       
 89 |       @Override
 90 |       public void cleanupJob(JobContext arg0) throws IOException {
 91 |         // TODO Auto-generated method stub
 92 |         
 93 |       }
 94 |       
 95 |       @Override
 96 |       public void abortTask(TaskAttemptContext arg0) throws IOException {
 97 |         // TODO Auto-generated method stub
 98 |         
 99 |       }
100 |     };
101 |   }
102 |   
103 |   @Override
104 |   public RecordWriter<Text,Mutation> getRecordWriter(TaskAttemptContext attempt) throws IOException, InterruptedException {
105 |     
106 |     // grab the configuration
107 |     final Configuration conf = attempt.getConfiguration();
108 |     // grab the max size
109 |     final long maxSize = getMaxBufferSize(conf);
110 |     
111 |     return new BufferingRFileRecordWriter(maxSize, conf);
112 |   }
113 |   
114 | }
115 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/parser/JexlOperatorConstants.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.parser;
 18 | 
 19 | import java.util.Map;
 20 | import java.util.concurrent.ConcurrentHashMap;
 21 | import org.apache.commons.jexl2.parser.ASTAndNode;
 22 | 
 23 | import org.apache.commons.jexl2.parser.ASTEQNode;
 24 | import org.apache.commons.jexl2.parser.ASTERNode;
 25 | import org.apache.commons.jexl2.parser.ASTFunctionNode;
 26 | import org.apache.commons.jexl2.parser.ASTGENode;
 27 | import org.apache.commons.jexl2.parser.ASTGTNode;
 28 | import org.apache.commons.jexl2.parser.ASTLENode;
 29 | import org.apache.commons.jexl2.parser.ASTLTNode;
 30 | import org.apache.commons.jexl2.parser.ASTNENode;
 31 | import org.apache.commons.jexl2.parser.ASTNRNode;
 32 | import org.apache.commons.jexl2.parser.ASTOrNode;
 33 | import org.apache.commons.jexl2.parser.JexlNode;
 34 | import org.apache.commons.jexl2.parser.ParserTreeConstants;
 35 | 
 36 | public class JexlOperatorConstants implements ParserTreeConstants {
 37 |   
 38 |   private static Map<Class<? extends JexlNode>,String> operatorMap = new ConcurrentHashMap<Class<? extends JexlNode>,String>();
 39 |   private static Map<String,Class<? extends JexlNode>> classMap = new ConcurrentHashMap<String,Class<? extends JexlNode>>();
 40 |   private static Map<Integer,String> jjtOperatorMap = new ConcurrentHashMap<Integer,String>();
 41 |   private static Map<String,Integer> jjtTypeMap = new ConcurrentHashMap<String,Integer>();
 42 |   
 43 |   static {
 44 |     operatorMap.put(ASTEQNode.class, "==");
 45 |     operatorMap.put(ASTNENode.class, "!=");
 46 |     operatorMap.put(ASTLTNode.class, "<");
 47 |     operatorMap.put(ASTLENode.class, "<=");
 48 |     operatorMap.put(ASTGTNode.class, ">");
 49 |     operatorMap.put(ASTGENode.class, ">=");
 50 |     operatorMap.put(ASTERNode.class, "=~");
 51 |     operatorMap.put(ASTNRNode.class, "!~");
 52 |     operatorMap.put(ASTFunctionNode.class, "f");
 53 |     operatorMap.put(ASTAndNode.class, "and");
 54 |     operatorMap.put(ASTOrNode.class, "or");
 55 |     
 56 |     classMap.put("==", ASTEQNode.class);
 57 |     classMap.put("!=", ASTNENode.class);
 58 |     classMap.put("<", ASTLTNode.class);
 59 |     classMap.put("<=", ASTLENode.class);
 60 |     classMap.put(">", ASTGTNode.class);
 61 |     classMap.put(">=", ASTGENode.class);
 62 |     classMap.put("=~", ASTERNode.class);
 63 |     classMap.put("!~", ASTNRNode.class);
 64 |     classMap.put("f", ASTFunctionNode.class);
 65 |     
 66 |     jjtOperatorMap.put(JJTEQNODE, "==");
 67 |     jjtOperatorMap.put(JJTNENODE, "!=");
 68 |     jjtOperatorMap.put(JJTLTNODE, "<");
 69 |     jjtOperatorMap.put(JJTLENODE, "<=");
 70 |     jjtOperatorMap.put(JJTGTNODE, ">");
 71 |     jjtOperatorMap.put(JJTGENODE, ">=");
 72 |     jjtOperatorMap.put(JJTERNODE, "=~");
 73 |     jjtOperatorMap.put(JJTNRNODE, "!~");
 74 |     jjtOperatorMap.put(JJTFUNCTIONNODE, "f");
 75 |     jjtOperatorMap.put(JJTANDNODE, "and");
 76 |     jjtOperatorMap.put(JJTORNODE, "or");
 77 |     
 78 |     jjtTypeMap.put("==", JJTEQNODE);
 79 |     jjtTypeMap.put("!=", JJTNENODE);
 80 |     jjtTypeMap.put("<", JJTLTNODE);
 81 |     jjtTypeMap.put("<=", JJTLENODE);
 82 |     jjtTypeMap.put(">", JJTGTNODE);
 83 |     jjtTypeMap.put(">=", JJTGENODE);
 84 |     jjtTypeMap.put("=~", JJTERNODE);
 85 |     jjtTypeMap.put("!~", JJTNRNODE);
 86 |     jjtTypeMap.put("f", JJTFUNCTIONNODE);
 87 |     
 88 |   }
 89 |   
 90 |   public static String getOperator(Class<? extends JexlNode> nodeType) {
 91 |     return operatorMap.get(nodeType);
 92 |   }
 93 |   
 94 |   public static String getOperator(Integer jjtNode) {
 95 |     return jjtOperatorMap.get(jjtNode);
 96 |   }
 97 |   
 98 |   public static Class<? extends JexlNode> getClass(String operator) {
 99 |     return classMap.get(operator);
100 |   }
101 |   
102 |   public static int getJJTNodeType(String operator) {
103 |     return jjtTypeMap.get(operator);
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/logic/ContentLogic.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.logic;
 18 | 
 19 | import java.util.List;
 20 | import java.util.Map.Entry;
 21 | import java.util.regex.Matcher;
 22 | import java.util.regex.Pattern;
 23 | 
 24 | import org.apache.accumulo.core.client.Connector;
 25 | import org.apache.accumulo.core.client.Scanner;
 26 | import org.apache.accumulo.core.client.TableNotFoundException;
 27 | import org.apache.accumulo.core.data.Key;
 28 | import org.apache.accumulo.core.data.Range;
 29 | import org.apache.accumulo.core.data.Value;
 30 | import org.apache.accumulo.core.security.Authorizations;
 31 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper;
 32 | import org.apache.accumulo.examples.wikisearch.sample.Document;
 33 | import org.apache.accumulo.examples.wikisearch.sample.Field;
 34 | import org.apache.accumulo.examples.wikisearch.sample.Results;
 35 | import org.apache.commons.codec.binary.Base64;
 36 | import org.apache.commons.lang.StringUtils;
 37 | import org.apache.log4j.Logger;
 38 | 
 39 | 
 40 | /**
 41 |  * This query table implementation returns a Results object that contains documents from the wiki table. The query will contain the partition id, wikitype, and
 42 |  * UID so that we can seek directly to the document. The document is stored as base64 compressed binary in the Accumulo table. We will decompress the data so
 43 |  * that it is base64 encoded binary data in the Results object.
 44 |  * 
 45 |  * The query that needs to be passed to the web service is: DOCUMENT:partitionId/wikitype/uid.
 46 |  * 
 47 |  */
 48 | public class ContentLogic {
 49 |   
 50 |   private static final Logger log = Logger.getLogger(ContentLogic.class);
 51 |   
 52 |   private static final String NULL_BYTE = "\u0000";
 53 |   
 54 |   private String tableName = null;
 55 |   
 56 |   private Pattern queryPattern = Pattern.compile("^DOCUMENT:(.*)/(.*)/(.*)$");
 57 |   
 58 |   public String getTableName() {
 59 |     return tableName;
 60 |   }
 61 |   
 62 |   public void setTableName(String tableName) {
 63 |     this.tableName = tableName;
 64 |   }
 65 |   
 66 |   public Results runQuery(Connector connector, String query, List<String> authorizations) {
 67 |     
 68 |     Results results = new Results();
 69 |     Authorizations auths = new Authorizations(StringUtils.join(authorizations, "|"));
 70 |     
 71 |     Matcher match = queryPattern.matcher(query);
 72 |     if (!match.matches()) {
 73 |       throw new IllegalArgumentException("Query does not match the pattern: DOCUMENT:partitionId/wikitype/uid, your query: " + query.toString());
 74 |     } else {
 75 |       String partitionId = match.group(1);
 76 |       String wikitype = match.group(2);
 77 |       String id = match.group(3);
 78 |       
 79 |       log.debug("Received pieces: " + partitionId + ", " + wikitype + ", " + id);
 80 |       
 81 |       // Create the Range
 82 |       Key startKey = new Key(partitionId, WikipediaMapper.DOCUMENT_COLUMN_FAMILY, wikitype + NULL_BYTE + id);
 83 |       Key endKey = new Key(partitionId, WikipediaMapper.DOCUMENT_COLUMN_FAMILY, wikitype + NULL_BYTE + id + NULL_BYTE);
 84 |       Range r = new Range(startKey, true, endKey, false);
 85 |       
 86 |       log.debug("Setting range: " + r);
 87 |       
 88 |       try {
 89 |         Scanner scanner = connector.createScanner(this.getTableName(), auths);
 90 |         scanner.setRange(r);
 91 |         // This should in theory only match one thing.
 92 |         for (Entry<Key,Value> entry : scanner) {
 93 |           Document doc = new Document();
 94 |           doc.setId(id);
 95 |           Field val = new Field();
 96 |           val.setFieldName("DOCUMENT");
 97 |           val.setFieldValue(new String(Base64.decodeBase64(entry.getValue().toString())));
 98 |           doc.getFields().add(val);
 99 |           results.getResults().add(doc);
100 |         }
101 |       } catch (TableNotFoundException e) {
102 |         throw new RuntimeException("Table not found: " + this.getTableName(), e);
103 |       }
104 |       
105 |     }
106 |     return results;
107 |   }
108 |   
109 | }
110 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.ingest;
 18 | 
 19 | import java.io.DataInput;
 20 | import java.io.DataOutput;
 21 | import java.io.IOException;
 22 | import java.util.ArrayList;
 23 | import java.util.List;
 24 | 
 25 | import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
 26 | import org.apache.hadoop.fs.Path;
 27 | import org.apache.hadoop.io.LongWritable;
 28 | import org.apache.hadoop.io.Text;
 29 | import org.apache.hadoop.io.Writable;
 30 | import org.apache.hadoop.mapreduce.InputSplit;
 31 | import org.apache.hadoop.mapreduce.JobContext;
 32 | import org.apache.hadoop.mapreduce.RecordReader;
 33 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 34 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 35 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 36 | 
 37 | 
 38 | public class WikipediaInputFormat extends TextInputFormat {
 39 | 
 40 |   public static class WikipediaInputSplit extends InputSplit implements Writable {
 41 | 
 42 |     public WikipediaInputSplit(){}
 43 |     
 44 |     public WikipediaInputSplit(FileSplit fileSplit, int partition)
 45 |     {
 46 |       this.fileSplit = fileSplit;
 47 |       this.partition = partition;
 48 |     }
 49 |     
 50 |     private FileSplit fileSplit = null;
 51 |     private int partition = -1;
 52 | 
 53 |     public int getPartition()
 54 |     {
 55 |       return partition;
 56 |     }
 57 |     
 58 |     public FileSplit getFileSplit()
 59 |     {
 60 |       return fileSplit;
 61 |     }
 62 |     
 63 |     @Override
 64 |     public long getLength() throws IOException, InterruptedException {
 65 |       return fileSplit.getLength();
 66 |     }
 67 | 
 68 |     @Override
 69 |     public String[] getLocations() throws IOException, InterruptedException {
 70 |       // for highly replicated files, returning all of the locations can lead to bunching
 71 |       // TODO replace this with a subset of the locations
 72 |       return fileSplit.getLocations();
 73 |     }
 74 | 
 75 |     @Override
 76 |     public void readFields(DataInput in) throws IOException {
 77 |       Path file = new Path(in.readUTF());
 78 |       long start = in.readLong();
 79 |       long length = in.readLong();
 80 |       String [] hosts = null;
 81 |       if(in.readBoolean())
 82 |       {
 83 |         int numHosts = in.readInt();
 84 |         hosts = new String[numHosts];
 85 |         for(int i = 0; i < numHosts; i++)
 86 |           hosts[i] = in.readUTF();
 87 |       }
 88 |       fileSplit = new FileSplit(file, start, length, hosts);
 89 |       partition = in.readInt();
 90 |     }
 91 | 
 92 |     @Override
 93 |     public void write(DataOutput out) throws IOException {
 94 |       out.writeUTF(fileSplit.getPath().toString());
 95 |       out.writeLong(fileSplit.getStart());
 96 |       out.writeLong(fileSplit.getLength());
 97 |       String [] hosts = fileSplit.getLocations();
 98 |       if(hosts == null)
 99 |       {
100 |         out.writeBoolean(false);
101 |       }
102 |       else
103 |       {
104 |         out.writeBoolean(true);
105 |         out.writeInt(hosts.length);
106 |         for(String host:hosts)
107 |         out.writeUTF(host);
108 |       }
109 |       out.writeInt(partition);
110 |     }
111 |     
112 |   }
113 |   
114 |   @Override
115 |   public List<InputSplit> getSplits(JobContext job) throws IOException {
116 |     List<InputSplit> superSplits = super.getSplits(job);
117 |     List<InputSplit> splits = new ArrayList<InputSplit>();
118 |     
119 |     int numGroups = WikipediaConfiguration.getNumGroups(job.getConfiguration());
120 | 
121 |     for(int group = 0; group < numGroups; group++)
122 |     {
123 |       for(InputSplit split:superSplits)
124 |       {
125 |         FileSplit fileSplit = (FileSplit)split;
126 |         splits.add(new WikipediaInputSplit(fileSplit,group));
127 |       }
128 |     }
129 |     return splits;
130 |   }
131 | 
132 |   @Override
133 |   public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
134 |     return new AggregatingRecordReader();
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/EvaluatingIterator.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.iterator;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Collection;
 21 | import java.util.Collections;
 22 | 
 23 | import org.apache.accumulo.core.data.ByteSequence;
 24 | import org.apache.accumulo.core.data.Key;
 25 | import org.apache.accumulo.core.data.PartialKey;
 26 | import org.apache.accumulo.core.data.Range;
 27 | import org.apache.accumulo.core.data.Value;
 28 | import org.apache.accumulo.core.iterators.IteratorEnvironment;
 29 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
 30 | import org.apache.accumulo.core.security.ColumnVisibility;
 31 | import org.apache.accumulo.examples.wikisearch.parser.EventFields;
 32 | import org.apache.accumulo.examples.wikisearch.parser.EventFields.FieldValue;
 33 | import org.apache.commons.collections.map.LRUMap;
 34 | import org.apache.hadoop.io.Text;
 35 | 
 36 | public class EvaluatingIterator extends AbstractEvaluatingIterator {
 37 | 
 38 |   public static final String NULL_BYTE_STRING = "\u0000";
 39 |   LRUMap visibilityMap = new LRUMap();
 40 | 
 41 |   public EvaluatingIterator() {
 42 |     super();
 43 |   }
 44 | 
 45 |   public EvaluatingIterator(AbstractEvaluatingIterator other, IteratorEnvironment env) {
 46 |     super(other, env);
 47 |   }
 48 | 
 49 |   @Override
 50 |   public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) {
 51 |     return new EvaluatingIterator(this, env);
 52 |   }
 53 | 
 54 |   @Override
 55 |   public PartialKey getKeyComparator() {
 56 |     return PartialKey.ROW_COLFAM;
 57 |   }
 58 | 
 59 |   @Override
 60 |   public Key getReturnKey(Key k) {
 61 |     // If we were using column visibility, then we would get the merged visibility here and use it
 62 |     // in the key.
 63 |     // Remove the COLQ from the key and use the combined visibility
 64 |     Key r = new Key(k.getRowData().getBackingArray(), k.getColumnFamilyData().getBackingArray(),
 65 |         NULL_BYTE, k.getColumnVisibility().getBytes(), k.getTimestamp(), k.isDeleted(), false);
 66 |     return r;
 67 |   }
 68 | 
 69 |   @Override
 70 |   public void fillMap(EventFields event, Key key, Value value) {
 71 |     // If we were using column visibility, we would have to merge them here.
 72 | 
 73 |     // Pull the datatype from the colf in case we need to do anything datatype specific.
 74 |     // String colf = key.getColumnFamily().toString();
 75 |     // String datatype = colf.substring(0, colf.indexOf(NULL_BYTE_STRING));
 76 | 
 77 |     // For the partitioned table, the field name and field value are stored in the column qualifier
 78 |     // separated by a \0.
 79 |     String colq = key.getColumnQualifier().toString();// .toLowerCase();
 80 |     int idx = colq.indexOf(NULL_BYTE_STRING);
 81 |     String fieldName = colq.substring(0, idx);
 82 |     String fieldValue = colq.substring(idx + 1);
 83 | 
 84 |     event.put(fieldName, new FieldValue(getColumnVisibility(key), fieldValue.getBytes()));
 85 |   }
 86 | 
 87 |   /**
 88 |    * @return The column visibility
 89 |    */
 90 |   public ColumnVisibility getColumnVisibility(Key key) {
 91 |     ColumnVisibility result = (ColumnVisibility) visibilityMap.get(key.getColumnVisibility());
 92 |     if (result != null) {
 93 |       return result;
 94 |     }
 95 |     result = new ColumnVisibility(key.getColumnVisibility().getBytes());
 96 |     visibilityMap.put(key.getColumnVisibility(), result);
 97 |     return result;
 98 |   }
 99 | 
100 |   /**
101 |    * Don't accept this key if the colf starts with 'fi'
102 |    */
103 |   @Override
104 |   public boolean isKeyAccepted(Key key) throws IOException {
105 |     if (key.getColumnFamily().toString().startsWith("fi")) {
106 |       Key copy = new Key(key.getRow(), new Text("fi\01"));
107 |       Collection<ByteSequence> columnFamilies = Collections.emptyList();
108 |       this.iterator.seek(new Range(copy, copy), columnFamilies, true);
109 |       if (this.iterator.hasTop()) {
110 |         return isKeyAccepted(this.iterator.getTopKey());
111 |       }
112 |       return true;
113 |     }
114 |     return true;
115 |   }
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/jexl/Arithmetic.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.jexl;
 18 | 
 19 | import java.util.regex.Matcher;
 20 | import java.util.regex.Pattern;
 21 | 
 22 | import org.apache.commons.jexl2.JexlArithmetic;
 23 | import org.apache.commons.lang.math.NumberUtils;
 24 | 
 25 | public class Arithmetic extends JexlArithmetic {
 26 |   
 27 |   public Arithmetic(boolean lenient) {
 28 |     super(lenient);
 29 |   }
 30 |   
 31 |   /**
 32 |    * This method differs from the parent in that we are not calling String.matches() because it does not match on a newline. Instead we are handling this case.
 33 |    * 
 34 |    * @param left
 35 |    *          first value
 36 |    * @param right
 37 |    *          second value
 38 |    * @return test result.
 39 |    */
 40 |   @Override
 41 |   public boolean matches(Object left, Object right) {
 42 |     if (left == null && right == null) {
 43 |       // if both are null L == R
 44 |       return true;
 45 |     }
 46 |     if (left == null || right == null) {
 47 |       // we know both aren't null, therefore L != R
 48 |       return false;
 49 |     }
 50 |     final String arg = left.toString();
 51 |     if (right instanceof java.util.regex.Pattern) {
 52 |       return ((java.util.regex.Pattern) right).matcher(arg).matches();
 53 |     } else {
 54 |       // return arg.matches(right.toString());
 55 |       Pattern p = Pattern.compile(right.toString(), Pattern.DOTALL);
 56 |       Matcher m = p.matcher(arg);
 57 |       return m.matches();
 58 |       
 59 |     }
 60 |   }
 61 |   
 62 |   /**
 63 |    * This method differs from the parent class in that we are going to try and do a better job of coercing the types. As a last resort we will do a string
 64 |    * comparison and try not to throw a NumberFormatException. The JexlArithmetic class performs coercion to a particular type if either the left or the right
 65 |    * match a known type. We will look at the type of the right operator and try to make the left of the same type.
 66 |    */
 67 |   @Override
 68 |   public boolean equals(Object left, Object right) {
 69 |     Object fixedLeft = fixLeft(left, right);
 70 |     return super.equals(fixedLeft, right);
 71 |   }
 72 |   
 73 |   @Override
 74 |   public boolean lessThan(Object left, Object right) {
 75 |     Object fixedLeft = fixLeft(left, right);
 76 |     return super.lessThan(fixedLeft, right);
 77 |   }
 78 |   
 79 |   protected Object fixLeft(Object left, Object right) {
 80 |     
 81 |     if (null == left || null == right)
 82 |       return left;
 83 |     
 84 |     if (!(right instanceof Number) && left instanceof Number) {
 85 |       right = NumberUtils.createNumber(right.toString());
 86 |     }
 87 |     
 88 |     if (right instanceof Number && left instanceof Number) {
 89 |       if (right instanceof Double)
 90 |         return ((Double) right).doubleValue();
 91 |       else if (right instanceof Float)
 92 |         return ((Float) right).floatValue();
 93 |       else if (right instanceof Long)
 94 |         return ((Long) right).longValue();
 95 |       else if (right instanceof Integer)
 96 |         return ((Integer) right).intValue();
 97 |       else if (right instanceof Short)
 98 |         return ((Short) right).shortValue();
 99 |       else if (right instanceof Byte)
100 |         return ((Byte) right).byteValue();
101 |       else
102 |         return right;
103 |     }
104 |     if (right instanceof Number && left instanceof String) {
105 |       Number num = NumberUtils.createNumber(left.toString());
106 |       // Let's try to cast left as right's type.
107 |       if (this.isFloatingPointNumber(right) && this.isFloatingPointNumber(left))
108 |         return num;
109 |       else if (this.isFloatingPointNumber(right))
110 |         return num.doubleValue();
111 |       else if (right instanceof Number)
112 |         return num.longValue();
113 |     } else if (right instanceof Boolean && left instanceof String) {
114 |       if (left.equals("true") || left.equals("false"))
115 |         return Boolean.parseBoolean(left.toString());
116 |       
117 |       Number num = NumberUtils.createNumber(left.toString());
118 |       if (num.intValue() == 1)
119 |         return (Boolean) true;
120 |       else if (num.intValue() == 0)
121 |         return (Boolean) false;
122 |     }
123 |     return left;
124 |   }
125 |   
126 | }
127 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.reader;
 18 | 
 19 | import java.io.IOException;
 20 | 
 21 | import org.apache.hadoop.conf.Configuration;
 22 | import org.apache.hadoop.fs.FSDataInputStream;
 23 | import org.apache.hadoop.fs.FileSystem;
 24 | import org.apache.hadoop.fs.Path;
 25 | import org.apache.hadoop.io.LongWritable;
 26 | import org.apache.hadoop.io.Text;
 27 | import org.apache.hadoop.io.compress.CompressionCodec;
 28 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
 29 | import org.apache.hadoop.mapreduce.InputSplit;
 30 | import org.apache.hadoop.mapreduce.RecordReader;
 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 32 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 33 | import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
 34 | import org.apache.hadoop.util.LineReader;
 35 | 
 36 | /**
 37 |  * A copy of {@link LineRecordReader} which does not discard lines longer than "mapred.linerecordreader.maxlength". Instead, it returns them, leaving it to the
 38 |  * mapper to decide what to do with it. It also does not treat '\r' (CR) characters as new lines -- it uses {@link LfLineReader} instead of {@link LineReader}
 39 |  * to read lines.
 40 |  */
 41 | public class LongLineRecordReader extends RecordReader<LongWritable,Text> {
 42 |   private CompressionCodecFactory compressionCodecs = null;
 43 |   private long start;
 44 |   private long pos;
 45 |   private long end;
 46 |   private LfLineReader in;
 47 |   private int maxLineLength;
 48 |   private LongWritable key = null;
 49 |   private Text value = null;
 50 |   
 51 |   @Override
 52 |   public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
 53 |     FileSplit split = (FileSplit) genericSplit;
 54 |     Configuration job = context.getConfiguration();
 55 |     this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
 56 |     start = split.getStart();
 57 |     end = start + split.getLength();
 58 |     final Path file = split.getPath();
 59 |     compressionCodecs = new CompressionCodecFactory(job);
 60 |     final CompressionCodec codec = compressionCodecs.getCodec(file);
 61 |     
 62 |     // open the file and seek to the start of the split
 63 |     FileSystem fs = file.getFileSystem(job);
 64 |     FSDataInputStream fileIn = fs.open(split.getPath());
 65 |     boolean skipFirstLine = false;
 66 |     if (codec != null) {
 67 |       in = new LfLineReader(codec.createInputStream(fileIn), job);
 68 |       end = Long.MAX_VALUE;
 69 |     } else {
 70 |       if (start != 0) {
 71 |         skipFirstLine = true;
 72 |         --start;
 73 |         fileIn.seek(start);
 74 |       }
 75 |       in = new LfLineReader(fileIn, job);
 76 |     }
 77 |     if (skipFirstLine) { // skip first line and re-establish "start".
 78 |       start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
 79 |     }
 80 |     this.pos = start;
 81 |   }
 82 |   
 83 |   @Override
 84 |   public boolean nextKeyValue() throws IOException {
 85 |     if (key == null) {
 86 |       key = new LongWritable();
 87 |     }
 88 |     key.set(pos);
 89 |     if (value == null) {
 90 |       value = new Text();
 91 |     }
 92 |     int newSize = 0;
 93 |     if (pos < end) {
 94 |       newSize = in.readLine(value, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
 95 |       if (newSize != 0) {
 96 |         pos += newSize;
 97 |       }
 98 |     }
 99 |     if (newSize == 0) {
100 |       key = null;
101 |       value = null;
102 |       return false;
103 |     } else {
104 |       return true;
105 |     }
106 |   }
107 |   
108 |   @Override
109 |   public LongWritable getCurrentKey() {
110 |     return key;
111 |   }
112 |   
113 |   @Override
114 |   public Text getCurrentValue() {
115 |     return value;
116 |   }
117 |   
118 |   /**
119 |    * Get the progress within the split
120 |    */
121 |   @Override
122 |   public float getProgress() {
123 |     if (start == end) {
124 |       return 0.0f;
125 |     } else {
126 |       return Math.min(1.0f, (pos - start) / (float) (end - start));
127 |     }
128 |   }
129 |   
130 |   @Override
131 |   public synchronized void close() throws IOException {
132 |     if (in != null) {
133 |       in.close();
134 |     }
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | Licensed to the Apache Software Foundation (ASF) under one or more
  3 | contributor license agreements.  See the NOTICE file distributed with
  4 | this work for additional information regarding copyright ownership.
  5 | The ASF licenses this file to You under the Apache License, Version 2.0
  6 | (the "License"); you may not use this file except in compliance with
  7 | the License.  You may obtain a copy of the License at
  8 | 
  9 |     http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 | Unless required by applicable law or agreed to in writing, software
 12 | distributed under the License is distributed on an "AS IS" BASIS,
 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | See the License for the specific language governing permissions and
 15 | limitations under the License.
 16 | -->
 17 | # Wikisearch Installation
 18 | 
 19 | Instructions for installing and running the Accumulo Wikisearch example.
 20 | 
 21 | ## Ingest
 22 |  
 23 | ### Prerequisites
 24 | 
 25 | 1. Accumulo, Hadoop, and ZooKeeper must be installed and running
 26 | 1. Download one or more [wikipedia dump files][dump-files] and put them in an HDFS directory.
 27 | 	 You will want to grab the files with the link name of pages-articles.xml.bz2. Though not strictly
 28 | 	 required, the ingest will go more quickly if the files are decompressed:
 29 | 
 30 |         $ bunzip2 enwiki-*-pages-articles.xml.bz2
 31 |         $ hadoop fs -put enwiki-*-pages-articles.xml /wikipedia/enwiki-pages-articles.xml
 32 | 
 33 | ### Instructions
 34 | 	
 35 | 1. Create a `wikipedia.xml` file (or `wikipedia_parallel.xml` if running parallel version) from
 36 |    [wikipedia.xml.example] or [wikipedia_parallel.xml.example] and modify for your Accumulo
 37 |    installation.
 38 |    
 39 |         $ cp ingest/conf
 40 |         $ cp wikipedia.xml.example wikipedia.xml
 41 |         $ vim wikipedia.xml
 42 |  
 43 | 1. Copy `ingest/lib/wikisearch-*.jar` to `$ACCUMULO_HOME/lib/ext`
 44 | 1. Run `ingest/bin/ingest.sh` (or `ingest_parallel.sh` if running parallel version) with one
 45 |    argument (the name of the directory in HDFS where the wikipedia XML files reside) and this will
 46 |    kick off a MapReduce job to ingest the data into Accumulo.
 47 | 
 48 | ## Query
 49 |  
 50 | ### Prerequisites
 51 | 
 52 | 1. The query software was tested using JBoss AS 6. Install the JBoss distro and follow the instructions below
 53 |    to build the EJB jar and WAR file required.
 54 |   * To stop the JBoss warnings about WSDescriptorDeployer and JMSDescriptorDeployer, these deployers can be
 55 |     removed from `$JBOSS_HOME/server/default/deployers/jbossws.deployer/META-INF/stack-agnostic-jboss-beans.xml`
 56 | 1. Ensure that you have successfully run `mvn clean install` at the Wikisearch top level to install the jars
 57 |    into your local maven repo before building the query package.
 58 | 	
 59 | ### Instructions
 60 | 
 61 | 1. Create a `ejb-jar.xml` from [ejb-jar.xml.example] and modify it to contain the same information
 62 |    that you put into `wikipedia.xml` in the ingest steps above:
 63 | 
 64 |         cd query/src/main/resources/META-INF/
 65 |         cp ejb-jar.xml.example ejb-jar.xml
 66 |         vim ejb-jar.xml
 67 | 
 68 | 1. Re-build the query distribution by running `mvn package assembly:single` in the query module's directory.
 69 | 1. Untar the resulting file in the `$JBOSS_HOME/server/default` directory.
 70 | 
 71 |         $ cd $JBOSS_HOME/server/default
 72 |         $ tar -xzf /some/path/to/wikisearch/query/target/wikisearch-query*.tar.gz
 73 |  
 74 |    This will place the dependent jars in the lib directory and the EJB jar into the deploy directory.
 75 | 1. Next, copy the wikisearch*.war file in the query-war/target directory to $JBOSS_HOME/server/default/deploy. 
 76 | 1. Start JBoss ($JBOSS_HOME/bin/run.sh)
 77 | 1. Use the Accumulo shell and give the user permissions for the wikis that you loaded:
 78 | 			
 79 |         > setauths -u <user> -s all,enwiki,eswiki,frwiki,fawiki
 80 | 			  
 81 | 1. Copy the following jars to the `$ACCUMULO_HOME/lib/ext` directory from the `$JBOSS_HOME/server/default/lib` directory:
 82 | 	
 83 |         kryo*.jar
 84 |         minlog*.jar
 85 |         commons-jexl*.jar
 86 | 		
 87 | 1. Copy `$JBOSS_HOME/server/default/deploy/wikisearch-query*.jar` to `$ACCUMULO_HOME/lib/ext.`
 88 | 
 89 | 1. At this point you should be able to open a browser and view the page:
 90 | 
 91 |         http://localhost:8080/accumulo-wikisearch/ui.html
 92 | 
 93 |   You can issue the queries using this user interface or via the following REST urls:
 94 | 
 95 |         <host>/accumulo-wikisearch/rest/Query/xml
 96 |         <host>/accumulo-wikisearch/rest/Query/html
 97 |         <host>/accumulo-wikisearch/rest/Query/yaml
 98 |         <host>/accumulo-wikisearch/rest/Query/json.
 99 | 
100 |   There are two parameters to the REST service, query and auths. The query parameter is the same string that you would type
101 | 	into the search box at ui.jsp, and the auths parameter is a comma-separated list of wikis that you want to search (i.e.
102 | 	enwiki,frwiki,dewiki, etc. Or you can use all) 
103 | 	
104 |   - NOTE: Ran into a [bug] that did not allow an EJB3.1 war file. The workaround is to separate the RESTEasy servlet
105 |     from the EJBs by creating an EJB jar and a WAR file.
106 | 
107 | [ejb-jar.xml.example]: query/src/main/resources/META-INF/ejb-jar.xml.example
108 | [dump-files]: http://dumps.wikimedia.org/backup-index.html
109 | [wikipedia.xml.example]: ingest/conf/wikipedia.xml.example
110 | [wikipedia_parallel.xml.example]: ingest/conf/wikipedia_parallel.xml.example
111 | [bug]: https://issues.jboss.org/browse/RESTEASY-531
112 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/BufferingRFileRecordWriter.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.output;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.HashMap;
 21 | import java.util.Map;
 22 | import java.util.Map.Entry;
 23 | import java.util.TreeMap;
 24 | 
 25 | import org.apache.accumulo.core.client.AccumuloException;
 26 | import org.apache.accumulo.core.client.AccumuloSecurityException;
 27 | import org.apache.accumulo.core.client.BatchWriter;
 28 | import org.apache.accumulo.core.client.BatchWriterConfig;
 29 | import org.apache.accumulo.core.client.Connector;
 30 | import org.apache.accumulo.core.client.TableNotFoundException;
 31 | import org.apache.accumulo.core.conf.AccumuloConfiguration;
 32 | import org.apache.accumulo.core.data.ColumnUpdate;
 33 | import org.apache.accumulo.core.data.Key;
 34 | import org.apache.accumulo.core.data.Mutation;
 35 | import org.apache.accumulo.core.data.Value;
 36 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
 37 | import org.apache.hadoop.conf.Configuration;
 38 | import org.apache.hadoop.fs.FileSystem;
 39 | import org.apache.hadoop.io.Text;
 40 | import org.apache.hadoop.mapreduce.RecordWriter;
 41 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 42 | 
 43 | final class BufferingRFileRecordWriter extends RecordWriter<Text,Mutation> {
 44 |   private final long maxSize;
 45 |   private final Configuration conf;
 46 |   private long size;
 47 |   
 48 |   private Map<Text,TreeMap<Key,Value>> buffers = new HashMap<Text,TreeMap<Key,Value>>();
 49 |   private Map<Text,Long> bufferSizes = new HashMap<Text,Long>();
 50 |   
 51 |   private TreeMap<Key,Value> getBuffer(Text tablename) {
 52 |     TreeMap<Key,Value> buffer = buffers.get(tablename);
 53 |     if (buffer == null) {
 54 |       buffer = new TreeMap<Key,Value>();
 55 |       buffers.put(tablename, buffer);
 56 |       bufferSizes.put(tablename, 0l);
 57 |     }
 58 |     return buffer;
 59 |   }
 60 |   
 61 |   private Text getLargestTablename() {
 62 |     long max = 0;
 63 |     Text table = null;
 64 |     for (Entry<Text,Long> e : bufferSizes.entrySet()) {
 65 |       if (e.getValue() > max) {
 66 |         max = e.getValue();
 67 |         table = e.getKey();
 68 |       }
 69 |     }
 70 |     return table;
 71 |   }
 72 |   
 73 |   private void flushLargestTable() throws IOException {
 74 |     Text tablename = getLargestTablename();
 75 |     if (tablename == null)
 76 |       return;
 77 |     long bufferSize = bufferSizes.get(tablename);
 78 |     TreeMap<Key,Value> buffer = buffers.get(tablename);
 79 |     if (buffer.size() == 0)
 80 |       return;
 81 |     
 82 |     Connector conn;
 83 | 	try {		
 84 | 	  conn = WikipediaConfiguration.getConnector(conf);
 85 |       BatchWriterConfig bwconfig = new BatchWriterConfig();
 86 |       BatchWriter writer = conn.createBatchWriter(tablename.toString(), bwconfig);
 87 |       for (Entry<Key,Value> e : buffer.entrySet()) {
 88 |         Key k = e.getKey();
 89 |     	Mutation m = new Mutation();
 90 |     	m.put(k.getColumnFamily(), k.getColumnQualifier(), e.getValue());
 91 |         writer.addMutation(m);
 92 |       }
 93 |       writer.close();
 94 | 	} catch (AccumuloException | AccumuloSecurityException | TableNotFoundException e) {
 95 | 	  System.err.println("Error occured in flushLargestTable: " + e.getMessage());
 96 | 	  e.printStackTrace();
 97 | 	}    
 98 |     // TODO get the table configuration for the given table?
 99 |     
100 |     size -= bufferSize;
101 |     buffer.clear();
102 |     bufferSizes.put(tablename, 0l);
103 |   }
104 |   
105 |   BufferingRFileRecordWriter(long maxSize, Configuration conf) {
106 |     this.maxSize = maxSize;
107 |     this.conf = conf;
108 |   }
109 |   
110 |   @Override
111 |   public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
112 |     while (size > 0)
113 |       flushLargestTable();
114 |   }
115 |   
116 |   @Override
117 |   public void write(Text table, Mutation mutation) throws IOException, InterruptedException {
118 |     TreeMap<Key,Value> buffer = getBuffer(table);
119 |     int mutationSize = 0;
120 |     for (ColumnUpdate update : mutation.getUpdates()) {
121 |       Key k = new Key(mutation.getRow(), update.getColumnFamily(), update.getColumnQualifier(), update.getColumnVisibility(), update.getTimestamp(),
122 |           update.isDeleted());
123 |       Value v = new Value(update.getValue());
124 |       // TODO account for object overhead
125 |       mutationSize += k.getSize();
126 |       mutationSize += v.getSize();
127 |       buffer.put(k, v);
128 |     }
129 |     size += mutationSize;
130 |     long bufferSize = bufferSizes.get(table);
131 |     
132 |     // TODO use a MutableLong instead
133 |     bufferSize += mutationSize;
134 |     bufferSizes.put(table, bufferSize);
135 |     
136 |     while (size >= maxSize) {
137 |       flushLargestTable();
138 |     }
139 |   }
140 |   
141 | }
142 | 


--------------------------------------------------------------------------------
/query-war/src/main/webapp/ui.html:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   contributor license agreements.  See the NOTICE file distributed with
  4 |   this work for additional information regarding copyright ownership.
  5 |   The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   (the "License"); you may not use this file except in compliance with
  7 |   the License.  You may obtain a copy of the License at
  8 | 
  9 |       http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |   Unless required by applicable law or agreed to in writing, software
 12 |   distributed under the License is distributed on an "AS IS" BASIS,
 13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   See the License for the specific language governing permissions and
 15 |   limitations under the License.
 16 | -->
 17 | <%@page contentType="text/html" pageEncoding="UTF-8"%>
 18 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 19 |    "http://www.w3.org/TR/html4/loose.dtd">
 20 | 
 21 | <html>
 22 |     <head>
 23 |         <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
 24 |         <title>Wiki Search Page</title>
 25 |         <style type="text/css">
 26 |         	#comments {
 27 | 				width: 90%;
 28 | 				margin: auto;
 29 |         	}
 30 |         	h1 {
 31 |         		text-align: center;
 32 |         	}
 33 | 			#d {
 34 | 				width: 80%;
 35 | 				margin: auto;
 36 | 			}
 37 | 			.center_box {
 38 | 				width: 70%;
 39 | 				margin: auto;
 40 | 			}
 41 | 			.center_input {
 42 | 				width: 30%;
 43 | 				margin: auto;
 44 | 			}
 45 |         </style>
 46 |     </head>
 47 |     <body>
 48 |     	<div id="comments">
 49 |     		<h1>Wiki Search using Apache Accumulo</h1>
 50 |     		<p>This sample application demonstrates the ability to use search documents using Apache Accumulo. The associated ingest software
 51 |     		extracts the id, title, timestamp, and comments from each wikipedia article. In addition, the wikipedia text has been tokenized
 52 |     		and is available for searching. You can enter a boolean expression into the search box below and select the particular set of
 53 |     		wikipedia languages you want to search.</p>
 54 |     		<p> Fields available for searching:
 55 |     		<ol>
 56 |     			<li>TEXT</li>
 57 |     			<li>ID</li>
 58 |     			<li>TITLE</li>
 59 |     			<li>TIMESTAMP</li>
 60 |     			<li>COMMENTS</li>
 61 |     		</ol>
 62 |     		<p>The search syntax is boolean logic, for example: TEXT == 'boy' and TITLE =~ 'Autism'. The supported operators are:
 63 |     		==, !=, &lt;, &gt;, &le;, &ge;, =~, and !~. Likewise grouping can be performed using parentheses and predicates can be
 64 |     		joined using and, or, and not.
 65 |     		<p>To highlight the cell-level access control of Apache Accumulo, the "authorization" required for a particular cell is the language 
 66 |     		of the associated wikipedia article.
 67 |     	</div>
 68 |     	<div id="d">
 69 | 	    	<form id="FORM" name="queryForm" method="get" target="results" onsubmit="return setAction()">
 70 | 	    		<br />
 71 | 	    		<br />
 72 | 	    		<div class="center_box">
 73 | 	    		<label>Query: </label>
 74 | 	    		<input id="QUERY" type="text" name="query" size="100" maxlength="300"/>
 75 | 	    		</div>
 76 | 	    		<br />
 77 | 	    		<div class="center_input">
 78 | 	    		<label>Authorizations: </label>
 79 | 	    		<br />
 80 | 	    		<label>All</label><input type="checkbox" name="auths" value="all" />
 81 | 				</div>
 82 | 	    		<div class="center_input">
 83 | 				<label>Arabic</label> <input type="checkbox" name="auths" value="arwiki" />
 84 | 				<label>Brazilian</label> <input type="checkbox" name="auths" value="brwiki" />
 85 | 				<label>Chinese</label> <input type="checkbox" name="auths" value="zhwiki" />
 86 | 				</div>
 87 | 				<div class="center_input">
 88 | 				<label>Dutch</label> <input type="checkbox" name="auths" value="nlwiki" />
 89 | 	    		<label>English</label> <input type="checkbox" name="auths" value="enwiki" />
 90 | 				<label>Farsi</label> <input type="checkbox" name="auths" value="fawiki" />
 91 | 				</div>
 92 | 	    		<div class="center_input">				
 93 | 				<label>French</label> <input type="checkbox" name="auths" value="frwiki" />
 94 | 				<label>German</label> <input type="checkbox" name="auths" value="dewiki" />
 95 | 				<label>Greek</label> <input type="checkbox" name="auths" value="elwiki" />
 96 | 				</div>
 97 | 	    		<div class="center_input">				
 98 | 				<label>Italian</label> <input type="checkbox" name="auths" value="itwiki" />
 99 | 				<label>Spanish</label> <input type="checkbox" name="auths" value="eswiki" />
100 | 				<label>Russia</label>n <input type="checkbox" name="auths" value="ruwiki" /><br />
101 | 				</div>
102 | 	    		<div class="center_input">				
103 | 				<input type="submit" name="Submit Query" />
104 | 				</div>
105 | 	    	</form>
106 | 	   		<br />
107 | 	   		<br />
108 | 	    	<iframe name="results" width="90%" height="400" scrolling="yes" >
109 | 	    	</iframe>
110 |     	</div>
111 |     	<script type="text/javascript">
112 |     		function setAction() {
113 | 	    		var f = document.forms[0];
114 | 	    		var authString = "";
115 | 	    		var sep = "";
116 | 	    		for (var i=0; i<f.auths.length; i++) {
117 | 	    			if (f.auths[i].checked) {
118 | 	    				authString = authString + sep + f.auths[i].value;
119 | 	    				sep = ",";
120 | 	    			}
121 | 	    		}
122 | 	    		//Build the new query
123 | 				var existingAction = "/accumulo-wikisearch/rest/Query/html";
124 | 	    		var query = f.query.value;
125 | 	    		
126 | 	    		var newAction = existingAction + "?query=" + query + "&auths=" + authString;
127 | 	    		document.forms[0].action = newAction;
128 |     		}
129 |     	</script>    	
130 |     </body>
131 | </html>
132 | 


--------------------------------------------------------------------------------
/query/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   Licensed to the Apache Software Foundation (ASF) under one or more
  4 |   contributor license agreements. See the NOTICE file distributed with
  5 |   this work for additional information regarding copyright ownership.
  6 |   The ASF licenses this file to You under the Apache License, Version 2.0
  7 |   (the "License"); you may not use this file except in compliance with
  8 |   the License. You may obtain a copy of the License at
  9 | 
 10 |   http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |   Unless required by applicable law or agreed to in writing, software
 13 |   distributed under the License is distributed on an "AS IS" BASIS,
 14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   See the License for the specific language governing permissions and
 16 |   limitations under the License.
 17 | -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 19 |   <modelVersion>4.0.0</modelVersion>
 20 |   <parent>
 21 |     <groupId>org.apache.accumulo</groupId>
 22 |     <artifactId>accumulo-wikisearch</artifactId>
 23 |     <version>2.0.0-SNAPSHOT</version>
 24 |   </parent>
 25 |   <artifactId>wikisearch-query</artifactId>
 26 |   <packaging>ejb</packaging>
 27 |   <name>wikisearch-query</name>
 28 |   <dependencies>
 29 |     <dependency>
 30 |       <groupId>com.google.guava</groupId>
 31 |       <artifactId>guava</artifactId>
 32 |     </dependency>
 33 |     <dependency>
 34 |       <groupId>com.google.protobuf</groupId>
 35 |       <artifactId>protobuf-java</artifactId>
 36 |     </dependency>
 37 |     <dependency>
 38 |       <groupId>com.googlecode</groupId>
 39 |       <artifactId>kryo</artifactId>
 40 |     </dependency>
 41 |     <dependency>
 42 |       <groupId>com.sun.jersey</groupId>
 43 |       <artifactId>jersey-core</artifactId>
 44 |     </dependency>
 45 |     <dependency>
 46 |       <groupId>commons-codec</groupId>
 47 |       <artifactId>commons-codec</artifactId>
 48 |     </dependency>
 49 |     <dependency>
 50 |       <groupId>commons-collections</groupId>
 51 |       <artifactId>commons-collections</artifactId>
 52 |     </dependency>
 53 |     <dependency>
 54 |       <groupId>commons-configuration</groupId>
 55 |       <artifactId>commons-configuration</artifactId>
 56 |     </dependency>
 57 |     <dependency>
 58 |       <groupId>commons-lang</groupId>
 59 |       <artifactId>commons-lang</artifactId>
 60 |     </dependency>
 61 |     <dependency>
 62 |       <groupId>org.apache.accumulo</groupId>
 63 |       <artifactId>accumulo-core</artifactId>
 64 |     </dependency>
 65 |     <dependency>
 66 |       <groupId>org.apache.accumulo</groupId>
 67 |       <artifactId>wikisearch-ingest</artifactId>
 68 |     </dependency>
 69 |     <dependency>
 70 |       <groupId>org.apache.commons</groupId>
 71 |       <artifactId>commons-jexl</artifactId>
 72 |     </dependency>
 73 |     <dependency>
 74 |       <groupId>org.apache.hadoop</groupId>
 75 |       <artifactId>hadoop-client</artifactId>
 76 |     </dependency>
 77 |     <dependency>
 78 |       <groupId>javaee</groupId>
 79 |       <artifactId>javaee-api</artifactId>
 80 |       <scope>provided</scope>
 81 |     </dependency>
 82 |     <dependency>
 83 |       <groupId>com.googlecode</groupId>
 84 |       <artifactId>minlog</artifactId>
 85 |       <scope>runtime</scope>
 86 |     </dependency>
 87 |     <dependency>
 88 |       <groupId>commons-io</groupId>
 89 |       <artifactId>commons-io</artifactId>
 90 |       <scope>runtime</scope>
 91 |     </dependency>
 92 |     <dependency>
 93 |       <groupId>org.apache.htrace</groupId>
 94 |       <artifactId>htrace-core</artifactId>
 95 |       <scope>runtime</scope>
 96 |     </dependency>
 97 |     <dependency>
 98 |       <groupId>org.apache.thrift</groupId>
 99 |       <artifactId>libthrift</artifactId>
100 |       <scope>runtime</scope>
101 |          <exclusions>
102 |             <exclusion>
103 |                 <groupId>org.apache.httpcomponents</groupId>
104 |                 <artifactId>httpclient</artifactId>
105 |             </exclusion>
106 |         </exclusions>
107 |     </dependency>
108 |     <dependency>
109 |       <groupId>org.apache.zookeeper</groupId>
110 |       <artifactId>zookeeper</artifactId>
111 |       <scope>runtime</scope>
112 |     </dependency>
113 |     <dependency>
114 |       <groupId>junit</groupId>
115 |       <artifactId>junit</artifactId>
116 |       <scope>test</scope>
117 |     </dependency>
118 |   </dependencies>
119 |   <build>
120 |     <plugins>
121 |       <plugin>
122 |         <groupId>org.apache.maven.plugins</groupId>
123 |         <artifactId>maven-dependency-plugin</artifactId>
124 |         <executions>
125 |           <execution>
126 |             <id>copy-dependencies</id>
127 |             <goals>
128 |               <goal>copy-dependencies</goal>
129 |             </goals>
130 |             <phase>prepare-package</phase>
131 |             <configuration>
132 |               <outputDirectory>lib</outputDirectory>
133 |               <!-- just grab the non-provided runtime dependencies -->
134 |               <includeArtifactIds>commons-io,commons-configuration,commons-lang,commons-codec,protobuf-java,libthrift,zookeeper,hadoop-client,commons-jexl,guava,kryo,asm,minlog,reflectasm,wikisearch-ingest,accumulo-core,accumulo-fate,accumulo-trace,htrace-core</includeArtifactIds>
135 |               <excludeTransitive>true</excludeTransitive>
136 |             </configuration>
137 |           </execution>
138 |         </executions>
139 |       </plugin>
140 |       <plugin>
141 |         <groupId>org.apache.maven.plugins</groupId>
142 |         <artifactId>maven-assembly-plugin</artifactId>
143 |         <configuration>
144 |           <descriptors>
145 |             <descriptor>src/assembly/dist.xml</descriptor>
146 |           </descriptors>
147 |         </configuration>
148 |       </plugin>
149 |       <plugin>
150 |         <groupId>org.apache.maven.plugins</groupId>
151 |         <artifactId>maven-ejb-plugin</artifactId>
152 |         <configuration>
153 |           <ejbVersion>3.1</ejbVersion>
154 |         </configuration>
155 |       </plugin>
156 |     </plugins>
157 |   </build>
158 | </project>
159 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.reader;
 18 | 
 19 | import java.io.IOException;
 20 | import java.io.InputStream;
 21 | 
 22 | import org.apache.hadoop.conf.Configuration;
 23 | import org.apache.hadoop.io.Text;
 24 | 
 25 | /**
 26 |  * A class that provides a line reader from an input stream.
 27 |  */
 28 | public class LfLineReader {
 29 |   private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
 30 |   private int bufferSize = DEFAULT_BUFFER_SIZE;
 31 |   private InputStream in;
 32 |   private byte[] buffer;
 33 |   // the number of bytes of real data in the buffer
 34 |   private int bufferLength = 0;
 35 |   // the current position in the buffer
 36 |   private int bufferPosn = 0;
 37 | 
 38 |   private static final byte LF = '\n';
 39 | 
 40 |   /**
 41 |    * Create a line reader that reads from the given stream using the default buffer-size (64k).
 42 |    *
 43 |    * @param in
 44 |    *          The input stream
 45 |    */
 46 |   public LfLineReader(InputStream in) {
 47 |     this(in, DEFAULT_BUFFER_SIZE);
 48 |   }
 49 | 
 50 |   /**
 51 |    * Create a line reader that reads from the given stream using the given buffer-size.
 52 |    *
 53 |    * @param in
 54 |    *          The input stream
 55 |    * @param bufferSize
 56 |    *          Size of the read buffer
 57 |    */
 58 |   public LfLineReader(InputStream in, int bufferSize) {
 59 |     this.in = in;
 60 |     this.bufferSize = bufferSize;
 61 |     this.buffer = new byte[this.bufferSize];
 62 |   }
 63 | 
 64 |   /**
 65 |    * Create a line reader that reads from the given stream using the
 66 |    * <code>io.file.buffer.size</code> specified in the given <code>Configuration</code>.
 67 |    *
 68 |    * @param in
 69 |    *          input stream
 70 |    * @param conf
 71 |    *          configuration
 72 |    */
 73 |   public LfLineReader(InputStream in, Configuration conf) throws IOException {
 74 |     this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
 75 |   }
 76 | 
 77 |   /**
 78 |    * Close the underlying stream.
 79 |    */
 80 |   public void close() throws IOException {
 81 |     in.close();
 82 |   }
 83 | 
 84 |   /**
 85 |    * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF).
 86 |    * EOF also terminates an otherwise unterminated line.
 87 |    *
 88 |    * @param str
 89 |    *          the object to store the given line (without newline)
 90 |    * @param maxLineLength
 91 |    *          the maximum number of bytes to store into str; the rest of the line is silently
 92 |    *          discarded.
 93 |    * @param maxBytesToConsume
 94 |    *          the maximum number of bytes to consume in this call. This is only a hint, because if
 95 |    *          the line cross this threshold, we allow it to happen. It can overshoot potentially by
 96 |    *          as much as one buffer length.
 97 |    *
 98 |    * @return the number of bytes read including the (longest) newline found.
 99 |    *
100 |    * @throws IOException
101 |    *           if the underlying stream throws
102 |    */
103 |   public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
104 |     /*
105 |      * We're reading data from in, but the head of the stream may be already buffered in buffer, so
106 |      * we have several cases: 1. No newline characters are in the buffer, so we need to copy
107 |      * everything and read another buffer from the stream. 2. An unambiguously terminated line is in
108 |      * buffer, so we just copy to str.
109 |      */
110 |     str.clear();
111 |     int txtLength = 0; // tracks str.getLength(), as an optimization
112 |     int newlineLength = 0; // length of terminating newline
113 |     long bytesConsumed = 0;
114 |     do {
115 |       int startPosn = bufferPosn; // starting from where we left off the last time
116 |       if (bufferPosn >= bufferLength) {
117 |         startPosn = bufferPosn = 0;
118 |         bufferLength = in.read(buffer);
119 |         if (bufferLength <= 0) {
120 |           break; // EOF
121 |         }
122 |       }
123 |       for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
124 |         if (buffer[bufferPosn] == LF) {
125 |           newlineLength = 1;
126 |           ++bufferPosn; // at next invocation proceed from following byte
127 |           break;
128 |         }
129 |       }
130 |       int readLength = bufferPosn - startPosn;
131 |       bytesConsumed += readLength;
132 |       int appendLength = readLength - newlineLength;
133 |       if (appendLength > maxLineLength - txtLength) {
134 |         appendLength = maxLineLength - txtLength;
135 |       }
136 |       if (appendLength > 0) {
137 |         str.append(buffer, startPosn, appendLength);
138 |         txtLength += appendLength;
139 |       }
140 |     } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
141 | 
142 |     if (bytesConsumed > Integer.MAX_VALUE) {
143 |       throw new IOException("Too many bytes before newline: " + bytesConsumed);
144 |     }
145 |     return (int) bytesConsumed;
146 |   }
147 | 
148 |   /**
149 |    * Read from the InputStream into the given Text.
150 |    *
151 |    * @param str
152 |    *          the object to store the given line
153 |    * @param maxLineLength
154 |    *          the maximum number of bytes to store into str.
155 |    * @return the number of bytes read including the newline
156 |    * @throws IOException
157 |    *           if the underlying stream throws
158 |    */
159 |   public int readLine(Text str, int maxLineLength) throws IOException {
160 |     return readLine(str, maxLineLength, Integer.MAX_VALUE);
161 |   }
162 | 
163 |   /**
164 |    * Read from the InputStream into the given Text.
165 |    *
166 |    * @param str
167 |    *          the object to store the given line
168 |    * @return the number of bytes read including the newline
169 |    * @throws IOException
170 |    *           if the underlying stream throws
171 |    */
172 |   public int readLine(Text str) throws IOException {
173 |     return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
174 |   }
175 | 
176 | }
177 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.iterator;
 18 | 
 19 | import java.util.ArrayList;
 20 | import java.util.Collections;
 21 | import java.util.List;
 22 | 
 23 | import org.apache.accumulo.core.data.Key;
 24 | import org.apache.accumulo.core.data.Value;
 25 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight;
 26 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder;
 27 | import org.junit.After;
 28 | import org.junit.Assert;
 29 | import org.junit.Before;
 30 | import org.junit.Test;
 31 | 
 32 | import com.google.protobuf.InvalidProtocolBufferException;
 33 | 
 34 | public class TextIndexTest {
 35 |   private TextIndexCombiner combiner;
 36 |   private List<Value> values;
 37 | 
 38 |   @Before
 39 |   public void setup() throws Exception {
 40 |     combiner = new TextIndexCombiner();
 41 |     combiner.init(null, Collections.singletonMap("all", "true"), null);
 42 |     values = new ArrayList<>();
 43 |   }
 44 | 
 45 |   @After
 46 |   public void cleanup() {
 47 | 
 48 |   }
 49 | 
 50 |   private TermWeight.Info.Builder createBuilder() {
 51 |     return TermWeight.Info.newBuilder();
 52 |   }
 53 | 
 54 |   @Test
 55 |   public void testSingleValue() throws InvalidProtocolBufferException {
 56 |     Builder builder = createBuilder();
 57 |     builder.addWordOffset(1);
 58 |     builder.addWordOffset(5);
 59 |     builder.setNormalizedTermFrequency(0.1f);
 60 | 
 61 |     values.add(new Value(builder.build().toByteArray()));
 62 | 
 63 |     Value result = combiner.reduce(new Key(), values.iterator());
 64 | 
 65 |     TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
 66 | 
 67 |     Assert.assertTrue(info.getNormalizedTermFrequency() == 0.1f);
 68 | 
 69 |     List<Integer> offsets = info.getWordOffsetList();
 70 |     Assert.assertTrue(offsets.size() == 2);
 71 |     Assert.assertTrue(offsets.get(0) == 1);
 72 |     Assert.assertTrue(offsets.get(1) == 5);
 73 |   }
 74 | 
 75 |   @Test
 76 |   public void testAggregateTwoValues() throws InvalidProtocolBufferException {
 77 |     Builder builder = createBuilder();
 78 |     builder.addWordOffset(1);
 79 |     builder.addWordOffset(5);
 80 |     builder.setNormalizedTermFrequency(0.1f);
 81 | 
 82 |     values.add(new Value(builder.build().toByteArray()));
 83 | 
 84 |     builder = createBuilder();
 85 |     builder.addWordOffset(3);
 86 |     builder.setNormalizedTermFrequency(0.05f);
 87 | 
 88 |     values.add(new Value(builder.build().toByteArray()));
 89 | 
 90 |     Value result = combiner.reduce(new Key(), values.iterator());
 91 | 
 92 |     TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
 93 | 
 94 |     Assert.assertTrue(info.getNormalizedTermFrequency() == 0.15f);
 95 | 
 96 |     List<Integer> offsets = info.getWordOffsetList();
 97 |     Assert.assertTrue(offsets.size() == 3);
 98 |     Assert.assertTrue(offsets.get(0) == 1);
 99 |     Assert.assertTrue(offsets.get(1) == 3);
100 |     Assert.assertTrue(offsets.get(2) == 5);
101 |   }
102 | 
103 |   @Test
104 |   public void testAggregateManyValues() throws InvalidProtocolBufferException {
105 |     Builder builder = createBuilder();
106 |     builder.addWordOffset(13);
107 |     builder.addWordOffset(15);
108 |     builder.addWordOffset(19);
109 |     builder.setNormalizedTermFrequency(0.12f);
110 | 
111 |     values.add(new Value(builder.build().toByteArray()));
112 | 
113 |     builder = createBuilder();
114 |     builder.addWordOffset(1);
115 |     builder.addWordOffset(5);
116 |     builder.setNormalizedTermFrequency(0.1f);
117 | 
118 |     values.add(new Value(builder.build().toByteArray()));
119 | 
120 |     builder = createBuilder();
121 |     builder.addWordOffset(3);
122 |     builder.setNormalizedTermFrequency(0.05f);
123 | 
124 |     values.add(new Value(builder.build().toByteArray()));
125 | 
126 |     Value result = combiner.reduce(new Key(), values.iterator());
127 | 
128 |     TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
129 | 
130 |     Assert.assertTrue(info.getNormalizedTermFrequency() == 0.27f);
131 | 
132 |     List<Integer> offsets = info.getWordOffsetList();
133 |     Assert.assertTrue(offsets.size() == 6);
134 |     Assert.assertTrue(offsets.get(0) == 1);
135 |     Assert.assertTrue(offsets.get(1) == 3);
136 |     Assert.assertTrue(offsets.get(2) == 5);
137 |     Assert.assertTrue(offsets.get(3) == 13);
138 |     Assert.assertTrue(offsets.get(4) == 15);
139 |     Assert.assertTrue(offsets.get(5) == 19);
140 |   }
141 | 
142 |   @Test
143 |   public void testEmptyValue() throws InvalidProtocolBufferException {
144 |     Builder builder = createBuilder();
145 |     builder.addWordOffset(13);
146 |     builder.addWordOffset(15);
147 |     builder.addWordOffset(19);
148 |     builder.setNormalizedTermFrequency(0.12f);
149 | 
150 |     values.add(new Value("".getBytes()));
151 |     values.add(new Value(builder.build().toByteArray()));
152 |     values.add(new Value("".getBytes()));
153 | 
154 |     builder = createBuilder();
155 |     builder.addWordOffset(1);
156 |     builder.addWordOffset(5);
157 |     builder.setNormalizedTermFrequency(0.1f);
158 | 
159 |     values.add(new Value(builder.build().toByteArray()));
160 |     values.add(new Value("".getBytes()));
161 | 
162 |     builder = createBuilder();
163 |     builder.addWordOffset(3);
164 |     builder.setNormalizedTermFrequency(0.05f);
165 | 
166 |     values.add(new Value(builder.build().toByteArray()));
167 |     values.add(new Value("".getBytes()));
168 | 
169 |     Value result = combiner.reduce(new Key(), values.iterator());
170 | 
171 |     TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
172 | 
173 |     Assert.assertTrue(info.getNormalizedTermFrequency() == 0.27f);
174 | 
175 |     List<Integer> offsets = info.getWordOffsetList();
176 |     Assert.assertTrue(offsets.size() == 6);
177 |     Assert.assertTrue(offsets.get(0) == 1);
178 |     Assert.assertTrue(offsets.get(1) == 3);
179 |     Assert.assertTrue(offsets.get(2) == 5);
180 |     Assert.assertTrue(offsets.get(3) == 13);
181 |     Assert.assertTrue(offsets.get(4) == 15);
182 |     Assert.assertTrue(offsets.get(5) == 19);
183 |   }
184 | }
185 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/parser/TreeNode.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.parser;
 18 | 
 19 | import java.util.ArrayList;
 20 | import java.util.Collections;
 21 | import java.util.Enumeration;
 22 | import java.util.List;
 23 | import java.util.NoSuchElementException;
 24 | import java.util.Vector;
 25 | 
 26 | import org.apache.accumulo.examples.wikisearch.parser.QueryParser.QueryTerm;
 27 | import org.apache.commons.jexl2.parser.JexlNode;
 28 | 
 29 | 
 30 | import com.google.common.collect.HashMultimap;
 31 | import com.google.common.collect.Multimap;
 32 | 
 33 | public class TreeNode {
 34 |   
 35 |   private Class<? extends JexlNode> type = null;
 36 |   /* navigation elements */
 37 |   private TreeNode parent = null;
 38 |   private List<TreeNode> children = new ArrayList<TreeNode>();
 39 |   private Multimap<String,QueryTerm> terms = HashMultimap.create();
 40 |   
 41 |   public TreeNode() {
 42 |     super();
 43 |   }
 44 |   
 45 |   public Class<? extends JexlNode> getType() {
 46 |     return type;
 47 |   }
 48 |   
 49 |   public TreeNode getParent() {
 50 |     return parent;
 51 |   }
 52 |   
 53 |   public List<TreeNode> getChildren() {
 54 |     return children;
 55 |   }
 56 |   
 57 |   public Enumeration<TreeNode> getChildrenAsEnumeration() {
 58 |     return Collections.enumeration(children);
 59 |   }
 60 |   
 61 |   public Multimap<String,QueryTerm> getTerms() {
 62 |     return terms;
 63 |   }
 64 |   
 65 |   public void setType(Class<? extends JexlNode> type) {
 66 |     this.type = type;
 67 |   }
 68 |   
 69 |   public void setParent(TreeNode parent) {
 70 |     this.parent = parent;
 71 |   }
 72 |   
 73 |   public void setChildren(List<TreeNode> children) {
 74 |     this.children = children;
 75 |   }
 76 |   
 77 |   public void setTerms(Multimap<String,QueryTerm> terms) {
 78 |     this.terms = terms;
 79 |   }
 80 |   
 81 |   public boolean isLeaf() {
 82 |     return children.isEmpty();
 83 |   }
 84 |   
 85 |   @Override
 86 |   public String toString() {
 87 |     StringBuilder buf = new StringBuilder();
 88 |     buf.append("Type: ").append(type.getSimpleName());
 89 |     buf.append(" Terms: ");
 90 |     if (null == terms) {
 91 |       buf.append("null");
 92 |     } else {
 93 |       buf.append(terms.toString());
 94 |     }
 95 |     return buf.toString();
 96 |   }
 97 |   
 98 |   public final Enumeration<?> depthFirstEnumeration() {
 99 |     return new PostorderEnumeration(this);
100 |   }
101 |   
102 |   public Enumeration<?> breadthFirstEnumeration() {
103 |     return new BreadthFirstEnumeration(this);
104 |   }
105 |   
106 |   public final class PostorderEnumeration implements Enumeration<TreeNode> {
107 |     
108 |     protected TreeNode root;
109 |     protected Enumeration<TreeNode> children;
110 |     protected Enumeration<TreeNode> subtree;
111 |     
112 |     public PostorderEnumeration(TreeNode rootNode) {
113 |       super();
114 |       root = rootNode;
115 |       children = root.getChildrenAsEnumeration();
116 |       subtree = EMPTY_ENUMERATION;
117 |     }
118 |     
119 |     public boolean hasMoreElements() {
120 |       return root != null;
121 |     }
122 |     
123 |     public TreeNode nextElement() {
124 |       TreeNode retval;
125 |       
126 |       if (subtree.hasMoreElements()) {
127 |         retval = subtree.nextElement();
128 |       } else if (children.hasMoreElements()) {
129 |         subtree = new PostorderEnumeration((TreeNode) children.nextElement());
130 |         retval = subtree.nextElement();
131 |       } else {
132 |         retval = root;
133 |         root = null;
134 |       }
135 |       
136 |       return retval;
137 |     }
138 |   } // End of class PostorderEnumeration
139 |   
140 |   static public final Enumeration<TreeNode> EMPTY_ENUMERATION = new Enumeration<TreeNode>() {
141 |     
142 |     public boolean hasMoreElements() {
143 |       return false;
144 |     }
145 |     
146 |     public TreeNode nextElement() {
147 |       throw new NoSuchElementException("No more elements");
148 |     }
149 |   };
150 |   
151 |   final class BreadthFirstEnumeration implements Enumeration<TreeNode> {
152 |     protected Queue queue;
153 |     
154 |     public BreadthFirstEnumeration(TreeNode rootNode) {
155 |       super();
156 |       Vector<TreeNode> v = new Vector<TreeNode>(1);
157 |       v.addElement(rootNode); // PENDING: don't really need a vector
158 |       queue = new Queue();
159 |       queue.enqueue(v.elements());
160 |     }
161 |     
162 |     public boolean hasMoreElements() {
163 |       return (!queue.isEmpty() && ((Enumeration<?>) queue.firstObject()).hasMoreElements());
164 |     }
165 |     
166 |     public TreeNode nextElement() {
167 |       Enumeration<?> enumer = (Enumeration<?>) queue.firstObject();
168 |       TreeNode node = (TreeNode) enumer.nextElement();
169 |       Enumeration<?> children = node.getChildrenAsEnumeration();
170 |       
171 |       if (!enumer.hasMoreElements()) {
172 |         queue.dequeue();
173 |       }
174 |       if (children.hasMoreElements()) {
175 |         queue.enqueue(children);
176 |       }
177 |       return node;
178 |     }
179 |     
180 |     // A simple queue with a linked list data structure.
181 |     final class Queue {
182 |       QNode head; // null if empty
183 |       QNode tail;
184 |       
185 |       final class QNode {
186 |         public Object object;
187 |         public QNode next; // null if end
188 |         
189 |         public QNode(Object object, QNode next) {
190 |           this.object = object;
191 |           this.next = next;
192 |         }
193 |       }
194 |       
195 |       public void enqueue(Object anObject) {
196 |         if (head == null) {
197 |           head = tail = new QNode(anObject, null);
198 |         } else {
199 |           tail.next = new QNode(anObject, null);
200 |           tail = tail.next;
201 |         }
202 |       }
203 |       
204 |       public Object dequeue() {
205 |         if (head == null) {
206 |           throw new NoSuchElementException("No more elements");
207 |         }
208 |         
209 |         Object retval = head.object;
210 |         QNode oldHead = head;
211 |         head = head.next;
212 |         if (head == null) {
213 |           tail = null;
214 |         } else {
215 |           oldHead.next = null;
216 |         }
217 |         return retval;
218 |       }
219 |       
220 |       public Object firstObject() {
221 |         if (head == null) {
222 |           throw new NoSuchElementException("No more elements");
223 |         }
224 |         
225 |         return head.object;
226 |       }
227 |       
228 |       public boolean isEmpty() {
229 |         return head == null;
230 |       }
231 |       
232 |     } // End of class Queue
233 |     
234 |   } // End of class BreadthFirstEnumeration
235 | }
236 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.ingest;
 18 | 
 19 | import java.io.DataInput;
 20 | import java.io.DataOutput;
 21 | import java.io.IOException;
 22 | import java.io.Reader;
 23 | import java.text.ParseException;
 24 | import java.text.SimpleDateFormat;
 25 | import java.util.HashMap;
 26 | import java.util.Map;
 27 | 
 28 | import javax.xml.namespace.QName;
 29 | import javax.xml.stream.XMLInputFactory;
 30 | import javax.xml.stream.XMLStreamException;
 31 | import javax.xml.stream.XMLStreamReader;
 32 | 
 33 | import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
 34 | import org.apache.hadoop.io.Text;
 35 | import org.apache.hadoop.io.Writable;
 36 | 
 37 | 
 38 | public class ArticleExtractor {
 39 |   
 40 |   public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z");
 41 |   private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer();
 42 |   
 43 |   public static class Article implements Writable {
 44 |     int id;
 45 |     String title;
 46 |     long timestamp;
 47 |     String comments;
 48 |     String text;
 49 |     
 50 |     public Article(){}
 51 |     
 52 |     private Article(int id, String title, long timestamp, String comments, String text) {
 53 |       super();
 54 |       this.id = id;
 55 |       this.title = title;
 56 |       this.timestamp = timestamp;
 57 |       this.comments = comments;
 58 |       this.text = text;
 59 |     }
 60 |     
 61 |     public int getId() {
 62 |       return id;
 63 |     }
 64 |     
 65 |     public String getTitle() {
 66 |       return title;
 67 |     }
 68 |     
 69 |     public String getComments() {
 70 |       return comments;
 71 |     }
 72 |     
 73 |     public String getText() {
 74 |       return text;
 75 |     }
 76 |     
 77 |     public long getTimestamp() {
 78 |       return timestamp;
 79 |     }
 80 |     
 81 |     public Map<String,Object> getFieldValues() {
 82 |       Map<String,Object> fields = new HashMap<String,Object>();
 83 |       fields.put("ID", this.id);
 84 |       fields.put("TITLE", this.title);
 85 |       fields.put("TIMESTAMP", this.timestamp);
 86 |       fields.put("COMMENTS", this.comments);
 87 |       return fields;
 88 |     }
 89 |     
 90 |     public Map<String,String> getNormalizedFieldValues() {
 91 |       Map<String,String> fields = new HashMap<String,String>();
 92 |       //fields.put("ID", nn.normalizeFieldValue("ID", this.id));
 93 |       fields.put("ID", Integer.toString(this.id));
 94 |       fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title));
 95 |       //fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
 96 |       fields.put("TIMESTAMP", Long.toString(this.timestamp));
 97 |       fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments));
 98 |       return fields;
 99 |     }
100 | 
101 |     @Override
102 |     public void readFields(DataInput in) throws IOException {
103 |       id = in.readInt();
104 |       Text foo = new Text();
105 |       foo.readFields(in);
106 |       title = foo.toString();
107 |       timestamp = in.readLong();
108 |       foo.readFields(in);
109 |       comments = foo.toString();
110 |       foo.readFields(in);
111 |       text = foo.toString();
112 |     }
113 | 
114 |     @Override
115 |     public void write(DataOutput out) throws IOException {
116 |       out.writeInt(id);
117 |       (new Text(title)).write(out);
118 |       out.writeLong(timestamp);
119 |       (new Text(comments)).write(out);
120 |       (new Text(text)).write(out);
121 |     }
122 |     
123 |   }
124 |   
125 |   public ArticleExtractor() {}
126 |   
127 |   private static XMLInputFactory xmlif = XMLInputFactory.newInstance();
128 | 
129 |   static
130 |   {
131 |     xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
132 |   }
133 |   
134 |   public Article extract(Reader reader) {
135 |     
136 |     XMLStreamReader xmlr = null;
137 |     
138 |     try {
139 |       xmlr = xmlif.createXMLStreamReader(reader);
140 |     } catch (XMLStreamException e1) {
141 |       throw new RuntimeException(e1);
142 |     }
143 |     
144 |     QName titleName = QName.valueOf("title");
145 |     QName textName = QName.valueOf("text");
146 |     QName revisionName = QName.valueOf("revision");
147 |     QName timestampName = QName.valueOf("timestamp");
148 |     QName commentName = QName.valueOf("comment");
149 |     QName idName = QName.valueOf("id");
150 |     
151 |     Map<QName,StringBuilder> tags = new HashMap<QName,StringBuilder>();
152 |     for (QName tag : new QName[] {titleName, textName, timestampName, commentName, idName}) {
153 |       tags.put(tag, new StringBuilder());
154 |     }
155 |     
156 |     StringBuilder articleText = tags.get(textName);
157 |     StringBuilder titleText = tags.get(titleName);
158 |     StringBuilder timestampText = tags.get(timestampName);
159 |     StringBuilder commentText = tags.get(commentName);
160 |     StringBuilder idText = tags.get(idName);
161 |     
162 |     StringBuilder current = null;
163 |     boolean inRevision = false;
164 |     while (true) {
165 |       try {
166 |         if (!xmlr.hasNext())
167 |           break;
168 |         xmlr.next();
169 |       } catch (XMLStreamException e) {
170 |         throw new RuntimeException(e);
171 |       }
172 |       QName currentName = null;
173 |       if (xmlr.hasName()) {
174 |         currentName = xmlr.getName();
175 |       }
176 |       if (xmlr.isStartElement() && tags.containsKey(currentName)) {
177 |         if (!inRevision || (!currentName.equals(revisionName) && !currentName.equals(idName))) {
178 |           current = tags.get(currentName);
179 |           current.setLength(0);
180 |         }
181 |       } else if (xmlr.isStartElement() && currentName.equals(revisionName)) {
182 |         inRevision = true;
183 |       } else if (xmlr.isEndElement() && currentName.equals(revisionName)) {
184 |         inRevision = false;
185 |       } else if (xmlr.isEndElement() && current != null) {
186 |         if (textName.equals(currentName)) {
187 |           
188 |           String title = titleText.toString();
189 |           String text = articleText.toString();
190 |           String comment = commentText.toString();
191 |           int id = Integer.parseInt(idText.toString());
192 |           long timestamp;
193 |           try {
194 |             timestamp = dateFormat.parse(timestampText.append("+0000").toString()).getTime();
195 |             return new Article(id, title, timestamp, comment, text);
196 |           } catch (ParseException e) {
197 |             return null;
198 |           }
199 |         }
200 |         current = null;
201 |       } else if (current != null && xmlr.hasText()) {
202 |         current.append(xmlr.getText());
203 |       }
204 |     }
205 |     return null;
206 |   }
207 | }
208 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.reader;
 18 | 
 19 | import java.io.IOException;
 20 | 
 21 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
 22 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
 23 | import org.apache.accumulo.examples.wikisearch.util.TextUtil;
 24 | import org.apache.hadoop.io.LongWritable;
 25 | import org.apache.hadoop.io.Text;
 26 | import org.apache.hadoop.mapreduce.InputSplit;
 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 28 | 
 29 | /**
 30 |  * This class aggregates Text values based on a start and end filter. An example use case for this
 31 |  * would be XML data. This will not work with data that has nested start and stop tokens.
 32 |  *
 33 |  */
 34 | public class AggregatingRecordReader extends LongLineRecordReader {
 35 | 
 36 |   public static final String START_TOKEN = "aggregating.token.start";
 37 |   public static final String END_TOKEN = "aggregating.token.end";
 38 |   public static final String RETURN_PARTIAL_MATCHES = "aggregating.allow.partial";
 39 | 
 40 |   private LongWritable key = new LongWritable();
 41 |   private String startToken = null;
 42 |   private String endToken = null;
 43 |   private long counter = 0;
 44 |   private Text aggValue = new Text();
 45 |   private boolean startFound = false;
 46 |   private StringBuilder remainder = new StringBuilder(0);
 47 |   private boolean returnPartialMatches = false;
 48 | 
 49 |   @Override
 50 |   public LongWritable getCurrentKey() {
 51 |     key.set(counter);
 52 |     return key;
 53 |   }
 54 | 
 55 |   @Override
 56 |   public Text getCurrentValue() {
 57 |     return aggValue;
 58 |   }
 59 | 
 60 |   @Override
 61 |   public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
 62 |     super.initialize(((WikipediaInputSplit) genericSplit).getFileSplit(), context);
 63 |     this.startToken =
 64 |         WikipediaConfiguration.isNull(context.getConfiguration(), START_TOKEN, String.class);
 65 |     this.endToken =
 66 |         WikipediaConfiguration.isNull(context.getConfiguration(), END_TOKEN, String.class);
 67 |     this.returnPartialMatches =
 68 |         context.getConfiguration().getBoolean(RETURN_PARTIAL_MATCHES, false);
 69 | 
 70 |     /*
 71 |      * Text-appending works almost exactly like the + operator on Strings- it creates a byte array
 72 |      * exactly the size of [prefix + suffix] and dumps the bytes into the new array. This module
 73 |      * works by doing lots of little additions, one line at a time. With most XML, the documents are
 74 |      * partitioned on line boundaries, so we will generally have lots of additions. Setting a large
 75 |      * default byte array for a text object can avoid this and give us StringBuilder-like
 76 |      * functionality for Text objects.
 77 |      */
 78 |     byte[] txtBuffer = new byte[2048];
 79 |     aggValue.set(txtBuffer);
 80 |   }
 81 | 
 82 |   @Override
 83 |   public boolean nextKeyValue() throws IOException {
 84 |     aggValue.clear();
 85 |     boolean hasNext = false;
 86 |     boolean finished = false;
 87 |     // Find the start token
 88 |     while (!finished && (((hasNext = super.nextKeyValue()) == true) || remainder.length() > 0)) {
 89 |       if (hasNext) {
 90 |         finished = process(super.getCurrentValue());
 91 |       } else {
 92 |         finished = process(null);
 93 |       }
 94 |       if (finished) {
 95 |         startFound = false;
 96 |         counter++;
 97 |         return true;
 98 |       }
 99 |     }
100 |     // If we have anything loaded in the agg value (and we found a start)
101 |     // then we ran out of data before finding the end. Just return the
102 |     // data we have and if it's not valid, downstream parsing of the data
103 |     // will fail.
104 |     if (returnPartialMatches && startFound && aggValue.getLength() > 0) {
105 |       startFound = false;
106 |       counter++;
107 |       return true;
108 |     }
109 |     return false;
110 |   }
111 | 
112 |   /**
113 |    * Populates aggValue with the contents of the Text object.
114 |    *
115 |    * @return true if aggValue is complete, else false and needs more data.
116 |    */
117 |   private boolean process(Text t) {
118 | 
119 |     if (null != t) {
120 |       remainder.append(t.toString());
121 |     }
122 |     while (remainder.length() > 0) {
123 |       if (!startFound) {
124 |         // If found, then begin aggregating at the start offset
125 |         int start = remainder.indexOf(startToken);
126 |         if (-1 != start) {
127 |           // Append the start token to the aggregate value
128 |           TextUtil.textAppendNoNull(aggValue,
129 |               remainder.substring(start, start + startToken.length()), false);
130 |           // Remove to the end of the start token from the remainder
131 |           remainder.delete(0, start + startToken.length());
132 |           startFound = true;
133 |         } else {
134 |           // If we are looking for the start and have not found it, then remove
135 |           // the bytes
136 |           remainder.delete(0, remainder.length());
137 |         }
138 |       } else {
139 |         // Try to find the end
140 |         int end = remainder.indexOf(endToken);
141 |         // Also try to find the start
142 |         int start = remainder.indexOf(startToken);
143 |         if (-1 == end) {
144 |           if (returnPartialMatches && start >= 0) {
145 |             // End token not found, but another start token was found...
146 |             // The amount to copy is up to the beginning of the next start token
147 |             TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
148 |             remainder.delete(0, start);
149 |             return true;
150 |           } else {
151 |             // Not found, aggregate the entire remainder
152 |             TextUtil.textAppendNoNull(aggValue, remainder.toString(), false);
153 |             // Delete all chars from remainder
154 |             remainder.delete(0, remainder.length());
155 |           }
156 |         } else {
157 |           if (returnPartialMatches && start >= 0 && start < end) {
158 |             // We found the end token, but found another start token first, so
159 |             // deal with that.
160 |             TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
161 |             remainder.delete(0, start);
162 |             return true;
163 |           } else {
164 |             // END_TOKEN was found. Extract to the end of END_TOKEN
165 |             TextUtil.textAppendNoNull(aggValue, remainder.substring(0, end + endToken.length()),
166 |                 false);
167 |             // Remove from remainder up to the end of END_TOKEN
168 |             remainder.delete(0, end + endToken.length());
169 |             return true;
170 |           }
171 |         }
172 |       }
173 |     }
174 |     return false;
175 |   }
176 | 
177 | }
178 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.iterator;
 18 | 
 19 | import static org.junit.Assert.assertTrue;
 20 | 
 21 | import java.util.ArrayList;
 22 | import java.util.Collections;
 23 | import java.util.List;
 24 | import java.util.UUID;
 25 | 
 26 | import org.apache.accumulo.core.client.IteratorSetting;
 27 | import org.apache.accumulo.core.data.Key;
 28 | import org.apache.accumulo.core.data.Value;
 29 | import org.apache.accumulo.core.iterators.Combiner;
 30 | import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
 31 | import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
 32 | import org.apache.log4j.Level;
 33 | import org.apache.log4j.Logger;
 34 | import org.junit.Before;
 35 | import org.junit.Test;
 36 | 
 37 | public class GlobalIndexUidTest {
 38 |   private GlobalIndexUidCombiner combiner;
 39 |   private List<Value> values;
 40 |   
 41 |   @Before
 42 |   public void setup() throws Exception {
 43 |     combiner = new GlobalIndexUidCombiner();
 44 |     combiner.init(null, Collections.singletonMap("all", "true"), null);
 45 |     values = new ArrayList<Value>();
 46 |   }
 47 |   
 48 |   private Uid.List.Builder createNewUidList() {
 49 |     return Uid.List.newBuilder();
 50 |   }
 51 |   
 52 |   @Test
 53 |   public void testSingleUid() {
 54 |     Builder b = createNewUidList();
 55 |     b.setCOUNT(1);
 56 |     b.setIGNORE(false);
 57 |     b.addUID(UUID.randomUUID().toString());
 58 |     Uid.List uidList = b.build();
 59 |     Value val = new Value(uidList.toByteArray());
 60 |     values.add(val);
 61 |     Value result = combiner.reduce(new Key(), values.iterator());
 62 |     assertTrue(val.compareTo(result.get()) == 0);
 63 |   }
 64 |   
 65 |   @Test
 66 |   public void testLessThanMax() throws Exception {
 67 |     List<String> savedUUIDs = new ArrayList<String>();
 68 |     for (int i = 0; i < GlobalIndexUidCombiner.MAX - 1; i++) {
 69 |       Builder b = createNewUidList();
 70 |       b.setIGNORE(false);
 71 |       String uuid = UUID.randomUUID().toString();
 72 |       savedUUIDs.add(uuid);
 73 |       b.setCOUNT(i);
 74 |       b.addUID(uuid);
 75 |       Uid.List uidList = b.build();
 76 |       Value val = new Value(uidList.toByteArray());
 77 |       values.add(val);
 78 |     }
 79 |     Value result = combiner.reduce(new Key(), values.iterator());
 80 |     Uid.List resultList = Uid.List.parseFrom(result.get());
 81 |     assertTrue(resultList.getIGNORE() == false);
 82 |     assertTrue(resultList.getUIDCount() == (GlobalIndexUidCombiner.MAX - 1));
 83 |     List<String> resultListUUIDs = resultList.getUIDList();
 84 |     for (String s : savedUUIDs)
 85 |       assertTrue(resultListUUIDs.contains(s));
 86 |   }
 87 |   
 88 |   @Test
 89 |   public void testEqualsMax() throws Exception {
 90 |     List<String> savedUUIDs = new ArrayList<String>();
 91 |     for (int i = 0; i < GlobalIndexUidCombiner.MAX; i++) {
 92 |       Builder b = createNewUidList();
 93 |       b.setIGNORE(false);
 94 |       String uuid = UUID.randomUUID().toString();
 95 |       savedUUIDs.add(uuid);
 96 |       b.setCOUNT(i);
 97 |       b.addUID(uuid);
 98 |       Uid.List uidList = b.build();
 99 |       Value val = new Value(uidList.toByteArray());
100 |       values.add(val);
101 |     }
102 |     Value result = combiner.reduce(new Key(), values.iterator());
103 |     Uid.List resultList = Uid.List.parseFrom(result.get());
104 |     assertTrue(resultList.getIGNORE() == false);
105 |     assertTrue(resultList.getUIDCount() == (GlobalIndexUidCombiner.MAX));
106 |     List<String> resultListUUIDs = resultList.getUIDList();
107 |     for (String s : savedUUIDs)
108 |       assertTrue(resultListUUIDs.contains(s));
109 |   }
110 |   
111 |   @Test
112 |   public void testMoreThanMax() throws Exception {
113 |     List<String> savedUUIDs = new ArrayList<String>();
114 |     for (int i = 0; i < GlobalIndexUidCombiner.MAX + 10; i++) {
115 |       Builder b = createNewUidList();
116 |       b.setIGNORE(false);
117 |       String uuid = UUID.randomUUID().toString();
118 |       savedUUIDs.add(uuid);
119 |       b.setCOUNT(1);
120 |       b.addUID(uuid);
121 |       Uid.List uidList = b.build();
122 |       Value val = new Value(uidList.toByteArray());
123 |       values.add(val);
124 |     }
125 |     Value result = combiner.reduce(new Key(), values.iterator());
126 |     Uid.List resultList = Uid.List.parseFrom(result.get());
127 |     assertTrue(resultList.getIGNORE() == true);
128 |     assertTrue(resultList.getUIDCount() == 0);
129 |     assertTrue(resultList.getCOUNT() == (GlobalIndexUidCombiner.MAX + 10));
130 |   }
131 |   
132 |   @Test
133 |   public void testSeenIgnore() throws Exception {
134 |     Builder b = createNewUidList();
135 |     b.setIGNORE(true);
136 |     b.setCOUNT(0);
137 |     Uid.List uidList = b.build();
138 |     Value val = new Value(uidList.toByteArray());
139 |     values.add(val);
140 |     b = createNewUidList();
141 |     b.setIGNORE(false);
142 |     b.setCOUNT(1);
143 |     b.addUID(UUID.randomUUID().toString());
144 |     uidList = b.build();
145 |     val = new Value(uidList.toByteArray());
146 |     values.add(val);
147 |     Value result = combiner.reduce(new Key(), values.iterator());
148 |     Uid.List resultList = Uid.List.parseFrom(result.get());
149 |     assertTrue(resultList.getIGNORE() == true);
150 |     assertTrue(resultList.getUIDCount() == 0);
151 |     assertTrue(resultList.getCOUNT() == 1);
152 |   }
153 |   
154 |   @Test
155 |   public void testInvalidValueType() throws Exception {
156 |     Combiner comb = new GlobalIndexUidCombiner();
157 |     IteratorSetting setting = new IteratorSetting(1, GlobalIndexUidCombiner.class);
158 |     GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
159 |     GlobalIndexUidCombiner.setLossyness(setting, true);
160 |     comb.init(null, setting.getOptions(), null);
161 |     Logger.getLogger(GlobalIndexUidCombiner.class).setLevel(Level.OFF);
162 |     Value val = new Value(UUID.randomUUID().toString().getBytes());
163 |     values.add(val);
164 |     Value result = comb.reduce(new Key(), values.iterator());
165 |     Uid.List resultList = Uid.List.parseFrom(result.get());
166 |     assertTrue(resultList.getIGNORE() == false);
167 |     assertTrue(resultList.getUIDCount() == 0);
168 |     assertTrue(resultList.getCOUNT() == 0);
169 |   }
170 |   
171 |   @Test
172 |   public void testCount() throws Exception {
173 |     UUID uuid = UUID.randomUUID();
174 |     // Collect the same UUID five times.
175 |     for (int i = 0; i < 5; i++) {
176 |       Builder b = createNewUidList();
177 |       b.setCOUNT(1);
178 |       b.setIGNORE(false);
179 |       b.addUID(uuid.toString());
180 |       Uid.List uidList = b.build();
181 |       Value val = new Value(uidList.toByteArray());
182 |       values.add(val);
183 |     }
184 |     Value result = combiner.reduce(new Key(), values.iterator());
185 |     Uid.List resultList = Uid.List.parseFrom(result.get());
186 |     assertTrue(resultList.getIGNORE() == false);
187 |     assertTrue(resultList.getUIDCount() == 1);
188 |     assertTrue(resultList.getCOUNT() == 5);
189 |     
190 |   }
191 |   
192 | }
193 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/parser/EventFields.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.parser;
 18 | 
 19 | import java.nio.ByteBuffer;
 20 | import java.util.Collection;
 21 | import java.util.Map;
 22 | import java.util.Map.Entry;
 23 | import java.util.Set;
 24 | 
 25 | import org.apache.accumulo.core.security.ColumnVisibility;
 26 | import org.apache.accumulo.examples.wikisearch.parser.EventFields.FieldValue;
 27 | 
 28 | import com.esotericsoftware.kryo.CustomSerialization;
 29 | import com.esotericsoftware.kryo.Kryo;
 30 | import com.esotericsoftware.kryo.serialize.ArraySerializer;
 31 | import com.esotericsoftware.kryo.serialize.IntSerializer;
 32 | import com.esotericsoftware.kryo.serialize.StringSerializer;
 33 | import com.google.common.collect.HashMultimap;
 34 | import com.google.common.collect.Multimap;
 35 | import com.google.common.collect.Multiset;
 36 | import com.google.common.collect.SetMultimap;
 37 | 
 38 | /**
 39 |  * Object used to hold the fields in an event. This is a multimap because fields can be repeated.
 40 |  */
 41 | public class EventFields implements SetMultimap<String,FieldValue>, CustomSerialization {
 42 |   
 43 |   private static boolean kryoInitialized = false;
 44 |   private static ArraySerializer valueSerializer = null;
 45 |   
 46 |   private Multimap<String,FieldValue> map = null;
 47 |   
 48 |   public static class FieldValue {
 49 |     ColumnVisibility visibility;
 50 |     byte[] value;
 51 |     
 52 |     public FieldValue(ColumnVisibility visibility, byte[] value) {
 53 |       super();
 54 |       this.visibility = visibility;
 55 |       this.value = value;
 56 |     }
 57 |     
 58 |     public ColumnVisibility getVisibility() {
 59 |       return visibility;
 60 |     }
 61 |     
 62 |     public byte[] getValue() {
 63 |       return value;
 64 |     }
 65 |     
 66 |     public void setVisibility(ColumnVisibility visibility) {
 67 |       this.visibility = visibility;
 68 |     }
 69 |     
 70 |     public void setValue(byte[] value) {
 71 |       this.value = value;
 72 |     }
 73 |     
 74 |     public int size() {
 75 |       return visibility.flatten().length + value.length;
 76 |     }
 77 |     
 78 |     @Override
 79 |     public String toString() {
 80 |       StringBuilder buf = new StringBuilder();
 81 |       if (null != visibility)
 82 |         buf.append(" visibility: ").append(new String(visibility.flatten()));
 83 |       if (null != value)
 84 |         buf.append(" value size: ").append(value.length);
 85 |       if (null != value)
 86 |         buf.append(" value: ").append(new String(value));
 87 |       return buf.toString();
 88 |     }
 89 |     
 90 |   }
 91 |   
 92 |   public EventFields() {
 93 |     map = HashMultimap.create();
 94 |   }
 95 |   
 96 |   public int size() {
 97 |     return map.size();
 98 |   }
 99 |   
100 |   public boolean isEmpty() {
101 |     return map.isEmpty();
102 |   }
103 |   
104 |   public boolean containsKey(Object key) {
105 |     return map.containsKey(key);
106 |   }
107 |   
108 |   public boolean containsValue(Object value) {
109 |     return map.containsValue(value);
110 |   }
111 |   
112 |   public boolean containsEntry(Object key, Object value) {
113 |     return map.containsEntry(key, value);
114 |   }
115 |   
116 |   public boolean put(String key, FieldValue value) {
117 |     return map.put(key, value);
118 |   }
119 |   
120 |   public boolean remove(Object key, Object value) {
121 |     return map.remove(key, value);
122 |   }
123 |   
124 |   public boolean putAll(String key, Iterable<? extends FieldValue> values) {
125 |     return map.putAll(key, values);
126 |   }
127 |   
128 |   public boolean putAll(Multimap<? extends String,? extends FieldValue> multimap) {
129 |     return map.putAll(multimap);
130 |   }
131 |   
132 |   public void clear() {
133 |     map.clear();
134 |   }
135 |   
136 |   public Set<String> keySet() {
137 |     return map.keySet();
138 |   }
139 |   
140 |   public Multiset<String> keys() {
141 |     return map.keys();
142 |   }
143 |   
144 |   public Collection<FieldValue> values() {
145 |     return map.values();
146 |   }
147 |   
148 |   public Set<FieldValue> get(String key) {
149 |     return (Set<FieldValue>) map.get(key);
150 |   }
151 |   
152 |   public Set<FieldValue> removeAll(Object key) {
153 |     return (Set<FieldValue>) map.removeAll(key);
154 |   }
155 |   
156 |   public Set<FieldValue> replaceValues(String key, Iterable<? extends FieldValue> values) {
157 |     return (Set<FieldValue>) map.replaceValues(key, values);
158 |   }
159 |   
160 |   public Set<Entry<String,FieldValue>> entries() {
161 |     return (Set<Entry<String,FieldValue>>) map.entries();
162 |   }
163 |   
164 |   public Map<String,Collection<FieldValue>> asMap() {
165 |     return map.asMap();
166 |   }
167 |   
168 |   public int getByteSize() {
169 |     int count = 0;
170 |     for (Entry<String,FieldValue> e : map.entries()) {
171 |       count += e.getKey().getBytes().length + e.getValue().size();
172 |     }
173 |     return count;
174 |   }
175 |   
176 |   @Override
177 |   public String toString() {
178 |     StringBuilder buf = new StringBuilder();
179 |     for (Entry<String,FieldValue> entry : map.entries()) {
180 |       buf.append("\tkey: ").append(entry.getKey()).append(" -> ").append(entry.getValue().toString()).append("\n");
181 |     }
182 |     return buf.toString();
183 |   }
184 |   
185 |   public static synchronized void initializeKryo(Kryo kryo) {
186 |     if (kryoInitialized)
187 |       return;
188 |     valueSerializer = new ArraySerializer(kryo);
189 |     valueSerializer.setDimensionCount(1);
190 |     valueSerializer.setElementsAreSameType(true);
191 |     valueSerializer.setCanBeNull(false);
192 |     valueSerializer.setElementsCanBeNull(false);
193 |     kryo.register(byte[].class, valueSerializer);
194 |     kryoInitialized = true;
195 |   }
196 |   
197 |   public void readObjectData(Kryo kryo, ByteBuffer buf) {
198 |     if (!kryoInitialized)
199 |       EventFields.initializeKryo(kryo);
200 |     // Read in the number of map entries
201 |     int entries = IntSerializer.get(buf, true);
202 |     for (int i = 0; i < entries; i++) {
203 |       // Read in the key
204 |       String key = StringSerializer.get(buf);
205 |       // Read in the fields in the value
206 |       ColumnVisibility vis = new ColumnVisibility(valueSerializer.readObjectData(buf, byte[].class));
207 |       byte[] value = valueSerializer.readObjectData(buf, byte[].class);
208 |       map.put(key, new FieldValue(vis, value));
209 |     }
210 |     
211 |   }
212 |   
213 |   public void writeObjectData(Kryo kryo, ByteBuffer buf) {
214 |     if (!kryoInitialized)
215 |       EventFields.initializeKryo(kryo);
216 |     // Write out the number of entries;
217 |     IntSerializer.put(buf, map.size(), true);
218 |     for (Entry<String,FieldValue> entry : map.entries()) {
219 |       // Write the key
220 |       StringSerializer.put(buf, entry.getKey());
221 |       // Write the fields in the value
222 |       valueSerializer.writeObjectData(buf, entry.getValue().getVisibility().flatten());
223 |       valueSerializer.writeObjectData(buf, entry.getValue().getValue());
224 |     }
225 |   }
226 |   
227 | }
228 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.ingest;
 18 | 
 19 | import java.io.IOException;
 20 | 
 21 | import org.apache.accumulo.core.client.AccumuloException;
 22 | import org.apache.accumulo.core.client.AccumuloSecurityException;
 23 | import org.apache.accumulo.core.client.Connector;
 24 | import org.apache.accumulo.core.client.Instance;
 25 | import org.apache.accumulo.core.client.ZooKeeperInstance;
 26 | import org.apache.commons.lang.StringUtils;
 27 | import org.apache.hadoop.conf.Configuration;
 28 | import org.apache.hadoop.fs.Path;
 29 | import org.apache.hadoop.util.ReflectionUtils;
 30 | import org.apache.lucene.analysis.Analyzer;
 31 | 
 32 | public class WikipediaConfiguration {
 33 |   public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name";
 34 |   public final static String USER = "wikipedia.accumulo.user";
 35 |   public final static String PASSWORD = "wikipedia.accumulo.password";
 36 |   public final static String TABLE_NAME = "wikipedia.accumulo.table";
 37 | 
 38 |   public final static String ZOOKEEPERS = "wikipedia.accumulo.zookeepers";
 39 | 
 40 |   public final static String NAMESPACES_FILENAME = "wikipedia.namespaces.filename";
 41 |   public final static String LANGUAGES_FILENAME = "wikipedia.languages.filename";
 42 |   public final static String WORKING_DIRECTORY = "wikipedia.ingest.working";
 43 | 
 44 |   public final static String ANALYZER = "wikipedia.index.analyzer";
 45 | 
 46 |   public final static String NUM_PARTITIONS = "wikipedia.ingest.partitions";
 47 | 
 48 |   public final static String NUM_GROUPS = "wikipedia.ingest.groups";
 49 | 
 50 |   public final static String PARTITIONED_ARTICLES_DIRECTORY = "wikipedia.partitioned.directory";
 51 | 
 52 |   public final static String RUN_PARTITIONER = "wikipedia.run.partitioner";
 53 |   public final static String RUN_INGEST = "wikipedia.run.ingest";
 54 |   public final static String BULK_INGEST = "wikipedia.bulk.ingest";
 55 |   public final static String BULK_INGEST_DIR = "wikipedia.bulk.ingest.dir";
 56 |   public final static String BULK_INGEST_FAILURE_DIR = "wikipedia.bulk.ingest.failure.dir";
 57 |   public final static String BULK_INGEST_BUFFER_SIZE = "wikipedia.bulk.ingest.buffer.size";
 58 |   public final static String PARTITIONED_INPUT_MIN_SPLIT_SIZE = "wikipedia.min.input.split.size";
 59 | 
 60 |   public static String getUser(Configuration conf) {
 61 |     return conf.get(USER);
 62 |   }
 63 | 
 64 |   public static byte[] getPassword(Configuration conf) {
 65 |     String pass = conf.get(PASSWORD);
 66 |     if (pass == null) {
 67 |       return null;
 68 |     }
 69 |     return pass.getBytes();
 70 |   }
 71 | 
 72 |   public static String getTableName(Configuration conf) {
 73 |     String tablename = conf.get(TABLE_NAME);
 74 |     if (tablename == null) {
 75 |       throw new RuntimeException("No data table name specified in " + TABLE_NAME);
 76 |     }
 77 |     return tablename;
 78 |   }
 79 | 
 80 |   public static String getInstanceName(Configuration conf) {
 81 |     return conf.get(INSTANCE_NAME);
 82 |   }
 83 | 
 84 |   public static String getZookeepers(Configuration conf) {
 85 |     String zookeepers = conf.get(ZOOKEEPERS);
 86 |     if (zookeepers == null) {
 87 |       throw new RuntimeException("No zookeepers specified in " + ZOOKEEPERS);
 88 |     }
 89 |     return zookeepers;
 90 |   }
 91 | 
 92 |   public static Path getNamespacesFile(Configuration conf) {
 93 |     String filename = conf.get(NAMESPACES_FILENAME,
 94 |         new Path(getWorkingDirectory(conf), "namespaces.dat").toString());
 95 |     return new Path(filename);
 96 |   }
 97 | 
 98 |   public static Path getLanguagesFile(Configuration conf) {
 99 |     String filename = conf.get(LANGUAGES_FILENAME,
100 |         new Path(getWorkingDirectory(conf), "languages.txt").toString());
101 |     return new Path(filename);
102 |   }
103 | 
104 |   public static Path getWorkingDirectory(Configuration conf) {
105 |     String filename = conf.get(WORKING_DIRECTORY);
106 |     return new Path(filename);
107 |   }
108 | 
109 |   public static Connector getConnector(Configuration conf)
110 |       throws AccumuloException, AccumuloSecurityException {
111 |     return getInstance(conf).getConnector(getUser(conf), getPassword(conf));
112 |   }
113 | 
114 |   public static Instance getInstance(Configuration conf) {
115 |     return new ZooKeeperInstance(getInstanceName(conf), getZookeepers(conf));
116 |   }
117 | 
118 |   public static int getNumPartitions(Configuration conf) {
119 |     return conf.getInt(NUM_PARTITIONS, 25);
120 |   }
121 | 
122 |   public static int getNumGroups(Configuration conf) {
123 |     return conf.getInt(NUM_GROUPS, 1);
124 |   }
125 | 
126 |   public static Path getPartitionedArticlesPath(Configuration conf) {
127 |     return new Path(conf.get(PARTITIONED_ARTICLES_DIRECTORY));
128 |   }
129 | 
130 |   public static long getMinInputSplitSize(Configuration conf) {
131 |     return conf.getLong(PARTITIONED_INPUT_MIN_SPLIT_SIZE, 1l << 27);
132 |   }
133 | 
134 |   public static boolean runPartitioner(Configuration conf) {
135 |     return conf.getBoolean(RUN_PARTITIONER, false);
136 |   }
137 | 
138 |   public static boolean runIngest(Configuration conf) {
139 |     return conf.getBoolean(RUN_INGEST, true);
140 |   }
141 | 
142 |   public static boolean bulkIngest(Configuration conf) {
143 |     return conf.getBoolean(BULK_INGEST, true);
144 |   }
145 | 
146 |   public static String bulkIngestDir(Configuration conf) {
147 |     return conf.get(BULK_INGEST_DIR);
148 |   }
149 | 
150 |   public static String bulkIngestFailureDir(Configuration conf) {
151 |     return conf.get(BULK_INGEST_FAILURE_DIR);
152 |   }
153 | 
154 |   public static long bulkIngestBufferSize(Configuration conf) {
155 |     return conf.getLong(BULK_INGEST_BUFFER_SIZE, 1l << 28);
156 |   }
157 | 
158 |   /**
159 |    * Helper method to get properties from Hadoop configuration
160 |    *
161 |    * @throws IllegalArgumentException
162 |    *           if property is not defined, null, or empty. Or if resultClass is not handled.
163 |    * @return value of property
164 |    */
165 |   @SuppressWarnings("unchecked")
166 |   public static <T> T isNull(Configuration conf, String propertyName, Class<T> resultClass) {
167 |     String p = conf.get(propertyName);
168 |     if (StringUtils.isEmpty(p)) {
169 |       throw new IllegalArgumentException(propertyName + " must be specified");
170 |     }
171 | 
172 |     if (resultClass.equals(String.class)) {
173 |       return (T) p;
174 |     } else if (resultClass.equals(String[].class)) {
175 |       return (T) conf.getStrings(propertyName);
176 |     } else if (resultClass.equals(Boolean.class)) {
177 |       return (T) Boolean.valueOf(p);
178 |     } else if (resultClass.equals(Long.class)) {
179 |       return (T) Long.valueOf(p);
180 |     } else if (resultClass.equals(Integer.class)) {
181 |       return (T) Integer.valueOf(p);
182 |     } else if (resultClass.equals(Float.class)) {
183 |       return (T) Float.valueOf(p);
184 |     } else if (resultClass.equals(Double.class)) {
185 |       return (T) Double.valueOf(p);
186 |     } else {
187 |       throw new IllegalArgumentException(resultClass.getSimpleName() + " is unhandled.");
188 |     }
189 | 
190 |   }
191 | 
192 | }
193 | 


--------------------------------------------------------------------------------
/query/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/OptimizedQueryIterator.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.iterator;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Collection;
 21 | import java.util.HashMap;
 22 | import java.util.HashSet;
 23 | import java.util.Map;
 24 | 
 25 | import org.apache.accumulo.core.data.ByteSequence;
 26 | import org.apache.accumulo.core.data.Key;
 27 | import org.apache.accumulo.core.data.PartialKey;
 28 | import org.apache.accumulo.core.data.Range;
 29 | import org.apache.accumulo.core.data.Value;
 30 | import org.apache.accumulo.core.iterators.IteratorEnvironment;
 31 | import org.apache.accumulo.core.iterators.OptionDescriber;
 32 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
 33 | import org.apache.log4j.Logger;
 34 | 
 35 | /**
 36 |  * This iterator internally uses the BooleanLogicIterator to find event UIDs in the field index portion of the partition and uses the EvaluatingIterator to
 37 |  * evaluate the events against an expression. The key and value that are emitted from this iterator are the key and value that come from the EvaluatingIterator.
 38 |  */
 39 | public class OptimizedQueryIterator implements SortedKeyValueIterator<Key,Value>, OptionDescriber {
 40 |   
 41 |   private static Logger log = Logger.getLogger(OptimizedQueryIterator.class);
 42 |   private EvaluatingIterator event = null;
 43 |   private SortedKeyValueIterator<Key,Value> index = null;
 44 |   private Key key = null;
 45 |   private Value value = null;
 46 |   private boolean eventSpecificRange = false;
 47 |   
 48 |   public IteratorOptions describeOptions() {
 49 |     Map<String,String> options = new HashMap<String,String>();
 50 |     options.put(EvaluatingIterator.QUERY_OPTION, "full query expression");
 51 |     options.put(BooleanLogicIterator.FIELD_INDEX_QUERY, "modified query for the field index query portion");
 52 |     options.put(ReadAheadIterator.QUEUE_SIZE, "parallel queue size");
 53 |     options.put(ReadAheadIterator.TIMEOUT, "parallel iterator timeout");
 54 |     return new IteratorOptions(getClass().getSimpleName(), "evaluates event objects against an expression using the field index", options, null);
 55 |   }
 56 |   
 57 |   public boolean validateOptions(Map<String,String> options) {
 58 |     if (options.containsKey(EvaluatingIterator.QUERY_OPTION) && options.containsKey(BooleanLogicIterator.FIELD_INDEX_QUERY)) {
 59 |       return true;
 60 |     }
 61 |     return false;
 62 |   }
 63 |   
 64 |   public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
 65 |     if (!validateOptions(options)) {
 66 |       throw new IllegalArgumentException("Invalid options");
 67 |     }
 68 |     
 69 |     // Setup the EvaluatingIterator
 70 |     event = new EvaluatingIterator();
 71 |     event.init(source.deepCopy(env), options, env);
 72 |     
 73 |     // if queue size and timeout are set, then use the read ahead iterator
 74 |     if (options.containsKey(ReadAheadIterator.QUEUE_SIZE) && options.containsKey(ReadAheadIterator.TIMEOUT)) {
 75 |       BooleanLogicIterator bli = new BooleanLogicIterator();
 76 |       bli.init(source, options, env);
 77 |       index = new ReadAheadIterator();
 78 |       index.init(bli, options, env);
 79 |     } else {
 80 |       index = new BooleanLogicIterator();
 81 |       // index.setDebug(Level.DEBUG);
 82 |       index.init(source, options, env);
 83 |     }
 84 |     
 85 |   }
 86 |   
 87 |   public OptimizedQueryIterator() {}
 88 |   
 89 |   public OptimizedQueryIterator(OptimizedQueryIterator other, IteratorEnvironment env) {
 90 |     this.event = other.event;
 91 |     this.index = other.index;
 92 |   }
 93 |   
 94 |   public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) {
 95 |     return new OptimizedQueryIterator(this, env);
 96 |   }
 97 |   
 98 |   public Key getTopKey() {
 99 |     if (log.isDebugEnabled()) {
100 |       log.debug("getTopKey: " + key);
101 |     }
102 |     return key;
103 |   }
104 |   
105 |   public Value getTopValue() {
106 |     if (log.isDebugEnabled()) {
107 |       log.debug("getTopValue: " + value);
108 |     }
109 |     return value;
110 |   }
111 |   
112 |   public boolean hasTop() {
113 |     if (log.isDebugEnabled()) {
114 |       log.debug("hasTop: returned: " + (key != null));
115 |     }
116 |     return (key != null);
117 |   }
118 |   
119 |   public void next() throws IOException {
120 |     if (log.isDebugEnabled()) {
121 |       log.debug("next");
122 |     }
123 |     if (key != null) {
124 |       key = null;
125 |       value = null;
126 |     }
127 |     
128 |     if (eventSpecificRange) {
129 |       // Then this will probably return nothing
130 |       event.next();
131 |       if (event.hasTop()) {
132 |         key = event.getTopKey();
133 |         value = event.getTopValue();
134 |       }
135 |     } else {
136 |       
137 |       do {
138 |         index.next();
139 |         // If the index has a match, then seek the event to the key
140 |         if (index.hasTop()) {
141 |           Key eventKey = index.getTopKey();
142 |           Key endKey = eventKey.followingKey(PartialKey.ROW_COLFAM);
143 |           Key startKey = new Key(eventKey.getRow(), eventKey.getColumnFamily());
144 |           Range eventRange = new Range(startKey, endKey);
145 |           HashSet<ByteSequence> cf = new HashSet<ByteSequence>();
146 |           cf.add(eventKey.getColumnFamilyData());
147 |           event.seek(eventRange, cf, true);
148 |           if (event.hasTop()) {
149 |             key = event.getTopKey();
150 |             value = event.getTopValue();
151 |           }
152 |         }
153 |       } while (key == null && index.hasTop());
154 |     }
155 |     // Sanity check. Make sure both returnValue and returnKey are null or both are not null
156 |     if (!((key == null && value == null) || (key != null && value != null))) {
157 |       log.warn("Key: " + ((key == null) ? "null" : key.toString()));
158 |       log.warn("Value: " + ((value == null) ? "null" : value.toString()));
159 |       throw new IOException("Return values are inconsistent");
160 |     }
161 |     
162 |   }
163 |   
164 |   public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
165 |     if (log.isDebugEnabled()) {
166 |       log.debug("seek, range:" + range);
167 |     }
168 |     // Test the range to see if it is event specific.
169 |     if (null != range.getEndKey() && range.getEndKey().getColumnFamily() != null && range.getEndKey().getColumnFamily().getLength() != 0) {
170 |       if (log.isDebugEnabled()) {
171 |         log.debug("Jumping straight to the event");
172 |       }
173 |       // Then this range is for a specific event. We don't need to use the index iterator to find it, we can just
174 |       // seek to it with the event iterator and evaluate it.
175 |       eventSpecificRange = true;
176 |       event.seek(range, columnFamilies, inclusive);
177 |       if (event.hasTop()) {
178 |         key = event.getTopKey();
179 |         value = event.getTopValue();
180 |       }
181 |     } else {
182 |       if (log.isDebugEnabled()) {
183 |         log.debug("Using BooleanLogicIteratorJexl");
184 |       }
185 |       // Seek the boolean logic iterator
186 |       index.seek(range, columnFamilies, inclusive);
187 |       
188 |       // If the index has a match, then seek the event to the key
189 |       if (index.hasTop()) {
190 |         Key eventKey = index.getTopKey();
191 |         // Range eventRange = new Range(eventKey, eventKey);
192 |         Range eventRange = new Range(eventKey.getRow());
193 |         HashSet<ByteSequence> cf = new HashSet<ByteSequence>();
194 |         cf.add(eventKey.getColumnFamilyData());
195 |         event.seek(eventRange, cf, true);
196 |         if (event.hasTop()) {
197 |           key = event.getTopKey();
198 |           value = event.getTopValue();
199 |         } else {
200 |           next();
201 |         }
202 |       }
203 |     }
204 |   }
205 | }
206 | 


--------------------------------------------------------------------------------
/query/src/test/hadoop1/org/apache/accumulo/examples/wikisearch/logic/TestQueryLogic.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.accumulo.examples.wikisearch.logic;
 18 | 
 19 | import static org.junit.Assert.assertEquals;
 20 | 
 21 | import java.io.File;
 22 | import java.io.IOException;
 23 | import java.net.URL;
 24 | import java.util.ArrayList;
 25 | import java.util.Collections;
 26 | import java.util.HashMap;
 27 | import java.util.List;
 28 | import java.util.Map.Entry;
 29 | 
 30 | import junit.framework.Assert;
 31 | 
 32 | import org.apache.accumulo.core.client.BatchWriter;
 33 | import org.apache.accumulo.core.client.Connector;
 34 | import org.apache.accumulo.core.client.MutationsRejectedException;
 35 | import org.apache.accumulo.core.client.Scanner;
 36 | import org.apache.accumulo.core.client.mock.MockInstance;
 37 | import org.apache.accumulo.core.client.security.tokens.PasswordToken;
 38 | import org.apache.accumulo.core.data.Key;
 39 | import org.apache.accumulo.core.data.Mutation;
 40 | import org.apache.accumulo.core.data.Range;
 41 | import org.apache.accumulo.core.data.Value;
 42 | import org.apache.accumulo.core.security.Authorizations;
 43 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
 44 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester;
 45 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
 46 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper;
 47 | import org.apache.accumulo.examples.wikisearch.parser.RangeCalculator;
 48 | import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
 49 | import org.apache.accumulo.examples.wikisearch.sample.Document;
 50 | import org.apache.accumulo.examples.wikisearch.sample.Field;
 51 | import org.apache.accumulo.examples.wikisearch.sample.Results;
 52 | import org.apache.hadoop.conf.Configuration;
 53 | import org.apache.hadoop.fs.Path;
 54 | import org.apache.hadoop.fs.RawLocalFileSystem;
 55 | import org.apache.hadoop.io.LongWritable;
 56 | import org.apache.hadoop.io.Text;
 57 | import org.apache.hadoop.mapreduce.Mapper;
 58 | import org.apache.hadoop.mapreduce.OutputCommitter;
 59 | import org.apache.hadoop.mapreduce.RecordWriter;
 60 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 61 | import org.apache.hadoop.mapreduce.TaskAttemptID;
 62 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 63 | import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
 64 | import org.apache.log4j.Level;
 65 | import org.apache.log4j.Logger;
 66 | import org.junit.Before;
 67 | import org.junit.Test;
 68 | 
 69 | public class TestQueryLogic {
 70 |   
 71 |   private static final String METADATA_TABLE_NAME = "wikiMetadata";
 72 |   
 73 |   private static final String TABLE_NAME = "wiki";
 74 |   
 75 |   private static final String INDEX_TABLE_NAME = "wikiIndex";
 76 |   
 77 |   private static final String RINDEX_TABLE_NAME = "wikiReverseIndex";
 78 |   
 79 |   private static final String TABLE_NAMES[] = {METADATA_TABLE_NAME, TABLE_NAME, RINDEX_TABLE_NAME, INDEX_TABLE_NAME};
 80 |   
 81 |   private class MockAccumuloRecordWriter extends RecordWriter<Text,Mutation> {
 82 |     @Override
 83 |     public void write(Text key, Mutation value) throws IOException, InterruptedException {
 84 |       try {
 85 |         writerMap.get(key).addMutation(value);
 86 |       } catch (MutationsRejectedException e) {
 87 |         throw new IOException("Error adding mutation", e);
 88 |       }
 89 |     }
 90 |     
 91 |     @Override
 92 |     public void close(TaskAttemptContext context) throws IOException, InterruptedException {
 93 |       try {
 94 |         for (BatchWriter w : writerMap.values()) {
 95 |           w.flush();
 96 |           w.close();
 97 |         }
 98 |       } catch (MutationsRejectedException e) {
 99 |         throw new IOException("Error closing Batch Writer", e);
100 |       }
101 |     }
102 |     
103 |   }
104 |   
105 |   private Connector c = null;
106 |   private Configuration conf = new Configuration();
107 |   private HashMap<Text,BatchWriter> writerMap = new HashMap<Text,BatchWriter>();
108 |   private QueryLogic table = null;
109 |   
110 |   @Before
111 |   public void setup() throws Exception {
112 |     
113 |     Logger.getLogger(AbstractQueryLogic.class).setLevel(Level.DEBUG);
114 |     Logger.getLogger(QueryLogic.class).setLevel(Level.DEBUG);
115 |     Logger.getLogger(RangeCalculator.class).setLevel(Level.DEBUG);
116 |     
117 |     conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
118 |     conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
119 |     conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME);
120 |     conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1");
121 |     conf.set(WikipediaConfiguration.NUM_GROUPS, "1");
122 |     
123 |     MockInstance i = new MockInstance();
124 |     c = i.getConnector("root", new PasswordToken(""));
125 |     WikipediaIngester.createTables(c.tableOperations(), TABLE_NAME, false);
126 |     for (String table : TABLE_NAMES) {
127 |       writerMap.put(new Text(table), c.createBatchWriter(table, 1000L, 1000L, 1));
128 |     }
129 |     
130 |     TaskAttemptID id = new TaskAttemptID();
131 |     TaskAttemptContext context = new TaskAttemptContext(conf, id);
132 |     
133 |     RawLocalFileSystem fs = new RawLocalFileSystem();
134 |     fs.setConf(conf);
135 |     
136 |     URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml");
137 |     Assert.assertNotNull(url);
138 |     File data = new File(url.toURI());
139 |     Path tmpFile = new Path(data.getAbsolutePath());
140 |     
141 |     // Setup the Mapper
142 |     WikipediaInputSplit split = new WikipediaInputSplit(new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null), 0);
143 |     AggregatingRecordReader rr = new AggregatingRecordReader();
144 |     Path ocPath = new Path(tmpFile, "oc");
145 |     OutputCommitter oc = new FileOutputCommitter(ocPath, context);
146 |     fs.deleteOnExit(ocPath);
147 |     StandaloneStatusReporter sr = new StandaloneStatusReporter();
148 |     rr.initialize(split, context);
149 |     MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter();
150 |     WikipediaMapper mapper = new WikipediaMapper();
151 |     
152 |     // Load data into Mock Accumulo
153 |     Mapper<LongWritable,Text,Text,Mutation>.Context con = mapper.new Context(conf, id, rr, rw, oc, sr, split);
154 |     mapper.run(con);
155 |     
156 |     // Flush and close record writers.
157 |     rw.close(context);
158 |     
159 |     table = new QueryLogic();
160 |     table.setMetadataTableName(METADATA_TABLE_NAME);
161 |     table.setTableName(TABLE_NAME);
162 |     table.setIndexTableName(INDEX_TABLE_NAME);
163 |     table.setReverseIndexTableName(RINDEX_TABLE_NAME);
164 |     table.setUseReadAheadIterator(false);
165 |     table.setUnevaluatedFields(Collections.singletonList("TEXT"));
166 |   }
167 |   
168 |   void debugQuery(String tableName) throws Exception {
169 |     Scanner s = c.createScanner(tableName, new Authorizations("all"));
170 |     Range r = new Range();
171 |     s.setRange(r);
172 |     for (Entry<Key,Value> entry : s)
173 |       System.out.println(entry.getKey().toString() + " " + entry.getValue().toString());
174 |   }
175 |   
176 |   @Test
177 |   public void testTitle() throws Exception {
178 |     Logger.getLogger(AbstractQueryLogic.class).setLevel(Level.OFF);
179 |     Logger.getLogger(RangeCalculator.class).setLevel(Level.OFF);
180 |     List<String> auths = new ArrayList<String>();
181 |     auths.add("enwiki");
182 |     
183 |     Results results = table.runQuery(c, auths, "TITLE == 'asphalt' or TITLE == 'abacus' or TITLE == 'acid' or TITLE == 'acronym'", null, null, null);
184 |     List<Document> docs = results.getResults();
185 |     assertEquals(4, docs.size());
186 |     
187 |     results = table.runQuery(c, auths, "TEXT == 'abacus'", null, null, null);
188 |     docs = results.getResults();
189 |     assertEquals(1, docs.size());
190 |     for (Document doc : docs) {
191 |       System.out.println("id: " + doc.getId());
192 |       for (Field field : doc.getFields())
193 |         System.out.println(field.getFieldName() + " -> " + field.getFieldValue());
194 |     }
195 |   }
196 |   
197 | }
198 | 


--------------------------------------------------------------------------------