├── query ├── src │ ├── main │ │ ├── resources │ │ │ └── META-INF │ │ │ │ ├── MANIFEST.MF │ │ │ │ ├── ejb-jar.xml.uno │ │ │ │ └── ejb-jar.xml.example │ │ └── java │ │ │ └── org │ │ │ └── apache │ │ │ └── accumulo │ │ │ └── examples │ │ │ └── wikisearch │ │ │ ├── sample │ │ │ ├── Results.java │ │ │ ├── Field.java │ │ │ └── Document.java │ │ │ ├── query │ │ │ └── IQuery.java │ │ │ ├── util │ │ │ ├── FieldIndexKeyParser.java │ │ │ ├── KeyParser.java │ │ │ └── BaseKeyParser.java │ │ │ ├── function │ │ │ └── QueryFunctions.java │ │ │ ├── iterator │ │ │ ├── DefaultIteratorEnvironment.java │ │ │ ├── EvaluatingIterator.java │ │ │ └── OptimizedQueryIterator.java │ │ │ ├── parser │ │ │ ├── JexlOperatorConstants.java │ │ │ ├── TreeNode.java │ │ │ └── EventFields.java │ │ │ ├── logic │ │ │ └── ContentLogic.java │ │ │ └── jexl │ │ │ └── Arithmetic.java │ ├── assembly │ │ └── dist.xml │ └── test │ │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── accumulo │ │ │ └── examples │ │ │ └── wikisearch │ │ │ └── logic │ │ │ └── StandaloneStatusReporter.java │ │ └── hadoop1 │ │ └── org │ │ └── apache │ │ └── accumulo │ │ └── examples │ │ └── wikisearch │ │ └── logic │ │ └── TestQueryLogic.java └── pom.xml ├── .gitignore ├── NOTICE ├── ingest ├── src │ ├── main │ │ ├── protobuf │ │ │ ├── compile_protos.sh │ │ │ ├── TermWeight.proto │ │ │ └── Uid.proto │ │ └── java │ │ │ └── org │ │ │ └── apache │ │ │ └── accumulo │ │ │ └── examples │ │ │ └── wikisearch │ │ │ ├── normalizer │ │ │ ├── NoOpNormalizer.java │ │ │ ├── Normalizer.java │ │ │ └── LcNoDiacriticsNormalizer.java │ │ │ ├── ingest │ │ │ ├── LRUOutputCombiner.java │ │ │ ├── WikipediaPartitioner.java │ │ │ ├── WikipediaInputFormat.java │ │ │ ├── ArticleExtractor.java │ │ │ └── WikipediaConfiguration.java │ │ │ ├── iterator │ │ │ ├── GlobalIndexUidCombiner.java │ │ │ └── TextIndexCombiner.java │ │ │ ├── util │ │ │ └── TextUtil.java │ │ │ ├── output │ │ │ ├── SortingRFileOutputFormat.java │ │ │ └── BufferingRFileRecordWriter.java │ │ │ └── reader │ │ │ ├── LongLineRecordReader.java │ │ │ ├── LfLineReader.java │ │ │ └── AggregatingRecordReader.java │ ├── assembly │ │ └── dist.xml │ └── test │ │ └── java │ │ └── org │ │ └── apache │ │ └── accumulo │ │ └── examples │ │ └── wikisearch │ │ ├── ingest │ │ └── WikipediaInputSplitTest.java │ │ └── iterator │ │ ├── TextIndexTest.java │ │ └── GlobalIndexUidTest.java ├── conf │ ├── wikipedia.xml.uno │ ├── wikipedia.xml.example │ └── wikipedia_parallel.xml.example ├── bin │ ├── ingest.sh │ └── ingest_parallel.sh └── pom.xml ├── query-war ├── src │ └── main │ │ └── webapp │ │ ├── WEB-INF │ │ ├── jboss-web.xml │ │ └── web.xml │ │ ├── style.xsl │ │ └── ui.html └── pom.xml ├── .asf.yaml ├── CONTRIBUTING.md ├── .github └── workflows │ └── maven.yaml └── INSTALL.md /query/src/main/resources/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/target 2 | .idea 3 | **/*.iml 4 | **/lib 5 | .project 6 | .settings/ 7 | .classpath 8 | wikipedia.xml 9 | ejb-jar.xml 10 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Apache Accumulo Wikisearch 2 | Copyright 2011-2019 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /ingest/src/main/protobuf/compile_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | for PROTO in `ls -1 *proto`; do protoc --java_out ../java $PROTO; done 20 | -------------------------------------------------------------------------------- /query-war/src/main/webapp/WEB-INF/jboss-web.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | /accumulo-wikisearch 20 | 21 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.normalizer; 18 | 19 | public class NoOpNormalizer implements Normalizer { 20 | public String normalizeFieldValue(String field, Object value) { 21 | return value.toString(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /.asf.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | # https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features 21 | 22 | github: 23 | description: "Apache Accumulo Wikisearch" 24 | homepage: https://accumulo.apache.org 25 | labels: 26 | - accumulo 27 | - big-data 28 | - hacktoberfest 29 | features: 30 | wiki: false 31 | issues: true 32 | projects: true 33 | 34 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 17 | 18 | # Contributing to the Accumulo Wikisearch application 19 | 20 | Contributions to the Accumulo Wikisearch application can be made by creating a pull 21 | request to this repo on GitHub. 22 | 23 | Before creating a pull request, run `mvn clean verify`. 24 | 25 | For general information on contributing to Accumulo projects, check out the 26 | [Accumulo Contributor guide][contribute]. 27 | 28 | [contribute]: https://accumulo.apache.org/contributor/ 29 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.normalizer; 18 | 19 | public interface Normalizer { 20 | 21 | /** 22 | * Creates normalized content for ingest based upon implemented logic. 23 | * 24 | * @param field 25 | * The field being normalized 26 | * @param value 27 | * The value to normalize 28 | * @return a normalized value 29 | */ 30 | public String normalizeFieldValue(String field, Object value); 31 | 32 | } 33 | -------------------------------------------------------------------------------- /ingest/src/assembly/dist.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | dist 20 | 21 | tar.gz 22 | 23 | 24 | 25 | 26 | lib 27 | 0644 28 | 29 | 30 | bin 31 | 0744 32 | 33 | 34 | conf 35 | 0644 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /ingest/src/main/protobuf/TermWeight.proto: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one or more 2 | // contributor license agreements. See the NOTICE file distributed with 3 | // this work for additional information regarding copyright ownership. 4 | // The ASF licenses this file to You under the Apache License, Version 2.0 5 | // (the "License"); you may not use this file except in compliance with 6 | // the License. You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | // compile with protoc --java_out ../java 17 | // compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList 18 | // classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar 19 | 20 | package org.apache.accumulo.examples.wikisearch.protobuf; 21 | 22 | option java_package = "org.apache.accumulo.examples.wikisearch.protobuf"; 23 | option optimize_for = SPEED; 24 | 25 | message Info { 26 | required float normalizedTermFrequency = 1; 27 | repeated uint32 wordOffset = 2; 28 | } 29 | -------------------------------------------------------------------------------- /ingest/src/main/protobuf/Uid.proto: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one or more 2 | // contributor license agreements. See the NOTICE file distributed with 3 | // this work for additional information regarding copyright ownership. 4 | // The ASF licenses this file to You under the Apache License, Version 2.0 5 | // (the "License"); you may not use this file except in compliance with 6 | // the License. You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | // compile with protoc --java_out ../java 17 | // compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList 18 | // classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar 19 | 20 | package org.apache.accumulo.examples.wikisearch.protobuf; 21 | 22 | option java_package = "org.apache.accumulo.examples.wikisearch.protobuf"; 23 | option optimize_for = SPEED; 24 | 25 | message List { 26 | required bool IGNORE = 1; 27 | required uint64 COUNT = 2; 28 | repeated string UID = 3; 29 | } 30 | -------------------------------------------------------------------------------- /query/src/assembly/dist.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | dist 20 | 21 | tar.gz 22 | 23 | 24 | 25 | 26 | lib 27 | lib 28 | 29 | ${project.name}-${project.version}.jar 30 | 31 | 0644 32 | 33 | 34 | 35 | 36 | target/${project.name}-${project.version}.jar 37 | deploy 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /ingest/conf/wikipedia.xml.uno: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | wikipedia.accumulo.zookeepers 21 | localhost:2181 22 | 23 | 24 | wikipedia.accumulo.instance_name 25 | uno 26 | 27 | 28 | wikipedia.accumulo.user 29 | root 30 | 31 | 32 | wikipedia.accumulo.password 33 | secret 34 | 35 | 36 | wikipedia.accumulo.table 37 | wikipedia 38 | 39 | 40 | wikipedia.ingest.partitions 41 | 1 42 | 43 | 44 | -------------------------------------------------------------------------------- /ingest/conf/wikipedia.xml.example: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | wikipedia.accumulo.zookeepers 21 | 22 | 23 | 24 | wikipedia.accumulo.instance_name 25 | 26 | 27 | 28 | wikipedia.accumulo.user 29 | 30 | 31 | 32 | wikipedia.accumulo.password 33 | 34 | 35 | 36 | wikipedia.accumulo.table 37 | 38 | 39 | 40 | wikipedia.ingest.partitions 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /ingest/bin/ingest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | 20 | THIS_SCRIPT="$0" 21 | SCRIPT_DIR="${THIS_SCRIPT%/*}" 22 | SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd` 23 | echo $SCRIPT_DIR 24 | 25 | # 26 | # Add our jars 27 | # 28 | for f in $SCRIPT_DIR/../lib/*.jar; do 29 | CLASSPATH=${CLASSPATH}:$f 30 | done 31 | 32 | # 33 | # Transform the classpath into a comma-separated list also 34 | # 35 | LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'` 36 | 37 | 38 | # 39 | # Map/Reduce job 40 | # 41 | JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-*.jar 42 | CONF=$SCRIPT_DIR/../conf/wikipedia.xml 43 | HDFS_DATA_DIR=$1 44 | export HADOOP_CLASSPATH=$CLASSPATH 45 | echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}" 46 | hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR} 47 | -------------------------------------------------------------------------------- /.github/workflows/maven.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | # This workflow will build a Java project with Maven 21 | # See also: 22 | # https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 23 | 24 | name: mvn 25 | 26 | on: 27 | push: 28 | branches: [ '*' ] 29 | pull_request: 30 | branches: [ '*' ] 31 | 32 | jobs: 33 | # fast build to populate the local maven repository cache 34 | verify: 35 | runs-on: ubuntu-latest 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: Set up JDK 17 39 | uses: actions/setup-java@v4 40 | with: 41 | distribution: adopt 42 | java-version: 17 43 | cache: 'maven' 44 | - name: Show the first log message 45 | run: git log -n1 46 | - name: Build with Maven 47 | timeout-minutes: 5 48 | run: mvn -B -V -e -ntp "-Dstyle.color=always" clean verify 49 | env: 50 | MAVEN_OPTS: -Djansi.force=true 51 | 52 | -------------------------------------------------------------------------------- /query-war/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | org.apache.accumulo 22 | accumulo-wikisearch 23 | 2.0.0-SNAPSHOT 24 | 25 | 26 | wikisearch-query-war 27 | war 28 | wikisearch-query-war 29 | 30 | 31 | 32 | org.apache.maven.plugins 33 | maven-war-plugin 34 | 35 | true 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /ingest/bin/ingest_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | 20 | THIS_SCRIPT="$0" 21 | SCRIPT_DIR="${THIS_SCRIPT%/*}" 22 | SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd` 23 | echo $SCRIPT_DIR 24 | 25 | # 26 | # Add our jars 27 | # 28 | for f in $SCRIPT_DIR/../lib/*.jar; do 29 | CLASSPATH=${CLASSPATH}:$f 30 | done 31 | 32 | # 33 | # Transform the classpath into a comma-separated list also 34 | # 35 | LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'` 36 | 37 | 38 | # 39 | # Map/Reduce job 40 | # 41 | JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.5.0.jar 42 | CONF=$SCRIPT_DIR/../conf/wikipedia_parallel.xml 43 | HDFS_DATA_DIR=$1 44 | export HADOOP_CLASSPATH=$CLASSPATH 45 | echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}" 46 | hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR} 47 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/sample/Results.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.sample; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import javax.xml.bind.annotation.XmlAccessType; 23 | import javax.xml.bind.annotation.XmlAccessorType; 24 | import javax.xml.bind.annotation.XmlElement; 25 | import javax.xml.bind.annotation.XmlRootElement; 26 | 27 | @XmlRootElement 28 | @XmlAccessorType(XmlAccessType.FIELD) 29 | public class Results { 30 | 31 | @XmlElement 32 | private List document = new ArrayList(); 33 | 34 | public Results() { 35 | super(); 36 | } 37 | 38 | public List getResults() { 39 | return document; 40 | } 41 | 42 | public void setResults(List results) { 43 | this.document = results; 44 | } 45 | 46 | public int size() { 47 | if (null == document) 48 | return 0; 49 | else 50 | return document.size(); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/sample/Field.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.sample; 18 | 19 | import javax.xml.bind.annotation.XmlAccessType; 20 | import javax.xml.bind.annotation.XmlAccessorType; 21 | import javax.xml.bind.annotation.XmlAttribute; 22 | import javax.xml.bind.annotation.XmlValue; 23 | 24 | @XmlAccessorType(XmlAccessType.FIELD) 25 | public class Field { 26 | 27 | @XmlAttribute 28 | private String name = null; 29 | @XmlValue 30 | private String value = null; 31 | 32 | public Field() { 33 | super(); 34 | } 35 | 36 | public Field(String fieldName, String fieldValue) { 37 | super(); 38 | this.name = fieldName; 39 | this.value = fieldValue; 40 | } 41 | 42 | public String getFieldName() { 43 | return name; 44 | } 45 | 46 | public String getFieldValue() { 47 | return value; 48 | } 49 | 50 | public void setFieldName(String fieldName) { 51 | this.name = fieldName; 52 | } 53 | 54 | public void setFieldValue(String fieldValue) { 55 | this.value = fieldValue; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/sample/Document.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.sample; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import javax.xml.bind.annotation.XmlAccessType; 23 | import javax.xml.bind.annotation.XmlAccessorType; 24 | import javax.xml.bind.annotation.XmlElement; 25 | 26 | @XmlAccessorType(XmlAccessType.FIELD) 27 | public class Document { 28 | 29 | @XmlElement 30 | private String id = null; 31 | 32 | @XmlElement 33 | private List field = new ArrayList(); 34 | 35 | public Document() { 36 | super(); 37 | } 38 | 39 | public Document(String id, List fields) { 40 | super(); 41 | this.id = id; 42 | this.field = fields; 43 | } 44 | 45 | public String getId() { 46 | return id; 47 | } 48 | 49 | public List getFields() { 50 | return field; 51 | } 52 | 53 | public void setId(String id) { 54 | this.id = id; 55 | } 56 | 57 | public void setFields(List fields) { 58 | this.field = fields; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /query-war/src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 22 | 23 | 24 | resteasy.jndi.resources 25 | Query/local 26 | 27 | 28 | 29 | org.jboss.resteasy.plugins.server.servlet.ResteasyBootstrap 30 | 31 | 32 | 33 | Resteasy 34 | org.jboss.resteasy.plugins.server.servlet.HttpServletDispatcher 35 | 36 | 37 | 38 | Resteasy 39 | /rest/* 40 | 41 | 42 | 43 | resteasy.servlet.mapping.prefix 44 | /rest 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.normalizer; 18 | 19 | import java.text.Normalizer; 20 | import java.text.Normalizer.Form; 21 | import java.util.Locale; 22 | import java.util.regex.Matcher; 23 | import java.util.regex.Pattern; 24 | 25 | /** 26 | * An {@link Normalizer} which performs the following steps: 27 | *
    28 | *
  1. Unicode canonical decomposition ({@link Form#NFD})
  2. 29 | *
  3. Removal of diacritical marks
  4. 30 | *
  5. Unicode canonical composition ({@link Form#NFC})
  6. 31 | *
  7. lower casing in the {@link Locale#ENGLISH English local} 32 | *
33 | */ 34 | public class LcNoDiacriticsNormalizer implements org.apache.accumulo.examples.wikisearch.normalizer.Normalizer { 35 | private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}"); 36 | 37 | public String normalizeFieldValue(String fieldName, Object fieldValue) { 38 | String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD); 39 | String noDiacriticals = removeDiacriticalMarks(decomposed); 40 | String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC); 41 | return recomposed.toLowerCase(Locale.ENGLISH); 42 | } 43 | 44 | private String removeDiacriticalMarks(String str) { 45 | Matcher matcher = diacriticals.matcher(str); 46 | return matcher.replaceAll(""); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/query/IQuery.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.query; 18 | 19 | import javax.ws.rs.Consumes; 20 | import javax.ws.rs.GET; 21 | import javax.ws.rs.POST; 22 | import javax.ws.rs.Path; 23 | import javax.ws.rs.Produces; 24 | import javax.ws.rs.QueryParam; 25 | 26 | import org.apache.accumulo.examples.wikisearch.sample.Results; 27 | 28 | 29 | @Path("/Query") 30 | public interface IQuery { 31 | 32 | @GET 33 | @POST 34 | @Path("/html") 35 | @Consumes("*/*") 36 | public String html(@QueryParam("query") String query, @QueryParam("auths") String auths); 37 | 38 | @GET 39 | @POST 40 | @Path("/xml") 41 | @Consumes("*/*") 42 | @Produces("application/xml") 43 | public Results xml(@QueryParam("query") String query, @QueryParam("auths") String auths); 44 | 45 | @GET 46 | @POST 47 | @Path("/json") 48 | @Consumes("*/*") 49 | @Produces("application/json") 50 | public Results json(@QueryParam("query") String query, @QueryParam("auths") String auths); 51 | 52 | @GET 53 | @POST 54 | @Path("/yaml") 55 | @Consumes("*/*") 56 | @Produces("text/x-yaml") 57 | public Results yaml(@QueryParam("query") String query, @QueryParam("auths") String auths); 58 | 59 | @GET 60 | @POST 61 | @Path("/content") 62 | @Consumes("*/*") 63 | @Produces("application/xml") 64 | public Results content(@QueryParam("query") String query, @QueryParam("auths") String auths); 65 | 66 | } 67 | -------------------------------------------------------------------------------- /query-war/src/main/webapp/style.xsl: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | /accumulo-wikisearch/rest/Query/content?query=&auths=all 33 | 34 | 35 | 36 | 37 | _blank 38 | View Document 39 | 40 | 41 | 42 | 43 | 44 |
IdTitleTimestampCommentsDocument Link
45 | 46 |
47 |
48 | -------------------------------------------------------------------------------- /query/src/test/java/org/apache/accumulo/examples/wikisearch/logic/StandaloneStatusReporter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.logic; 18 | 19 | import org.apache.hadoop.mapreduce.Counter; 20 | import org.apache.hadoop.mapreduce.Counters; 21 | import org.apache.hadoop.mapreduce.StatusReporter; 22 | 23 | public class StandaloneStatusReporter extends StatusReporter { 24 | 25 | private Counters c = new Counters(); 26 | 27 | private long filesProcessed = 0; 28 | private long recordsProcessed = 0; 29 | 30 | public Counters getCounters() { 31 | return c; 32 | } 33 | 34 | @Override 35 | public Counter getCounter(Enum name) { 36 | return c.findCounter(name); 37 | } 38 | 39 | @Override 40 | public Counter getCounter(String group, String name) { 41 | return c.findCounter(group, name); 42 | } 43 | 44 | @Override 45 | public void progress() { 46 | // do nothing 47 | } 48 | 49 | @Override 50 | public void setStatus(String status) { 51 | // do nothing 52 | } 53 | 54 | public long getFilesProcessed() { 55 | return filesProcessed; 56 | } 57 | 58 | public long getRecordsProcessed() { 59 | return recordsProcessed; 60 | } 61 | 62 | public void incrementFilesProcessed() { 63 | filesProcessed++; 64 | recordsProcessed = 0; 65 | } 66 | 67 | public void incrementRecordsProcessed() { 68 | recordsProcessed++; 69 | } 70 | 71 | public float getProgress() { 72 | return 0; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.ingest; 18 | 19 | import java.util.LinkedHashMap; 20 | import java.util.Map; 21 | 22 | public class LRUOutputCombiner extends LinkedHashMap { 23 | 24 | private static final long serialVersionUID = 1L; 25 | 26 | public static abstract class Fold { 27 | public abstract Value fold(Value oldValue, Value newValue); 28 | } 29 | 30 | public static abstract class Output { 31 | public abstract void output(Key key, Value value); 32 | } 33 | 34 | private final int capacity; 35 | private final Fold fold; 36 | private final Output output; 37 | 38 | private long cacheHits = 0; 39 | private long cacheMisses = 0; 40 | 41 | public LRUOutputCombiner(int capacity, Fold fold, Output output) { 42 | super(capacity + 1, 1.1f, true); 43 | this.capacity = capacity; 44 | this.fold = fold; 45 | this.output = output; 46 | } 47 | 48 | protected boolean removeEldestEntry(Map.Entry eldest) { 49 | if (size() > capacity) { 50 | output.output(eldest.getKey(), eldest.getValue()); 51 | return true; 52 | } 53 | return false; 54 | } 55 | 56 | @Override 57 | public Value put(Key key, Value value) { 58 | Value val = get(key); 59 | if (val != null) { 60 | value = fold.fold(val, value); 61 | cacheHits++; 62 | } else { 63 | cacheMisses++; 64 | } 65 | super.put(key, value); 66 | return null; 67 | } 68 | 69 | public void flush() { 70 | for (Map.Entry e : entrySet()) { 71 | output.output(e.getKey(), e.getValue()); 72 | } 73 | clear(); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/util/FieldIndexKeyParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.util; 18 | 19 | import org.apache.accumulo.core.data.Key; 20 | 21 | public class FieldIndexKeyParser extends KeyParser { 22 | 23 | public static final String DELIMITER = "\0"; 24 | 25 | @Override 26 | public void parse(Key key) { 27 | super.parse(key); 28 | 29 | String[] colFamParts = this.keyFields.get(BaseKeyParser.COLUMN_FAMILY_FIELD).split(DELIMITER); 30 | this.keyFields.put(FIELDNAME_FIELD, colFamParts.length >= 2 ? colFamParts[1] : ""); 31 | 32 | String[] colQualParts = this.keyFields.get(BaseKeyParser.COLUMN_QUALIFIER_FIELD).split(DELIMITER); 33 | this.keyFields.put(SELECTOR_FIELD, colQualParts.length >= 1 ? colQualParts[0] : ""); 34 | this.keyFields.put(DATATYPE_FIELD, colQualParts.length >= 2 ? colQualParts[1] : ""); 35 | this.keyFields.put(UID_FIELD, colQualParts.length >= 3 ? colQualParts[2] : ""); 36 | } 37 | 38 | @Override 39 | public BaseKeyParser duplicate() { 40 | return new FieldIndexKeyParser(); 41 | } 42 | 43 | @Override 44 | public String getSelector() { 45 | return keyFields.get(SELECTOR_FIELD); 46 | } 47 | 48 | @Override 49 | public String getDataType() { 50 | return keyFields.get(DATATYPE_FIELD); 51 | } 52 | 53 | @Override 54 | public String getFieldName() { 55 | return keyFields.get(FIELDNAME_FIELD); 56 | } 57 | 58 | @Override 59 | public String getUid() { 60 | return keyFields.get(UID_FIELD); 61 | } 62 | 63 | public String getDataTypeUid() { 64 | return getDataType() + DELIMITER + getUid(); 65 | } 66 | 67 | // An alias for getSelector 68 | public String getFieldValue() { 69 | return getSelector(); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/function/QueryFunctions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.function; 18 | 19 | import org.apache.commons.lang.math.NumberUtils; 20 | import org.apache.log4j.Logger; 21 | 22 | public class QueryFunctions { 23 | 24 | protected static Logger log = Logger.getLogger(QueryFunctions.class); 25 | 26 | public static boolean between(String fieldValue, double left, double right) { 27 | try { 28 | Double value = Double.parseDouble(fieldValue); 29 | if (value >= left && value <= right) 30 | return true; 31 | return false; 32 | } catch (NumberFormatException nfe) { 33 | return false; 34 | } 35 | } 36 | 37 | public static boolean between(String fieldValue, long left, long right) { 38 | try { 39 | Long value = Long.parseLong(fieldValue); 40 | if (value >= left && value <= right) 41 | return true; 42 | return false; 43 | } catch (NumberFormatException nfe) { 44 | return false; 45 | } 46 | } 47 | 48 | public static Number abs(String fieldValue) { 49 | Number retval = null; 50 | try { 51 | Number value = NumberUtils.createNumber(fieldValue); 52 | if (null == value) 53 | retval = (Number) Integer.MIN_VALUE; 54 | else if (value instanceof Long) 55 | retval = Math.abs(value.longValue()); 56 | else if (value instanceof Double) 57 | retval = Math.abs(value.doubleValue()); 58 | else if (value instanceof Float) 59 | retval = Math.abs(value.floatValue()); 60 | else if (value instanceof Integer) 61 | retval = Math.abs(value.intValue()); 62 | } catch (NumberFormatException nfe) { 63 | return (Number) Integer.MIN_VALUE; 64 | } 65 | return retval; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/util/KeyParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.util; 18 | 19 | import org.apache.accumulo.core.data.Key; 20 | 21 | public class KeyParser extends BaseKeyParser { 22 | public static final String SELECTOR_FIELD = "selector"; 23 | public static final String DATATYPE_FIELD = "dataType"; 24 | public static final String FIELDNAME_FIELD = "fieldName"; 25 | public static final String UID_FIELD = "uid"; 26 | public static final String DELIMITER = "\0"; 27 | 28 | @Override 29 | public void parse(Key key) { 30 | super.parse(key); 31 | 32 | String[] colFamParts = this.keyFields.get(BaseKeyParser.COLUMN_FAMILY_FIELD).split(DELIMITER); 33 | this.keyFields.put(FIELDNAME_FIELD, colFamParts.length >= 2 ? colFamParts[1] : ""); 34 | 35 | String[] colQualParts = this.keyFields.get(BaseKeyParser.COLUMN_QUALIFIER_FIELD).split(DELIMITER); 36 | this.keyFields.put(SELECTOR_FIELD, colQualParts.length >= 1 ? colQualParts[0] : ""); 37 | this.keyFields.put(DATATYPE_FIELD, colQualParts.length >= 2 ? colQualParts[1] : ""); 38 | this.keyFields.put(UID_FIELD, colQualParts.length >= 3 ? colQualParts[2] : ""); 39 | } 40 | 41 | @Override 42 | public BaseKeyParser duplicate() { 43 | return new KeyParser(); 44 | } 45 | 46 | public String getSelector() { 47 | return keyFields.get(SELECTOR_FIELD); 48 | } 49 | 50 | public String getDataType() { 51 | return keyFields.get(DATATYPE_FIELD); 52 | } 53 | 54 | public String getFieldName() { 55 | return keyFields.get(FIELDNAME_FIELD); 56 | } 57 | 58 | public String getUid() { 59 | return keyFields.get(UID_FIELD); 60 | } 61 | 62 | public String getDataTypeUid() { 63 | return getDataType() + DELIMITER + getUid(); 64 | } 65 | 66 | // An alias for getSelector 67 | public String getFieldValue() { 68 | return getSelector(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/util/BaseKeyParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.util; 18 | 19 | import java.util.HashMap; 20 | import java.util.Map; 21 | 22 | import org.apache.accumulo.core.data.Key; 23 | 24 | public class BaseKeyParser { 25 | public static final String ROW_FIELD = "row"; 26 | public static final String COLUMN_FAMILY_FIELD = "columnFamily"; 27 | public static final String COLUMN_QUALIFIER_FIELD = "columnQualifier"; 28 | 29 | protected Map keyFields = new HashMap<>(); 30 | protected Key key = null; 31 | 32 | /** 33 | * Parses a Key object into its constituent fields. This method clears any prior values, so the 34 | * object can be reused without requiring a new instantiation. This default implementation makes 35 | * the row, columnFamily, and columnQualifier available. 36 | */ 37 | public void parse(Key key) { 38 | this.key = key; 39 | 40 | keyFields.clear(); 41 | 42 | keyFields.put(ROW_FIELD, key.getRow().toString()); 43 | keyFields.put(COLUMN_FAMILY_FIELD, key.getColumnFamily().toString()); 44 | keyFields.put(COLUMN_QUALIFIER_FIELD, key.getColumnQualifier().toString()); 45 | } 46 | 47 | public String getFieldValue(String fieldName) { 48 | return keyFields.get(fieldName); 49 | } 50 | 51 | public String[] getFieldNames() { 52 | String[] fieldNames = new String[keyFields.size()]; 53 | return keyFields.keySet().toArray(fieldNames); 54 | } 55 | 56 | public BaseKeyParser duplicate() { 57 | return new BaseKeyParser(); 58 | } 59 | 60 | public String getRow() { 61 | return keyFields.get(ROW_FIELD); 62 | } 63 | 64 | public String getColumnFamily() { 65 | return keyFields.get(COLUMN_FAMILY_FIELD); 66 | } 67 | 68 | public String getColumnQualifier() { 69 | return keyFields.get(COLUMN_QUALIFIER_FIELD); 70 | } 71 | 72 | public Key getKey() { 73 | return this.key; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /query/src/main/resources/META-INF/ejb-jar.xml.uno: -------------------------------------------------------------------------------- 1 | 2 | 18 | 22 | 23 | 24 | Query 25 | 26 | instanceName 27 | java.lang.String 28 | uno 29 | 30 | 31 | zooKeepers 32 | java.lang.String 33 | localhost:2181 34 | 35 | 36 | username 37 | java.lang.String 38 | root 39 | 40 | 41 | password 42 | java.lang.String 43 | secret 44 | 45 | 46 | tableName 47 | java.lang.String 48 | wikipedia 49 | 50 | 51 | partitions 52 | java.lang.Integer 53 | 100 54 | 55 | 56 | threads 57 | java.lang.Integer 58 | 8 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /query/src/main/resources/META-INF/ejb-jar.xml.example: -------------------------------------------------------------------------------- 1 | 2 | 18 | 22 | 23 | 24 | Query 25 | 26 | instanceName 27 | java.lang.String 28 | 29 | 30 | 31 | zooKeepers 32 | java.lang.String 33 | 34 | 35 | 36 | username 37 | java.lang.String 38 | 39 | 40 | 41 | password 42 | java.lang.String 43 | 44 | 45 | 46 | tableName 47 | java.lang.String 48 | wiki 49 | 50 | 51 | partitions 52 | java.lang.Integer 53 | 100 54 | 55 | 56 | threads 57 | java.lang.Integer 58 | 8 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputSplitTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.ingest; 18 | 19 | import java.io.ByteArrayInputStream; 20 | import java.io.ByteArrayOutputStream; 21 | import java.io.DataInput; 22 | import java.io.IOException; 23 | import java.io.ObjectInputStream; 24 | import java.io.ObjectOutputStream; 25 | 26 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 29 | import org.junit.Assert; 30 | import org.junit.Test; 31 | 32 | public class WikipediaInputSplitTest { 33 | @Test 34 | public void testSerialization() throws IOException { 35 | Path testPath = new Path("/foo/bar"); 36 | String[] hosts = new String[2]; 37 | hosts[0] = "abcd"; 38 | hosts[1] = "efgh"; 39 | FileSplit fSplit = new FileSplit(testPath, 1, 2, hosts); 40 | WikipediaInputSplit split = new WikipediaInputSplit(fSplit, 7); 41 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 42 | ObjectOutputStream out = new ObjectOutputStream(baos); 43 | split.write(out); 44 | out.close(); 45 | baos.close(); 46 | 47 | ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); 48 | DataInput in = new ObjectInputStream(bais); 49 | 50 | WikipediaInputSplit split2 = new WikipediaInputSplit(); 51 | split2.readFields(in); 52 | Assert.assertTrue(bais.available() == 0); 53 | bais.close(); 54 | 55 | Assert.assertTrue(split.getPartition() == split2.getPartition()); 56 | 57 | FileSplit fSplit2 = split2.getFileSplit(); 58 | Assert.assertTrue(fSplit.getPath().equals(fSplit2.getPath())); 59 | Assert.assertTrue(fSplit.getStart() == fSplit2.getStart()); 60 | Assert.assertTrue(fSplit.getLength() == fSplit2.getLength()); 61 | 62 | String[] hosts2 = fSplit2.getLocations(); 63 | Assert.assertEquals(hosts.length, hosts2.length); 64 | for (int i = 0; i < hosts.length; i++) { 65 | Assert.assertEquals(hosts[i], hosts2[i]); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /ingest/conf/wikipedia_parallel.xml.example: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | wikipedia.accumulo.zookeepers 21 | 22 | 23 | 24 | wikipedia.accumulo.instance_name 25 | 26 | 27 | 28 | wikipedia.accumulo.user 29 | 30 | 31 | 32 | wikipedia.accumulo.password 33 | 34 | 35 | 36 | wikipedia.accumulo.table 37 | 38 | 39 | 40 | wikipedia.ingest.partitions 41 | 42 | 43 | 44 | wikipedia.partitioned.directory 45 | 46 | 47 | 48 | wikipedia.ingest.groups 49 | 50 | 51 | 52 | wikipedia.run.partitioner 53 | 54 | 55 | 56 | wikipedia.run.ingest 57 | 58 | 59 | 60 | wikipedia.bulk.ingest 61 | 62 | 63 | 64 | wikipedia.bulk.ingest.dir 65 | 66 | 67 | 68 | wikipedia.bulk.ingest.failure.dir 69 | 70 | 71 | 72 | wikipedia.bulk.ingest.buffer.size 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/DefaultIteratorEnvironment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.accumulo.core.client.sample.SamplerConfiguration; 22 | import org.apache.accumulo.core.conf.AccumuloConfiguration; 23 | import org.apache.accumulo.core.conf.DefaultConfiguration; 24 | import org.apache.accumulo.core.data.Key; 25 | import org.apache.accumulo.core.data.Value; 26 | import org.apache.accumulo.core.iterators.IteratorEnvironment; 27 | import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope; 28 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator; 29 | import org.apache.accumulo.core.iterators.system.MapFileIterator; 30 | import org.apache.accumulo.core.security.Authorizations; 31 | import org.apache.hadoop.conf.Configuration; 32 | import org.apache.hadoop.fs.FileSystem; 33 | 34 | public class DefaultIteratorEnvironment implements IteratorEnvironment { 35 | 36 | AccumuloConfiguration conf; 37 | 38 | public DefaultIteratorEnvironment() { 39 | this.conf = DefaultConfiguration.getInstance(); 40 | } 41 | 42 | @Override 43 | public SortedKeyValueIterator reserveMapFileReader(String mapFileName) 44 | throws IOException { 45 | Configuration conf = new Configuration(); 46 | FileSystem fs = FileSystem.get(conf); 47 | return new MapFileIterator(fs, mapFileName, conf); 48 | } 49 | 50 | @Override 51 | public AccumuloConfiguration getConfig() { 52 | return conf; 53 | } 54 | 55 | @Override 56 | public boolean isSamplingEnabled() { 57 | return false; 58 | } 59 | 60 | @Override 61 | public IteratorScope getIteratorScope() { 62 | throw new UnsupportedOperationException(); 63 | } 64 | 65 | @Override 66 | public boolean isFullMajorCompaction() { 67 | throw new UnsupportedOperationException(); 68 | } 69 | 70 | @Override 71 | public void registerSideChannel(SortedKeyValueIterator iter) { 72 | throw new UnsupportedOperationException(); 73 | } 74 | 75 | @Override 76 | public Authorizations getAuthorizations() { 77 | throw new UnsupportedOperationException(); 78 | } 79 | 80 | @Override 81 | public SamplerConfiguration getSamplerConfiguration() { 82 | throw new UnsupportedOperationException(); 83 | } 84 | 85 | @Override 86 | public IteratorEnvironment cloneWithSamplingEnabled() { 87 | throw new UnsupportedOperationException(); 88 | } 89 | 90 | @Override 91 | public boolean isUserCompaction() { 92 | return false; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidCombiner.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import java.io.IOException; 20 | import java.util.HashSet; 21 | import java.util.Iterator; 22 | import java.util.Map; 23 | 24 | import org.apache.accumulo.core.client.lexicoder.Encoder; 25 | import org.apache.accumulo.core.data.Key; 26 | import org.apache.accumulo.core.data.Value; 27 | import org.apache.accumulo.core.iterators.IteratorEnvironment; 28 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator; 29 | import org.apache.accumulo.core.iterators.TypedValueCombiner; 30 | import org.apache.accumulo.core.iterators.ValueFormatException; 31 | import org.apache.accumulo.examples.wikisearch.protobuf.Uid; 32 | 33 | import com.google.protobuf.InvalidProtocolBufferException; 34 | 35 | /** 36 | * 37 | */ 38 | public class GlobalIndexUidCombiner extends TypedValueCombiner { 39 | public static final Encoder UID_LIST_ENCODER = new UidListEncoder(); 40 | public static final int MAX = 20; 41 | 42 | @Override 43 | public void init(SortedKeyValueIterator source, Map options, 44 | IteratorEnvironment env) throws IOException { 45 | super.init(source, options, env); 46 | setEncoder(UID_LIST_ENCODER); 47 | } 48 | 49 | @Override 50 | public Uid.List typedReduce(Key key, Iterator iter) { 51 | Uid.List.Builder builder = Uid.List.newBuilder(); 52 | HashSet uids = new HashSet<>(); 53 | boolean seenIgnore = false; 54 | long count = 0; 55 | while (iter.hasNext()) { 56 | Uid.List v = iter.next(); 57 | if (null == v) 58 | continue; 59 | count = count + v.getCOUNT(); 60 | if (v.getIGNORE()) { 61 | seenIgnore = true; 62 | } 63 | uids.addAll(v.getUIDList()); 64 | } 65 | // Special case logic 66 | // If we have aggregated more than MAX UIDs, then null out the UID list and set IGNORE to true 67 | // However, always maintain the count 68 | builder.setCOUNT(count); 69 | if (uids.size() > MAX || seenIgnore) { 70 | builder.setIGNORE(true); 71 | builder.clearUID(); 72 | } else { 73 | builder.setIGNORE(false); 74 | builder.addAllUID(uids); 75 | } 76 | return builder.build(); 77 | } 78 | 79 | public static class UidListEncoder implements Encoder { 80 | @Override 81 | public byte[] encode(Uid.List v) { 82 | return v.toByteArray(); 83 | } 84 | 85 | @Override 86 | public Uid.List decode(byte[] b) { 87 | if (b.length == 0) 88 | return null; 89 | try { 90 | return Uid.List.parseFrom(b); 91 | } catch (InvalidProtocolBufferException e) { 92 | throw new ValueFormatException("Value passed to aggregator was not of type Uid.List"); 93 | } 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitioner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | /** 18 | * 19 | */ 20 | package org.apache.accumulo.examples.wikisearch.ingest; 21 | 22 | 23 | import java.io.ByteArrayInputStream; 24 | import java.io.IOException; 25 | import java.io.InputStreamReader; 26 | import java.nio.charset.Charset; 27 | import java.util.regex.Matcher; 28 | import java.util.regex.Pattern; 29 | 30 | import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article; 31 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit; 32 | import org.apache.hadoop.conf.Configuration; 33 | import org.apache.hadoop.io.LongWritable; 34 | import org.apache.hadoop.io.Text; 35 | import org.apache.hadoop.mapreduce.Mapper; 36 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 37 | 38 | public class WikipediaPartitioner extends Mapper { 39 | 40 | // private static final Logger log = Logger.getLogger(WikipediaPartitioner.class); 41 | 42 | public final static Charset UTF8 = Charset.forName("UTF-8"); 43 | public static final String DOCUMENT_COLUMN_FAMILY = "d"; 44 | public static final String METADATA_EVENT_COLUMN_FAMILY = "e"; 45 | public static final String METADATA_INDEX_COLUMN_FAMILY = "i"; 46 | public static final String TOKENS_FIELD_NAME = "TEXT"; 47 | 48 | private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?"); 49 | 50 | private ArticleExtractor extractor; 51 | private String language; 52 | 53 | private int myGroup = -1; 54 | private int numGroups = -1; 55 | 56 | @Override 57 | public void setup(Context context) { 58 | Configuration conf = context.getConfiguration(); 59 | 60 | WikipediaInputSplit wiSplit = (WikipediaInputSplit)context.getInputSplit(); 61 | myGroup = wiSplit.getPartition(); 62 | numGroups = WikipediaConfiguration.getNumGroups(conf); 63 | 64 | FileSplit split = wiSplit.getFileSplit(); 65 | String fileName = split.getPath().getName(); 66 | Matcher matcher = languagePattern.matcher(fileName); 67 | if (matcher.matches()) { 68 | language = matcher.group(1).replace('_', '-').toLowerCase(); 69 | } else { 70 | throw new RuntimeException("Unknown ingest language! " + fileName); 71 | } 72 | extractor = new ArticleExtractor(); 73 | } 74 | 75 | @Override 76 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 77 | Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8)); 78 | if (article != null) { 79 | int groupId = WikipediaMapper.getPartitionId(article, numGroups); 80 | if(groupId != myGroup) 81 | return; 82 | context.write(new Text(language), article); 83 | } else { 84 | context.getCounter("wikipedia", "invalid articles").increment(1); 85 | context.progress(); 86 | } 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.util; 18 | 19 | import java.nio.ByteBuffer; 20 | import java.nio.charset.CharacterCodingException; 21 | 22 | import org.apache.accumulo.core.iterators.user.SummingCombiner; 23 | import org.apache.hadoop.io.Text; 24 | 25 | public class TextUtil { 26 | 27 | /** 28 | * Appends a null byte followed by the UTF-8 bytes of the given string to the given {@link Text} 29 | * 30 | * @param text 31 | * the Text to which to append 32 | * @param string 33 | * the String to append 34 | */ 35 | public static void textAppend(Text text, String string) { 36 | appendNullByte(text); 37 | textAppendNoNull(text, string); 38 | } 39 | 40 | public static void textAppend(Text text, String string, boolean replaceBadChar) { 41 | appendNullByte(text); 42 | textAppendNoNull(text, string, replaceBadChar); 43 | } 44 | 45 | public static void textAppend(Text t, long s) { 46 | t.append(nullByte, 0, 1); 47 | t.append(SummingCombiner.FIXED_LEN_ENCODER.encode(s), 0, 8); 48 | } 49 | 50 | private static final byte[] nullByte = {0}; 51 | 52 | /** 53 | * Appends a null byte to the given text 54 | * 55 | * @param text 56 | * the text to which to append the null byte 57 | */ 58 | public static void appendNullByte(Text text) { 59 | text.append(nullByte, 0, nullByte.length); 60 | } 61 | 62 | /** 63 | * Appends the UTF-8 bytes of the given string to the given {@link Text} 64 | * 65 | * @param t 66 | * the Text to which to append 67 | * @param s 68 | * the String to append 69 | */ 70 | public static void textAppendNoNull(Text t, String s) { 71 | textAppendNoNull(t, s, false); 72 | } 73 | 74 | /** 75 | * Appends the UTF-8 bytes of the given string to the given {@link Text} 76 | */ 77 | public static void textAppendNoNull(Text t, String s, boolean replaceBadChar) { 78 | try { 79 | ByteBuffer buffer = Text.encode(s, replaceBadChar); 80 | t.append(buffer.array(), 0, buffer.limit()); 81 | } catch (CharacterCodingException cce) { 82 | throw new IllegalArgumentException(cce); 83 | } 84 | } 85 | 86 | /** 87 | * Converts the given string its UTF-8 bytes. This uses Hadoop's method for converting string to 88 | * UTF-8 and is much faster than calling {@link String#getBytes(String)}. 89 | * 90 | * @param string 91 | * the string to convert 92 | * @return the UTF-8 representation of the string 93 | */ 94 | public static byte[] toUtf8(String string) { 95 | ByteBuffer buffer; 96 | try { 97 | buffer = Text.encode(string, false); 98 | } catch (CharacterCodingException cce) { 99 | throw new IllegalArgumentException(cce); 100 | } 101 | byte[] bytes = new byte[buffer.limit()]; 102 | System.arraycopy(buffer.array(), 0, bytes, 0, bytes.length); 103 | return bytes; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexCombiner.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import java.io.IOException; 20 | import java.util.ArrayList; 21 | import java.util.Collections; 22 | import java.util.Iterator; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | import org.apache.accumulo.core.client.lexicoder.Encoder; 27 | import org.apache.accumulo.core.data.Key; 28 | import org.apache.accumulo.core.data.Value; 29 | import org.apache.accumulo.core.iterators.IteratorEnvironment; 30 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator; 31 | import org.apache.accumulo.core.iterators.TypedValueCombiner; 32 | import org.apache.accumulo.core.iterators.ValueFormatException; 33 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight; 34 | 35 | import com.google.protobuf.InvalidProtocolBufferException; 36 | 37 | /** 38 | * 39 | */ 40 | public class TextIndexCombiner extends TypedValueCombiner { 41 | public static final Encoder TERMWEIGHT_INFO_ENCODER = 42 | new TermWeightInfoEncoder(); 43 | 44 | @Override 45 | public TermWeight.Info typedReduce(Key key, Iterator iter) { 46 | TermWeight.Info.Builder builder = TermWeight.Info.newBuilder(); 47 | List offsets = new ArrayList<>(); 48 | float normalizedTermFrequency = 0f; 49 | 50 | while (iter.hasNext()) { 51 | TermWeight.Info info = iter.next(); 52 | if (null == info) 53 | continue; 54 | 55 | // Add each offset into the list maintaining sorted order 56 | for (int offset : info.getWordOffsetList()) { 57 | int pos = Collections.binarySearch(offsets, offset); 58 | 59 | if (pos < 0) { 60 | // Undo the transform on the insertion point 61 | offsets.add((-1 * pos) - 1, offset); 62 | } else { 63 | offsets.add(pos, offset); 64 | } 65 | } 66 | 67 | if (info.getNormalizedTermFrequency() > 0) { 68 | normalizedTermFrequency += info.getNormalizedTermFrequency(); 69 | } 70 | } 71 | 72 | // Keep the sorted order we tried to maintain 73 | for (Integer offset : offsets) { 74 | builder.addWordOffset(offset); 75 | } 76 | 77 | builder.setNormalizedTermFrequency(normalizedTermFrequency); 78 | return builder.build(); 79 | } 80 | 81 | @Override 82 | public void init(SortedKeyValueIterator source, Map options, 83 | IteratorEnvironment env) throws IOException { 84 | super.init(source, options, env); 85 | setEncoder(TERMWEIGHT_INFO_ENCODER); 86 | } 87 | 88 | public static class TermWeightInfoEncoder implements Encoder { 89 | @Override 90 | public byte[] encode(TermWeight.Info v) { 91 | return v.toByteArray(); 92 | } 93 | 94 | @Override 95 | public TermWeight.Info decode(byte[] b) { 96 | if (b.length == 0) 97 | return null; 98 | try { 99 | return TermWeight.Info.parseFrom(b); 100 | } catch (InvalidProtocolBufferException e) { 101 | throw new ValueFormatException( 102 | "Value passed to aggregator was not of type TermWeight.Info"); 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /ingest/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | org.apache.accumulo 22 | accumulo-wikisearch 23 | 2.0.0-SNAPSHOT 24 | 25 | wikisearch-ingest 26 | wikisearch-ingest 27 | 28 | 29 | com.google.guava 30 | guava 31 | 32 | 33 | com.google.protobuf 34 | protobuf-java 35 | 36 | 37 | commons-codec 38 | commons-codec 39 | 40 | 41 | commons-lang 42 | commons-lang 43 | 44 | 45 | log4j 46 | log4j 47 | 48 | 49 | org.apache.accumulo 50 | accumulo-core 51 | 52 | 53 | commons-digester 54 | commons-digester 55 | 56 | 57 | 58 | 59 | org.apache.lucene 60 | lucene-analyzers-common 61 | 62 | 63 | org.apache.zookeeper 64 | zookeeper 65 | runtime 66 | 67 | 68 | junit 69 | junit 70 | test 71 | 72 | 73 | 74 | 75 | 76 | org.apache.maven.plugins 77 | maven-dependency-plugin 78 | 79 | 80 | copy-dependencies 81 | 82 | copy-dependencies 83 | 84 | prepare-package 85 | 86 | lib 87 | 88 | 89 | commons-lang,guava,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,zookeeper,commons-codec,accumulo-fate,accumulo-trace 90 | false 91 | 92 | 93 | 94 | 95 | 96 | maven-assembly-plugin 97 | 98 | 99 | src/assembly/dist.xml 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/SortingRFileOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.output; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.accumulo.core.conf.AccumuloConfiguration; 22 | import org.apache.accumulo.core.data.Mutation; 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.hadoop.mapreduce.JobContext; 27 | import org.apache.hadoop.mapreduce.OutputCommitter; 28 | import org.apache.hadoop.mapreduce.OutputFormat; 29 | import org.apache.hadoop.mapreduce.RecordWriter; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | 32 | public class SortingRFileOutputFormat extends OutputFormat { 33 | 34 | // private static final Logger log = Logger.getLogger(SortingRFileOutputFormat.class); 35 | 36 | public static final String PATH_NAME = "sortingrfileoutputformat.path"; 37 | public static final String MAX_BUFFER_SIZE = "sortingrfileoutputformat.max.buffer.size"; 38 | 39 | public static void setPathName(Configuration conf, String path) { 40 | conf.set(PATH_NAME, path); 41 | } 42 | 43 | public static String getPathName(Configuration conf) { 44 | return conf.get(PATH_NAME); 45 | } 46 | 47 | public static void setMaxBufferSize(Configuration conf, long maxBufferSize) { 48 | conf.setLong(MAX_BUFFER_SIZE, maxBufferSize); 49 | } 50 | 51 | public static long getMaxBufferSize(Configuration conf) { 52 | return conf.getLong(MAX_BUFFER_SIZE, -1); 53 | } 54 | 55 | @Override 56 | public void checkOutputSpecs(JobContext job) throws IOException, InterruptedException { 57 | // TODO make sure the path is writable? 58 | // TODO make sure the max buffer size is set and is reasonable 59 | } 60 | 61 | @Override 62 | public OutputCommitter getOutputCommitter(TaskAttemptContext arg0) throws IOException, InterruptedException { 63 | return new OutputCommitter() { 64 | 65 | @Override 66 | public void setupTask(TaskAttemptContext arg0) throws IOException { 67 | // TODO Auto-generated method stub 68 | 69 | } 70 | 71 | @Override 72 | public void setupJob(JobContext arg0) throws IOException { 73 | // TODO Auto-generated method stub 74 | 75 | } 76 | 77 | @Override 78 | public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException { 79 | // TODO Auto-generated method stub 80 | return false; 81 | } 82 | 83 | @Override 84 | public void commitTask(TaskAttemptContext arg0) throws IOException { 85 | // TODO Auto-generated method stub 86 | 87 | } 88 | 89 | @Override 90 | public void cleanupJob(JobContext arg0) throws IOException { 91 | // TODO Auto-generated method stub 92 | 93 | } 94 | 95 | @Override 96 | public void abortTask(TaskAttemptContext arg0) throws IOException { 97 | // TODO Auto-generated method stub 98 | 99 | } 100 | }; 101 | } 102 | 103 | @Override 104 | public RecordWriter getRecordWriter(TaskAttemptContext attempt) throws IOException, InterruptedException { 105 | 106 | // grab the configuration 107 | final Configuration conf = attempt.getConfiguration(); 108 | // grab the max size 109 | final long maxSize = getMaxBufferSize(conf); 110 | 111 | return new BufferingRFileRecordWriter(maxSize, conf); 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/parser/JexlOperatorConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.parser; 18 | 19 | import java.util.Map; 20 | import java.util.concurrent.ConcurrentHashMap; 21 | import org.apache.commons.jexl2.parser.ASTAndNode; 22 | 23 | import org.apache.commons.jexl2.parser.ASTEQNode; 24 | import org.apache.commons.jexl2.parser.ASTERNode; 25 | import org.apache.commons.jexl2.parser.ASTFunctionNode; 26 | import org.apache.commons.jexl2.parser.ASTGENode; 27 | import org.apache.commons.jexl2.parser.ASTGTNode; 28 | import org.apache.commons.jexl2.parser.ASTLENode; 29 | import org.apache.commons.jexl2.parser.ASTLTNode; 30 | import org.apache.commons.jexl2.parser.ASTNENode; 31 | import org.apache.commons.jexl2.parser.ASTNRNode; 32 | import org.apache.commons.jexl2.parser.ASTOrNode; 33 | import org.apache.commons.jexl2.parser.JexlNode; 34 | import org.apache.commons.jexl2.parser.ParserTreeConstants; 35 | 36 | public class JexlOperatorConstants implements ParserTreeConstants { 37 | 38 | private static Map,String> operatorMap = new ConcurrentHashMap,String>(); 39 | private static Map> classMap = new ConcurrentHashMap>(); 40 | private static Map jjtOperatorMap = new ConcurrentHashMap(); 41 | private static Map jjtTypeMap = new ConcurrentHashMap(); 42 | 43 | static { 44 | operatorMap.put(ASTEQNode.class, "=="); 45 | operatorMap.put(ASTNENode.class, "!="); 46 | operatorMap.put(ASTLTNode.class, "<"); 47 | operatorMap.put(ASTLENode.class, "<="); 48 | operatorMap.put(ASTGTNode.class, ">"); 49 | operatorMap.put(ASTGENode.class, ">="); 50 | operatorMap.put(ASTERNode.class, "=~"); 51 | operatorMap.put(ASTNRNode.class, "!~"); 52 | operatorMap.put(ASTFunctionNode.class, "f"); 53 | operatorMap.put(ASTAndNode.class, "and"); 54 | operatorMap.put(ASTOrNode.class, "or"); 55 | 56 | classMap.put("==", ASTEQNode.class); 57 | classMap.put("!=", ASTNENode.class); 58 | classMap.put("<", ASTLTNode.class); 59 | classMap.put("<=", ASTLENode.class); 60 | classMap.put(">", ASTGTNode.class); 61 | classMap.put(">=", ASTGENode.class); 62 | classMap.put("=~", ASTERNode.class); 63 | classMap.put("!~", ASTNRNode.class); 64 | classMap.put("f", ASTFunctionNode.class); 65 | 66 | jjtOperatorMap.put(JJTEQNODE, "=="); 67 | jjtOperatorMap.put(JJTNENODE, "!="); 68 | jjtOperatorMap.put(JJTLTNODE, "<"); 69 | jjtOperatorMap.put(JJTLENODE, "<="); 70 | jjtOperatorMap.put(JJTGTNODE, ">"); 71 | jjtOperatorMap.put(JJTGENODE, ">="); 72 | jjtOperatorMap.put(JJTERNODE, "=~"); 73 | jjtOperatorMap.put(JJTNRNODE, "!~"); 74 | jjtOperatorMap.put(JJTFUNCTIONNODE, "f"); 75 | jjtOperatorMap.put(JJTANDNODE, "and"); 76 | jjtOperatorMap.put(JJTORNODE, "or"); 77 | 78 | jjtTypeMap.put("==", JJTEQNODE); 79 | jjtTypeMap.put("!=", JJTNENODE); 80 | jjtTypeMap.put("<", JJTLTNODE); 81 | jjtTypeMap.put("<=", JJTLENODE); 82 | jjtTypeMap.put(">", JJTGTNODE); 83 | jjtTypeMap.put(">=", JJTGENODE); 84 | jjtTypeMap.put("=~", JJTERNODE); 85 | jjtTypeMap.put("!~", JJTNRNODE); 86 | jjtTypeMap.put("f", JJTFUNCTIONNODE); 87 | 88 | } 89 | 90 | public static String getOperator(Class nodeType) { 91 | return operatorMap.get(nodeType); 92 | } 93 | 94 | public static String getOperator(Integer jjtNode) { 95 | return jjtOperatorMap.get(jjtNode); 96 | } 97 | 98 | public static Class getClass(String operator) { 99 | return classMap.get(operator); 100 | } 101 | 102 | public static int getJJTNodeType(String operator) { 103 | return jjtTypeMap.get(operator); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/logic/ContentLogic.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.logic; 18 | 19 | import java.util.List; 20 | import java.util.Map.Entry; 21 | import java.util.regex.Matcher; 22 | import java.util.regex.Pattern; 23 | 24 | import org.apache.accumulo.core.client.Connector; 25 | import org.apache.accumulo.core.client.Scanner; 26 | import org.apache.accumulo.core.client.TableNotFoundException; 27 | import org.apache.accumulo.core.data.Key; 28 | import org.apache.accumulo.core.data.Range; 29 | import org.apache.accumulo.core.data.Value; 30 | import org.apache.accumulo.core.security.Authorizations; 31 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper; 32 | import org.apache.accumulo.examples.wikisearch.sample.Document; 33 | import org.apache.accumulo.examples.wikisearch.sample.Field; 34 | import org.apache.accumulo.examples.wikisearch.sample.Results; 35 | import org.apache.commons.codec.binary.Base64; 36 | import org.apache.commons.lang.StringUtils; 37 | import org.apache.log4j.Logger; 38 | 39 | 40 | /** 41 | * This query table implementation returns a Results object that contains documents from the wiki table. The query will contain the partition id, wikitype, and 42 | * UID so that we can seek directly to the document. The document is stored as base64 compressed binary in the Accumulo table. We will decompress the data so 43 | * that it is base64 encoded binary data in the Results object. 44 | * 45 | * The query that needs to be passed to the web service is: DOCUMENT:partitionId/wikitype/uid. 46 | * 47 | */ 48 | public class ContentLogic { 49 | 50 | private static final Logger log = Logger.getLogger(ContentLogic.class); 51 | 52 | private static final String NULL_BYTE = "\u0000"; 53 | 54 | private String tableName = null; 55 | 56 | private Pattern queryPattern = Pattern.compile("^DOCUMENT:(.*)/(.*)/(.*)$"); 57 | 58 | public String getTableName() { 59 | return tableName; 60 | } 61 | 62 | public void setTableName(String tableName) { 63 | this.tableName = tableName; 64 | } 65 | 66 | public Results runQuery(Connector connector, String query, List authorizations) { 67 | 68 | Results results = new Results(); 69 | Authorizations auths = new Authorizations(StringUtils.join(authorizations, "|")); 70 | 71 | Matcher match = queryPattern.matcher(query); 72 | if (!match.matches()) { 73 | throw new IllegalArgumentException("Query does not match the pattern: DOCUMENT:partitionId/wikitype/uid, your query: " + query.toString()); 74 | } else { 75 | String partitionId = match.group(1); 76 | String wikitype = match.group(2); 77 | String id = match.group(3); 78 | 79 | log.debug("Received pieces: " + partitionId + ", " + wikitype + ", " + id); 80 | 81 | // Create the Range 82 | Key startKey = new Key(partitionId, WikipediaMapper.DOCUMENT_COLUMN_FAMILY, wikitype + NULL_BYTE + id); 83 | Key endKey = new Key(partitionId, WikipediaMapper.DOCUMENT_COLUMN_FAMILY, wikitype + NULL_BYTE + id + NULL_BYTE); 84 | Range r = new Range(startKey, true, endKey, false); 85 | 86 | log.debug("Setting range: " + r); 87 | 88 | try { 89 | Scanner scanner = connector.createScanner(this.getTableName(), auths); 90 | scanner.setRange(r); 91 | // This should in theory only match one thing. 92 | for (Entry entry : scanner) { 93 | Document doc = new Document(); 94 | doc.setId(id); 95 | Field val = new Field(); 96 | val.setFieldName("DOCUMENT"); 97 | val.setFieldValue(new String(Base64.decodeBase64(entry.getValue().toString()))); 98 | doc.getFields().add(val); 99 | results.getResults().add(doc); 100 | } 101 | } catch (TableNotFoundException e) { 102 | throw new RuntimeException("Table not found: " + this.getTableName(), e); 103 | } 104 | 105 | } 106 | return results; 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.ingest; 18 | 19 | import java.io.DataInput; 20 | import java.io.DataOutput; 21 | import java.io.IOException; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | 25 | import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.io.LongWritable; 28 | import org.apache.hadoop.io.Text; 29 | import org.apache.hadoop.io.Writable; 30 | import org.apache.hadoop.mapreduce.InputSplit; 31 | import org.apache.hadoop.mapreduce.JobContext; 32 | import org.apache.hadoop.mapreduce.RecordReader; 33 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 34 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 35 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 36 | 37 | 38 | public class WikipediaInputFormat extends TextInputFormat { 39 | 40 | public static class WikipediaInputSplit extends InputSplit implements Writable { 41 | 42 | public WikipediaInputSplit(){} 43 | 44 | public WikipediaInputSplit(FileSplit fileSplit, int partition) 45 | { 46 | this.fileSplit = fileSplit; 47 | this.partition = partition; 48 | } 49 | 50 | private FileSplit fileSplit = null; 51 | private int partition = -1; 52 | 53 | public int getPartition() 54 | { 55 | return partition; 56 | } 57 | 58 | public FileSplit getFileSplit() 59 | { 60 | return fileSplit; 61 | } 62 | 63 | @Override 64 | public long getLength() throws IOException, InterruptedException { 65 | return fileSplit.getLength(); 66 | } 67 | 68 | @Override 69 | public String[] getLocations() throws IOException, InterruptedException { 70 | // for highly replicated files, returning all of the locations can lead to bunching 71 | // TODO replace this with a subset of the locations 72 | return fileSplit.getLocations(); 73 | } 74 | 75 | @Override 76 | public void readFields(DataInput in) throws IOException { 77 | Path file = new Path(in.readUTF()); 78 | long start = in.readLong(); 79 | long length = in.readLong(); 80 | String [] hosts = null; 81 | if(in.readBoolean()) 82 | { 83 | int numHosts = in.readInt(); 84 | hosts = new String[numHosts]; 85 | for(int i = 0; i < numHosts; i++) 86 | hosts[i] = in.readUTF(); 87 | } 88 | fileSplit = new FileSplit(file, start, length, hosts); 89 | partition = in.readInt(); 90 | } 91 | 92 | @Override 93 | public void write(DataOutput out) throws IOException { 94 | out.writeUTF(fileSplit.getPath().toString()); 95 | out.writeLong(fileSplit.getStart()); 96 | out.writeLong(fileSplit.getLength()); 97 | String [] hosts = fileSplit.getLocations(); 98 | if(hosts == null) 99 | { 100 | out.writeBoolean(false); 101 | } 102 | else 103 | { 104 | out.writeBoolean(true); 105 | out.writeInt(hosts.length); 106 | for(String host:hosts) 107 | out.writeUTF(host); 108 | } 109 | out.writeInt(partition); 110 | } 111 | 112 | } 113 | 114 | @Override 115 | public List getSplits(JobContext job) throws IOException { 116 | List superSplits = super.getSplits(job); 117 | List splits = new ArrayList(); 118 | 119 | int numGroups = WikipediaConfiguration.getNumGroups(job.getConfiguration()); 120 | 121 | for(int group = 0; group < numGroups; group++) 122 | { 123 | for(InputSplit split:superSplits) 124 | { 125 | FileSplit fileSplit = (FileSplit)split; 126 | splits.add(new WikipediaInputSplit(fileSplit,group)); 127 | } 128 | } 129 | return splits; 130 | } 131 | 132 | @Override 133 | public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) { 134 | return new AggregatingRecordReader(); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/EvaluatingIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import java.io.IOException; 20 | import java.util.Collection; 21 | import java.util.Collections; 22 | 23 | import org.apache.accumulo.core.data.ByteSequence; 24 | import org.apache.accumulo.core.data.Key; 25 | import org.apache.accumulo.core.data.PartialKey; 26 | import org.apache.accumulo.core.data.Range; 27 | import org.apache.accumulo.core.data.Value; 28 | import org.apache.accumulo.core.iterators.IteratorEnvironment; 29 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator; 30 | import org.apache.accumulo.core.security.ColumnVisibility; 31 | import org.apache.accumulo.examples.wikisearch.parser.EventFields; 32 | import org.apache.accumulo.examples.wikisearch.parser.EventFields.FieldValue; 33 | import org.apache.commons.collections.map.LRUMap; 34 | import org.apache.hadoop.io.Text; 35 | 36 | public class EvaluatingIterator extends AbstractEvaluatingIterator { 37 | 38 | public static final String NULL_BYTE_STRING = "\u0000"; 39 | LRUMap visibilityMap = new LRUMap(); 40 | 41 | public EvaluatingIterator() { 42 | super(); 43 | } 44 | 45 | public EvaluatingIterator(AbstractEvaluatingIterator other, IteratorEnvironment env) { 46 | super(other, env); 47 | } 48 | 49 | @Override 50 | public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { 51 | return new EvaluatingIterator(this, env); 52 | } 53 | 54 | @Override 55 | public PartialKey getKeyComparator() { 56 | return PartialKey.ROW_COLFAM; 57 | } 58 | 59 | @Override 60 | public Key getReturnKey(Key k) { 61 | // If we were using column visibility, then we would get the merged visibility here and use it 62 | // in the key. 63 | // Remove the COLQ from the key and use the combined visibility 64 | Key r = new Key(k.getRowData().getBackingArray(), k.getColumnFamilyData().getBackingArray(), 65 | NULL_BYTE, k.getColumnVisibility().getBytes(), k.getTimestamp(), k.isDeleted(), false); 66 | return r; 67 | } 68 | 69 | @Override 70 | public void fillMap(EventFields event, Key key, Value value) { 71 | // If we were using column visibility, we would have to merge them here. 72 | 73 | // Pull the datatype from the colf in case we need to do anything datatype specific. 74 | // String colf = key.getColumnFamily().toString(); 75 | // String datatype = colf.substring(0, colf.indexOf(NULL_BYTE_STRING)); 76 | 77 | // For the partitioned table, the field name and field value are stored in the column qualifier 78 | // separated by a \0. 79 | String colq = key.getColumnQualifier().toString();// .toLowerCase(); 80 | int idx = colq.indexOf(NULL_BYTE_STRING); 81 | String fieldName = colq.substring(0, idx); 82 | String fieldValue = colq.substring(idx + 1); 83 | 84 | event.put(fieldName, new FieldValue(getColumnVisibility(key), fieldValue.getBytes())); 85 | } 86 | 87 | /** 88 | * @return The column visibility 89 | */ 90 | public ColumnVisibility getColumnVisibility(Key key) { 91 | ColumnVisibility result = (ColumnVisibility) visibilityMap.get(key.getColumnVisibility()); 92 | if (result != null) { 93 | return result; 94 | } 95 | result = new ColumnVisibility(key.getColumnVisibility().getBytes()); 96 | visibilityMap.put(key.getColumnVisibility(), result); 97 | return result; 98 | } 99 | 100 | /** 101 | * Don't accept this key if the colf starts with 'fi' 102 | */ 103 | @Override 104 | public boolean isKeyAccepted(Key key) throws IOException { 105 | if (key.getColumnFamily().toString().startsWith("fi")) { 106 | Key copy = new Key(key.getRow(), new Text("fi\01")); 107 | Collection columnFamilies = Collections.emptyList(); 108 | this.iterator.seek(new Range(copy, copy), columnFamilies, true); 109 | if (this.iterator.hasTop()) { 110 | return isKeyAccepted(this.iterator.getTopKey()); 111 | } 112 | return true; 113 | } 114 | return true; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/jexl/Arithmetic.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.jexl; 18 | 19 | import java.util.regex.Matcher; 20 | import java.util.regex.Pattern; 21 | 22 | import org.apache.commons.jexl2.JexlArithmetic; 23 | import org.apache.commons.lang.math.NumberUtils; 24 | 25 | public class Arithmetic extends JexlArithmetic { 26 | 27 | public Arithmetic(boolean lenient) { 28 | super(lenient); 29 | } 30 | 31 | /** 32 | * This method differs from the parent in that we are not calling String.matches() because it does not match on a newline. Instead we are handling this case. 33 | * 34 | * @param left 35 | * first value 36 | * @param right 37 | * second value 38 | * @return test result. 39 | */ 40 | @Override 41 | public boolean matches(Object left, Object right) { 42 | if (left == null && right == null) { 43 | // if both are null L == R 44 | return true; 45 | } 46 | if (left == null || right == null) { 47 | // we know both aren't null, therefore L != R 48 | return false; 49 | } 50 | final String arg = left.toString(); 51 | if (right instanceof java.util.regex.Pattern) { 52 | return ((java.util.regex.Pattern) right).matcher(arg).matches(); 53 | } else { 54 | // return arg.matches(right.toString()); 55 | Pattern p = Pattern.compile(right.toString(), Pattern.DOTALL); 56 | Matcher m = p.matcher(arg); 57 | return m.matches(); 58 | 59 | } 60 | } 61 | 62 | /** 63 | * This method differs from the parent class in that we are going to try and do a better job of coercing the types. As a last resort we will do a string 64 | * comparison and try not to throw a NumberFormatException. The JexlArithmetic class performs coercion to a particular type if either the left or the right 65 | * match a known type. We will look at the type of the right operator and try to make the left of the same type. 66 | */ 67 | @Override 68 | public boolean equals(Object left, Object right) { 69 | Object fixedLeft = fixLeft(left, right); 70 | return super.equals(fixedLeft, right); 71 | } 72 | 73 | @Override 74 | public boolean lessThan(Object left, Object right) { 75 | Object fixedLeft = fixLeft(left, right); 76 | return super.lessThan(fixedLeft, right); 77 | } 78 | 79 | protected Object fixLeft(Object left, Object right) { 80 | 81 | if (null == left || null == right) 82 | return left; 83 | 84 | if (!(right instanceof Number) && left instanceof Number) { 85 | right = NumberUtils.createNumber(right.toString()); 86 | } 87 | 88 | if (right instanceof Number && left instanceof Number) { 89 | if (right instanceof Double) 90 | return ((Double) right).doubleValue(); 91 | else if (right instanceof Float) 92 | return ((Float) right).floatValue(); 93 | else if (right instanceof Long) 94 | return ((Long) right).longValue(); 95 | else if (right instanceof Integer) 96 | return ((Integer) right).intValue(); 97 | else if (right instanceof Short) 98 | return ((Short) right).shortValue(); 99 | else if (right instanceof Byte) 100 | return ((Byte) right).byteValue(); 101 | else 102 | return right; 103 | } 104 | if (right instanceof Number && left instanceof String) { 105 | Number num = NumberUtils.createNumber(left.toString()); 106 | // Let's try to cast left as right's type. 107 | if (this.isFloatingPointNumber(right) && this.isFloatingPointNumber(left)) 108 | return num; 109 | else if (this.isFloatingPointNumber(right)) 110 | return num.doubleValue(); 111 | else if (right instanceof Number) 112 | return num.longValue(); 113 | } else if (right instanceof Boolean && left instanceof String) { 114 | if (left.equals("true") || left.equals("false")) 115 | return Boolean.parseBoolean(left.toString()); 116 | 117 | Number num = NumberUtils.createNumber(left.toString()); 118 | if (num.intValue() == 1) 119 | return (Boolean) true; 120 | else if (num.intValue() == 0) 121 | return (Boolean) false; 122 | } 123 | return left; 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.reader; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.fs.FSDataInputStream; 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.io.LongWritable; 26 | import org.apache.hadoop.io.Text; 27 | import org.apache.hadoop.io.compress.CompressionCodec; 28 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.RecordReader; 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 32 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 33 | import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; 34 | import org.apache.hadoop.util.LineReader; 35 | 36 | /** 37 | * A copy of {@link LineRecordReader} which does not discard lines longer than "mapred.linerecordreader.maxlength". Instead, it returns them, leaving it to the 38 | * mapper to decide what to do with it. It also does not treat '\r' (CR) characters as new lines -- it uses {@link LfLineReader} instead of {@link LineReader} 39 | * to read lines. 40 | */ 41 | public class LongLineRecordReader extends RecordReader { 42 | private CompressionCodecFactory compressionCodecs = null; 43 | private long start; 44 | private long pos; 45 | private long end; 46 | private LfLineReader in; 47 | private int maxLineLength; 48 | private LongWritable key = null; 49 | private Text value = null; 50 | 51 | @Override 52 | public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { 53 | FileSplit split = (FileSplit) genericSplit; 54 | Configuration job = context.getConfiguration(); 55 | this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); 56 | start = split.getStart(); 57 | end = start + split.getLength(); 58 | final Path file = split.getPath(); 59 | compressionCodecs = new CompressionCodecFactory(job); 60 | final CompressionCodec codec = compressionCodecs.getCodec(file); 61 | 62 | // open the file and seek to the start of the split 63 | FileSystem fs = file.getFileSystem(job); 64 | FSDataInputStream fileIn = fs.open(split.getPath()); 65 | boolean skipFirstLine = false; 66 | if (codec != null) { 67 | in = new LfLineReader(codec.createInputStream(fileIn), job); 68 | end = Long.MAX_VALUE; 69 | } else { 70 | if (start != 0) { 71 | skipFirstLine = true; 72 | --start; 73 | fileIn.seek(start); 74 | } 75 | in = new LfLineReader(fileIn, job); 76 | } 77 | if (skipFirstLine) { // skip first line and re-establish "start". 78 | start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); 79 | } 80 | this.pos = start; 81 | } 82 | 83 | @Override 84 | public boolean nextKeyValue() throws IOException { 85 | if (key == null) { 86 | key = new LongWritable(); 87 | } 88 | key.set(pos); 89 | if (value == null) { 90 | value = new Text(); 91 | } 92 | int newSize = 0; 93 | if (pos < end) { 94 | newSize = in.readLine(value, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); 95 | if (newSize != 0) { 96 | pos += newSize; 97 | } 98 | } 99 | if (newSize == 0) { 100 | key = null; 101 | value = null; 102 | return false; 103 | } else { 104 | return true; 105 | } 106 | } 107 | 108 | @Override 109 | public LongWritable getCurrentKey() { 110 | return key; 111 | } 112 | 113 | @Override 114 | public Text getCurrentValue() { 115 | return value; 116 | } 117 | 118 | /** 119 | * Get the progress within the split 120 | */ 121 | @Override 122 | public float getProgress() { 123 | if (start == end) { 124 | return 0.0f; 125 | } else { 126 | return Math.min(1.0f, (pos - start) / (float) (end - start)); 127 | } 128 | } 129 | 130 | @Override 131 | public synchronized void close() throws IOException { 132 | if (in != null) { 133 | in.close(); 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | 17 | # Wikisearch Installation 18 | 19 | Instructions for installing and running the Accumulo Wikisearch example. 20 | 21 | ## Ingest 22 | 23 | ### Prerequisites 24 | 25 | 1. Accumulo, Hadoop, and ZooKeeper must be installed and running 26 | 1. Download one or more [wikipedia dump files][dump-files] and put them in an HDFS directory. 27 | You will want to grab the files with the link name of pages-articles.xml.bz2. Though not strictly 28 | required, the ingest will go more quickly if the files are decompressed: 29 | 30 | $ bunzip2 enwiki-*-pages-articles.xml.bz2 31 | $ hadoop fs -put enwiki-*-pages-articles.xml /wikipedia/enwiki-pages-articles.xml 32 | 33 | ### Instructions 34 | 35 | 1. Create a `wikipedia.xml` file (or `wikipedia_parallel.xml` if running parallel version) from 36 | [wikipedia.xml.example] or [wikipedia_parallel.xml.example] and modify for your Accumulo 37 | installation. 38 | 39 | $ cp ingest/conf 40 | $ cp wikipedia.xml.example wikipedia.xml 41 | $ vim wikipedia.xml 42 | 43 | 1. Copy `ingest/lib/wikisearch-*.jar` to `$ACCUMULO_HOME/lib/ext` 44 | 1. Run `ingest/bin/ingest.sh` (or `ingest_parallel.sh` if running parallel version) with one 45 | argument (the name of the directory in HDFS where the wikipedia XML files reside) and this will 46 | kick off a MapReduce job to ingest the data into Accumulo. 47 | 48 | ## Query 49 | 50 | ### Prerequisites 51 | 52 | 1. The query software was tested using JBoss AS 6. Install the JBoss distro and follow the instructions below 53 | to build the EJB jar and WAR file required. 54 | * To stop the JBoss warnings about WSDescriptorDeployer and JMSDescriptorDeployer, these deployers can be 55 | removed from `$JBOSS_HOME/server/default/deployers/jbossws.deployer/META-INF/stack-agnostic-jboss-beans.xml` 56 | 1. Ensure that you have successfully run `mvn clean install` at the Wikisearch top level to install the jars 57 | into your local maven repo before building the query package. 58 | 59 | ### Instructions 60 | 61 | 1. Create a `ejb-jar.xml` from [ejb-jar.xml.example] and modify it to contain the same information 62 | that you put into `wikipedia.xml` in the ingest steps above: 63 | 64 | cd query/src/main/resources/META-INF/ 65 | cp ejb-jar.xml.example ejb-jar.xml 66 | vim ejb-jar.xml 67 | 68 | 1. Re-build the query distribution by running `mvn package assembly:single` in the query module's directory. 69 | 1. Untar the resulting file in the `$JBOSS_HOME/server/default` directory. 70 | 71 | $ cd $JBOSS_HOME/server/default 72 | $ tar -xzf /some/path/to/wikisearch/query/target/wikisearch-query*.tar.gz 73 | 74 | This will place the dependent jars in the lib directory and the EJB jar into the deploy directory. 75 | 1. Next, copy the wikisearch*.war file in the query-war/target directory to $JBOSS_HOME/server/default/deploy. 76 | 1. Start JBoss ($JBOSS_HOME/bin/run.sh) 77 | 1. Use the Accumulo shell and give the user permissions for the wikis that you loaded: 78 | 79 | > setauths -u -s all,enwiki,eswiki,frwiki,fawiki 80 | 81 | 1. Copy the following jars to the `$ACCUMULO_HOME/lib/ext` directory from the `$JBOSS_HOME/server/default/lib` directory: 82 | 83 | kryo*.jar 84 | minlog*.jar 85 | commons-jexl*.jar 86 | 87 | 1. Copy `$JBOSS_HOME/server/default/deploy/wikisearch-query*.jar` to `$ACCUMULO_HOME/lib/ext.` 88 | 89 | 1. At this point you should be able to open a browser and view the page: 90 | 91 | http://localhost:8080/accumulo-wikisearch/ui.html 92 | 93 | You can issue the queries using this user interface or via the following REST urls: 94 | 95 | /accumulo-wikisearch/rest/Query/xml 96 | /accumulo-wikisearch/rest/Query/html 97 | /accumulo-wikisearch/rest/Query/yaml 98 | /accumulo-wikisearch/rest/Query/json. 99 | 100 | There are two parameters to the REST service, query and auths. The query parameter is the same string that you would type 101 | into the search box at ui.jsp, and the auths parameter is a comma-separated list of wikis that you want to search (i.e. 102 | enwiki,frwiki,dewiki, etc. Or you can use all) 103 | 104 | - NOTE: Ran into a [bug] that did not allow an EJB3.1 war file. The workaround is to separate the RESTEasy servlet 105 | from the EJBs by creating an EJB jar and a WAR file. 106 | 107 | [ejb-jar.xml.example]: query/src/main/resources/META-INF/ejb-jar.xml.example 108 | [dump-files]: http://dumps.wikimedia.org/backup-index.html 109 | [wikipedia.xml.example]: ingest/conf/wikipedia.xml.example 110 | [wikipedia_parallel.xml.example]: ingest/conf/wikipedia_parallel.xml.example 111 | [bug]: https://issues.jboss.org/browse/RESTEASY-531 112 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/BufferingRFileRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.output; 18 | 19 | import java.io.IOException; 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | import java.util.Map.Entry; 23 | import java.util.TreeMap; 24 | 25 | import org.apache.accumulo.core.client.AccumuloException; 26 | import org.apache.accumulo.core.client.AccumuloSecurityException; 27 | import org.apache.accumulo.core.client.BatchWriter; 28 | import org.apache.accumulo.core.client.BatchWriterConfig; 29 | import org.apache.accumulo.core.client.Connector; 30 | import org.apache.accumulo.core.client.TableNotFoundException; 31 | import org.apache.accumulo.core.conf.AccumuloConfiguration; 32 | import org.apache.accumulo.core.data.ColumnUpdate; 33 | import org.apache.accumulo.core.data.Key; 34 | import org.apache.accumulo.core.data.Mutation; 35 | import org.apache.accumulo.core.data.Value; 36 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration; 37 | import org.apache.hadoop.conf.Configuration; 38 | import org.apache.hadoop.fs.FileSystem; 39 | import org.apache.hadoop.io.Text; 40 | import org.apache.hadoop.mapreduce.RecordWriter; 41 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 42 | 43 | final class BufferingRFileRecordWriter extends RecordWriter { 44 | private final long maxSize; 45 | private final Configuration conf; 46 | private long size; 47 | 48 | private Map> buffers = new HashMap>(); 49 | private Map bufferSizes = new HashMap(); 50 | 51 | private TreeMap getBuffer(Text tablename) { 52 | TreeMap buffer = buffers.get(tablename); 53 | if (buffer == null) { 54 | buffer = new TreeMap(); 55 | buffers.put(tablename, buffer); 56 | bufferSizes.put(tablename, 0l); 57 | } 58 | return buffer; 59 | } 60 | 61 | private Text getLargestTablename() { 62 | long max = 0; 63 | Text table = null; 64 | for (Entry e : bufferSizes.entrySet()) { 65 | if (e.getValue() > max) { 66 | max = e.getValue(); 67 | table = e.getKey(); 68 | } 69 | } 70 | return table; 71 | } 72 | 73 | private void flushLargestTable() throws IOException { 74 | Text tablename = getLargestTablename(); 75 | if (tablename == null) 76 | return; 77 | long bufferSize = bufferSizes.get(tablename); 78 | TreeMap buffer = buffers.get(tablename); 79 | if (buffer.size() == 0) 80 | return; 81 | 82 | Connector conn; 83 | try { 84 | conn = WikipediaConfiguration.getConnector(conf); 85 | BatchWriterConfig bwconfig = new BatchWriterConfig(); 86 | BatchWriter writer = conn.createBatchWriter(tablename.toString(), bwconfig); 87 | for (Entry e : buffer.entrySet()) { 88 | Key k = e.getKey(); 89 | Mutation m = new Mutation(); 90 | m.put(k.getColumnFamily(), k.getColumnQualifier(), e.getValue()); 91 | writer.addMutation(m); 92 | } 93 | writer.close(); 94 | } catch (AccumuloException | AccumuloSecurityException | TableNotFoundException e) { 95 | System.err.println("Error occured in flushLargestTable: " + e.getMessage()); 96 | e.printStackTrace(); 97 | } 98 | // TODO get the table configuration for the given table? 99 | 100 | size -= bufferSize; 101 | buffer.clear(); 102 | bufferSizes.put(tablename, 0l); 103 | } 104 | 105 | BufferingRFileRecordWriter(long maxSize, Configuration conf) { 106 | this.maxSize = maxSize; 107 | this.conf = conf; 108 | } 109 | 110 | @Override 111 | public void close(TaskAttemptContext arg0) throws IOException, InterruptedException { 112 | while (size > 0) 113 | flushLargestTable(); 114 | } 115 | 116 | @Override 117 | public void write(Text table, Mutation mutation) throws IOException, InterruptedException { 118 | TreeMap buffer = getBuffer(table); 119 | int mutationSize = 0; 120 | for (ColumnUpdate update : mutation.getUpdates()) { 121 | Key k = new Key(mutation.getRow(), update.getColumnFamily(), update.getColumnQualifier(), update.getColumnVisibility(), update.getTimestamp(), 122 | update.isDeleted()); 123 | Value v = new Value(update.getValue()); 124 | // TODO account for object overhead 125 | mutationSize += k.getSize(); 126 | mutationSize += v.getSize(); 127 | buffer.put(k, v); 128 | } 129 | size += mutationSize; 130 | long bufferSize = bufferSizes.get(table); 131 | 132 | // TODO use a MutableLong instead 133 | bufferSize += mutationSize; 134 | bufferSizes.put(table, bufferSize); 135 | 136 | while (size >= maxSize) { 137 | flushLargestTable(); 138 | } 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /query-war/src/main/webapp/ui.html: -------------------------------------------------------------------------------- 1 | 17 | <%@page contentType="text/html" pageEncoding="UTF-8"%> 18 | 20 | 21 | 22 | 23 | 24 | Wiki Search Page 25 | 46 | 47 | 48 |
49 |

Wiki Search using Apache Accumulo

50 |

This sample application demonstrates the ability to use search documents using Apache Accumulo. The associated ingest software 51 | extracts the id, title, timestamp, and comments from each wikipedia article. In addition, the wikipedia text has been tokenized 52 | and is available for searching. You can enter a boolean expression into the search box below and select the particular set of 53 | wikipedia languages you want to search.

54 |

Fields available for searching: 55 |

    56 |
  1. TEXT
  2. 57 |
  3. ID
  4. 58 |
  5. TITLE
  6. 59 |
  7. TIMESTAMP
  8. 60 |
  9. COMMENTS
  10. 61 |
62 |

The search syntax is boolean logic, for example: TEXT == 'boy' and TITLE =~ 'Autism'. The supported operators are: 63 | ==, !=, <, >, ≤, ≥, =~, and !~. Likewise grouping can be performed using parentheses and predicates can be 64 | joined using and, or, and not. 65 |

To highlight the cell-level access control of Apache Accumulo, the "authorization" required for a particular cell is the language 66 | of the associated wikipedia article. 67 |

68 |
69 |
70 |
71 |
72 |
73 | 74 | 75 |
76 |
77 |
78 | 79 |
80 | 81 |
82 |
83 | 84 | 85 | 86 |
87 |
88 | 89 | 90 | 91 |
92 |
93 | 94 | 95 | 96 |
97 |
98 | 99 | 100 | n
101 |
102 |
103 | 104 |
105 |
106 |
107 |
108 | 110 |
111 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /query/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | org.apache.accumulo 22 | accumulo-wikisearch 23 | 2.0.0-SNAPSHOT 24 | 25 | wikisearch-query 26 | ejb 27 | wikisearch-query 28 | 29 | 30 | com.google.guava 31 | guava 32 | 33 | 34 | com.google.protobuf 35 | protobuf-java 36 | 37 | 38 | com.googlecode 39 | kryo 40 | 41 | 42 | com.sun.jersey 43 | jersey-core 44 | 45 | 46 | commons-codec 47 | commons-codec 48 | 49 | 50 | commons-collections 51 | commons-collections 52 | 53 | 54 | commons-configuration 55 | commons-configuration 56 | 57 | 58 | commons-lang 59 | commons-lang 60 | 61 | 62 | org.apache.accumulo 63 | accumulo-core 64 | 65 | 66 | org.apache.accumulo 67 | wikisearch-ingest 68 | 69 | 70 | org.apache.commons 71 | commons-jexl 72 | 73 | 74 | org.apache.hadoop 75 | hadoop-client 76 | 77 | 78 | javaee 79 | javaee-api 80 | provided 81 | 82 | 83 | com.googlecode 84 | minlog 85 | runtime 86 | 87 | 88 | commons-io 89 | commons-io 90 | runtime 91 | 92 | 93 | org.apache.htrace 94 | htrace-core 95 | runtime 96 | 97 | 98 | org.apache.thrift 99 | libthrift 100 | runtime 101 | 102 | 103 | org.apache.httpcomponents 104 | httpclient 105 | 106 | 107 | 108 | 109 | org.apache.zookeeper 110 | zookeeper 111 | runtime 112 | 113 | 114 | junit 115 | junit 116 | test 117 | 118 | 119 | 120 | 121 | 122 | org.apache.maven.plugins 123 | maven-dependency-plugin 124 | 125 | 126 | copy-dependencies 127 | 128 | copy-dependencies 129 | 130 | prepare-package 131 | 132 | lib 133 | 134 | commons-io,commons-configuration,commons-lang,commons-codec,protobuf-java,libthrift,zookeeper,hadoop-client,commons-jexl,guava,kryo,asm,minlog,reflectasm,wikisearch-ingest,accumulo-core,accumulo-fate,accumulo-trace,htrace-core 135 | true 136 | 137 | 138 | 139 | 140 | 141 | org.apache.maven.plugins 142 | maven-assembly-plugin 143 | 144 | 145 | src/assembly/dist.xml 146 | 147 | 148 | 149 | 150 | org.apache.maven.plugins 151 | maven-ejb-plugin 152 | 153 | 3.1 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.reader; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.io.Text; 24 | 25 | /** 26 | * A class that provides a line reader from an input stream. 27 | */ 28 | public class LfLineReader { 29 | private static final int DEFAULT_BUFFER_SIZE = 64 * 1024; 30 | private int bufferSize = DEFAULT_BUFFER_SIZE; 31 | private InputStream in; 32 | private byte[] buffer; 33 | // the number of bytes of real data in the buffer 34 | private int bufferLength = 0; 35 | // the current position in the buffer 36 | private int bufferPosn = 0; 37 | 38 | private static final byte LF = '\n'; 39 | 40 | /** 41 | * Create a line reader that reads from the given stream using the default buffer-size (64k). 42 | * 43 | * @param in 44 | * The input stream 45 | */ 46 | public LfLineReader(InputStream in) { 47 | this(in, DEFAULT_BUFFER_SIZE); 48 | } 49 | 50 | /** 51 | * Create a line reader that reads from the given stream using the given buffer-size. 52 | * 53 | * @param in 54 | * The input stream 55 | * @param bufferSize 56 | * Size of the read buffer 57 | */ 58 | public LfLineReader(InputStream in, int bufferSize) { 59 | this.in = in; 60 | this.bufferSize = bufferSize; 61 | this.buffer = new byte[this.bufferSize]; 62 | } 63 | 64 | /** 65 | * Create a line reader that reads from the given stream using the 66 | * io.file.buffer.size specified in the given Configuration. 67 | * 68 | * @param in 69 | * input stream 70 | * @param conf 71 | * configuration 72 | */ 73 | public LfLineReader(InputStream in, Configuration conf) throws IOException { 74 | this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE)); 75 | } 76 | 77 | /** 78 | * Close the underlying stream. 79 | */ 80 | public void close() throws IOException { 81 | in.close(); 82 | } 83 | 84 | /** 85 | * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). 86 | * EOF also terminates an otherwise unterminated line. 87 | * 88 | * @param str 89 | * the object to store the given line (without newline) 90 | * @param maxLineLength 91 | * the maximum number of bytes to store into str; the rest of the line is silently 92 | * discarded. 93 | * @param maxBytesToConsume 94 | * the maximum number of bytes to consume in this call. This is only a hint, because if 95 | * the line cross this threshold, we allow it to happen. It can overshoot potentially by 96 | * as much as one buffer length. 97 | * 98 | * @return the number of bytes read including the (longest) newline found. 99 | * 100 | * @throws IOException 101 | * if the underlying stream throws 102 | */ 103 | public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { 104 | /* 105 | * We're reading data from in, but the head of the stream may be already buffered in buffer, so 106 | * we have several cases: 1. No newline characters are in the buffer, so we need to copy 107 | * everything and read another buffer from the stream. 2. An unambiguously terminated line is in 108 | * buffer, so we just copy to str. 109 | */ 110 | str.clear(); 111 | int txtLength = 0; // tracks str.getLength(), as an optimization 112 | int newlineLength = 0; // length of terminating newline 113 | long bytesConsumed = 0; 114 | do { 115 | int startPosn = bufferPosn; // starting from where we left off the last time 116 | if (bufferPosn >= bufferLength) { 117 | startPosn = bufferPosn = 0; 118 | bufferLength = in.read(buffer); 119 | if (bufferLength <= 0) { 120 | break; // EOF 121 | } 122 | } 123 | for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline 124 | if (buffer[bufferPosn] == LF) { 125 | newlineLength = 1; 126 | ++bufferPosn; // at next invocation proceed from following byte 127 | break; 128 | } 129 | } 130 | int readLength = bufferPosn - startPosn; 131 | bytesConsumed += readLength; 132 | int appendLength = readLength - newlineLength; 133 | if (appendLength > maxLineLength - txtLength) { 134 | appendLength = maxLineLength - txtLength; 135 | } 136 | if (appendLength > 0) { 137 | str.append(buffer, startPosn, appendLength); 138 | txtLength += appendLength; 139 | } 140 | } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); 141 | 142 | if (bytesConsumed > Integer.MAX_VALUE) { 143 | throw new IOException("Too many bytes before newline: " + bytesConsumed); 144 | } 145 | return (int) bytesConsumed; 146 | } 147 | 148 | /** 149 | * Read from the InputStream into the given Text. 150 | * 151 | * @param str 152 | * the object to store the given line 153 | * @param maxLineLength 154 | * the maximum number of bytes to store into str. 155 | * @return the number of bytes read including the newline 156 | * @throws IOException 157 | * if the underlying stream throws 158 | */ 159 | public int readLine(Text str, int maxLineLength) throws IOException { 160 | return readLine(str, maxLineLength, Integer.MAX_VALUE); 161 | } 162 | 163 | /** 164 | * Read from the InputStream into the given Text. 165 | * 166 | * @param str 167 | * the object to store the given line 168 | * @return the number of bytes read including the newline 169 | * @throws IOException 170 | * if the underlying stream throws 171 | */ 172 | public int readLine(Text str) throws IOException { 173 | return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); 174 | } 175 | 176 | } 177 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.List; 22 | 23 | import org.apache.accumulo.core.data.Key; 24 | import org.apache.accumulo.core.data.Value; 25 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight; 26 | import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder; 27 | import org.junit.After; 28 | import org.junit.Assert; 29 | import org.junit.Before; 30 | import org.junit.Test; 31 | 32 | import com.google.protobuf.InvalidProtocolBufferException; 33 | 34 | public class TextIndexTest { 35 | private TextIndexCombiner combiner; 36 | private List values; 37 | 38 | @Before 39 | public void setup() throws Exception { 40 | combiner = new TextIndexCombiner(); 41 | combiner.init(null, Collections.singletonMap("all", "true"), null); 42 | values = new ArrayList<>(); 43 | } 44 | 45 | @After 46 | public void cleanup() { 47 | 48 | } 49 | 50 | private TermWeight.Info.Builder createBuilder() { 51 | return TermWeight.Info.newBuilder(); 52 | } 53 | 54 | @Test 55 | public void testSingleValue() throws InvalidProtocolBufferException { 56 | Builder builder = createBuilder(); 57 | builder.addWordOffset(1); 58 | builder.addWordOffset(5); 59 | builder.setNormalizedTermFrequency(0.1f); 60 | 61 | values.add(new Value(builder.build().toByteArray())); 62 | 63 | Value result = combiner.reduce(new Key(), values.iterator()); 64 | 65 | TermWeight.Info info = TermWeight.Info.parseFrom(result.get()); 66 | 67 | Assert.assertTrue(info.getNormalizedTermFrequency() == 0.1f); 68 | 69 | List offsets = info.getWordOffsetList(); 70 | Assert.assertTrue(offsets.size() == 2); 71 | Assert.assertTrue(offsets.get(0) == 1); 72 | Assert.assertTrue(offsets.get(1) == 5); 73 | } 74 | 75 | @Test 76 | public void testAggregateTwoValues() throws InvalidProtocolBufferException { 77 | Builder builder = createBuilder(); 78 | builder.addWordOffset(1); 79 | builder.addWordOffset(5); 80 | builder.setNormalizedTermFrequency(0.1f); 81 | 82 | values.add(new Value(builder.build().toByteArray())); 83 | 84 | builder = createBuilder(); 85 | builder.addWordOffset(3); 86 | builder.setNormalizedTermFrequency(0.05f); 87 | 88 | values.add(new Value(builder.build().toByteArray())); 89 | 90 | Value result = combiner.reduce(new Key(), values.iterator()); 91 | 92 | TermWeight.Info info = TermWeight.Info.parseFrom(result.get()); 93 | 94 | Assert.assertTrue(info.getNormalizedTermFrequency() == 0.15f); 95 | 96 | List offsets = info.getWordOffsetList(); 97 | Assert.assertTrue(offsets.size() == 3); 98 | Assert.assertTrue(offsets.get(0) == 1); 99 | Assert.assertTrue(offsets.get(1) == 3); 100 | Assert.assertTrue(offsets.get(2) == 5); 101 | } 102 | 103 | @Test 104 | public void testAggregateManyValues() throws InvalidProtocolBufferException { 105 | Builder builder = createBuilder(); 106 | builder.addWordOffset(13); 107 | builder.addWordOffset(15); 108 | builder.addWordOffset(19); 109 | builder.setNormalizedTermFrequency(0.12f); 110 | 111 | values.add(new Value(builder.build().toByteArray())); 112 | 113 | builder = createBuilder(); 114 | builder.addWordOffset(1); 115 | builder.addWordOffset(5); 116 | builder.setNormalizedTermFrequency(0.1f); 117 | 118 | values.add(new Value(builder.build().toByteArray())); 119 | 120 | builder = createBuilder(); 121 | builder.addWordOffset(3); 122 | builder.setNormalizedTermFrequency(0.05f); 123 | 124 | values.add(new Value(builder.build().toByteArray())); 125 | 126 | Value result = combiner.reduce(new Key(), values.iterator()); 127 | 128 | TermWeight.Info info = TermWeight.Info.parseFrom(result.get()); 129 | 130 | Assert.assertTrue(info.getNormalizedTermFrequency() == 0.27f); 131 | 132 | List offsets = info.getWordOffsetList(); 133 | Assert.assertTrue(offsets.size() == 6); 134 | Assert.assertTrue(offsets.get(0) == 1); 135 | Assert.assertTrue(offsets.get(1) == 3); 136 | Assert.assertTrue(offsets.get(2) == 5); 137 | Assert.assertTrue(offsets.get(3) == 13); 138 | Assert.assertTrue(offsets.get(4) == 15); 139 | Assert.assertTrue(offsets.get(5) == 19); 140 | } 141 | 142 | @Test 143 | public void testEmptyValue() throws InvalidProtocolBufferException { 144 | Builder builder = createBuilder(); 145 | builder.addWordOffset(13); 146 | builder.addWordOffset(15); 147 | builder.addWordOffset(19); 148 | builder.setNormalizedTermFrequency(0.12f); 149 | 150 | values.add(new Value("".getBytes())); 151 | values.add(new Value(builder.build().toByteArray())); 152 | values.add(new Value("".getBytes())); 153 | 154 | builder = createBuilder(); 155 | builder.addWordOffset(1); 156 | builder.addWordOffset(5); 157 | builder.setNormalizedTermFrequency(0.1f); 158 | 159 | values.add(new Value(builder.build().toByteArray())); 160 | values.add(new Value("".getBytes())); 161 | 162 | builder = createBuilder(); 163 | builder.addWordOffset(3); 164 | builder.setNormalizedTermFrequency(0.05f); 165 | 166 | values.add(new Value(builder.build().toByteArray())); 167 | values.add(new Value("".getBytes())); 168 | 169 | Value result = combiner.reduce(new Key(), values.iterator()); 170 | 171 | TermWeight.Info info = TermWeight.Info.parseFrom(result.get()); 172 | 173 | Assert.assertTrue(info.getNormalizedTermFrequency() == 0.27f); 174 | 175 | List offsets = info.getWordOffsetList(); 176 | Assert.assertTrue(offsets.size() == 6); 177 | Assert.assertTrue(offsets.get(0) == 1); 178 | Assert.assertTrue(offsets.get(1) == 3); 179 | Assert.assertTrue(offsets.get(2) == 5); 180 | Assert.assertTrue(offsets.get(3) == 13); 181 | Assert.assertTrue(offsets.get(4) == 15); 182 | Assert.assertTrue(offsets.get(5) == 19); 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/parser/TreeNode.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.parser; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.Enumeration; 22 | import java.util.List; 23 | import java.util.NoSuchElementException; 24 | import java.util.Vector; 25 | 26 | import org.apache.accumulo.examples.wikisearch.parser.QueryParser.QueryTerm; 27 | import org.apache.commons.jexl2.parser.JexlNode; 28 | 29 | 30 | import com.google.common.collect.HashMultimap; 31 | import com.google.common.collect.Multimap; 32 | 33 | public class TreeNode { 34 | 35 | private Class type = null; 36 | /* navigation elements */ 37 | private TreeNode parent = null; 38 | private List children = new ArrayList(); 39 | private Multimap terms = HashMultimap.create(); 40 | 41 | public TreeNode() { 42 | super(); 43 | } 44 | 45 | public Class getType() { 46 | return type; 47 | } 48 | 49 | public TreeNode getParent() { 50 | return parent; 51 | } 52 | 53 | public List getChildren() { 54 | return children; 55 | } 56 | 57 | public Enumeration getChildrenAsEnumeration() { 58 | return Collections.enumeration(children); 59 | } 60 | 61 | public Multimap getTerms() { 62 | return terms; 63 | } 64 | 65 | public void setType(Class type) { 66 | this.type = type; 67 | } 68 | 69 | public void setParent(TreeNode parent) { 70 | this.parent = parent; 71 | } 72 | 73 | public void setChildren(List children) { 74 | this.children = children; 75 | } 76 | 77 | public void setTerms(Multimap terms) { 78 | this.terms = terms; 79 | } 80 | 81 | public boolean isLeaf() { 82 | return children.isEmpty(); 83 | } 84 | 85 | @Override 86 | public String toString() { 87 | StringBuilder buf = new StringBuilder(); 88 | buf.append("Type: ").append(type.getSimpleName()); 89 | buf.append(" Terms: "); 90 | if (null == terms) { 91 | buf.append("null"); 92 | } else { 93 | buf.append(terms.toString()); 94 | } 95 | return buf.toString(); 96 | } 97 | 98 | public final Enumeration depthFirstEnumeration() { 99 | return new PostorderEnumeration(this); 100 | } 101 | 102 | public Enumeration breadthFirstEnumeration() { 103 | return new BreadthFirstEnumeration(this); 104 | } 105 | 106 | public final class PostorderEnumeration implements Enumeration { 107 | 108 | protected TreeNode root; 109 | protected Enumeration children; 110 | protected Enumeration subtree; 111 | 112 | public PostorderEnumeration(TreeNode rootNode) { 113 | super(); 114 | root = rootNode; 115 | children = root.getChildrenAsEnumeration(); 116 | subtree = EMPTY_ENUMERATION; 117 | } 118 | 119 | public boolean hasMoreElements() { 120 | return root != null; 121 | } 122 | 123 | public TreeNode nextElement() { 124 | TreeNode retval; 125 | 126 | if (subtree.hasMoreElements()) { 127 | retval = subtree.nextElement(); 128 | } else if (children.hasMoreElements()) { 129 | subtree = new PostorderEnumeration((TreeNode) children.nextElement()); 130 | retval = subtree.nextElement(); 131 | } else { 132 | retval = root; 133 | root = null; 134 | } 135 | 136 | return retval; 137 | } 138 | } // End of class PostorderEnumeration 139 | 140 | static public final Enumeration EMPTY_ENUMERATION = new Enumeration() { 141 | 142 | public boolean hasMoreElements() { 143 | return false; 144 | } 145 | 146 | public TreeNode nextElement() { 147 | throw new NoSuchElementException("No more elements"); 148 | } 149 | }; 150 | 151 | final class BreadthFirstEnumeration implements Enumeration { 152 | protected Queue queue; 153 | 154 | public BreadthFirstEnumeration(TreeNode rootNode) { 155 | super(); 156 | Vector v = new Vector(1); 157 | v.addElement(rootNode); // PENDING: don't really need a vector 158 | queue = new Queue(); 159 | queue.enqueue(v.elements()); 160 | } 161 | 162 | public boolean hasMoreElements() { 163 | return (!queue.isEmpty() && ((Enumeration) queue.firstObject()).hasMoreElements()); 164 | } 165 | 166 | public TreeNode nextElement() { 167 | Enumeration enumer = (Enumeration) queue.firstObject(); 168 | TreeNode node = (TreeNode) enumer.nextElement(); 169 | Enumeration children = node.getChildrenAsEnumeration(); 170 | 171 | if (!enumer.hasMoreElements()) { 172 | queue.dequeue(); 173 | } 174 | if (children.hasMoreElements()) { 175 | queue.enqueue(children); 176 | } 177 | return node; 178 | } 179 | 180 | // A simple queue with a linked list data structure. 181 | final class Queue { 182 | QNode head; // null if empty 183 | QNode tail; 184 | 185 | final class QNode { 186 | public Object object; 187 | public QNode next; // null if end 188 | 189 | public QNode(Object object, QNode next) { 190 | this.object = object; 191 | this.next = next; 192 | } 193 | } 194 | 195 | public void enqueue(Object anObject) { 196 | if (head == null) { 197 | head = tail = new QNode(anObject, null); 198 | } else { 199 | tail.next = new QNode(anObject, null); 200 | tail = tail.next; 201 | } 202 | } 203 | 204 | public Object dequeue() { 205 | if (head == null) { 206 | throw new NoSuchElementException("No more elements"); 207 | } 208 | 209 | Object retval = head.object; 210 | QNode oldHead = head; 211 | head = head.next; 212 | if (head == null) { 213 | tail = null; 214 | } else { 215 | oldHead.next = null; 216 | } 217 | return retval; 218 | } 219 | 220 | public Object firstObject() { 221 | if (head == null) { 222 | throw new NoSuchElementException("No more elements"); 223 | } 224 | 225 | return head.object; 226 | } 227 | 228 | public boolean isEmpty() { 229 | return head == null; 230 | } 231 | 232 | } // End of class Queue 233 | 234 | } // End of class BreadthFirstEnumeration 235 | } 236 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.ingest; 18 | 19 | import java.io.DataInput; 20 | import java.io.DataOutput; 21 | import java.io.IOException; 22 | import java.io.Reader; 23 | import java.text.ParseException; 24 | import java.text.SimpleDateFormat; 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | import javax.xml.namespace.QName; 29 | import javax.xml.stream.XMLInputFactory; 30 | import javax.xml.stream.XMLStreamException; 31 | import javax.xml.stream.XMLStreamReader; 32 | 33 | import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer; 34 | import org.apache.hadoop.io.Text; 35 | import org.apache.hadoop.io.Writable; 36 | 37 | 38 | public class ArticleExtractor { 39 | 40 | public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z"); 41 | private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer(); 42 | 43 | public static class Article implements Writable { 44 | int id; 45 | String title; 46 | long timestamp; 47 | String comments; 48 | String text; 49 | 50 | public Article(){} 51 | 52 | private Article(int id, String title, long timestamp, String comments, String text) { 53 | super(); 54 | this.id = id; 55 | this.title = title; 56 | this.timestamp = timestamp; 57 | this.comments = comments; 58 | this.text = text; 59 | } 60 | 61 | public int getId() { 62 | return id; 63 | } 64 | 65 | public String getTitle() { 66 | return title; 67 | } 68 | 69 | public String getComments() { 70 | return comments; 71 | } 72 | 73 | public String getText() { 74 | return text; 75 | } 76 | 77 | public long getTimestamp() { 78 | return timestamp; 79 | } 80 | 81 | public Map getFieldValues() { 82 | Map fields = new HashMap(); 83 | fields.put("ID", this.id); 84 | fields.put("TITLE", this.title); 85 | fields.put("TIMESTAMP", this.timestamp); 86 | fields.put("COMMENTS", this.comments); 87 | return fields; 88 | } 89 | 90 | public Map getNormalizedFieldValues() { 91 | Map fields = new HashMap(); 92 | //fields.put("ID", nn.normalizeFieldValue("ID", this.id)); 93 | fields.put("ID", Integer.toString(this.id)); 94 | fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title)); 95 | //fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp)); 96 | fields.put("TIMESTAMP", Long.toString(this.timestamp)); 97 | fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments)); 98 | return fields; 99 | } 100 | 101 | @Override 102 | public void readFields(DataInput in) throws IOException { 103 | id = in.readInt(); 104 | Text foo = new Text(); 105 | foo.readFields(in); 106 | title = foo.toString(); 107 | timestamp = in.readLong(); 108 | foo.readFields(in); 109 | comments = foo.toString(); 110 | foo.readFields(in); 111 | text = foo.toString(); 112 | } 113 | 114 | @Override 115 | public void write(DataOutput out) throws IOException { 116 | out.writeInt(id); 117 | (new Text(title)).write(out); 118 | out.writeLong(timestamp); 119 | (new Text(comments)).write(out); 120 | (new Text(text)).write(out); 121 | } 122 | 123 | } 124 | 125 | public ArticleExtractor() {} 126 | 127 | private static XMLInputFactory xmlif = XMLInputFactory.newInstance(); 128 | 129 | static 130 | { 131 | xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); 132 | } 133 | 134 | public Article extract(Reader reader) { 135 | 136 | XMLStreamReader xmlr = null; 137 | 138 | try { 139 | xmlr = xmlif.createXMLStreamReader(reader); 140 | } catch (XMLStreamException e1) { 141 | throw new RuntimeException(e1); 142 | } 143 | 144 | QName titleName = QName.valueOf("title"); 145 | QName textName = QName.valueOf("text"); 146 | QName revisionName = QName.valueOf("revision"); 147 | QName timestampName = QName.valueOf("timestamp"); 148 | QName commentName = QName.valueOf("comment"); 149 | QName idName = QName.valueOf("id"); 150 | 151 | Map tags = new HashMap(); 152 | for (QName tag : new QName[] {titleName, textName, timestampName, commentName, idName}) { 153 | tags.put(tag, new StringBuilder()); 154 | } 155 | 156 | StringBuilder articleText = tags.get(textName); 157 | StringBuilder titleText = tags.get(titleName); 158 | StringBuilder timestampText = tags.get(timestampName); 159 | StringBuilder commentText = tags.get(commentName); 160 | StringBuilder idText = tags.get(idName); 161 | 162 | StringBuilder current = null; 163 | boolean inRevision = false; 164 | while (true) { 165 | try { 166 | if (!xmlr.hasNext()) 167 | break; 168 | xmlr.next(); 169 | } catch (XMLStreamException e) { 170 | throw new RuntimeException(e); 171 | } 172 | QName currentName = null; 173 | if (xmlr.hasName()) { 174 | currentName = xmlr.getName(); 175 | } 176 | if (xmlr.isStartElement() && tags.containsKey(currentName)) { 177 | if (!inRevision || (!currentName.equals(revisionName) && !currentName.equals(idName))) { 178 | current = tags.get(currentName); 179 | current.setLength(0); 180 | } 181 | } else if (xmlr.isStartElement() && currentName.equals(revisionName)) { 182 | inRevision = true; 183 | } else if (xmlr.isEndElement() && currentName.equals(revisionName)) { 184 | inRevision = false; 185 | } else if (xmlr.isEndElement() && current != null) { 186 | if (textName.equals(currentName)) { 187 | 188 | String title = titleText.toString(); 189 | String text = articleText.toString(); 190 | String comment = commentText.toString(); 191 | int id = Integer.parseInt(idText.toString()); 192 | long timestamp; 193 | try { 194 | timestamp = dateFormat.parse(timestampText.append("+0000").toString()).getTime(); 195 | return new Article(id, title, timestamp, comment, text); 196 | } catch (ParseException e) { 197 | return null; 198 | } 199 | } 200 | current = null; 201 | } else if (current != null && xmlr.hasText()) { 202 | current.append(xmlr.getText()); 203 | } 204 | } 205 | return null; 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.reader; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration; 22 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit; 23 | import org.apache.accumulo.examples.wikisearch.util.TextUtil; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.hadoop.mapreduce.InputSplit; 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 28 | 29 | /** 30 | * This class aggregates Text values based on a start and end filter. An example use case for this 31 | * would be XML data. This will not work with data that has nested start and stop tokens. 32 | * 33 | */ 34 | public class AggregatingRecordReader extends LongLineRecordReader { 35 | 36 | public static final String START_TOKEN = "aggregating.token.start"; 37 | public static final String END_TOKEN = "aggregating.token.end"; 38 | public static final String RETURN_PARTIAL_MATCHES = "aggregating.allow.partial"; 39 | 40 | private LongWritable key = new LongWritable(); 41 | private String startToken = null; 42 | private String endToken = null; 43 | private long counter = 0; 44 | private Text aggValue = new Text(); 45 | private boolean startFound = false; 46 | private StringBuilder remainder = new StringBuilder(0); 47 | private boolean returnPartialMatches = false; 48 | 49 | @Override 50 | public LongWritable getCurrentKey() { 51 | key.set(counter); 52 | return key; 53 | } 54 | 55 | @Override 56 | public Text getCurrentValue() { 57 | return aggValue; 58 | } 59 | 60 | @Override 61 | public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { 62 | super.initialize(((WikipediaInputSplit) genericSplit).getFileSplit(), context); 63 | this.startToken = 64 | WikipediaConfiguration.isNull(context.getConfiguration(), START_TOKEN, String.class); 65 | this.endToken = 66 | WikipediaConfiguration.isNull(context.getConfiguration(), END_TOKEN, String.class); 67 | this.returnPartialMatches = 68 | context.getConfiguration().getBoolean(RETURN_PARTIAL_MATCHES, false); 69 | 70 | /* 71 | * Text-appending works almost exactly like the + operator on Strings- it creates a byte array 72 | * exactly the size of [prefix + suffix] and dumps the bytes into the new array. This module 73 | * works by doing lots of little additions, one line at a time. With most XML, the documents are 74 | * partitioned on line boundaries, so we will generally have lots of additions. Setting a large 75 | * default byte array for a text object can avoid this and give us StringBuilder-like 76 | * functionality for Text objects. 77 | */ 78 | byte[] txtBuffer = new byte[2048]; 79 | aggValue.set(txtBuffer); 80 | } 81 | 82 | @Override 83 | public boolean nextKeyValue() throws IOException { 84 | aggValue.clear(); 85 | boolean hasNext = false; 86 | boolean finished = false; 87 | // Find the start token 88 | while (!finished && (((hasNext = super.nextKeyValue()) == true) || remainder.length() > 0)) { 89 | if (hasNext) { 90 | finished = process(super.getCurrentValue()); 91 | } else { 92 | finished = process(null); 93 | } 94 | if (finished) { 95 | startFound = false; 96 | counter++; 97 | return true; 98 | } 99 | } 100 | // If we have anything loaded in the agg value (and we found a start) 101 | // then we ran out of data before finding the end. Just return the 102 | // data we have and if it's not valid, downstream parsing of the data 103 | // will fail. 104 | if (returnPartialMatches && startFound && aggValue.getLength() > 0) { 105 | startFound = false; 106 | counter++; 107 | return true; 108 | } 109 | return false; 110 | } 111 | 112 | /** 113 | * Populates aggValue with the contents of the Text object. 114 | * 115 | * @return true if aggValue is complete, else false and needs more data. 116 | */ 117 | private boolean process(Text t) { 118 | 119 | if (null != t) { 120 | remainder.append(t.toString()); 121 | } 122 | while (remainder.length() > 0) { 123 | if (!startFound) { 124 | // If found, then begin aggregating at the start offset 125 | int start = remainder.indexOf(startToken); 126 | if (-1 != start) { 127 | // Append the start token to the aggregate value 128 | TextUtil.textAppendNoNull(aggValue, 129 | remainder.substring(start, start + startToken.length()), false); 130 | // Remove to the end of the start token from the remainder 131 | remainder.delete(0, start + startToken.length()); 132 | startFound = true; 133 | } else { 134 | // If we are looking for the start and have not found it, then remove 135 | // the bytes 136 | remainder.delete(0, remainder.length()); 137 | } 138 | } else { 139 | // Try to find the end 140 | int end = remainder.indexOf(endToken); 141 | // Also try to find the start 142 | int start = remainder.indexOf(startToken); 143 | if (-1 == end) { 144 | if (returnPartialMatches && start >= 0) { 145 | // End token not found, but another start token was found... 146 | // The amount to copy is up to the beginning of the next start token 147 | TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false); 148 | remainder.delete(0, start); 149 | return true; 150 | } else { 151 | // Not found, aggregate the entire remainder 152 | TextUtil.textAppendNoNull(aggValue, remainder.toString(), false); 153 | // Delete all chars from remainder 154 | remainder.delete(0, remainder.length()); 155 | } 156 | } else { 157 | if (returnPartialMatches && start >= 0 && start < end) { 158 | // We found the end token, but found another start token first, so 159 | // deal with that. 160 | TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false); 161 | remainder.delete(0, start); 162 | return true; 163 | } else { 164 | // END_TOKEN was found. Extract to the end of END_TOKEN 165 | TextUtil.textAppendNoNull(aggValue, remainder.substring(0, end + endToken.length()), 166 | false); 167 | // Remove from remainder up to the end of END_TOKEN 168 | remainder.delete(0, end + endToken.length()); 169 | return true; 170 | } 171 | } 172 | } 173 | } 174 | return false; 175 | } 176 | 177 | } 178 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import static org.junit.Assert.assertTrue; 20 | 21 | import java.util.ArrayList; 22 | import java.util.Collections; 23 | import java.util.List; 24 | import java.util.UUID; 25 | 26 | import org.apache.accumulo.core.client.IteratorSetting; 27 | import org.apache.accumulo.core.data.Key; 28 | import org.apache.accumulo.core.data.Value; 29 | import org.apache.accumulo.core.iterators.Combiner; 30 | import org.apache.accumulo.examples.wikisearch.protobuf.Uid; 31 | import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder; 32 | import org.apache.log4j.Level; 33 | import org.apache.log4j.Logger; 34 | import org.junit.Before; 35 | import org.junit.Test; 36 | 37 | public class GlobalIndexUidTest { 38 | private GlobalIndexUidCombiner combiner; 39 | private List values; 40 | 41 | @Before 42 | public void setup() throws Exception { 43 | combiner = new GlobalIndexUidCombiner(); 44 | combiner.init(null, Collections.singletonMap("all", "true"), null); 45 | values = new ArrayList(); 46 | } 47 | 48 | private Uid.List.Builder createNewUidList() { 49 | return Uid.List.newBuilder(); 50 | } 51 | 52 | @Test 53 | public void testSingleUid() { 54 | Builder b = createNewUidList(); 55 | b.setCOUNT(1); 56 | b.setIGNORE(false); 57 | b.addUID(UUID.randomUUID().toString()); 58 | Uid.List uidList = b.build(); 59 | Value val = new Value(uidList.toByteArray()); 60 | values.add(val); 61 | Value result = combiner.reduce(new Key(), values.iterator()); 62 | assertTrue(val.compareTo(result.get()) == 0); 63 | } 64 | 65 | @Test 66 | public void testLessThanMax() throws Exception { 67 | List savedUUIDs = new ArrayList(); 68 | for (int i = 0; i < GlobalIndexUidCombiner.MAX - 1; i++) { 69 | Builder b = createNewUidList(); 70 | b.setIGNORE(false); 71 | String uuid = UUID.randomUUID().toString(); 72 | savedUUIDs.add(uuid); 73 | b.setCOUNT(i); 74 | b.addUID(uuid); 75 | Uid.List uidList = b.build(); 76 | Value val = new Value(uidList.toByteArray()); 77 | values.add(val); 78 | } 79 | Value result = combiner.reduce(new Key(), values.iterator()); 80 | Uid.List resultList = Uid.List.parseFrom(result.get()); 81 | assertTrue(resultList.getIGNORE() == false); 82 | assertTrue(resultList.getUIDCount() == (GlobalIndexUidCombiner.MAX - 1)); 83 | List resultListUUIDs = resultList.getUIDList(); 84 | for (String s : savedUUIDs) 85 | assertTrue(resultListUUIDs.contains(s)); 86 | } 87 | 88 | @Test 89 | public void testEqualsMax() throws Exception { 90 | List savedUUIDs = new ArrayList(); 91 | for (int i = 0; i < GlobalIndexUidCombiner.MAX; i++) { 92 | Builder b = createNewUidList(); 93 | b.setIGNORE(false); 94 | String uuid = UUID.randomUUID().toString(); 95 | savedUUIDs.add(uuid); 96 | b.setCOUNT(i); 97 | b.addUID(uuid); 98 | Uid.List uidList = b.build(); 99 | Value val = new Value(uidList.toByteArray()); 100 | values.add(val); 101 | } 102 | Value result = combiner.reduce(new Key(), values.iterator()); 103 | Uid.List resultList = Uid.List.parseFrom(result.get()); 104 | assertTrue(resultList.getIGNORE() == false); 105 | assertTrue(resultList.getUIDCount() == (GlobalIndexUidCombiner.MAX)); 106 | List resultListUUIDs = resultList.getUIDList(); 107 | for (String s : savedUUIDs) 108 | assertTrue(resultListUUIDs.contains(s)); 109 | } 110 | 111 | @Test 112 | public void testMoreThanMax() throws Exception { 113 | List savedUUIDs = new ArrayList(); 114 | for (int i = 0; i < GlobalIndexUidCombiner.MAX + 10; i++) { 115 | Builder b = createNewUidList(); 116 | b.setIGNORE(false); 117 | String uuid = UUID.randomUUID().toString(); 118 | savedUUIDs.add(uuid); 119 | b.setCOUNT(1); 120 | b.addUID(uuid); 121 | Uid.List uidList = b.build(); 122 | Value val = new Value(uidList.toByteArray()); 123 | values.add(val); 124 | } 125 | Value result = combiner.reduce(new Key(), values.iterator()); 126 | Uid.List resultList = Uid.List.parseFrom(result.get()); 127 | assertTrue(resultList.getIGNORE() == true); 128 | assertTrue(resultList.getUIDCount() == 0); 129 | assertTrue(resultList.getCOUNT() == (GlobalIndexUidCombiner.MAX + 10)); 130 | } 131 | 132 | @Test 133 | public void testSeenIgnore() throws Exception { 134 | Builder b = createNewUidList(); 135 | b.setIGNORE(true); 136 | b.setCOUNT(0); 137 | Uid.List uidList = b.build(); 138 | Value val = new Value(uidList.toByteArray()); 139 | values.add(val); 140 | b = createNewUidList(); 141 | b.setIGNORE(false); 142 | b.setCOUNT(1); 143 | b.addUID(UUID.randomUUID().toString()); 144 | uidList = b.build(); 145 | val = new Value(uidList.toByteArray()); 146 | values.add(val); 147 | Value result = combiner.reduce(new Key(), values.iterator()); 148 | Uid.List resultList = Uid.List.parseFrom(result.get()); 149 | assertTrue(resultList.getIGNORE() == true); 150 | assertTrue(resultList.getUIDCount() == 0); 151 | assertTrue(resultList.getCOUNT() == 1); 152 | } 153 | 154 | @Test 155 | public void testInvalidValueType() throws Exception { 156 | Combiner comb = new GlobalIndexUidCombiner(); 157 | IteratorSetting setting = new IteratorSetting(1, GlobalIndexUidCombiner.class); 158 | GlobalIndexUidCombiner.setCombineAllColumns(setting, true); 159 | GlobalIndexUidCombiner.setLossyness(setting, true); 160 | comb.init(null, setting.getOptions(), null); 161 | Logger.getLogger(GlobalIndexUidCombiner.class).setLevel(Level.OFF); 162 | Value val = new Value(UUID.randomUUID().toString().getBytes()); 163 | values.add(val); 164 | Value result = comb.reduce(new Key(), values.iterator()); 165 | Uid.List resultList = Uid.List.parseFrom(result.get()); 166 | assertTrue(resultList.getIGNORE() == false); 167 | assertTrue(resultList.getUIDCount() == 0); 168 | assertTrue(resultList.getCOUNT() == 0); 169 | } 170 | 171 | @Test 172 | public void testCount() throws Exception { 173 | UUID uuid = UUID.randomUUID(); 174 | // Collect the same UUID five times. 175 | for (int i = 0; i < 5; i++) { 176 | Builder b = createNewUidList(); 177 | b.setCOUNT(1); 178 | b.setIGNORE(false); 179 | b.addUID(uuid.toString()); 180 | Uid.List uidList = b.build(); 181 | Value val = new Value(uidList.toByteArray()); 182 | values.add(val); 183 | } 184 | Value result = combiner.reduce(new Key(), values.iterator()); 185 | Uid.List resultList = Uid.List.parseFrom(result.get()); 186 | assertTrue(resultList.getIGNORE() == false); 187 | assertTrue(resultList.getUIDCount() == 1); 188 | assertTrue(resultList.getCOUNT() == 5); 189 | 190 | } 191 | 192 | } 193 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/parser/EventFields.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.parser; 18 | 19 | import java.nio.ByteBuffer; 20 | import java.util.Collection; 21 | import java.util.Map; 22 | import java.util.Map.Entry; 23 | import java.util.Set; 24 | 25 | import org.apache.accumulo.core.security.ColumnVisibility; 26 | import org.apache.accumulo.examples.wikisearch.parser.EventFields.FieldValue; 27 | 28 | import com.esotericsoftware.kryo.CustomSerialization; 29 | import com.esotericsoftware.kryo.Kryo; 30 | import com.esotericsoftware.kryo.serialize.ArraySerializer; 31 | import com.esotericsoftware.kryo.serialize.IntSerializer; 32 | import com.esotericsoftware.kryo.serialize.StringSerializer; 33 | import com.google.common.collect.HashMultimap; 34 | import com.google.common.collect.Multimap; 35 | import com.google.common.collect.Multiset; 36 | import com.google.common.collect.SetMultimap; 37 | 38 | /** 39 | * Object used to hold the fields in an event. This is a multimap because fields can be repeated. 40 | */ 41 | public class EventFields implements SetMultimap, CustomSerialization { 42 | 43 | private static boolean kryoInitialized = false; 44 | private static ArraySerializer valueSerializer = null; 45 | 46 | private Multimap map = null; 47 | 48 | public static class FieldValue { 49 | ColumnVisibility visibility; 50 | byte[] value; 51 | 52 | public FieldValue(ColumnVisibility visibility, byte[] value) { 53 | super(); 54 | this.visibility = visibility; 55 | this.value = value; 56 | } 57 | 58 | public ColumnVisibility getVisibility() { 59 | return visibility; 60 | } 61 | 62 | public byte[] getValue() { 63 | return value; 64 | } 65 | 66 | public void setVisibility(ColumnVisibility visibility) { 67 | this.visibility = visibility; 68 | } 69 | 70 | public void setValue(byte[] value) { 71 | this.value = value; 72 | } 73 | 74 | public int size() { 75 | return visibility.flatten().length + value.length; 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | StringBuilder buf = new StringBuilder(); 81 | if (null != visibility) 82 | buf.append(" visibility: ").append(new String(visibility.flatten())); 83 | if (null != value) 84 | buf.append(" value size: ").append(value.length); 85 | if (null != value) 86 | buf.append(" value: ").append(new String(value)); 87 | return buf.toString(); 88 | } 89 | 90 | } 91 | 92 | public EventFields() { 93 | map = HashMultimap.create(); 94 | } 95 | 96 | public int size() { 97 | return map.size(); 98 | } 99 | 100 | public boolean isEmpty() { 101 | return map.isEmpty(); 102 | } 103 | 104 | public boolean containsKey(Object key) { 105 | return map.containsKey(key); 106 | } 107 | 108 | public boolean containsValue(Object value) { 109 | return map.containsValue(value); 110 | } 111 | 112 | public boolean containsEntry(Object key, Object value) { 113 | return map.containsEntry(key, value); 114 | } 115 | 116 | public boolean put(String key, FieldValue value) { 117 | return map.put(key, value); 118 | } 119 | 120 | public boolean remove(Object key, Object value) { 121 | return map.remove(key, value); 122 | } 123 | 124 | public boolean putAll(String key, Iterable values) { 125 | return map.putAll(key, values); 126 | } 127 | 128 | public boolean putAll(Multimap multimap) { 129 | return map.putAll(multimap); 130 | } 131 | 132 | public void clear() { 133 | map.clear(); 134 | } 135 | 136 | public Set keySet() { 137 | return map.keySet(); 138 | } 139 | 140 | public Multiset keys() { 141 | return map.keys(); 142 | } 143 | 144 | public Collection values() { 145 | return map.values(); 146 | } 147 | 148 | public Set get(String key) { 149 | return (Set) map.get(key); 150 | } 151 | 152 | public Set removeAll(Object key) { 153 | return (Set) map.removeAll(key); 154 | } 155 | 156 | public Set replaceValues(String key, Iterable values) { 157 | return (Set) map.replaceValues(key, values); 158 | } 159 | 160 | public Set> entries() { 161 | return (Set>) map.entries(); 162 | } 163 | 164 | public Map> asMap() { 165 | return map.asMap(); 166 | } 167 | 168 | public int getByteSize() { 169 | int count = 0; 170 | for (Entry e : map.entries()) { 171 | count += e.getKey().getBytes().length + e.getValue().size(); 172 | } 173 | return count; 174 | } 175 | 176 | @Override 177 | public String toString() { 178 | StringBuilder buf = new StringBuilder(); 179 | for (Entry entry : map.entries()) { 180 | buf.append("\tkey: ").append(entry.getKey()).append(" -> ").append(entry.getValue().toString()).append("\n"); 181 | } 182 | return buf.toString(); 183 | } 184 | 185 | public static synchronized void initializeKryo(Kryo kryo) { 186 | if (kryoInitialized) 187 | return; 188 | valueSerializer = new ArraySerializer(kryo); 189 | valueSerializer.setDimensionCount(1); 190 | valueSerializer.setElementsAreSameType(true); 191 | valueSerializer.setCanBeNull(false); 192 | valueSerializer.setElementsCanBeNull(false); 193 | kryo.register(byte[].class, valueSerializer); 194 | kryoInitialized = true; 195 | } 196 | 197 | public void readObjectData(Kryo kryo, ByteBuffer buf) { 198 | if (!kryoInitialized) 199 | EventFields.initializeKryo(kryo); 200 | // Read in the number of map entries 201 | int entries = IntSerializer.get(buf, true); 202 | for (int i = 0; i < entries; i++) { 203 | // Read in the key 204 | String key = StringSerializer.get(buf); 205 | // Read in the fields in the value 206 | ColumnVisibility vis = new ColumnVisibility(valueSerializer.readObjectData(buf, byte[].class)); 207 | byte[] value = valueSerializer.readObjectData(buf, byte[].class); 208 | map.put(key, new FieldValue(vis, value)); 209 | } 210 | 211 | } 212 | 213 | public void writeObjectData(Kryo kryo, ByteBuffer buf) { 214 | if (!kryoInitialized) 215 | EventFields.initializeKryo(kryo); 216 | // Write out the number of entries; 217 | IntSerializer.put(buf, map.size(), true); 218 | for (Entry entry : map.entries()) { 219 | // Write the key 220 | StringSerializer.put(buf, entry.getKey()); 221 | // Write the fields in the value 222 | valueSerializer.writeObjectData(buf, entry.getValue().getVisibility().flatten()); 223 | valueSerializer.writeObjectData(buf, entry.getValue().getValue()); 224 | } 225 | } 226 | 227 | } 228 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.ingest; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.accumulo.core.client.AccumuloException; 22 | import org.apache.accumulo.core.client.AccumuloSecurityException; 23 | import org.apache.accumulo.core.client.Connector; 24 | import org.apache.accumulo.core.client.Instance; 25 | import org.apache.accumulo.core.client.ZooKeeperInstance; 26 | import org.apache.commons.lang.StringUtils; 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.util.ReflectionUtils; 30 | import org.apache.lucene.analysis.Analyzer; 31 | 32 | public class WikipediaConfiguration { 33 | public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name"; 34 | public final static String USER = "wikipedia.accumulo.user"; 35 | public final static String PASSWORD = "wikipedia.accumulo.password"; 36 | public final static String TABLE_NAME = "wikipedia.accumulo.table"; 37 | 38 | public final static String ZOOKEEPERS = "wikipedia.accumulo.zookeepers"; 39 | 40 | public final static String NAMESPACES_FILENAME = "wikipedia.namespaces.filename"; 41 | public final static String LANGUAGES_FILENAME = "wikipedia.languages.filename"; 42 | public final static String WORKING_DIRECTORY = "wikipedia.ingest.working"; 43 | 44 | public final static String ANALYZER = "wikipedia.index.analyzer"; 45 | 46 | public final static String NUM_PARTITIONS = "wikipedia.ingest.partitions"; 47 | 48 | public final static String NUM_GROUPS = "wikipedia.ingest.groups"; 49 | 50 | public final static String PARTITIONED_ARTICLES_DIRECTORY = "wikipedia.partitioned.directory"; 51 | 52 | public final static String RUN_PARTITIONER = "wikipedia.run.partitioner"; 53 | public final static String RUN_INGEST = "wikipedia.run.ingest"; 54 | public final static String BULK_INGEST = "wikipedia.bulk.ingest"; 55 | public final static String BULK_INGEST_DIR = "wikipedia.bulk.ingest.dir"; 56 | public final static String BULK_INGEST_FAILURE_DIR = "wikipedia.bulk.ingest.failure.dir"; 57 | public final static String BULK_INGEST_BUFFER_SIZE = "wikipedia.bulk.ingest.buffer.size"; 58 | public final static String PARTITIONED_INPUT_MIN_SPLIT_SIZE = "wikipedia.min.input.split.size"; 59 | 60 | public static String getUser(Configuration conf) { 61 | return conf.get(USER); 62 | } 63 | 64 | public static byte[] getPassword(Configuration conf) { 65 | String pass = conf.get(PASSWORD); 66 | if (pass == null) { 67 | return null; 68 | } 69 | return pass.getBytes(); 70 | } 71 | 72 | public static String getTableName(Configuration conf) { 73 | String tablename = conf.get(TABLE_NAME); 74 | if (tablename == null) { 75 | throw new RuntimeException("No data table name specified in " + TABLE_NAME); 76 | } 77 | return tablename; 78 | } 79 | 80 | public static String getInstanceName(Configuration conf) { 81 | return conf.get(INSTANCE_NAME); 82 | } 83 | 84 | public static String getZookeepers(Configuration conf) { 85 | String zookeepers = conf.get(ZOOKEEPERS); 86 | if (zookeepers == null) { 87 | throw new RuntimeException("No zookeepers specified in " + ZOOKEEPERS); 88 | } 89 | return zookeepers; 90 | } 91 | 92 | public static Path getNamespacesFile(Configuration conf) { 93 | String filename = conf.get(NAMESPACES_FILENAME, 94 | new Path(getWorkingDirectory(conf), "namespaces.dat").toString()); 95 | return new Path(filename); 96 | } 97 | 98 | public static Path getLanguagesFile(Configuration conf) { 99 | String filename = conf.get(LANGUAGES_FILENAME, 100 | new Path(getWorkingDirectory(conf), "languages.txt").toString()); 101 | return new Path(filename); 102 | } 103 | 104 | public static Path getWorkingDirectory(Configuration conf) { 105 | String filename = conf.get(WORKING_DIRECTORY); 106 | return new Path(filename); 107 | } 108 | 109 | public static Connector getConnector(Configuration conf) 110 | throws AccumuloException, AccumuloSecurityException { 111 | return getInstance(conf).getConnector(getUser(conf), getPassword(conf)); 112 | } 113 | 114 | public static Instance getInstance(Configuration conf) { 115 | return new ZooKeeperInstance(getInstanceName(conf), getZookeepers(conf)); 116 | } 117 | 118 | public static int getNumPartitions(Configuration conf) { 119 | return conf.getInt(NUM_PARTITIONS, 25); 120 | } 121 | 122 | public static int getNumGroups(Configuration conf) { 123 | return conf.getInt(NUM_GROUPS, 1); 124 | } 125 | 126 | public static Path getPartitionedArticlesPath(Configuration conf) { 127 | return new Path(conf.get(PARTITIONED_ARTICLES_DIRECTORY)); 128 | } 129 | 130 | public static long getMinInputSplitSize(Configuration conf) { 131 | return conf.getLong(PARTITIONED_INPUT_MIN_SPLIT_SIZE, 1l << 27); 132 | } 133 | 134 | public static boolean runPartitioner(Configuration conf) { 135 | return conf.getBoolean(RUN_PARTITIONER, false); 136 | } 137 | 138 | public static boolean runIngest(Configuration conf) { 139 | return conf.getBoolean(RUN_INGEST, true); 140 | } 141 | 142 | public static boolean bulkIngest(Configuration conf) { 143 | return conf.getBoolean(BULK_INGEST, true); 144 | } 145 | 146 | public static String bulkIngestDir(Configuration conf) { 147 | return conf.get(BULK_INGEST_DIR); 148 | } 149 | 150 | public static String bulkIngestFailureDir(Configuration conf) { 151 | return conf.get(BULK_INGEST_FAILURE_DIR); 152 | } 153 | 154 | public static long bulkIngestBufferSize(Configuration conf) { 155 | return conf.getLong(BULK_INGEST_BUFFER_SIZE, 1l << 28); 156 | } 157 | 158 | /** 159 | * Helper method to get properties from Hadoop configuration 160 | * 161 | * @throws IllegalArgumentException 162 | * if property is not defined, null, or empty. Or if resultClass is not handled. 163 | * @return value of property 164 | */ 165 | @SuppressWarnings("unchecked") 166 | public static T isNull(Configuration conf, String propertyName, Class resultClass) { 167 | String p = conf.get(propertyName); 168 | if (StringUtils.isEmpty(p)) { 169 | throw new IllegalArgumentException(propertyName + " must be specified"); 170 | } 171 | 172 | if (resultClass.equals(String.class)) { 173 | return (T) p; 174 | } else if (resultClass.equals(String[].class)) { 175 | return (T) conf.getStrings(propertyName); 176 | } else if (resultClass.equals(Boolean.class)) { 177 | return (T) Boolean.valueOf(p); 178 | } else if (resultClass.equals(Long.class)) { 179 | return (T) Long.valueOf(p); 180 | } else if (resultClass.equals(Integer.class)) { 181 | return (T) Integer.valueOf(p); 182 | } else if (resultClass.equals(Float.class)) { 183 | return (T) Float.valueOf(p); 184 | } else if (resultClass.equals(Double.class)) { 185 | return (T) Double.valueOf(p); 186 | } else { 187 | throw new IllegalArgumentException(resultClass.getSimpleName() + " is unhandled."); 188 | } 189 | 190 | } 191 | 192 | } 193 | -------------------------------------------------------------------------------- /query/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/OptimizedQueryIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.iterator; 18 | 19 | import java.io.IOException; 20 | import java.util.Collection; 21 | import java.util.HashMap; 22 | import java.util.HashSet; 23 | import java.util.Map; 24 | 25 | import org.apache.accumulo.core.data.ByteSequence; 26 | import org.apache.accumulo.core.data.Key; 27 | import org.apache.accumulo.core.data.PartialKey; 28 | import org.apache.accumulo.core.data.Range; 29 | import org.apache.accumulo.core.data.Value; 30 | import org.apache.accumulo.core.iterators.IteratorEnvironment; 31 | import org.apache.accumulo.core.iterators.OptionDescriber; 32 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator; 33 | import org.apache.log4j.Logger; 34 | 35 | /** 36 | * This iterator internally uses the BooleanLogicIterator to find event UIDs in the field index portion of the partition and uses the EvaluatingIterator to 37 | * evaluate the events against an expression. The key and value that are emitted from this iterator are the key and value that come from the EvaluatingIterator. 38 | */ 39 | public class OptimizedQueryIterator implements SortedKeyValueIterator, OptionDescriber { 40 | 41 | private static Logger log = Logger.getLogger(OptimizedQueryIterator.class); 42 | private EvaluatingIterator event = null; 43 | private SortedKeyValueIterator index = null; 44 | private Key key = null; 45 | private Value value = null; 46 | private boolean eventSpecificRange = false; 47 | 48 | public IteratorOptions describeOptions() { 49 | Map options = new HashMap(); 50 | options.put(EvaluatingIterator.QUERY_OPTION, "full query expression"); 51 | options.put(BooleanLogicIterator.FIELD_INDEX_QUERY, "modified query for the field index query portion"); 52 | options.put(ReadAheadIterator.QUEUE_SIZE, "parallel queue size"); 53 | options.put(ReadAheadIterator.TIMEOUT, "parallel iterator timeout"); 54 | return new IteratorOptions(getClass().getSimpleName(), "evaluates event objects against an expression using the field index", options, null); 55 | } 56 | 57 | public boolean validateOptions(Map options) { 58 | if (options.containsKey(EvaluatingIterator.QUERY_OPTION) && options.containsKey(BooleanLogicIterator.FIELD_INDEX_QUERY)) { 59 | return true; 60 | } 61 | return false; 62 | } 63 | 64 | public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { 65 | if (!validateOptions(options)) { 66 | throw new IllegalArgumentException("Invalid options"); 67 | } 68 | 69 | // Setup the EvaluatingIterator 70 | event = new EvaluatingIterator(); 71 | event.init(source.deepCopy(env), options, env); 72 | 73 | // if queue size and timeout are set, then use the read ahead iterator 74 | if (options.containsKey(ReadAheadIterator.QUEUE_SIZE) && options.containsKey(ReadAheadIterator.TIMEOUT)) { 75 | BooleanLogicIterator bli = new BooleanLogicIterator(); 76 | bli.init(source, options, env); 77 | index = new ReadAheadIterator(); 78 | index.init(bli, options, env); 79 | } else { 80 | index = new BooleanLogicIterator(); 81 | // index.setDebug(Level.DEBUG); 82 | index.init(source, options, env); 83 | } 84 | 85 | } 86 | 87 | public OptimizedQueryIterator() {} 88 | 89 | public OptimizedQueryIterator(OptimizedQueryIterator other, IteratorEnvironment env) { 90 | this.event = other.event; 91 | this.index = other.index; 92 | } 93 | 94 | public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { 95 | return new OptimizedQueryIterator(this, env); 96 | } 97 | 98 | public Key getTopKey() { 99 | if (log.isDebugEnabled()) { 100 | log.debug("getTopKey: " + key); 101 | } 102 | return key; 103 | } 104 | 105 | public Value getTopValue() { 106 | if (log.isDebugEnabled()) { 107 | log.debug("getTopValue: " + value); 108 | } 109 | return value; 110 | } 111 | 112 | public boolean hasTop() { 113 | if (log.isDebugEnabled()) { 114 | log.debug("hasTop: returned: " + (key != null)); 115 | } 116 | return (key != null); 117 | } 118 | 119 | public void next() throws IOException { 120 | if (log.isDebugEnabled()) { 121 | log.debug("next"); 122 | } 123 | if (key != null) { 124 | key = null; 125 | value = null; 126 | } 127 | 128 | if (eventSpecificRange) { 129 | // Then this will probably return nothing 130 | event.next(); 131 | if (event.hasTop()) { 132 | key = event.getTopKey(); 133 | value = event.getTopValue(); 134 | } 135 | } else { 136 | 137 | do { 138 | index.next(); 139 | // If the index has a match, then seek the event to the key 140 | if (index.hasTop()) { 141 | Key eventKey = index.getTopKey(); 142 | Key endKey = eventKey.followingKey(PartialKey.ROW_COLFAM); 143 | Key startKey = new Key(eventKey.getRow(), eventKey.getColumnFamily()); 144 | Range eventRange = new Range(startKey, endKey); 145 | HashSet cf = new HashSet(); 146 | cf.add(eventKey.getColumnFamilyData()); 147 | event.seek(eventRange, cf, true); 148 | if (event.hasTop()) { 149 | key = event.getTopKey(); 150 | value = event.getTopValue(); 151 | } 152 | } 153 | } while (key == null && index.hasTop()); 154 | } 155 | // Sanity check. Make sure both returnValue and returnKey are null or both are not null 156 | if (!((key == null && value == null) || (key != null && value != null))) { 157 | log.warn("Key: " + ((key == null) ? "null" : key.toString())); 158 | log.warn("Value: " + ((value == null) ? "null" : value.toString())); 159 | throw new IOException("Return values are inconsistent"); 160 | } 161 | 162 | } 163 | 164 | public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { 165 | if (log.isDebugEnabled()) { 166 | log.debug("seek, range:" + range); 167 | } 168 | // Test the range to see if it is event specific. 169 | if (null != range.getEndKey() && range.getEndKey().getColumnFamily() != null && range.getEndKey().getColumnFamily().getLength() != 0) { 170 | if (log.isDebugEnabled()) { 171 | log.debug("Jumping straight to the event"); 172 | } 173 | // Then this range is for a specific event. We don't need to use the index iterator to find it, we can just 174 | // seek to it with the event iterator and evaluate it. 175 | eventSpecificRange = true; 176 | event.seek(range, columnFamilies, inclusive); 177 | if (event.hasTop()) { 178 | key = event.getTopKey(); 179 | value = event.getTopValue(); 180 | } 181 | } else { 182 | if (log.isDebugEnabled()) { 183 | log.debug("Using BooleanLogicIteratorJexl"); 184 | } 185 | // Seek the boolean logic iterator 186 | index.seek(range, columnFamilies, inclusive); 187 | 188 | // If the index has a match, then seek the event to the key 189 | if (index.hasTop()) { 190 | Key eventKey = index.getTopKey(); 191 | // Range eventRange = new Range(eventKey, eventKey); 192 | Range eventRange = new Range(eventKey.getRow()); 193 | HashSet cf = new HashSet(); 194 | cf.add(eventKey.getColumnFamilyData()); 195 | event.seek(eventRange, cf, true); 196 | if (event.hasTop()) { 197 | key = event.getTopKey(); 198 | value = event.getTopValue(); 199 | } else { 200 | next(); 201 | } 202 | } 203 | } 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /query/src/test/hadoop1/org/apache/accumulo/examples/wikisearch/logic/TestQueryLogic.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.accumulo.examples.wikisearch.logic; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.io.File; 22 | import java.io.IOException; 23 | import java.net.URL; 24 | import java.util.ArrayList; 25 | import java.util.Collections; 26 | import java.util.HashMap; 27 | import java.util.List; 28 | import java.util.Map.Entry; 29 | 30 | import junit.framework.Assert; 31 | 32 | import org.apache.accumulo.core.client.BatchWriter; 33 | import org.apache.accumulo.core.client.Connector; 34 | import org.apache.accumulo.core.client.MutationsRejectedException; 35 | import org.apache.accumulo.core.client.Scanner; 36 | import org.apache.accumulo.core.client.mock.MockInstance; 37 | import org.apache.accumulo.core.client.security.tokens.PasswordToken; 38 | import org.apache.accumulo.core.data.Key; 39 | import org.apache.accumulo.core.data.Mutation; 40 | import org.apache.accumulo.core.data.Range; 41 | import org.apache.accumulo.core.data.Value; 42 | import org.apache.accumulo.core.security.Authorizations; 43 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration; 44 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester; 45 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit; 46 | import org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper; 47 | import org.apache.accumulo.examples.wikisearch.parser.RangeCalculator; 48 | import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader; 49 | import org.apache.accumulo.examples.wikisearch.sample.Document; 50 | import org.apache.accumulo.examples.wikisearch.sample.Field; 51 | import org.apache.accumulo.examples.wikisearch.sample.Results; 52 | import org.apache.hadoop.conf.Configuration; 53 | import org.apache.hadoop.fs.Path; 54 | import org.apache.hadoop.fs.RawLocalFileSystem; 55 | import org.apache.hadoop.io.LongWritable; 56 | import org.apache.hadoop.io.Text; 57 | import org.apache.hadoop.mapreduce.Mapper; 58 | import org.apache.hadoop.mapreduce.OutputCommitter; 59 | import org.apache.hadoop.mapreduce.RecordWriter; 60 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 61 | import org.apache.hadoop.mapreduce.TaskAttemptID; 62 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 63 | import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; 64 | import org.apache.log4j.Level; 65 | import org.apache.log4j.Logger; 66 | import org.junit.Before; 67 | import org.junit.Test; 68 | 69 | public class TestQueryLogic { 70 | 71 | private static final String METADATA_TABLE_NAME = "wikiMetadata"; 72 | 73 | private static final String TABLE_NAME = "wiki"; 74 | 75 | private static final String INDEX_TABLE_NAME = "wikiIndex"; 76 | 77 | private static final String RINDEX_TABLE_NAME = "wikiReverseIndex"; 78 | 79 | private static final String TABLE_NAMES[] = {METADATA_TABLE_NAME, TABLE_NAME, RINDEX_TABLE_NAME, INDEX_TABLE_NAME}; 80 | 81 | private class MockAccumuloRecordWriter extends RecordWriter { 82 | @Override 83 | public void write(Text key, Mutation value) throws IOException, InterruptedException { 84 | try { 85 | writerMap.get(key).addMutation(value); 86 | } catch (MutationsRejectedException e) { 87 | throw new IOException("Error adding mutation", e); 88 | } 89 | } 90 | 91 | @Override 92 | public void close(TaskAttemptContext context) throws IOException, InterruptedException { 93 | try { 94 | for (BatchWriter w : writerMap.values()) { 95 | w.flush(); 96 | w.close(); 97 | } 98 | } catch (MutationsRejectedException e) { 99 | throw new IOException("Error closing Batch Writer", e); 100 | } 101 | } 102 | 103 | } 104 | 105 | private Connector c = null; 106 | private Configuration conf = new Configuration(); 107 | private HashMap writerMap = new HashMap(); 108 | private QueryLogic table = null; 109 | 110 | @Before 111 | public void setup() throws Exception { 112 | 113 | Logger.getLogger(AbstractQueryLogic.class).setLevel(Level.DEBUG); 114 | Logger.getLogger(QueryLogic.class).setLevel(Level.DEBUG); 115 | Logger.getLogger(RangeCalculator.class).setLevel(Level.DEBUG); 116 | 117 | conf.set(AggregatingRecordReader.START_TOKEN, ""); 118 | conf.set(AggregatingRecordReader.END_TOKEN, ""); 119 | conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME); 120 | conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1"); 121 | conf.set(WikipediaConfiguration.NUM_GROUPS, "1"); 122 | 123 | MockInstance i = new MockInstance(); 124 | c = i.getConnector("root", new PasswordToken("")); 125 | WikipediaIngester.createTables(c.tableOperations(), TABLE_NAME, false); 126 | for (String table : TABLE_NAMES) { 127 | writerMap.put(new Text(table), c.createBatchWriter(table, 1000L, 1000L, 1)); 128 | } 129 | 130 | TaskAttemptID id = new TaskAttemptID(); 131 | TaskAttemptContext context = new TaskAttemptContext(conf, id); 132 | 133 | RawLocalFileSystem fs = new RawLocalFileSystem(); 134 | fs.setConf(conf); 135 | 136 | URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml"); 137 | Assert.assertNotNull(url); 138 | File data = new File(url.toURI()); 139 | Path tmpFile = new Path(data.getAbsolutePath()); 140 | 141 | // Setup the Mapper 142 | WikipediaInputSplit split = new WikipediaInputSplit(new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null), 0); 143 | AggregatingRecordReader rr = new AggregatingRecordReader(); 144 | Path ocPath = new Path(tmpFile, "oc"); 145 | OutputCommitter oc = new FileOutputCommitter(ocPath, context); 146 | fs.deleteOnExit(ocPath); 147 | StandaloneStatusReporter sr = new StandaloneStatusReporter(); 148 | rr.initialize(split, context); 149 | MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter(); 150 | WikipediaMapper mapper = new WikipediaMapper(); 151 | 152 | // Load data into Mock Accumulo 153 | Mapper.Context con = mapper.new Context(conf, id, rr, rw, oc, sr, split); 154 | mapper.run(con); 155 | 156 | // Flush and close record writers. 157 | rw.close(context); 158 | 159 | table = new QueryLogic(); 160 | table.setMetadataTableName(METADATA_TABLE_NAME); 161 | table.setTableName(TABLE_NAME); 162 | table.setIndexTableName(INDEX_TABLE_NAME); 163 | table.setReverseIndexTableName(RINDEX_TABLE_NAME); 164 | table.setUseReadAheadIterator(false); 165 | table.setUnevaluatedFields(Collections.singletonList("TEXT")); 166 | } 167 | 168 | void debugQuery(String tableName) throws Exception { 169 | Scanner s = c.createScanner(tableName, new Authorizations("all")); 170 | Range r = new Range(); 171 | s.setRange(r); 172 | for (Entry entry : s) 173 | System.out.println(entry.getKey().toString() + " " + entry.getValue().toString()); 174 | } 175 | 176 | @Test 177 | public void testTitle() throws Exception { 178 | Logger.getLogger(AbstractQueryLogic.class).setLevel(Level.OFF); 179 | Logger.getLogger(RangeCalculator.class).setLevel(Level.OFF); 180 | List auths = new ArrayList(); 181 | auths.add("enwiki"); 182 | 183 | Results results = table.runQuery(c, auths, "TITLE == 'asphalt' or TITLE == 'abacus' or TITLE == 'acid' or TITLE == 'acronym'", null, null, null); 184 | List docs = results.getResults(); 185 | assertEquals(4, docs.size()); 186 | 187 | results = table.runQuery(c, auths, "TEXT == 'abacus'", null, null, null); 188 | docs = results.getResults(); 189 | assertEquals(1, docs.size()); 190 | for (Document doc : docs) { 191 | System.out.println("id: " + doc.getId()); 192 | for (Field field : doc.getFields()) 193 | System.out.println(field.getFieldName() + " -> " + field.getFieldValue()); 194 | } 195 | } 196 | 197 | } 198 | --------------------------------------------------------------------------------