├── .asf.yaml ├── .github └── workflows │ └── maven.yaml ├── .gitignore ├── LICENSE ├── README.md ├── phrasecount ├── .travis.yml ├── LICENSE ├── README.md ├── bin │ ├── copy-jars.sh │ ├── load.sh │ ├── mini.sh │ ├── print.sh │ └── run.sh ├── pom.xml └── src │ ├── main │ └── java │ │ └── phrasecount │ │ ├── Application.java │ │ ├── Constants.java │ │ ├── DocumentLoader.java │ │ ├── DocumentObserver.java │ │ ├── PhraseExporter.java │ │ ├── PhraseMap.java │ │ ├── cmd │ │ ├── Load.java │ │ ├── Mini.java │ │ ├── Print.java │ │ ├── Setup.java │ │ └── Split.java │ │ ├── pojos │ │ ├── Counts.java │ │ ├── Document.java │ │ ├── PcKryoFactory.java │ │ └── PhraseAndCounts.java │ │ └── query │ │ ├── PhraseCountTable.java │ │ └── RowTransform.java │ └── test │ ├── java │ └── phrasecount │ │ └── PhraseCounterTest.java │ └── resources │ └── log4j.properties ├── pom.xml ├── stresso ├── .gitignore ├── .travis.yml ├── AUTHORS ├── LICENSE ├── README.md ├── bin │ ├── build.sh │ ├── bulk_load.sh │ ├── compact-ll.sh │ ├── diff.sh │ ├── generate.sh │ ├── load-env.sh │ ├── load.sh │ ├── print.sh │ ├── run-test.sh │ ├── split.sh │ └── unique.sh ├── conf │ ├── .gitignore │ ├── env.sh.example │ ├── fluo-app.properties │ ├── fluo-app.properties.example │ └── log4j.xml ├── pom.xml └── src │ ├── main │ └── java │ │ └── stresso │ │ └── trie │ │ ├── AccumuloUtil.java │ │ ├── CompactLL.java │ │ ├── Constants.java │ │ ├── Diff.java │ │ ├── Generate.java │ │ ├── Init.java │ │ ├── Load.java │ │ ├── Node.java │ │ ├── NodeObserver.java │ │ ├── NumberLoader.java │ │ ├── Print.java │ │ ├── Split.java │ │ ├── StressoConfig.java │ │ ├── StressoObserverProvider.java │ │ └── Unique.java │ └── test │ ├── java │ └── stresso │ │ ├── ITBase.java │ │ ├── TrieBasicIT.java │ │ ├── TrieMapRedIT.java │ │ └── TrieStopLevelIT.java │ └── resources │ └── log4j.properties └── webindex ├── .gitignore ├── .travis.yml ├── AUTHORS ├── LICENSE ├── README.md ├── bin ├── impl │ ├── base.sh │ └── init.sh └── webindex ├── conf ├── .gitignore └── examples │ ├── log4j.properties │ ├── webindex-env.sh │ └── webindex.yml ├── contrib ├── webindex-dashboard.json ├── webindex.png └── webindex.svg ├── docs ├── code-guide.md ├── install.md ├── tables.md └── webindex_graphic.png ├── modules ├── core │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── webindex │ │ │ └── core │ │ │ ├── Constants.java │ │ │ ├── IndexClient.java │ │ │ ├── WebIndexConfig.java │ │ │ ├── models │ │ │ ├── DomainStats.java │ │ │ ├── Link.java │ │ │ ├── Links.java │ │ │ ├── Page.java │ │ │ ├── Pages.java │ │ │ ├── TopResults.java │ │ │ ├── URL.java │ │ │ ├── UriInfo.java │ │ │ └── export │ │ │ │ ├── DomainUpdate.java │ │ │ │ ├── IndexUpdate.java │ │ │ │ ├── PageUpdate.java │ │ │ │ └── UriUpdate.java │ │ │ └── util │ │ │ └── Pager.java │ │ └── test │ │ ├── java │ │ └── webindex │ │ │ └── core │ │ │ ├── WebIndexConfigTest.java │ │ │ └── models │ │ │ ├── LinkTest.java │ │ │ ├── PageTest.java │ │ │ └── URLTest.java │ │ └── resources │ │ └── log4j.properties ├── data │ ├── pom.xml │ └── src │ │ ├── main │ │ ├── java │ │ │ └── webindex │ │ │ │ ├── data │ │ │ │ ├── CalcSplits.java │ │ │ │ ├── Configure.java │ │ │ │ ├── Copy.java │ │ │ │ ├── FluoApp.java │ │ │ │ ├── Init.java │ │ │ │ ├── LoadHdfs.java │ │ │ │ ├── LoadS3.java │ │ │ │ ├── TestParser.java │ │ │ │ ├── fluo │ │ │ │ │ ├── DomainCombineQ.java │ │ │ │ │ ├── IndexUpdateTranslator.java │ │ │ │ │ ├── PageLoader.java │ │ │ │ │ ├── PageObserver.java │ │ │ │ │ ├── UriCombineQ.java │ │ │ │ │ └── WebindexObservers.java │ │ │ │ ├── spark │ │ │ │ │ ├── IndexEnv.java │ │ │ │ │ ├── IndexStats.java │ │ │ │ │ └── IndexUtil.java │ │ │ │ └── util │ │ │ │ │ ├── ArchiveUtil.java │ │ │ │ │ ├── WARCFileInputFormat.java │ │ │ │ │ └── WARCFileRecordReader.java │ │ │ │ └── serialization │ │ │ │ └── WebindexKryoFactory.java │ │ └── resources │ │ │ └── splits │ │ │ └── accumulo-default.txt │ │ └── test │ │ ├── java │ │ └── webindex │ │ │ └── data │ │ │ ├── SparkTestUtil.java │ │ │ ├── fluo │ │ │ └── it │ │ │ │ └── IndexIT.java │ │ │ ├── spark │ │ │ ├── Hex.java │ │ │ ├── IndexEnvTest.java │ │ │ └── IndexUtilTest.java │ │ │ └── util │ │ │ └── ArchiveUtilTest.java │ │ └── resources │ │ ├── data │ │ └── set1 │ │ │ ├── accumulo-data.txt │ │ │ └── fluo-data.txt │ │ ├── log4j.properties │ │ ├── wat-18.warc │ │ └── wat.warc ├── integration │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── webindex │ │ │ └── integration │ │ │ ├── DevServer.java │ │ │ ├── DevServerOpts.java │ │ │ └── SampleData.java │ │ └── test │ │ ├── java │ │ └── webindex │ │ │ └── integration │ │ │ └── DevServerIT.java │ │ └── resources │ │ ├── 5-pages.txt │ │ └── log4j.properties └── ui │ ├── .gitignore │ ├── pom.xml │ └── src │ └── main │ ├── java │ └── webindex │ │ └── ui │ │ └── WebServer.java │ └── resources │ ├── assets │ └── img │ │ └── webindex.png │ └── spark │ └── template │ └── freemarker │ ├── 404.ftl │ ├── common │ ├── footer.ftl │ ├── head.ftl │ └── header.ftl │ ├── home.ftl │ ├── links.ftl │ ├── page.ftl │ ├── pages.ftl │ └── top.ftl └── pom.xml /.asf.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | # https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features 21 | 22 | github: 23 | description: "Apache Fluo Examples" 24 | homepage: https://fluo.apache.org 25 | labels: 26 | - fluo 27 | - accumulo 28 | - big-data 29 | - hacktoberfest 30 | features: 31 | wiki: false 32 | issues: true 33 | projects: true 34 | 35 | -------------------------------------------------------------------------------- /.github/workflows/maven.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | # This workflow will build a Java project with Maven 21 | # See also: 22 | # https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 23 | 24 | name: CI 25 | 26 | on: 27 | push: 28 | branches: [ '*' ] 29 | pull_request: 30 | branches: [ '*' ] 31 | 32 | jobs: 33 | mvn: 34 | timeout-minutes: 60 35 | runs-on: ubuntu-latest 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: Set up JDK 11 39 | uses: actions/setup-java@v4 40 | with: 41 | distribution: adopt 42 | java-version: 11 43 | cache: 'maven' 44 | - name: Build with Maven 45 | run: mvn -B -V -e -ntp "-Dstyle.color=always" verify javadoc:jar -DskipITs -Dfindbugs.skip # TODO fix/enable findbugs/ITs 46 | env: 47 | MAVEN_OPTS: -Djansi.force=true 48 | - name: Upload unit test results 49 | if: ${{ failure() }} 50 | uses: actions/upload-artifact@v4 51 | with: 52 | name: surefire-reports 53 | path: ./**/target/surefire-reports/ 54 | if-no-files-found: ignore 55 | - name: Upload integration test results 56 | if: ${{ failure() }} 57 | uses: actions/upload-artifact@v4 58 | with: 59 | name: failsafe-reports 60 | path: ./**/target/failsafe-reports/ 61 | if-no-files-found: ignore 62 | 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | 3 | .settings/ 4 | .classpath 5 | .project 6 | .idea/ 7 | *.iml 8 | 9 | .DS_Store 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 17 | 18 | # Examples for Apache Fluo 19 | 20 | [![Build Status][ti]][tl] [![Apache License][li]][ll] 21 | 22 | [ti]: https://github.com/apache/fluo-examples/workflows/CI/badge.svg 23 | [tl]: https://github.com/apache/fluo-examples/actions 24 | [li]: http://img.shields.io/badge/license-ASL-blue.svg 25 | [ll]: https://github.com/apache/fluo-examples/blob/main/LICENSE 26 | 27 | -------------------------------------------------------------------------------- /phrasecount/.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 4 | script: mvn verify 5 | notifications: 6 | irc: 7 | channels: 8 | - "chat.freenode.net#fluo" 9 | on_success: always 10 | on_failure: always 11 | use_notice: true 12 | skip_join: true 13 | -------------------------------------------------------------------------------- /phrasecount/bin/copy-jars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | #This script will copy the phrase count jar and its dependencies to the Fluo 19 | #application lib dir 20 | 21 | 22 | if [ "$#" -ne 2 ]; then 23 | echo "Usage : $0 " 24 | exit 25 | fi 26 | 27 | FLUO_HOME=$1 28 | PC_HOME=$2 29 | 30 | PC_JAR=$PC_HOME/target/phrasecount-0.0.1-SNAPSHOT.jar 31 | 32 | #build and copy phrasecount jar 33 | (cd $PC_HOME; mvn package -DskipTests) 34 | 35 | FLUO_APP_LIB=$FLUO_HOME/apps/phrasecount/lib/ 36 | 37 | cp $PC_JAR $FLUO_APP_LIB 38 | (cd $PC_HOME; mvn dependency:copy-dependencies -DoutputDirectory=$FLUO_APP_LIB) 39 | 40 | -------------------------------------------------------------------------------- /phrasecount/bin/load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | mvn exec:java -Dexec.mainClass=phrasecount.cmd.Load -Dexec.args="${*:1}" -Dexec.classpathScope=test 19 | -------------------------------------------------------------------------------- /phrasecount/bin/mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | mvn exec:java -Dexec.mainClass=phrasecount.cmd.Mini -Dexec.args="${*:1}" -Dexec.classpathScope=test &>mini.log & 19 | echo "Started Mini in background. Writing output to mini.log." 20 | -------------------------------------------------------------------------------- /phrasecount/bin/print.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | mvn exec:java -Dexec.mainClass=phrasecount.cmd.Print -Dexec.args="${*:1}" -Dexec.classpathScope=test 19 | 20 | -------------------------------------------------------------------------------- /phrasecount/bin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | PC_HOME=$( cd "$( dirname "$BIN_DIR" )" && pwd ) 20 | 21 | # stop if any command fails 22 | set -e 23 | 24 | if [ "$#" -ne 1 ]; then 25 | echo "Usage : $0 " 26 | exit 27 | fi 28 | 29 | #set the following to a directory containing text files 30 | TXT_DIR=$1 31 | if [ ! -d $TXT_DIR ]; then 32 | echo "Document directory $TXT_DIR does not exist" 33 | exit 1 34 | fi 35 | 36 | #ensure $FLUO_HOME is set 37 | if [ -z "$FLUO_HOME" ]; then 38 | echo '$FLUO_HOME must be set!' 39 | exit 1 40 | fi 41 | 42 | #Set application name. $FLUO_APP_NAME is set by fluo-dev and zetten 43 | APP=${FLUO_APP_NAME:-phrasecount} 44 | 45 | #derived variables 46 | APP_PROPS=$FLUO_HOME/apps/$APP/conf/fluo.properties 47 | 48 | if [ ! -f $FLUO_HOME/conf/fluo.properties ]; then 49 | echo "Fluo is not configured, exiting." 50 | exit 1 51 | fi 52 | 53 | #remove application if it exists 54 | if [ -d $FLUO_HOME/apps/$APP ]; then 55 | echo "Restarting '$APP' application. Errors may be printed if it's not running..." 56 | $FLUO_HOME/bin/fluo kill $APP || true 57 | rm -rf $FLUO_HOME/apps/$APP 58 | fi 59 | 60 | #create new application dir 61 | $FLUO_HOME/bin/fluo new $APP 62 | 63 | #copy phrasecount jars to Fluo application lib dir 64 | $PC_HOME/bin/copy-jars.sh $FLUO_HOME $PC_HOME 65 | 66 | #Create export table and output Fluo configuration 67 | $FLUO_HOME/bin/fluo exec $APP phrasecount.cmd.Setup $APP_PROPS pcExport >> $APP_PROPS 68 | 69 | $FLUO_HOME/bin/fluo init $APP -f 70 | $FLUO_HOME/bin/fluo exec $APP org.apache.fluo.recipes.accumulo.cmds.OptimizeTable 71 | $FLUO_HOME/bin/fluo start $APP 72 | $FLUO_HOME/bin/fluo info $APP 73 | 74 | #Load data 75 | $FLUO_HOME/bin/fluo exec $APP phrasecount.cmd.Load $APP_PROPS $TXT_DIR 76 | 77 | #wait for all notifications to be processed. 78 | $FLUO_HOME/bin/fluo wait $APP 79 | 80 | #print phrase counts 81 | $FLUO_HOME/bin/fluo exec $APP phrasecount.cmd.Print $APP_PROPS pcExport 82 | 83 | $FLUO_HOME/bin/fluo stop $APP 84 | 85 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/Application.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount; 19 | 20 | import org.apache.fluo.api.config.FluoConfiguration; 21 | import org.apache.fluo.api.config.ObserverSpecification; 22 | import org.apache.fluo.recipes.accumulo.export.AccumuloExporter; 23 | import org.apache.fluo.recipes.core.export.ExportQueue; 24 | import org.apache.fluo.recipes.core.map.CollisionFreeMap; 25 | import org.apache.fluo.recipes.kryo.KryoSimplerSerializer; 26 | import phrasecount.pojos.Counts; 27 | import phrasecount.pojos.PcKryoFactory; 28 | 29 | import static phrasecount.Constants.EXPORT_QUEUE_ID; 30 | import static phrasecount.Constants.PCM_ID; 31 | 32 | public class Application { 33 | 34 | public static class Options { 35 | public Options(int pcmBuckets, int eqBuckets, String instance, String zooKeepers, String user, 36 | String password, String eTable) { 37 | this.phraseCountMapBuckets = pcmBuckets; 38 | this.exportQueueBuckets = eqBuckets; 39 | this.instance = instance; 40 | this.zookeepers = zooKeepers; 41 | this.user = user; 42 | this.password = password; 43 | this.exportTable = eTable; 44 | 45 | } 46 | 47 | public int phraseCountMapBuckets; 48 | public int exportQueueBuckets; 49 | 50 | public String instance; 51 | public String zookeepers; 52 | public String user; 53 | public String password; 54 | public String exportTable; 55 | } 56 | 57 | /** 58 | * Sets Fluo configuration needed to run the phrase count application 59 | * 60 | * @param fluoConfig FluoConfiguration 61 | * @param opts Options 62 | */ 63 | public static void configure(FluoConfiguration fluoConfig, Options opts) { 64 | // set up an observer that watches the reference counts of documents. When a document is 65 | // referenced or dereferenced, it will add or subtract phrase counts from a collision free map. 66 | fluoConfig.addObserver(new ObserverSpecification(DocumentObserver.class.getName())); 67 | 68 | // configure which KryoFactory recipes should use 69 | KryoSimplerSerializer.setKryoFactory(fluoConfig, PcKryoFactory.class); 70 | 71 | // set up a collision free map to combine phrase counts 72 | CollisionFreeMap.configure(fluoConfig, 73 | new CollisionFreeMap.Options(PCM_ID, PhraseMap.PcmCombiner.class, 74 | PhraseMap.PcmUpdateObserver.class, String.class, Counts.class, 75 | opts.phraseCountMapBuckets)); 76 | 77 | AccumuloExporter.Configuration accumuloConfig = new AccumuloExporter.Configuration( 78 | opts.instance, opts.zookeepers, opts.user, opts.password, opts.exportTable); 79 | 80 | // setup an Accumulo export queue to to send phrase count updates to an Accumulo table 81 | ExportQueue.Options exportQueueOpts = 82 | new ExportQueue.Options(EXPORT_QUEUE_ID, PhraseExporter.class.getName(), 83 | String.class.getName(), Counts.class.getName(), opts.exportQueueBuckets) 84 | .setExporterConfiguration(accumuloConfig); 85 | ExportQueue.configure(fluoConfig, exportQueueOpts); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/Constants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount; 19 | 20 | import org.apache.fluo.api.data.Column; 21 | import org.apache.fluo.recipes.core.types.StringEncoder; 22 | import org.apache.fluo.recipes.core.types.TypeLayer; 23 | 24 | public class Constants { 25 | 26 | // set the encoder to use in once place 27 | public static final TypeLayer TYPEL = new TypeLayer(new StringEncoder()); 28 | 29 | public static final Column INDEX_CHECK_COL = TYPEL.bc().fam("index").qual("check").vis(); 30 | public static final Column INDEX_STATUS_COL = TYPEL.bc().fam("index").qual("status").vis(); 31 | public static final Column DOC_CONTENT_COL = TYPEL.bc().fam("doc").qual("content").vis(); 32 | public static final Column DOC_HASH_COL = TYPEL.bc().fam("doc").qual("hash").vis(); 33 | public static final Column DOC_REF_COUNT_COL = TYPEL.bc().fam("doc").qual("refCount").vis(); 34 | 35 | public static final String EXPORT_QUEUE_ID = "aeq"; 36 | // phrase count map id 37 | public static final String PCM_ID = "pcm"; 38 | } 39 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/DocumentLoader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount; 19 | 20 | import org.apache.fluo.api.client.Loader; 21 | import org.apache.fluo.api.client.TransactionBase; 22 | import org.apache.fluo.recipes.core.types.TypedTransactionBase; 23 | import phrasecount.pojos.Document; 24 | 25 | import static phrasecount.Constants.DOC_CONTENT_COL; 26 | import static phrasecount.Constants.DOC_HASH_COL; 27 | import static phrasecount.Constants.DOC_REF_COUNT_COL; 28 | import static phrasecount.Constants.INDEX_CHECK_COL; 29 | import static phrasecount.Constants.TYPEL; 30 | 31 | /** 32 | * Executes document load transactions which dedupe and reference count documents. If needed, the 33 | * observer that updates phrase counts is triggered. 34 | */ 35 | public class DocumentLoader implements Loader { 36 | 37 | private Document document; 38 | 39 | public DocumentLoader(Document doc) { 40 | this.document = doc; 41 | } 42 | 43 | @Override 44 | public void load(TransactionBase tx, Context context) throws Exception { 45 | 46 | // TODO Need a strategy for dealing w/ large documents. If a worker processes many large 47 | // documents concurrently, it could cause memory exhaustion. Could break up large documents 48 | // into pieces, However, not sure if the example should be complicated with this. 49 | 50 | TypedTransactionBase ttx = TYPEL.wrap(tx); 51 | String storedHash = ttx.get().row("uri:" + document.getURI()).col(DOC_HASH_COL).toString(); 52 | 53 | if (storedHash == null || !storedHash.equals(document.getHash())) { 54 | 55 | ttx.mutate().row("uri:" + document.getURI()).col(DOC_HASH_COL).set(document.getHash()); 56 | 57 | Integer refCount = 58 | ttx.get().row("doc:" + document.getHash()).col(DOC_REF_COUNT_COL).toInteger(); 59 | if (refCount == null) { 60 | // this document was never seen before 61 | addNewDocument(ttx, document); 62 | } else { 63 | setRefCount(ttx, document.getHash(), refCount, refCount + 1); 64 | } 65 | 66 | if (storedHash != null) { 67 | decrementRefCount(ttx, refCount, storedHash); 68 | } 69 | } 70 | } 71 | 72 | private void setRefCount(TypedTransactionBase tx, String hash, Integer prevRc, int rc) { 73 | tx.mutate().row("doc:" + hash).col(DOC_REF_COUNT_COL).set(rc); 74 | 75 | if (rc == 0 || (rc == 1 && (prevRc == null || prevRc == 0))) { 76 | // setting this triggers DocumentObserver 77 | tx.mutate().row("doc:" + hash).col(INDEX_CHECK_COL).set(); 78 | } 79 | } 80 | 81 | private void decrementRefCount(TypedTransactionBase tx, Integer prevRc, String hash) { 82 | int rc = tx.get().row("doc:" + hash).col(DOC_REF_COUNT_COL).toInteger(); 83 | setRefCount(tx, hash, prevRc, rc - 1); 84 | } 85 | 86 | private void addNewDocument(TypedTransactionBase tx, Document doc) { 87 | setRefCount(tx, doc.getHash(), null, 1); 88 | tx.mutate().row("doc:" + doc.getHash()).col(DOC_CONTENT_COL).set(doc.getContent()); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/PhraseExporter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount; 19 | 20 | import java.util.function.Consumer; 21 | 22 | import org.apache.accumulo.core.data.Mutation; 23 | import org.apache.fluo.recipes.accumulo.export.AccumuloExporter; 24 | import org.apache.fluo.recipes.core.export.SequencedExport; 25 | import phrasecount.pojos.Counts; 26 | import phrasecount.query.PhraseCountTable; 27 | 28 | /** 29 | * Export code that converts {@link Counts} objects from the export queue to Mutations that are 30 | * written to Accumulo. 31 | */ 32 | public class PhraseExporter extends AccumuloExporter { 33 | 34 | @Override 35 | protected void translate(SequencedExport export, Consumer consumer) { 36 | String phrase = export.getKey(); 37 | long seq = export.getSequence(); 38 | Counts counts = export.getValue(); 39 | consumer.accept(PhraseCountTable.createMutation(phrase, seq, counts)); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/PhraseMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount; 19 | 20 | import java.util.Iterator; 21 | import java.util.Optional; 22 | 23 | import com.google.common.collect.Iterators; 24 | import org.apache.fluo.api.client.TransactionBase; 25 | import org.apache.fluo.api.observer.Observer.Context; 26 | import org.apache.fluo.recipes.core.export.Export; 27 | import org.apache.fluo.recipes.core.export.ExportQueue; 28 | import org.apache.fluo.recipes.core.map.CollisionFreeMap; 29 | import org.apache.fluo.recipes.core.map.Combiner; 30 | import org.apache.fluo.recipes.core.map.Update; 31 | import org.apache.fluo.recipes.core.map.UpdateObserver; 32 | import phrasecount.pojos.Counts; 33 | 34 | import static phrasecount.Constants.EXPORT_QUEUE_ID; 35 | 36 | /** 37 | * This class contains all of the code related to the {@link CollisionFreeMap} that keeps track of 38 | * phrase counts. 39 | */ 40 | public class PhraseMap { 41 | 42 | /** 43 | * A combiner for the {@link CollisionFreeMap} that stores phrase counts. The 44 | * {@link CollisionFreeMap} calls this combiner when it lazily updates the counts for a phrase. 45 | */ 46 | public static class PcmCombiner implements Combiner { 47 | 48 | @Override 49 | public Optional combine(String key, Iterator updates) { 50 | Counts sum = new Counts(0, 0); 51 | while (updates.hasNext()) { 52 | sum = sum.add(updates.next()); 53 | } 54 | return Optional.of(sum); 55 | } 56 | } 57 | 58 | /** 59 | * This class is notified when the {@link CollisionFreeMap} used to store phrase counts updates a 60 | * phrase count. Updates are placed an Accumulo export queue to be exported to the table storing 61 | * phrase counts for query. 62 | */ 63 | public static class PcmUpdateObserver extends UpdateObserver { 64 | 65 | private ExportQueue pcEq; 66 | 67 | @Override 68 | public void init(String mapId, Context observerContext) throws Exception { 69 | pcEq = ExportQueue.getInstance(EXPORT_QUEUE_ID, observerContext.getAppConfiguration()); 70 | } 71 | 72 | @Override 73 | public void updatingValues(TransactionBase tx, Iterator> updates) { 74 | Iterator> exports = 75 | Iterators.transform(updates, u -> new Export<>(u.getKey(), u.getNewValue().get())); 76 | pcEq.addAll(tx, exports); 77 | } 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/cmd/Load.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.cmd; 19 | 20 | import java.io.File; 21 | import java.nio.charset.StandardCharsets; 22 | 23 | import com.google.common.io.Files; 24 | import org.apache.fluo.api.client.FluoClient; 25 | import org.apache.fluo.api.client.FluoFactory; 26 | import org.apache.fluo.api.client.LoaderExecutor; 27 | import org.apache.fluo.api.config.FluoConfiguration; 28 | import phrasecount.DocumentLoader; 29 | import phrasecount.pojos.Document; 30 | 31 | public class Load { 32 | 33 | public static void main(String[] args) throws Exception { 34 | 35 | if (args.length != 2) { 36 | System.err.println("Usage : " + Load.class.getName() + " "); 37 | System.exit(-1); 38 | } 39 | 40 | FluoConfiguration config = new FluoConfiguration(new File(args[0])); 41 | config.setLoaderThreads(20); 42 | config.setLoaderQueueSize(40); 43 | 44 | try (FluoClient fluoClient = FluoFactory.newClient(config); 45 | LoaderExecutor le = fluoClient.newLoaderExecutor()) { 46 | File[] files = new File(args[1]).listFiles(); 47 | 48 | if (files == null) { 49 | System.out.println("Text file dir does not exist: " + args[1]); 50 | } else { 51 | for (File txtFile : files) { 52 | if (txtFile.getName().endsWith(".txt")) { 53 | String uri = txtFile.toURI().toString(); 54 | String content = Files.toString(txtFile, StandardCharsets.UTF_8); 55 | 56 | System.out.println("Processing : " + txtFile.toURI()); 57 | le.execute(new DocumentLoader(new Document(uri, content))); 58 | } else { 59 | System.out.println("Ignoring : " + txtFile.toURI()); 60 | } 61 | } 62 | } 63 | } 64 | 65 | // TODO figure what threads are hanging around 66 | System.exit(0); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/cmd/Print.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.cmd; 19 | 20 | import java.io.File; 21 | 22 | import com.google.common.collect.Iterables; 23 | import org.apache.fluo.api.client.FluoClient; 24 | import org.apache.fluo.api.client.FluoFactory; 25 | import org.apache.fluo.api.client.Snapshot; 26 | import org.apache.fluo.api.config.FluoConfiguration; 27 | import org.apache.fluo.api.data.Column; 28 | import org.apache.fluo.api.data.Span; 29 | import phrasecount.Constants; 30 | import phrasecount.pojos.PhraseAndCounts; 31 | import phrasecount.query.PhraseCountTable; 32 | 33 | public class Print { 34 | 35 | public static void main(String[] args) throws Exception { 36 | if (args.length != 2) { 37 | System.err 38 | .println("Usage : " + Print.class.getName() + " "); 39 | System.exit(-1); 40 | } 41 | 42 | FluoConfiguration fluoConfig = new FluoConfiguration(new File(args[0])); 43 | 44 | PhraseCountTable pcTable = new PhraseCountTable(fluoConfig, args[1]); 45 | for (PhraseAndCounts phraseCount : pcTable) { 46 | System.out.printf("%7d %7d '%s'\n", phraseCount.docPhraseCount, phraseCount.totalPhraseCount, 47 | phraseCount.phrase); 48 | } 49 | 50 | try (FluoClient fluoClient = FluoFactory.newClient(fluoConfig); 51 | Snapshot snap = fluoClient.newSnapshot()) { 52 | 53 | // TODO could precompute this using observers 54 | int uriCount = count(snap, "uri:", Constants.DOC_HASH_COL); 55 | int documentCount = count(snap, "doc:", Constants.DOC_REF_COUNT_COL); 56 | int numIndexedDocs = count(snap, "doc:", Constants.INDEX_STATUS_COL); 57 | 58 | System.out.println(); 59 | System.out.printf("# uris : %,d\n", uriCount); 60 | System.out.printf("# unique documents : %,d\n", documentCount); 61 | System.out.printf("# processed documents : %,d\n", numIndexedDocs); 62 | System.out.println(); 63 | } 64 | 65 | // TODO figure what threads are hanging around 66 | System.exit(0); 67 | } 68 | 69 | private static int count(Snapshot snap, String prefix, Column col) { 70 | return Iterables.size(snap.scanner().over(Span.prefix(prefix)).fetch(col).byRow().build()); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/cmd/Setup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.cmd; 19 | 20 | import java.io.File; 21 | 22 | import org.apache.accumulo.core.client.Connector; 23 | import org.apache.accumulo.core.client.TableNotFoundException; 24 | import org.apache.accumulo.core.client.ZooKeeperInstance; 25 | import org.apache.accumulo.core.client.security.tokens.PasswordToken; 26 | import org.apache.fluo.api.config.FluoConfiguration; 27 | import phrasecount.Application; 28 | import phrasecount.Application.Options; 29 | 30 | public class Setup { 31 | 32 | public static void main(String[] args) throws Exception { 33 | FluoConfiguration config = new FluoConfiguration(new File(args[0])); 34 | 35 | String exportTable = args[1]; 36 | 37 | Connector conn = 38 | new ZooKeeperInstance(config.getAccumuloInstance(), config.getAccumuloZookeepers()) 39 | .getConnector("root", new PasswordToken("secret")); 40 | try { 41 | conn.tableOperations().delete(exportTable); 42 | } catch (TableNotFoundException e) { 43 | // ignore if table not found 44 | } 45 | 46 | conn.tableOperations().create(exportTable); 47 | 48 | Options opts = 49 | new Options(103, 103, config.getAccumuloInstance(), config.getAccumuloZookeepers(), 50 | config.getAccumuloUser(), config.getAccumuloPassword(), exportTable); 51 | 52 | FluoConfiguration observerConfig = new FluoConfiguration(); 53 | Application.configure(observerConfig, opts); 54 | observerConfig.save(System.out); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/cmd/Split.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.cmd; 19 | 20 | import java.io.File; 21 | import java.util.SortedSet; 22 | import java.util.TreeSet; 23 | 24 | import org.apache.accumulo.core.client.Connector; 25 | import org.apache.accumulo.core.client.ZooKeeperInstance; 26 | import org.apache.accumulo.core.client.security.tokens.PasswordToken; 27 | import org.apache.fluo.api.config.FluoConfiguration; 28 | import org.apache.hadoop.io.Text; 29 | 30 | /** 31 | * Utility to add splits to the Accumulo table used by Fluo. 32 | */ 33 | public class Split { 34 | public static void main(String[] args) throws Exception { 35 | if (args.length != 2) { 36 | System.err.println("Usage : " + Split.class.getName() + " "); 37 | System.exit(-1); 38 | } 39 | 40 | FluoConfiguration fluoConfig = new FluoConfiguration(new File(args[0])); 41 | ZooKeeperInstance zki = 42 | new ZooKeeperInstance(fluoConfig.getAccumuloInstance(), fluoConfig.getAccumuloZookeepers()); 43 | Connector conn = zki.getConnector(fluoConfig.getAccumuloUser(), 44 | new PasswordToken(fluoConfig.getAccumuloPassword())); 45 | 46 | SortedSet splits = new TreeSet<>(); 47 | 48 | for (char c = 'b'; c < 'z'; c++) { 49 | splits.add(new Text("phrase:" + c)); 50 | } 51 | 52 | conn.tableOperations().addSplits(args[1], splits); 53 | 54 | // TODO figure what threads are hanging around 55 | System.exit(0); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/pojos/Counts.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.pojos; 19 | 20 | import com.google.common.base.Objects; 21 | 22 | public class Counts { 23 | // number of documents a phrase was seen in 24 | public final long docPhraseCount; 25 | // total times a phrase was seen in all documents 26 | public final long totalPhraseCount; 27 | 28 | public Counts() { 29 | docPhraseCount = 0; 30 | totalPhraseCount = 0; 31 | } 32 | 33 | public Counts(long docPhraseCount, long totalPhraseCount) { 34 | this.docPhraseCount = docPhraseCount; 35 | this.totalPhraseCount = totalPhraseCount; 36 | } 37 | 38 | public Counts add(Counts other) { 39 | return new Counts(this.docPhraseCount + other.docPhraseCount, 40 | this.totalPhraseCount + other.totalPhraseCount); 41 | } 42 | 43 | @Override 44 | public boolean equals(Object o) { 45 | if (o instanceof Counts) { 46 | Counts opc = (Counts) o; 47 | return opc.docPhraseCount == docPhraseCount && opc.totalPhraseCount == totalPhraseCount; 48 | } 49 | 50 | return false; 51 | } 52 | 53 | @Override 54 | public int hashCode() { 55 | return (int) (993 * totalPhraseCount + 17 * docPhraseCount); 56 | } 57 | 58 | @Override 59 | public String toString() { 60 | return Objects.toStringHelper(this).add("documents", docPhraseCount) 61 | .add("total", totalPhraseCount).toString(); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/pojos/Document.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.pojos; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | 23 | import com.google.common.hash.Hasher; 24 | import com.google.common.hash.Hashing; 25 | 26 | public class Document { 27 | // the location where the document came from. This is needed inorder to detect when a document 28 | // changes. 29 | private String uri; 30 | 31 | // the text of a document. 32 | private String content; 33 | 34 | private String hash = null; 35 | 36 | public Document(String uri, String content) { 37 | this.content = content; 38 | this.uri = uri; 39 | } 40 | 41 | public String getURI() { 42 | return uri; 43 | } 44 | 45 | public String getHash() { 46 | if (hash != null) { 47 | return hash; 48 | } 49 | 50 | Hasher hasher = Hashing.sha1().newHasher(); 51 | String[] tokens = content.toLowerCase().split("[^\\p{Alnum}]+"); 52 | 53 | for (String token : tokens) { 54 | hasher.putString(token); 55 | } 56 | 57 | return hash = hasher.hash().toString(); 58 | } 59 | 60 | public Map getPhrases() { 61 | String[] tokens = content.toLowerCase().split("[^\\p{Alnum}]+"); 62 | 63 | Map phrases = new HashMap<>(); 64 | for (int i = 3; i < tokens.length; i++) { 65 | String phrase = tokens[i - 3] + " " + tokens[i - 2] + " " + tokens[i - 1] + " " + tokens[i]; 66 | Integer old = phrases.put(phrase, 1); 67 | if (old != null) { 68 | phrases.put(phrase, 1 + old); 69 | } 70 | } 71 | 72 | return phrases; 73 | } 74 | 75 | public String getContent() { 76 | return content; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/pojos/PcKryoFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.pojos; 19 | 20 | import com.esotericsoftware.kryo.Kryo; 21 | import com.esotericsoftware.kryo.pool.KryoFactory; 22 | 23 | public class PcKryoFactory implements KryoFactory { 24 | @Override 25 | public Kryo create() { 26 | Kryo kryo = new Kryo(); 27 | kryo.register(Counts.class, 9); 28 | return kryo; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/pojos/PhraseAndCounts.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.pojos; 19 | 20 | public class PhraseAndCounts extends Counts { 21 | public String phrase; 22 | 23 | public PhraseAndCounts(String phrase, int docPhraseCount, int totalPhraseCount) { 24 | super(docPhraseCount, totalPhraseCount); 25 | this.phrase = phrase; 26 | } 27 | 28 | @Override 29 | public boolean equals(Object o) { 30 | if (o instanceof PhraseAndCounts) { 31 | PhraseAndCounts op = (PhraseAndCounts) o; 32 | return phrase.equals(op.phrase) && super.equals(op); 33 | } 34 | return false; 35 | } 36 | 37 | @Override 38 | public int hashCode() { 39 | return super.hashCode() + 31 * phrase.hashCode(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /phrasecount/src/main/java/phrasecount/query/RowTransform.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package phrasecount.query; 19 | 20 | import java.util.Iterator; 21 | import java.util.Map.Entry; 22 | import java.util.function.Function; 23 | 24 | import org.apache.accumulo.core.data.Key; 25 | import org.apache.accumulo.core.data.Value; 26 | import phrasecount.pojos.PhraseAndCounts; 27 | 28 | public class RowTransform implements Function>, PhraseAndCounts> { 29 | @Override 30 | public PhraseAndCounts apply(Iterator> input) { 31 | String phrase = null; 32 | 33 | int totalPhraseCount = 0; 34 | int docPhraseCount = 0; 35 | 36 | while (input.hasNext()) { 37 | Entry colEntry = input.next(); 38 | String cq = colEntry.getKey().getColumnQualifierData().toString(); 39 | 40 | if (cq.equals(PhraseCountTable.TOTAL_PC_CQ)) { 41 | totalPhraseCount = Integer.parseInt(colEntry.getValue().toString()); 42 | } else { 43 | docPhraseCount = Integer.parseInt(colEntry.getValue().toString()); 44 | } 45 | 46 | if (phrase == null) { 47 | phrase = colEntry.getKey().getRowData().toString(); 48 | } 49 | } 50 | 51 | return new PhraseAndCounts(phrase, docPhraseCount, totalPhraseCount); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /phrasecount/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | log4j.rootLogger=INFO, CA 17 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 18 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c{2}] %-5p: %m%n 20 | 21 | #Uncomment to see debugging output for Fluo. 22 | #log4j.logger.org.apache.fluo=DEBUG 23 | 24 | #uncomment the following to see all transaction activity 25 | #log4j.logger.fluo.tx=TRACE 26 | 27 | log4j.logger.org.apache.zookeeper.ClientCnxn=FATAL 28 | log4j.logger.org.apache.zookeeper.ZooKeeper=WARN 29 | log4j.logger.org.apache.curator=WARN 30 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | org.apache.fluo 22 | fluo-parent 23 | 3 24 | 25 | org.apache.fluo 26 | fluo-examples 27 | 1-SNAPSHOT 28 | pom 29 | Fluo Examples 30 | This repo contains several example applications for Apache Fluo 31 | https://fluo.apache.org 32 | 33 | stresso 34 | phrasecount 35 | webindex 36 | 37 | 38 | 8 39 | 1.8 40 | 1.8 41 | UTF-8 42 | 43 | 44 | 45 | 46 | 47 | org.apache.maven.plugins 48 | maven-compiler-plugin 49 | 3.8.1 50 | 51 | true 52 | UTF-8 53 | 54 | 55 | 56 | org.apache.maven.plugins 57 | maven-enforcer-plugin 58 | 3.0.0-M3 59 | 60 | 61 | org.apache.maven.plugins 62 | maven-javadoc-plugin 63 | 3.2.0 64 | 65 | true 66 | -J-Xmx512m 67 | all,-missing 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /stresso/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | lib/ 3 | -------------------------------------------------------------------------------- /stresso/.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Stresso authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | language: java 15 | jdk: 16 | - oraclejdk8 17 | script: mvn verify 18 | notifications: 19 | irc: 20 | channels: 21 | - "chat.freenode.net#fluo" 22 | on_success: always 23 | on_failure: always 24 | use_notice: true 25 | skip_join: true 26 | -------------------------------------------------------------------------------- /stresso/AUTHORS: -------------------------------------------------------------------------------- 1 | AUTHORS 2 | ------- 3 | 4 | Keith Turner - Peterson Technologies 5 | Mike Walch - Peterson Technologies 6 | -------------------------------------------------------------------------------- /stresso/bin/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | SKIP_JAR_CHECKS="true" 20 | 21 | . $BIN_DIR/load-env.sh 22 | 23 | unset SKIP_JAR_CHECKS 24 | 25 | cd $BIN_DIR/.. 26 | 27 | # build Stresso using the versions of Fluo and Accumulo running on the system 28 | mvn clean package -Dfluo.version=$FLUO_VERSION -Daccumulo.version=$ACCUMULO_VERSION -DskipTests 29 | 30 | mkdir -p lib 31 | 32 | # populate lib dir used by fluo init 33 | rm -f lib/* 34 | cp target/stresso-0.0.1-SNAPSHOT.jar ./lib/ 35 | mvn dependency:copy-dependencies -DincludeArtifactIds=fluo-recipes-core -DoutputDirectory=./lib 36 | -------------------------------------------------------------------------------- /stresso/bin/bulk_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | if [ "$#" -ne 3 ]; then 22 | echo "Usage : $0 " 23 | exit 1 24 | fi 25 | 26 | yarn jar $STRESSO_SHADED_JAR stresso.trie.Init -Dmapreduce.job.reduces=$3 $FLUO_CONN $FLUO_APP_NAME $1 $2 27 | -------------------------------------------------------------------------------- /stresso/bin/compact-ll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | $FLUO_CMD exec $FLUO_APP_NAME stresso.trie.CompactLL $FLUO_CONN $FLUO_APP_NAME $@ 22 | -------------------------------------------------------------------------------- /stresso/bin/diff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | $FLUO_CMD exec $FLUO_APP_NAME stresso.trie.Diff $FLUO_CONN $FLUO_APP_NAME $@ 22 | -------------------------------------------------------------------------------- /stresso/bin/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | yarn jar $STRESSO_SHADED_JAR stresso.trie.Generate $@ 22 | -------------------------------------------------------------------------------- /stresso/bin/load-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | if [ ! -f $BIN_DIR/../conf/env.sh ] 19 | then 20 | . $BIN_DIR/../conf/env.sh.example 21 | else 22 | . $BIN_DIR/../conf/env.sh 23 | fi 24 | 25 | # verify fluo configuration 26 | if [ ! -d "$FLUO_HOME" ]; then 27 | echo "Problem with FLUO_HOME : $FLUO_HOME" 28 | exit 1 29 | fi 30 | FLUO_CMD=$FLUO_HOME/bin/fluo 31 | if [ -z "$FLUO_APP_NAME" ]; then 32 | echo "FLUO_APP_NAME is not set!" 33 | exit 1 34 | fi 35 | 36 | if [ ! -f "$FLUO_CONN" ]; then 37 | echo "Fluo conn properties file not found : $FLUO_CONN" 38 | exit 1 39 | fi 40 | 41 | STRESSO_VERSION=0.0.1-SNAPSHOT 42 | STRESSO_JAR=$BIN_DIR/../target/stresso-$STRESSO_VERSION.jar 43 | STRESSO_SHADED_JAR=$BIN_DIR/../target/stresso-$STRESSO_VERSION-shaded.jar 44 | if [ ! -f "$STRESSO_JAR" ] && [ -z "$SKIP_JAR_CHECKS" ]; then 45 | echo "Stresso jar not found : $STRESSO_JAR" 46 | exit 1; 47 | fi 48 | if [ ! -f "$STRESSO_SHADED_JAR" ] && [ -z "$SKIP_JAR_CHECKS" ]; then 49 | echo "Stresso shaded jar not found : $STRESSO_SHADED_JAR" 50 | exit 1; 51 | fi 52 | 53 | command -v yarn >/dev/null 2>&1 || { echo >&2 "I require yarn but it's not installed. Aborting."; exit 1; } 54 | command -v hadoop >/dev/null 2>&1 || { echo >&2 "I require hadoop but it's not installed. Aborting."; exit 1; } 55 | 56 | if [[ "$OSTYPE" == "darwin"* ]]; then 57 | export SED="sed -i .bak" 58 | else 59 | export SED="sed -i" 60 | fi 61 | -------------------------------------------------------------------------------- /stresso/bin/load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | yarn jar $STRESSO_SHADED_JAR stresso.trie.Load $FLUO_CONN $FLUO_APP_NAME $@ 22 | -------------------------------------------------------------------------------- /stresso/bin/print.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | $FLUO_CMD exec $FLUO_APP_NAME stresso.trie.Print $FLUO_CONN $FLUO_APP_NAME $@ 22 | -------------------------------------------------------------------------------- /stresso/bin/run-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | 20 | . $BIN_DIR/load-env.sh 21 | 22 | # stop if any command fails 23 | set -e 24 | 25 | if [ $($FLUO_CMD status -a $FLUO_APP_NAME) != "RUNNING" ]; then 26 | echo "Fluo app $FLUO_APP_NAME is not running" 27 | exit 1 28 | fi 29 | 30 | mkdir -p $LOG_DIR 31 | 32 | hadoop fs -rm -r -f /stresso/ 33 | 34 | set -e 35 | 36 | # add splits to Fluo table 37 | echo "*****Presplitting table*****" 38 | $BIN_DIR/split.sh $SPLITS >$LOG_DIR/split.out 2>$LOG_DIR/split.err 39 | 40 | if (( GEN_INIT > 0 )); then 41 | # generate and load intial data using map reduce writing directly to table 42 | echo "*****Generating and loading initial data set*****" 43 | $BIN_DIR/generate.sh $MAPS $((GEN_INIT / MAPS)) $MAX /stresso/init >$LOG_DIR/generate_0.out 2>$LOG_DIR/generate_0.err 44 | $BIN_DIR/bulk_load.sh /stresso/init /stresso/initTmp $REDUCES >$LOG_DIR/init.out 2>$LOG_DIR/init.err 45 | hadoop fs -rm -r /stresso/initTmp 46 | fi 47 | 48 | # load data incrementally 49 | for i in $(seq 1 $ITERATIONS); do 50 | echo "*****Generating and loading incremental data set $i*****" 51 | $BIN_DIR/generate.sh $MAPS $((GEN_INCR / MAPS)) $MAX /stresso/$i >$LOG_DIR/generate_$i.out 2>$LOG_DIR/generate_$i.err 52 | $BIN_DIR/load.sh /stresso/$i >$LOG_DIR/load_$i.out 2>$LOG_DIR/load_$i.err 53 | # TODO could reload the same dataset sometimes, maybe when i%5 == 0 or something 54 | $BIN_DIR/compact-ll.sh $MAX $COMPACT_CUTOFF >$LOG_DIR/compact-ll_$i.out 2>$LOG_DIR/compact-ll_$i.err 55 | if ! ((i % WAIT_PERIOD)); then 56 | $FLUO_CMD wait -a $FLUO_APP_NAME >$LOG_DIR/wait_$i.out 2>$LOG_DIR/wait_$i.err 57 | else 58 | sleep $SLEEP 59 | fi 60 | done 61 | 62 | # print unique counts 63 | echo "*****Calculating # of unique integers using MapReduce*****" 64 | $BIN_DIR/unique.sh $REDUCES /stresso/* >$LOG_DIR/unique.out 2>$LOG_DIR/unique.err 65 | grep UNIQUE $LOG_DIR/unique.err 66 | 67 | echo "*****Wait for Fluo to finish processing*****" 68 | $FLUO_CMD wait -a $FLUO_APP_NAME 69 | 70 | echo "*****Printing # of unique integers calculated by Fluo*****" 71 | $BIN_DIR/print.sh >$LOG_DIR/print.out 2>$LOG_DIR/print.err 72 | cat $LOG_DIR/print.out 73 | 74 | echo "*****Verifying Fluo & MapReduce results match*****" 75 | MAPR_TOTAL=`grep UNIQUE $LOG_DIR/unique.err | cut -d = -f 2` 76 | FLUO_TOTAL=`grep "Total at root" $LOG_DIR/print.out | cut -d ' ' -f 5` 77 | if [ $MAPR_TOTAL -eq $FLUO_TOTAL ]; then 78 | echo "Success! Fluo & MapReduce both calculated $FLUO_TOTAL unique integers" 79 | exit 0 80 | else 81 | echo "ERROR - Results do not match. Fluo calculated $FLUO_TOTAL unique integers while MapReduce calculated $MAPR_TOTAL integers" 82 | exit 1 83 | fi 84 | -------------------------------------------------------------------------------- /stresso/bin/split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | $FLUO_CMD exec $FLUO_APP_NAME stresso.trie.Split $FLUO_CONN $FLUO_APP_NAME "$TABLE_PROPS" $@ 22 | -------------------------------------------------------------------------------- /stresso/bin/unique.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 19 | . $BIN_DIR/load-env.sh 20 | 21 | if [ "$#" -lt 2 ]; then 22 | echo "Usage : $0 { }" 23 | exit 1 24 | fi 25 | 26 | yarn jar $STRESSO_JAR stresso.trie.Unique -Dmapreduce.job.reduces=$1 ${@:2} 27 | -------------------------------------------------------------------------------- /stresso/conf/.gitignore: -------------------------------------------------------------------------------- 1 | env.sh 2 | fluo-app.properties 3 | -------------------------------------------------------------------------------- /stresso/conf/env.sh.example: -------------------------------------------------------------------------------- 1 | 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ############################### 18 | # configuration for all scripts 19 | ############################### 20 | # Fluo Home 21 | test -z "$FLUO_HOME" && FLUO_HOME=/path/to/accumulo 22 | # Fluo connection properties 23 | FLUO_CONN=$FLUO_HOME/conf/fluo-conn.properties 24 | # Fluo application name 25 | FLUO_APP_NAME=stresso 26 | # Set this to avoid Hadoop's old version of guava. This will make Hadoop's 27 | # yarn command use a classloader when running code. This classloader isolates 28 | # stresso runtime code from Hadoop's depedencies. 29 | export HADOOP_USE_CLIENT_CLASSLOADER=true 30 | 31 | ############################### 32 | # configuration for run-test.sh 33 | ############################### 34 | # Place where logs from test are placed 35 | LOG_DIR=$BIN_DIR/../logs 36 | # Maximum number to generate 37 | MAX=$((10**9)) 38 | #the number of splits to create in table 39 | SPLITS=17 40 | # Number of mappers to run for data generation, which determines how many files 41 | # generation outputs. The number of files determines how many mappers loading 42 | # data will run. 43 | MAPS=17 44 | # Number of reduce tasks 45 | REDUCES=17 46 | # Number of random numbers to generate initially 47 | GEN_INIT=$((10**6)) 48 | # Number of random numbers to generate for each incremental step. 49 | GEN_INCR=$((10**3)) 50 | # Number of incremental steps. 51 | ITERATIONS=3 52 | # Seconds to sleep between incremental steps. 53 | SLEEP=30 54 | # Compact levels with less than the following possible nodes after loads 55 | COMPACT_CUTOFF=$((256**3 + 1)) 56 | # The fluo wait command is executed after this many incremental load steps. 57 | WAIT_PERIOD=10 58 | # To run map reduce jobs, a shaded jar is built. The following properties 59 | # determine what versions of Fluo and Accumulo client libs end up in the shaded 60 | # jar. 61 | FLUO_VERSION=$($FLUO_HOME/bin/fluo version) 62 | ACCUMULO_VERSION=$(accumulo version) 63 | 64 | # The following Accumulo table properties will be set 65 | read -r -d '' TABLE_PROPS << EOM 66 | table.compaction.major.ratio=1.5 67 | table.file.compress.blocksize=8K 68 | table.file.compress.blocksize.index=32K 69 | table.file.compress.type=snappy 70 | EOM 71 | -------------------------------------------------------------------------------- /stresso/conf/fluo-app.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more contributor license 2 | # agreements. See the NOTICE file distributed with this work for additional information regarding 3 | # copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 4 | # (the "License"); you may not use this file except in compliance with the License. You may 5 | # obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software distributed under the License 10 | # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | # or implied. See the License for the specific language governing permissions and limitations under 12 | # the License. 13 | 14 | fluo.observer.provider=stresso.trie.StressoObserverProvider 15 | 16 | fluo.app.trie.nodeSize=8 17 | # For a max ~10^9 set to 6. For a max ~10^12 set to 5. If more than 10^12 set to 4. 18 | fluo.app.trie.stopLevel=6 19 | 20 | # The following assumes that fluo init is run in the stresso dir 21 | fluo.observer.init.dir=./lib 22 | 23 | fluo.accumulo.table=${fluo.connection.application.name} 24 | 25 | # You may need to edit the following to match your Hadoop and Accumulo settings 26 | fluo.dfs.root=hdfs://localhost:8020/fluo 27 | fluo.accumulo.instance=uno 28 | fluo.accumulo.user=root 29 | fluo.accumulo.password=secret 30 | fluo.accumulo.zookeepers=localhost 31 | 32 | # Performance related properties 33 | fluo.worker.num.threads=128 34 | fluo.loader.num.threads=64 35 | fluo.loader.queue.size=256 36 | 37 | -------------------------------------------------------------------------------- /stresso/conf/fluo-app.properties.example: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more contributor license 2 | # agreements. See the NOTICE file distributed with this work for additional information regarding 3 | # copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 4 | # (the "License"); you may not use this file except in compliance with the License. You may 5 | # obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software distributed under the License 10 | # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | # or implied. See the License for the specific language governing permissions and limitations under 12 | # the License. 13 | 14 | fluo.observer.provider=stresso.trie.StressoObserverProvider 15 | 16 | fluo.app.trie.nodeSize=8 17 | # For a max ~10^9 set to 6. For a max ~10^12 set to 5. If more than 10^12 set to 4. 18 | fluo.app.trie.stopLevel=6 19 | 20 | # The following assumes that fluo init is run in the stresso dir 21 | fluo.observer.init.dir=./lib 22 | 23 | fluo.accumulo.table=${fluo.connection.application.name} 24 | 25 | # You may need to edit the following to match your Hadoop and Accumulo settings 26 | fluo.dfs.root=hdfs://localhost:8020/fluo 27 | fluo.accumulo.instance=uno 28 | fluo.accumulo.user=root 29 | fluo.accumulo.password=secret 30 | fluo.accumulo.zookeepers=localhost 31 | 32 | # Performance related properties 33 | fluo.worker.num.threads=128 34 | fluo.loader.num.threads=64 35 | fluo.loader.queue.size=256 36 | 37 | -------------------------------------------------------------------------------- /stresso/conf/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/AccumuloUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import org.apache.accumulo.core.client.Accumulo; 21 | import org.apache.accumulo.core.client.AccumuloClient; 22 | import org.apache.accumulo.core.client.admin.TableOperations; 23 | import org.apache.fluo.api.client.FluoAdmin; 24 | import org.apache.fluo.api.client.FluoFactory; 25 | import org.apache.fluo.api.config.FluoConfiguration; 26 | 27 | public class AccumuloUtil { 28 | 29 | public interface TableOp { 30 | T run(TableOperations tableOps, String tableName) throws Exception; 31 | } 32 | 33 | public interface VoidTableOp { 34 | void run(TableOperations tableOps, String tableName) throws Exception; 35 | } 36 | 37 | public static void doTableOp(FluoConfiguration fc, VoidTableOp tableOp) { 38 | getTableOp(fc, (to, tn) -> { 39 | tableOp.run(to, tn); 40 | return null; 41 | }); 42 | } 43 | 44 | public static T getTableOp(FluoConfiguration fc, TableOp tableOp) { 45 | try (FluoAdmin fadmin = FluoFactory.newAdmin(fc)) { 46 | FluoConfiguration appCfg = new FluoConfiguration(fadmin.getApplicationConfig()); 47 | appCfg.setApplicationName(fc.getApplicationName()); 48 | AccumuloClient client = 49 | Accumulo.newClient().to(appCfg.getAccumuloInstance(), appCfg.getAccumuloZookeepers()) 50 | .as(appCfg.getAccumuloUser(), appCfg.getAccumuloPassword()).build(); 51 | return tableOp.run(client.tableOperations(), appCfg.getAccumuloTable()); 52 | } catch (Exception e) { 53 | throw new RuntimeException(e); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/CompactLL.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import java.io.File; 21 | 22 | import org.apache.accumulo.core.client.Connector; 23 | import org.apache.fluo.api.config.FluoConfiguration; 24 | import org.apache.hadoop.io.Text; 25 | 26 | /** 27 | * Compact the lower levels of the tree. The lower levels of the tree contain a small of nodes that 28 | * are frequently updated. Compacting these lower levels is a quick operation that cause the Fluo GC 29 | * iterator to cleanup past transactions. 30 | */ 31 | 32 | public class CompactLL { 33 | public static void main(String[] args) throws Exception { 34 | 35 | if (args.length != 4) { 36 | System.err.println( 37 | "Usage: " + Split.class.getSimpleName() + " "); 38 | System.exit(-1); 39 | } 40 | 41 | FluoConfiguration config = new FluoConfiguration(new File(args[0])); 42 | config.setApplicationName(args[1]); 43 | 44 | long max = Long.parseLong(args[2]); 45 | 46 | // compact levels that can contain less nodes than this 47 | int cutoff = Integer.parseInt(args[3]); 48 | 49 | StressoConfig sconf = StressoConfig.retrieve(config); 50 | 51 | int level = 64 / sconf.nodeSize; 52 | 53 | while (level >= sconf.stopLevel) { 54 | if (max < cutoff) { 55 | break; 56 | } 57 | 58 | max = max >> 8; 59 | level--; 60 | } 61 | 62 | String start = String.format("%02d", sconf.stopLevel); 63 | String end = String.format("%02d:~", (level)); 64 | 65 | System.out.println("Compacting " + start + " to " + end); 66 | AccumuloUtil.doTableOp(config, 67 | (tableOps, table) -> tableOps.compact(table, new Text(start), new Text(end), true, false)); 68 | System.exit(0); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/Constants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import org.apache.fluo.api.data.Column; 21 | import org.apache.fluo.recipes.core.types.StringEncoder; 22 | import org.apache.fluo.recipes.core.types.TypeLayer; 23 | 24 | /** 25 | * 26 | */ 27 | public class Constants { 28 | 29 | public static final TypeLayer TYPEL = new TypeLayer(new StringEncoder()); 30 | 31 | public static final Column COUNT_SEEN_COL = new Column("count", "seen"); 32 | public static final Column COUNT_WAIT_COL = new Column("count", "wait"); 33 | 34 | public static final String NODE_SIZE_PROP = "trie.nodeSize"; 35 | public static final String STOP_LEVEL_PROP = "trie.stopLevel"; 36 | } 37 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/Load.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | 23 | import org.apache.fluo.api.client.Loader; 24 | import org.apache.fluo.api.config.FluoConfiguration; 25 | import org.apache.fluo.mapreduce.FluoOutputFormat; 26 | import org.apache.hadoop.conf.Configured; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.io.LongWritable; 29 | import org.apache.hadoop.io.NullWritable; 30 | import org.apache.hadoop.mapreduce.Job; 31 | import org.apache.hadoop.mapreduce.Mapper; 32 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 33 | import org.apache.hadoop.util.Tool; 34 | import org.apache.hadoop.util.ToolRunner; 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | public class Load extends Configured implements Tool { 39 | 40 | private static final Logger log = LoggerFactory.getLogger(Load.class); 41 | 42 | public static class LoadMapper extends Mapper { 43 | 44 | @Override 45 | protected void map(LongWritable key, NullWritable val, Context context) 46 | throws IOException, InterruptedException { 47 | context.write(new NumberLoader(key.get()), val); 48 | } 49 | } 50 | 51 | @Override 52 | public int run(String[] args) throws Exception { 53 | 54 | if (args.length != 3) { 55 | log.error( 56 | "Usage: " + this.getClass().getSimpleName() + " "); 57 | System.exit(-1); 58 | } 59 | 60 | FluoConfiguration props = new FluoConfiguration(new File(args[0])); 61 | props.setApplicationName(args[1]); 62 | final Path input = new Path(args[2]); 63 | 64 | Job job = Job.getInstance(getConf()); 65 | 66 | job.setJobName(Load.class.getName()); 67 | 68 | job.setJarByClass(Load.class); 69 | 70 | job.setInputFormatClass(SequenceFileInputFormat.class); 71 | SequenceFileInputFormat.addInputPath(job, input); 72 | 73 | job.setMapperClass(LoadMapper.class); 74 | 75 | job.setNumReduceTasks(0); 76 | 77 | job.setOutputFormatClass(FluoOutputFormat.class); 78 | FluoOutputFormat.configure(job, props); 79 | 80 | job.getConfiguration().setBoolean("mapreduce.map.speculative", false); 81 | job.getConfiguration().set("mapreduce.job.classloader", "true"); 82 | 83 | boolean success = job.waitForCompletion(true); 84 | return success ? 0 : 1; 85 | } 86 | 87 | public static void main(String[] args) throws Exception { 88 | int ret = ToolRunner.run(new Load(), args); 89 | System.exit(ret); 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/NodeObserver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import java.util.Map; 21 | 22 | import org.apache.fluo.api.client.TransactionBase; 23 | import org.apache.fluo.api.data.Bytes; 24 | import org.apache.fluo.api.data.Column; 25 | import org.apache.fluo.api.observer.Observer; 26 | import org.apache.fluo.recipes.core.types.TypedSnapshotBase.Value; 27 | import org.apache.fluo.recipes.core.types.TypedTransactionBase; 28 | 29 | /** 30 | * Observer that looks for count:wait for nodes. If found, it increments count:seen and increments 31 | * count:wait of parent node in trie 32 | */ 33 | public class NodeObserver implements Observer { 34 | 35 | private final int stopLevel; 36 | 37 | public NodeObserver(int stopLevel) { 38 | this.stopLevel = stopLevel; 39 | } 40 | 41 | @Override 42 | public void process(TransactionBase tx, Bytes row, Column col) throws Exception { 43 | 44 | final TypedTransactionBase ttx = Constants.TYPEL.wrap(tx); 45 | 46 | Map colVals = 47 | ttx.get().row(row).columns(Constants.COUNT_SEEN_COL, Constants.COUNT_WAIT_COL); 48 | 49 | final Integer childWait = colVals.get(Constants.COUNT_WAIT_COL).toInteger(0); 50 | 51 | if (childWait > 0) { 52 | Integer childSeen = colVals.get(Constants.COUNT_SEEN_COL).toInteger(0); 53 | 54 | ttx.mutate().row(row).col(Constants.COUNT_SEEN_COL).set(childSeen + childWait); 55 | ttx.mutate().row(row).col(Constants.COUNT_WAIT_COL).delete(); 56 | 57 | Node node = new Node(row.toString()); 58 | if (node.getLevel() > stopLevel) { 59 | Node parent = node.getParent(); 60 | Integer parentWait = 61 | ttx.get().row(parent.getRowId()).col(Constants.COUNT_WAIT_COL).toInteger(0); 62 | ttx.mutate().row(parent.getRowId()).col(Constants.COUNT_WAIT_COL) 63 | .set(parentWait + childWait); 64 | } 65 | 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/NumberLoader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import java.util.Map; 21 | 22 | import org.apache.fluo.api.client.Loader; 23 | import org.apache.fluo.api.client.TransactionBase; 24 | import org.apache.fluo.api.data.Column; 25 | import org.apache.fluo.recipes.core.types.TypedSnapshotBase.Value; 26 | import org.apache.fluo.recipes.core.types.TypedTransactionBase; 27 | 28 | import static com.google.common.base.Preconditions.checkArgument; 29 | 30 | /** 31 | * Executes load transactions of numbers into trie at leaf node level 32 | */ 33 | public class NumberLoader implements Loader { 34 | 35 | private final Number number; 36 | private Integer nodeSize = null; 37 | 38 | public NumberLoader(Integer num, int nodeSize) { 39 | checkArgument(num >= 0, "Only positive numbers accepted"); 40 | checkArgument((nodeSize <= 32) && ((32 % nodeSize) == 0), "nodeSize must be divisor of 32"); 41 | this.number = num; 42 | this.nodeSize = nodeSize; 43 | } 44 | 45 | public NumberLoader(Long num) { 46 | checkArgument(num >= 0, "Only positive numbers accepted"); 47 | this.number = num; 48 | } 49 | 50 | @Override 51 | public void load(TransactionBase tx, Context context) throws Exception { 52 | 53 | if (nodeSize == null) { 54 | nodeSize = context.getAppConfiguration().getInt(Constants.NODE_SIZE_PROP); 55 | checkArgument((nodeSize <= 64) && ((64 % nodeSize) == 0), "nodeSize must be divisor of 64"); 56 | } 57 | int level = 64 / nodeSize; 58 | 59 | TypedTransactionBase ttx = Constants.TYPEL.wrap(tx); 60 | 61 | String rowId = new Node(number, level, nodeSize).getRowId(); 62 | 63 | Map colVals = 64 | ttx.get().row(rowId).columns(Constants.COUNT_SEEN_COL, Constants.COUNT_WAIT_COL); 65 | 66 | Integer seen = colVals.get(Constants.COUNT_SEEN_COL).toInteger(0); 67 | if (seen == 0) { 68 | Integer wait = colVals.get(Constants.COUNT_WAIT_COL).toInteger(0); 69 | if (wait == 0) { 70 | ttx.mutate().row(rowId).col(Constants.COUNT_WAIT_COL).set(1); 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/StressoConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import org.apache.fluo.api.client.FluoClient; 21 | import org.apache.fluo.api.client.FluoFactory; 22 | import org.apache.fluo.api.config.FluoConfiguration; 23 | import org.apache.fluo.api.config.SimpleConfiguration; 24 | 25 | public class StressoConfig { 26 | public final int nodeSize; 27 | public final int stopLevel; 28 | 29 | public StressoConfig(int nodeSize, int stopLevel) { 30 | this.nodeSize = nodeSize; 31 | this.stopLevel = stopLevel; 32 | } 33 | 34 | public static StressoConfig retrieve(FluoConfiguration fc) { 35 | try (FluoClient client = FluoFactory.newClient(fc)) { 36 | return retrieve(client); 37 | } 38 | } 39 | 40 | public static StressoConfig retrieve(FluoClient client) { 41 | SimpleConfiguration ac = client.getAppConfiguration(); 42 | return new StressoConfig(ac.getInt(Constants.NODE_SIZE_PROP), 43 | ac.getInt(Constants.STOP_LEVEL_PROP)); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /stresso/src/main/java/stresso/trie/StressoObserverProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso.trie; 19 | 20 | import org.apache.fluo.api.observer.ObserverProvider; 21 | 22 | import static org.apache.fluo.api.observer.Observer.NotificationType.STRONG; 23 | import static stresso.trie.Constants.COUNT_WAIT_COL; 24 | 25 | public class StressoObserverProvider implements ObserverProvider { 26 | @Override 27 | public void provide(Registry registry, Context ctx) { 28 | int stopLevel = ctx.getAppConfiguration().getInt(Constants.STOP_LEVEL_PROP); 29 | registry.forColumn(COUNT_WAIT_COL, STRONG).useObserver(new NodeObserver(stopLevel)); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /stresso/src/test/java/stresso/TrieStopLevelIT.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package stresso; 19 | 20 | import org.apache.fluo.api.client.Snapshot; 21 | import org.apache.fluo.api.config.FluoConfiguration; 22 | import org.apache.fluo.api.config.ObserverSpecification; 23 | import org.apache.fluo.api.config.SimpleConfiguration; 24 | import org.apache.fluo.api.data.Bytes; 25 | import org.junit.Assert; 26 | import org.junit.Test; 27 | import stresso.trie.Constants; 28 | import stresso.trie.Node; 29 | import stresso.trie.NodeObserver; 30 | 31 | public class TrieStopLevelIT extends TrieMapRedIT { 32 | 33 | @Override 34 | protected void preInit(FluoConfiguration conf) { 35 | conf.addObserver(new ObserverSpecification(NodeObserver.class.getName())); 36 | 37 | SimpleConfiguration appCfg = conf.getAppConfiguration(); 38 | appCfg.setProperty(Constants.STOP_LEVEL_PROP, 7); 39 | appCfg.setProperty(Constants.NODE_SIZE_PROP, 8); 40 | } 41 | 42 | @Test 43 | public void testEndToEnd() throws Exception { 44 | super.testEndToEnd(); 45 | try (Snapshot snap = client.newSnapshot()) { 46 | Bytes row = Bytes.of(Node.generateRootId(8)); 47 | Assert.assertNull(snap.get(row, Constants.COUNT_SEEN_COL)); 48 | Assert.assertNull(snap.get(row, Constants.COUNT_WAIT_COL)); 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /stresso/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Stresso authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | log4j.rootLogger=INFO, CA 16 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c{2}] %-5p: %m%n 19 | 20 | log4j.logger.org.apache.curator=ERROR 21 | log4j.logger.org.apache.accumulo=WARN 22 | log4j.logger.org.apache.commons.vfs2.impl.DefaultFileSystemManager=WARN 23 | log4j.logger.org.apache.fluo=WARN 24 | log4j.logger.org.apache.hadoop=WARN 25 | log4j.logger.org.apache.hadoop.conf=ERROR 26 | log4j.logger.org.apache.hadoop.mapred=ERROR 27 | log4j.logger.org.apache.hadoop.mapreduce=ERROR 28 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR 29 | log4j.logger.org.apache.zookeeper.ClientCnxn=FATAL 30 | log4j.logger.org.apache.zookeeper.ZooKeeper=WARN 31 | log4j.logger.stresso=WARN 32 | -------------------------------------------------------------------------------- /webindex/.gitignore: -------------------------------------------------------------------------------- 1 | /logs/ 2 | /data/ 3 | -------------------------------------------------------------------------------- /webindex/.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | language: java 15 | jdk: 16 | - openjdk8 17 | script: mvn -U clean verify 18 | -------------------------------------------------------------------------------- /webindex/AUTHORS: -------------------------------------------------------------------------------- 1 | AUTHORS 2 | ------- 3 | 4 | Mike Walch - Peterson Technologies 5 | Keith Turner - Peterson Technologies 6 | -------------------------------------------------------------------------------- /webindex/bin/impl/base.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 Webindex authors (see AUTHORS) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | : ${WI_HOME?"WI_HOME must be set"} 18 | : ${WI_CONFIG?"WI_CONFIG must be set"} 19 | : ${SPARK_HOME?"SPARK_HOME must be set"} 20 | 21 | function get_prop { 22 | echo "`grep $1 $WI_CONFIG | cut -d ' ' -f 2`" 23 | } 24 | 25 | : ${HADOOP_CONF_DIR?"HADOOP_CONF_DIR must be set in bash env or conf/webindex-env.sh"} 26 | if [ ! -d $HADOOP_CONF_DIR ]; then 27 | echo "HADOOP_CONF_DIR=$HADOOP_CONF_DIR does not exist" 28 | exit 1 29 | fi 30 | : ${FLUO_HOME?"FLUO_HOME must be set in bash env or conf/webindex-env.sh"} 31 | if [ ! -d $FLUO_HOME ]; then 32 | echo "FLUO_HOME=$FLUO_HOME does not exist" 33 | exit 1 34 | fi 35 | 36 | : ${WI_EXECUTOR_INSTANCES?"WI_EXECUTOR_INSTANCES must be set in bash env or conf/webindex-env.sh"} 37 | : ${WI_EXECUTOR_MEMORY?"WI_EXECUTOR_MEMORY must be set in bash env or conf/webindex-env.sh"} 38 | export COMMON_SPARK_OPTS="--master yarn-client --num-executors $WI_EXECUTOR_INSTANCES --executor-memory $WI_EXECUTOR_MEMORY" 39 | 40 | export SPARK_SUBMIT=$SPARK_HOME/bin/spark-submit 41 | if [ ! -f $SPARK_SUBMIT ]; then 42 | echo "The spark-submit command cannot be found in SPARK_HOME=$SPARK_HOME. Please set SPARK_HOME in conf/webindex-env.sh" 43 | exit 1 44 | fi 45 | 46 | hash mvn 2>/dev/null || { echo >&2 "Maven must be installed & mvn command must be on path. Aborting."; exit 1; } 47 | 48 | # Stop if any command after this fails 49 | set -e 50 | 51 | export WI_DATA_JAR=$WI_HOME/modules/data/target/webindex-data-$WI_VERSION.jar 52 | export WI_DATA_DEP_JAR=$WI_HOME/modules/data/target/webindex-data-$WI_VERSION-shaded.jar 53 | if [ ! -f $WI_DATA_DEP_JAR ]; then 54 | echo "Building $WI_DATA_DEP_JAR" 55 | cd $WI_HOME 56 | 57 | : ${ACCUMULO_VERSION?"ACCUMULO_VERSION must be set in bash env or conf/webindex-env.sh"} 58 | : ${FLUO_VERSION?"FLUO_VERSION must be set in bash env or conf/webindex-env.sh"} 59 | : ${THRIFT_VERSION?"THRIFT_VERSION must be set in bash env or conf/webindex-env.sh"} 60 | mvn clean package -Pcreate-shade-jar -DskipTests -Dfluo.version=$FLUO_VERSION -Daccumulo.version=$ACCUMULO_VERSION -Dthrift.version=$THRIFT_VERSION 61 | fi 62 | -------------------------------------------------------------------------------- /webindex/bin/impl/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 Webindex authors (see AUTHORS) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | : "${WI_HOME?"WI_HOME must be set"}" 18 | 19 | . "$WI_HOME/bin/impl/base.sh" 20 | 21 | if [[ "$OSTYPE" == "darwin"* ]]; then 22 | export SED="sed -i .bak" 23 | else 24 | export SED="sed -i" 25 | fi 26 | 27 | # stop if any command fails 28 | set -e 29 | 30 | : "${SPARK_SUBMIT?"SPARK_SUBMIT must be set"}" 31 | : "${WI_DATA_JAR?"WI_DATA_JAR must be set"}" 32 | : "${WI_DATA_DEP_JAR?"WI_DATA_DEP_JAR must be set"}" 33 | 34 | fluo_app=$(get_prop fluoApp) 35 | fluo_cmd=$FLUO_HOME/bin/fluo 36 | if [ ! -f "$fluo_cmd" ]; then 37 | echo "Fluo command script does not exist at $fluo_cmd" 38 | exit 1 39 | fi 40 | 41 | app_lib=$WI_HOME/target/lib 42 | mkdir -p "$app_lib" 43 | cp "$WI_DATA_JAR" "$app_lib" 44 | mvn package -Pcopy-dependencies -DskipTests -DoutputDirectory="$app_lib" 45 | # Add webindex core and its dependencies 46 | cp "$WI_HOME/modules/core/target/webindex-core-$WI_VERSION.jar" "$app_lib" 47 | 48 | app_props=$WI_HOME/target/fluo-app.properties 49 | cp "$FLUO_HOME/conf/fluo-app.properties" "$app_props" 50 | $SED "s#^.*fluo.observer.init.dir=[^ ]*#fluo.observer.init.dir=${app_lib}#" "$app_props" 51 | 52 | java -cp "$app_lib/*:$("$fluo_cmd" classpath)" webindex.data.Configure "$WI_CONFIG" "$app_props" 53 | 54 | "$fluo_cmd" init -a "$fluo_app" -p "$app_props" --force 55 | 56 | "$SPARK_SUBMIT" --class webindex.data.Init $COMMON_SPARK_OPTS \ 57 | --conf spark.shuffle.service.enabled=true \ 58 | --conf spark.executor.extraJavaOptions=-XX:+UseCompressedOops \ 59 | $WI_DATA_DEP_JAR $1 60 | 61 | echo "Webindex init has completed successfully." 62 | -------------------------------------------------------------------------------- /webindex/conf/.gitignore: -------------------------------------------------------------------------------- 1 | webindex.yml 2 | webindex-env.sh 3 | log4j.properties 4 | -------------------------------------------------------------------------------- /webindex/conf/examples/log4j.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | log4j.rootLogger=INFO, CA 16 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n 19 | 20 | log4j.logger.org.apache.accumulo=WARN 21 | log4j.logger.org.apache.curator=ERROR 22 | log4j.logger.org.apache.fluo=WARN 23 | log4j.logger.org.apache.hadoop=WARN 24 | log4j.logger.org.apache.hadoop.mapreduce=ERROR 25 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR 26 | log4j.logger.org.apache.zookeeper=ERROR 27 | log4j.logger.org.eclipse.jetty=WARN 28 | log4j.logger.org.spark-project=WARN 29 | log4j.logger.webindex=INFO 30 | -------------------------------------------------------------------------------- /webindex/conf/examples/webindex-env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Set environment variables if they are not already set. Please modify the 16 | # export statement to use the correct directory. Remove the test statement 17 | # to override any previously set environment. 18 | 19 | ## Installation directories 20 | test -z "$HADOOP_PREFIX" && export HADOOP_PREFIX=/path/to/hadoop 21 | test -z "$HADOOP_CONF_DIR" && export HADOOP_CONF_DIR=/path/to/hadoop/etc/hadoop 22 | test -z "$FLUO_HOME" && export FLUO_HOME=/path/to/fluo 23 | test -z "$SPARK_HOME" && export SPARK_HOME=/path/to/spark 24 | 25 | ## Accumulo and Fluo versions that should be included in the shaded jar created for Spark. 26 | export FLUO_VERSION=`$FLUO_HOME/bin/fluo version` 27 | export ACCUMULO_VERSION=`accumulo version` 28 | 29 | ## Accumulo client will likely not work without correct thrift version 30 | if [[ $ACCUMULO_VERSION < "1.8" ]]; then 31 | THRIFT_VERSION="0.9.1" 32 | elif [[ $ACCUMULO_VERSION < "2.0" ]]; then 33 | THRIFT_VERSION="0.9.3" 34 | else 35 | THRIFT_VERSION="0.10.0" 36 | fi 37 | 38 | ## Spark 39 | # Number of Spark executor instances 40 | export WI_EXECUTOR_INSTANCES=2 41 | # Amount of memory given to each Spark executor 42 | export WI_EXECUTOR_MEMORY=512m 43 | -------------------------------------------------------------------------------- /webindex/conf/examples/webindex.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Accumulo table where indexes are exported for search 16 | accumuloIndexTable: webindex_search 17 | # Fluo Application Name 18 | fluoApp: webindex 19 | # Webindex builds multiple data sets for its computation. Each of these data 20 | # sets needs to be spread across the cluster. The setting below determines how 21 | # much each dataset will be split up. Ideally this would be a small multiple of 22 | # the number of Accumulo tablet servers. 23 | numTablets: 20 24 | # Number of buckets for collision free maps and export queue. This setting is 25 | # used during initialization of the Fluo table and when Spark loads the initial 26 | # data. The value of numBuckets must be the same for these two task. 27 | numBuckets: 100 28 | #This determines how fast each Spark load task will load documents. Set to 0 29 | #for no limit. Setting this to 50 and running 10 conucurrent load task would 30 | #limit the load rate to 500 documents/sec. 31 | loadRateLimit: 0 32 | # HDFS temporary directory 33 | hdfsTempDir: /cc/temp 34 | -------------------------------------------------------------------------------- /webindex/contrib/webindex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/examples/a155274150ec5fdf74341f340294e60af7d48fed/webindex/contrib/webindex.png -------------------------------------------------------------------------------- /webindex/docs/webindex_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/examples/a155274150ec5fdf74341f340294e60af7d48fed/webindex/docs/webindex_graphic.png -------------------------------------------------------------------------------- /webindex/modules/core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | org.apache.fluo 22 | webindex-parent 23 | 0.0.1-SNAPSHOT 24 | ../../pom.xml 25 | 26 | webindex-core 27 | WebIndex Core 28 | 29 | 30 | com.esotericsoftware.yamlbeans 31 | yamlbeans 32 | 33 | 34 | com.google.code.gson 35 | gson 36 | 37 | 38 | com.google.guava 39 | guava 40 | 41 | 42 | commons-lang 43 | commons-lang 44 | 45 | 46 | commons-validator 47 | commons-validator 48 | 1.4.1 49 | 50 | 51 | org.apache.accumulo 52 | accumulo-core 53 | 54 | 55 | org.apache.fluo 56 | fluo-api 57 | 58 | 59 | org.apache.fluo 60 | fluo-recipes-accumulo 61 | 62 | 63 | org.slf4j 64 | slf4j-api 65 | 66 | 67 | 68 | junit 69 | junit 70 | test 71 | 72 | 73 | org.slf4j 74 | slf4j-log4j12 75 | test 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/Constants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core; 19 | 20 | import org.apache.fluo.api.data.Column; 21 | import org.apache.fluo.recipes.core.types.StringEncoder; 22 | import org.apache.fluo.recipes.core.types.TypeLayer; 23 | 24 | public class Constants { 25 | 26 | // Column Families 27 | // for page 28 | public static final String PAGE = "page"; 29 | public static final String INLINKS = "inlinks"; 30 | // for domains 31 | public static final String DOMAIN = "domain"; 32 | public static final String PAGES = "pages"; 33 | public static final String RANK = "rank"; 34 | 35 | // Column Qualifiers 36 | // for page 37 | public static final String INCOUNT = "incount"; 38 | public static final String NEW = "new"; 39 | public static final String CUR = "cur"; 40 | // for domains 41 | public static final String PAGECOUNT = "pagecount"; 42 | 43 | // Columns 44 | public static final Column PAGE_NEW_COL = new Column(PAGE, NEW); 45 | public static final Column PAGE_CUR_COL = new Column(PAGE, CUR); 46 | public static final Column PAGE_INCOUNT_COL = new Column(PAGE, INCOUNT); 47 | public static final Column PAGECOUNT_COL = new Column(DOMAIN, PAGECOUNT); 48 | 49 | public static final TypeLayer TYPEL = new TypeLayer(new StringEncoder()); 50 | } 51 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/DomainStats.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | public class DomainStats { 21 | 22 | private String domain; 23 | private Long total = (long) 0; 24 | 25 | public DomainStats(String domain) { 26 | this.domain = domain; 27 | } 28 | 29 | public String getDomain() { 30 | return domain; 31 | } 32 | 33 | public Long getTotal() { 34 | return total; 35 | } 36 | 37 | public void setTotal(Long total) { 38 | this.total = total; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/Link.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import java.io.Serializable; 21 | import java.util.Objects; 22 | 23 | public class Link implements Serializable, Comparable { 24 | 25 | private static final long serialVersionUID = 1L; 26 | 27 | private String url; 28 | private String uri; 29 | private String anchorText; 30 | 31 | public Link() {} 32 | 33 | public Link(String uri, String anchorText) { 34 | Objects.requireNonNull(uri); 35 | Objects.requireNonNull(anchorText); 36 | this.url = URL.fromUri(uri).toString(); 37 | this.uri = uri; 38 | this.anchorText = anchorText; 39 | } 40 | 41 | public String getUrl() { 42 | return url; 43 | } 44 | 45 | public String getUri() { 46 | return uri; 47 | } 48 | 49 | public String getAnchorText() { 50 | return anchorText; 51 | } 52 | 53 | 54 | public static Link of(String uri, String anchorText) { 55 | return new Link(uri, anchorText); 56 | } 57 | 58 | public static Link of(String uri) { 59 | return new Link(uri, ""); 60 | } 61 | 62 | public static Link of(URL url, String anchorText) { 63 | return new Link(url.toUri(), anchorText); 64 | } 65 | 66 | public static Link of(URL url) { 67 | return new Link(url.toUri(), ""); 68 | } 69 | 70 | @Override 71 | public boolean equals(Object o) { 72 | if (o instanceof Link) { 73 | Link other = (Link) o; 74 | return url.equals(other.url) && uri.equals(other.uri); 75 | } 76 | return false; 77 | } 78 | 79 | @Override 80 | public int hashCode() { 81 | int result = url.hashCode(); 82 | result = 31 * result + uri.hashCode(); 83 | return result; 84 | } 85 | 86 | @Override 87 | public int compareTo(Link o) { 88 | int c = uri.compareTo(o.uri); 89 | if (c == 0) { 90 | c = url.compareTo(o.url); 91 | } 92 | 93 | return c; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/Links.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | public class Links { 24 | 25 | private String url; 26 | private String linkType; 27 | private String next = ""; 28 | private Integer pageNum; 29 | private Long total; 30 | private List links = new ArrayList<>(); 31 | 32 | public Links() { 33 | // Jackson deserialization 34 | } 35 | 36 | public Links(String url, String linkType, Integer pageNum) { 37 | this.url = url; 38 | this.linkType = linkType; 39 | this.pageNum = pageNum; 40 | } 41 | 42 | public Long getTotal() { 43 | return total; 44 | } 45 | 46 | public void setTotal(Long total) { 47 | this.total = total; 48 | } 49 | 50 | public String getUrl() { 51 | return url; 52 | } 53 | 54 | public List getLinks() { 55 | return links; 56 | } 57 | 58 | public void addLink(Link link) { 59 | links.add(link); 60 | } 61 | 62 | public String getLinkType() { 63 | return linkType; 64 | } 65 | 66 | public Integer getPageNum() { 67 | return pageNum; 68 | } 69 | 70 | public String getNext() { 71 | return next; 72 | } 73 | 74 | public void setNext(String next) { 75 | this.next = next; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/Pages.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | public class Pages { 24 | 25 | private String domain; 26 | private String next = ""; 27 | private Integer pageNum; 28 | private Long total; 29 | private List pages = new ArrayList<>(); 30 | 31 | public Pages() { 32 | // Jackson deserialization 33 | } 34 | 35 | public Pages(String domain, Integer pageNum) { 36 | this.domain = domain; 37 | this.pageNum = pageNum; 38 | } 39 | 40 | public Long getTotal() { 41 | return total; 42 | } 43 | 44 | public void setTotal(Long total) { 45 | this.total = total; 46 | } 47 | 48 | public String getDomain() { 49 | return domain; 50 | } 51 | 52 | public List getPages() { 53 | return pages; 54 | } 55 | 56 | public String getNext() { 57 | return next; 58 | } 59 | 60 | public void setNext(String next) { 61 | this.next = next; 62 | } 63 | 64 | public Integer getPageNum() { 65 | return pageNum; 66 | } 67 | 68 | public void addPage(PageScore pc) { 69 | pages.add(pc); 70 | } 71 | 72 | public void addPage(String url, Long score) { 73 | pages.add(new PageScore(url, score)); 74 | } 75 | 76 | public class PageScore { 77 | 78 | private String url; 79 | private Long score; 80 | 81 | public PageScore(String url, Long score) { 82 | this.url = url; 83 | this.score = score; 84 | } 85 | 86 | public String getUrl() { 87 | return url; 88 | } 89 | 90 | public Long getScore() { 91 | return score; 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/TopResults.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | public class TopResults { 24 | 25 | private String next = ""; 26 | private Integer pageNum; 27 | private List results = new ArrayList<>(); 28 | 29 | public Integer getPageNum() { 30 | return pageNum; 31 | } 32 | 33 | public void setPageNum(Integer pageNum) { 34 | this.pageNum = pageNum; 35 | } 36 | 37 | public String getNext() { 38 | return next; 39 | } 40 | 41 | public void addResult(String key, Long value) { 42 | results.add(new Result(key, value)); 43 | } 44 | 45 | public List getResults() { 46 | return results; 47 | } 48 | 49 | public void setNext(String next) { 50 | this.next = next; 51 | } 52 | 53 | public class Result { 54 | 55 | private String key; 56 | private Long value; 57 | 58 | Result(String key, Long value) { 59 | this.key = key; 60 | this.value = value; 61 | } 62 | 63 | public String getKey() { 64 | return key; 65 | } 66 | 67 | public Long getValue() { 68 | return value; 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/UriInfo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import java.io.Serializable; 21 | import java.util.Optional; 22 | 23 | import com.google.common.base.Preconditions; 24 | 25 | /** 26 | * Used by URI collision free map 27 | */ 28 | public class UriInfo implements Serializable { 29 | 30 | private static final long serialVersionUID = 1L; 31 | 32 | public static final UriInfo ZERO = new UriInfo(0, 0); 33 | 34 | // the numbers of documents that link to this URI 35 | public long linksTo; 36 | 37 | // the number of documents with this URI. Should be 0 or 1 38 | public int docs; 39 | 40 | public UriInfo() {} 41 | 42 | public UriInfo(long linksTo, int docs) { 43 | this.linksTo = linksTo; 44 | this.docs = docs; 45 | } 46 | 47 | public void add(UriInfo other) { 48 | Preconditions.checkArgument(this != ZERO); 49 | this.linksTo += other.linksTo; 50 | this.docs += other.docs; 51 | } 52 | 53 | @Override 54 | public String toString() { 55 | return linksTo + " " + docs; 56 | } 57 | 58 | @Override 59 | public boolean equals(Object o) { 60 | if (o instanceof UriInfo) { 61 | UriInfo oui = (UriInfo) o; 62 | return linksTo == oui.linksTo && docs == oui.docs; 63 | } 64 | return false; 65 | } 66 | 67 | @Override 68 | public int hashCode() { 69 | return docs + (int) linksTo; 70 | } 71 | 72 | public static UriInfo merge(UriInfo u1, UriInfo u2) { 73 | UriInfo total = new UriInfo(0, 0); 74 | total.add(u1); 75 | total.add(u2); 76 | return total; 77 | } 78 | 79 | public static Optional reduce(Iterable uriInfos) { 80 | UriInfo sum = new UriInfo(); 81 | for (UriInfo uriInfo : uriInfos) { 82 | sum.add(uriInfo); 83 | } 84 | return sum.equals(ZERO) ? Optional.empty() : Optional.of(sum); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/export/DomainUpdate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models.export; 19 | 20 | /** 21 | * Represents index updates for domain 22 | */ 23 | public class DomainUpdate implements IndexUpdate { 24 | 25 | private String domain; 26 | private Long oldPageCount; 27 | private Long newPageCount; 28 | 29 | public DomainUpdate() {} // For serialization 30 | 31 | public DomainUpdate(String domain, Long oldPageCount, Long newPageCount) { 32 | this.domain = domain; 33 | this.oldPageCount = oldPageCount; 34 | this.newPageCount = newPageCount; 35 | } 36 | 37 | public String getDomain() { 38 | return domain; 39 | } 40 | 41 | public Long getOldPageCount() { 42 | return oldPageCount; 43 | } 44 | 45 | public Long getNewPageCount() { 46 | return newPageCount; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/export/IndexUpdate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models.export; 19 | 20 | /** 21 | * Base class for updating indexes 22 | */ 23 | public interface IndexUpdate { 24 | 25 | } 26 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/export/PageUpdate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models.export; 19 | 20 | import java.util.List; 21 | 22 | import webindex.core.models.Link; 23 | 24 | /** 25 | * Represents index updates for pages 26 | */ 27 | public class PageUpdate implements IndexUpdate { 28 | 29 | private String uri; 30 | private String json; 31 | private List addedLinks; 32 | private List deletedLinks; 33 | 34 | public PageUpdate() {} // For serialization 35 | 36 | public PageUpdate(String uri, String json, List addedLinks, List deletedLinks) { 37 | this.uri = uri; 38 | this.json = json; 39 | this.addedLinks = addedLinks; 40 | this.deletedLinks = deletedLinks; 41 | } 42 | 43 | public String getUri() { 44 | return uri; 45 | } 46 | 47 | public String getJson() { 48 | return json; 49 | } 50 | 51 | public List getAddedLinks() { 52 | return addedLinks; 53 | } 54 | 55 | public List getDeletedLinks() { 56 | return deletedLinks; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/models/export/UriUpdate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models.export; 19 | 20 | import webindex.core.models.UriInfo; 21 | 22 | /** 23 | * Represents index updates for URIs 24 | */ 25 | public class UriUpdate implements IndexUpdate { 26 | 27 | private String uri; 28 | private UriInfo oldInfo; 29 | private UriInfo newInfo; 30 | 31 | public UriUpdate() {} // For serialization 32 | 33 | public UriUpdate(String uri, UriInfo oldInfo, UriInfo newInfo) { 34 | this.uri = uri; 35 | this.oldInfo = oldInfo; 36 | this.newInfo = newInfo; 37 | } 38 | 39 | public String getUri() { 40 | return uri; 41 | } 42 | 43 | public UriInfo getOldInfo() { 44 | return oldInfo; 45 | } 46 | 47 | public UriInfo getNewInfo() { 48 | return newInfo; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /webindex/modules/core/src/main/java/webindex/core/util/Pager.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.util; 19 | 20 | import java.util.Iterator; 21 | import java.util.Map; 22 | import java.util.concurrent.atomic.AtomicBoolean; 23 | import java.util.function.Consumer; 24 | 25 | import org.apache.accumulo.core.client.Scanner; 26 | import org.apache.accumulo.core.data.Key; 27 | import org.apache.accumulo.core.data.Range; 28 | import org.apache.accumulo.core.data.Value; 29 | 30 | public class Pager { 31 | 32 | private Scanner scanner; 33 | private int pageSize; 34 | private Range pageRange; 35 | private Consumer entryHandler; 36 | private AtomicBoolean pageRead = new AtomicBoolean(false); 37 | 38 | public class PageEntry { 39 | 40 | private Key key; 41 | private Value value; 42 | private boolean isNext; 43 | 44 | public PageEntry(Key key, Value value, boolean isNext) { 45 | this.key = key; 46 | this.value = value; 47 | this.isNext = isNext; 48 | } 49 | 50 | public Key getKey() { 51 | return key; 52 | } 53 | 54 | public Value getValue() { 55 | return value; 56 | } 57 | 58 | public boolean isNext() { 59 | return isNext; 60 | } 61 | } 62 | 63 | private Pager(Scanner scanner, Range pageRange, int pageSize, Consumer entryHandler) { 64 | this.scanner = scanner; 65 | this.pageRange = pageRange; 66 | this.pageSize = pageSize; 67 | this.entryHandler = entryHandler; 68 | } 69 | 70 | public void read(Key startKey) { 71 | if (pageRead.get() == true) { 72 | throw new IllegalStateException("Pager.read() cannot be called twice"); 73 | } 74 | scanner.setRange(new Range(startKey, pageRange.getEndKey())); 75 | handleStart(scanner.iterator()); 76 | } 77 | 78 | public void read(int pageNum) { 79 | if (pageRead.get() == true) { 80 | throw new IllegalStateException("Pager.read() cannot be called twice"); 81 | } 82 | scanner.setRange(pageRange); 83 | Iterator> iterator = scanner.iterator(); 84 | if (pageNum > 0) { 85 | long skip = 0; 86 | while (skip < (pageNum * pageSize)) { 87 | iterator.next(); 88 | skip++; 89 | } 90 | } 91 | handleStart(iterator); 92 | } 93 | 94 | private void handleStart(Iterator> iterator) { 95 | long num = 0; 96 | while (iterator.hasNext() && (num < (pageSize + 1))) { 97 | Map.Entry entry = iterator.next(); 98 | entryHandler.accept(new PageEntry(entry.getKey(), entry.getValue(), num == pageSize)); 99 | num++; 100 | } 101 | } 102 | 103 | public static Pager build(Scanner scanner, Range pageRange, int pageSize, 104 | Consumer entryHandler) { 105 | return new Pager(scanner, pageRange, pageSize, entryHandler); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /webindex/modules/core/src/test/java/webindex/core/WebIndexConfigTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core; 19 | 20 | import org.junit.Assert; 21 | import org.junit.Test; 22 | 23 | public class WebIndexConfigTest { 24 | 25 | @Test 26 | public void testBasic() throws Exception { 27 | WebIndexConfig config = WebIndexConfig.load("../../conf/examples/webindex.yml", false); 28 | Assert.assertEquals("webindex_search", config.accumuloIndexTable); 29 | Assert.assertEquals("webindex", config.fluoApp); 30 | Assert.assertEquals("/cc/temp", config.hdfsTempDir); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webindex/modules/core/src/test/java/webindex/core/models/LinkTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import org.junit.Assert; 21 | import org.junit.Test; 22 | 23 | public class LinkTest { 24 | 25 | @Test 26 | public void testBasic() { 27 | Link link1 = Link.of("com.a>>o>/", "anchor text"); 28 | Assert.assertEquals("http://a.com/", link1.getUrl()); 29 | Assert.assertEquals("com.a>>o>/", link1.getUri()); 30 | Assert.assertEquals("anchor text", link1.getAnchorText()); 31 | 32 | Link link2 = Link.of("com.a>>o>/", "other text"); 33 | Assert.assertEquals(link1, link2); 34 | 35 | Link link3 = Link.of(URLTest.from("http://a.com"), "more other text"); 36 | Assert.assertEquals("com.a>>o>/", link3.getUri()); 37 | Assert.assertEquals(link1, link3); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /webindex/modules/core/src/test/java/webindex/core/models/PageTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.core.models; 19 | 20 | import com.google.gson.Gson; 21 | import org.junit.Assert; 22 | import org.junit.Test; 23 | 24 | public class PageTest { 25 | 26 | @Test 27 | public void testBasic() { 28 | 29 | Page page = new Page(URLTest.from("http://example.com").toUri()); 30 | Assert.assertEquals("http://example.com/", page.getUrl()); 31 | Assert.assertEquals("com.example>>o>/", page.getUri()); 32 | Assert.assertEquals(Long.valueOf(0), page.getNumOutbound()); 33 | Assert.assertTrue(page.addOutbound(Link.of(URLTest.from("http://test1.com"), "test1"))); 34 | Assert.assertEquals(Long.valueOf(1), page.getNumOutbound()); 35 | Assert.assertTrue(page.addOutbound(Link.of(URLTest.from("http://test2.com"), "test2"))); 36 | Assert.assertEquals(Long.valueOf(2), page.getNumOutbound()); 37 | Assert.assertFalse(page.addOutbound(Link.of(URLTest.from("http://test2.com"), "test1234"))); 38 | Assert.assertEquals(Long.valueOf(2), page.getNumOutbound()); 39 | 40 | Gson gson = new Gson(); 41 | String json = gson.toJson(page); 42 | Assert.assertNotNull(json); 43 | Assert.assertFalse(json.isEmpty()); 44 | 45 | Page after = gson.fromJson(json, Page.class); 46 | Assert.assertEquals(page.getUrl(), after.getUrl()); 47 | Assert.assertEquals(page.getOutboundLinks().size(), after.getOutboundLinks().size()); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /webindex/modules/core/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | log4j.rootLogger=INFO, CA 16 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n 19 | 20 | log4j.logger.webindex=WARN 21 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/CalcSplits.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data; 19 | 20 | import java.util.SortedSet; 21 | 22 | import org.apache.fluo.api.data.Bytes; 23 | import org.apache.fluo.api.data.RowColumn; 24 | import org.apache.hadoop.conf.Configuration; 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.spark.SparkConf; 27 | import org.apache.spark.api.java.JavaPairRDD; 28 | import org.apache.spark.api.java.JavaRDD; 29 | import org.apache.spark.api.java.JavaSparkContext; 30 | import org.archive.io.ArchiveReader; 31 | import org.slf4j.Logger; 32 | import org.slf4j.LoggerFactory; 33 | import webindex.core.models.Page; 34 | import webindex.core.models.UriInfo; 35 | import webindex.data.spark.IndexEnv; 36 | import webindex.data.spark.IndexStats; 37 | import webindex.data.spark.IndexUtil; 38 | import webindex.data.util.WARCFileInputFormat; 39 | 40 | public class CalcSplits { 41 | 42 | private static final Logger log = LoggerFactory.getLogger(CalcSplits.class); 43 | 44 | public static void main(String[] args) { 45 | if (args.length != 1) { 46 | log.error("Usage: CalcSplits "); 47 | System.exit(1); 48 | } 49 | final String dataDir = args[0]; 50 | IndexEnv.validateDataDir(dataDir); 51 | 52 | SparkConf sparkConf = new SparkConf().setAppName("webindex-calcsplits"); 53 | try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) { 54 | 55 | IndexStats stats = new IndexStats(ctx); 56 | 57 | final JavaPairRDD archives = ctx.newAPIHadoopFile(dataDir, 58 | WARCFileInputFormat.class, Text.class, ArchiveReader.class, new Configuration()); 59 | 60 | JavaRDD pages = IndexUtil.createPages(archives); 61 | 62 | JavaPairRDD uriMap = IndexUtil.createUriMap(pages); 63 | JavaPairRDD domainMap = IndexUtil.createDomainMap(uriMap); 64 | JavaPairRDD accumuloIndex = 65 | IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap); 66 | SortedSet splits = IndexUtil.calculateSplits(accumuloIndex, 100); 67 | log.info("Accumulo splits:"); 68 | splits.forEach(System.out::println); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/Configure.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data; 19 | 20 | import java.io.BufferedWriter; 21 | import java.io.File; 22 | import java.io.FileWriter; 23 | import java.io.PrintWriter; 24 | import java.util.Iterator; 25 | 26 | import com.google.common.base.Preconditions; 27 | import org.apache.fluo.api.config.FluoConfiguration; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | import webindex.core.WebIndexConfig; 31 | import webindex.data.spark.IndexEnv; 32 | 33 | public class Configure { 34 | 35 | private static final Logger log = LoggerFactory.getLogger(Configure.class); 36 | 37 | public static void main(String[] args) throws Exception { 38 | 39 | if (args.length != 2) { 40 | log.error("Usage: Configure "); 41 | System.exit(1); 42 | } 43 | WebIndexConfig webIndexConfig = WebIndexConfig.load(args[0]); 44 | String appPropsPath = args[1]; 45 | Preconditions.checkArgument(new File(appPropsPath).exists(), 46 | "File does not exist: " + appPropsPath); 47 | 48 | FluoConfiguration fluoConfig = 49 | new FluoConfiguration(new File(webIndexConfig.getConnPropsPath())); 50 | fluoConfig.load(new File(appPropsPath)); 51 | 52 | 53 | IndexEnv env = new IndexEnv(webIndexConfig, fluoConfig); 54 | env.initAccumuloIndexTable(); 55 | 56 | FluoConfiguration appConfig = new FluoConfiguration(); 57 | env.configureApplication(fluoConfig, appConfig); 58 | Iterator iter = appConfig.getKeys(); 59 | try ( 60 | PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(appPropsPath, true)))) { 61 | while (iter.hasNext()) { 62 | String key = iter.next(); 63 | out.println(key + " = " + appConfig.getRawString(key)); 64 | } 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/FluoApp.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data; 19 | 20 | import org.apache.fluo.api.config.FluoConfiguration; 21 | import org.apache.fluo.recipes.accumulo.export.function.AccumuloExporter; 22 | import org.apache.fluo.recipes.core.data.RowHasher; 23 | import org.apache.fluo.recipes.core.export.ExportQueue; 24 | import org.apache.fluo.recipes.kryo.KryoSimplerSerializer; 25 | import webindex.core.models.export.IndexUpdate; 26 | import webindex.data.fluo.DomainCombineQ; 27 | import webindex.data.fluo.PageObserver; 28 | import webindex.data.fluo.UriCombineQ; 29 | import webindex.data.fluo.WebindexObservers; 30 | import webindex.serialization.WebindexKryoFactory; 31 | 32 | public class FluoApp { 33 | 34 | public static final String EXPORT_QUEUE_ID = "eq"; 35 | 36 | public static void configureApplication(FluoConfiguration connectionConfig, 37 | FluoConfiguration appConfig, String exportTable, int numBuckets, int numTablets) { 38 | 39 | appConfig.setObserverProvider(WebindexObservers.class); 40 | 41 | KryoSimplerSerializer.setKryoFactory(appConfig, WebindexKryoFactory.class); 42 | 43 | UriCombineQ.configure(appConfig, numBuckets, numTablets); 44 | DomainCombineQ.configure(appConfig, numBuckets, numTablets); 45 | 46 | ExportQueue.configure(EXPORT_QUEUE_ID).keyType(String.class).valueType(IndexUpdate.class) 47 | .buckets(numBuckets).bucketsPerTablet(numBuckets / numTablets).save(appConfig); 48 | 49 | AccumuloExporter.configure(EXPORT_QUEUE_ID) 50 | .instance(connectionConfig.getAccumuloInstance(), connectionConfig.getAccumuloZookeepers()) 51 | .credentials(connectionConfig.getAccumuloUser(), connectionConfig.getAccumuloPassword()) 52 | .table(exportTable).save(appConfig); 53 | 54 | RowHasher.configure(appConfig, PageObserver.getPageRowHasher().getPrefix(), numTablets); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/Init.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data; 19 | 20 | import org.apache.hadoop.conf.Configuration; 21 | import org.apache.hadoop.io.Text; 22 | import org.apache.spark.SparkConf; 23 | import org.apache.spark.api.java.JavaPairRDD; 24 | import org.apache.spark.api.java.JavaRDD; 25 | import org.apache.spark.api.java.JavaSparkContext; 26 | import org.archive.io.ArchiveReader; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | import webindex.core.WebIndexConfig; 30 | import webindex.core.models.Page; 31 | import webindex.data.spark.IndexEnv; 32 | import webindex.data.spark.IndexStats; 33 | import webindex.data.spark.IndexUtil; 34 | import webindex.data.util.WARCFileInputFormat; 35 | 36 | public class Init { 37 | 38 | private static final Logger log = LoggerFactory.getLogger(Init.class); 39 | 40 | public static void main(String[] args) throws Exception { 41 | 42 | if (args.length > 1) { 43 | log.error("Usage: Init []"); 44 | System.exit(1); 45 | } 46 | WebIndexConfig webIndexConfig = WebIndexConfig.load(); 47 | 48 | IndexEnv env = new IndexEnv(webIndexConfig); 49 | env.setFluoTableSplits(); 50 | log.info("Initialized Fluo table splits"); 51 | 52 | if (args.length == 1) { 53 | final String dataDir = args[0]; 54 | IndexEnv.validateDataDir(dataDir); 55 | 56 | SparkConf sparkConf = new SparkConf().setAppName("webindex-init"); 57 | try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) { 58 | IndexStats stats = new IndexStats(ctx); 59 | 60 | final JavaPairRDD archives = ctx.newAPIHadoopFile(dataDir, 61 | WARCFileInputFormat.class, Text.class, ArchiveReader.class, new Configuration()); 62 | 63 | JavaRDD pages = IndexUtil.createPages(archives); 64 | 65 | env.initializeIndexes(ctx, pages, stats); 66 | 67 | stats.print(); 68 | } 69 | } else { 70 | log.info("An init data dir was not specified"); 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/TestParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data; 19 | 20 | import java.net.URL; 21 | import java.util.List; 22 | 23 | import org.apache.spark.SparkConf; 24 | import org.apache.spark.api.java.JavaRDD; 25 | import org.apache.spark.api.java.JavaSparkContext; 26 | import org.archive.io.ArchiveReader; 27 | import org.archive.io.ArchiveRecord; 28 | import org.archive.io.warc.WARCReaderFactory; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | import webindex.core.WebIndexConfig; 32 | import webindex.data.spark.IndexEnv; 33 | import webindex.data.util.ArchiveUtil; 34 | 35 | public class TestParser { 36 | 37 | private static final Logger log = LoggerFactory.getLogger(TestParser.class); 38 | 39 | public static void main(String[] args) throws Exception { 40 | 41 | if (args.length != 2) { 42 | log.error("Usage: TestParser "); 43 | System.exit(1); 44 | } 45 | final List loadList = IndexEnv.getPathsRange(args[0], args[1]); 46 | if (loadList.isEmpty()) { 47 | log.error("No files to load given {} {}", args[0], args[1]); 48 | System.exit(1); 49 | } 50 | 51 | WebIndexConfig.load(); 52 | 53 | SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser"); 54 | try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) { 55 | 56 | log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1], 57 | args[0]); 58 | 59 | JavaRDD loadRDD = ctx.parallelize(loadList, loadList.size()); 60 | 61 | final String prefix = WebIndexConfig.CC_URL_PREFIX; 62 | 63 | loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> { 64 | String urlToCopy = prefix + path; 65 | log.info("Parsing {}", urlToCopy); 66 | try { 67 | ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0); 68 | for (ArchiveRecord record : reader) { 69 | ArchiveUtil.buildPageIgnoreErrors(record); 70 | } 71 | } catch (Exception e) { 72 | log.error("Exception while processing {}", path, e); 73 | } 74 | })); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/fluo/DomainCombineQ.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.fluo; 19 | 20 | import org.apache.fluo.api.client.TransactionBase; 21 | import org.apache.fluo.api.config.FluoConfiguration; 22 | import org.apache.fluo.api.metrics.Meter; 23 | import org.apache.fluo.api.metrics.MetricsReporter; 24 | import org.apache.fluo.recipes.core.combine.ChangeObserver; 25 | import org.apache.fluo.recipes.core.combine.CombineQueue; 26 | import org.apache.fluo.recipes.core.export.ExportQueue; 27 | import webindex.core.models.export.DomainUpdate; 28 | import webindex.core.models.export.IndexUpdate; 29 | 30 | public class DomainCombineQ { 31 | 32 | public static final String DOMAIN_COMBINE_Q_ID = "dm"; 33 | 34 | /** 35 | * Observes domain map updates and adds those updates to an export queue. 36 | */ 37 | public static class DomainUpdateObserver implements ChangeObserver { 38 | 39 | private ExportQueue exportQ; 40 | private Meter domainsNew; 41 | private Meter domainsChanged; 42 | 43 | DomainUpdateObserver(ExportQueue exportQ, MetricsReporter reporter) { 44 | this.exportQ = exportQ; 45 | domainsNew = reporter.meter("webindex_domains_new"); 46 | domainsChanged = reporter.meter("webindex_domains_changed"); 47 | } 48 | 49 | @Override 50 | public void process(TransactionBase tx, Iterable> updates) { 51 | for (Change update : updates) { 52 | String domain = update.getKey(); 53 | Long oldVal = update.getOldValue().orElse(0L); 54 | Long newVal = update.getNewValue().orElse(0L); 55 | if (oldVal == 0L && newVal > 0L) { 56 | domainsNew.mark(); 57 | } 58 | exportQ.add(tx, domain, new DomainUpdate(domain, oldVal, newVal)); 59 | domainsChanged.mark(); 60 | } 61 | } 62 | } 63 | 64 | /** 65 | * A helper method for configuring the domain map before initializing Fluo. 66 | */ 67 | public static void configure(FluoConfiguration config, int numBuckets, int numTablets) { 68 | CombineQueue.configure(DOMAIN_COMBINE_Q_ID).keyType(String.class).valueType(Long.class) 69 | .buckets(numBuckets).bucketsPerTablet(numBuckets / numTablets).save(config); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/fluo/IndexUpdateTranslator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.fluo; 19 | 20 | import java.util.function.Consumer; 21 | 22 | import org.apache.accumulo.core.data.Mutation; 23 | import org.apache.fluo.api.metrics.Meter; 24 | import org.apache.fluo.api.metrics.MetricsReporter; 25 | import org.apache.fluo.recipes.accumulo.export.function.AccumuloTranslator; 26 | import org.apache.fluo.recipes.core.export.SequencedExport; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | import webindex.core.IndexClient; 30 | import webindex.core.models.export.DomainUpdate; 31 | import webindex.core.models.export.IndexUpdate; 32 | import webindex.core.models.export.PageUpdate; 33 | import webindex.core.models.export.UriUpdate; 34 | 35 | public class IndexUpdateTranslator implements AccumuloTranslator { 36 | 37 | private static final Logger log = LoggerFactory.getLogger(IndexUpdateTranslator.class); 38 | 39 | private Meter pagesExported; 40 | private Meter linksExported; 41 | private Meter domainsExported; 42 | 43 | public IndexUpdateTranslator(MetricsReporter reporter) { 44 | pagesExported = reporter.meter("webindex_pages_exported"); 45 | linksExported = reporter.meter("webindex_links_exported"); 46 | domainsExported = reporter.meter("webindex_domains_exported"); 47 | } 48 | 49 | @Override 50 | public void translate(SequencedExport export, Consumer consumer) { 51 | if (export.getValue() instanceof DomainUpdate) { 52 | domainsExported.mark(); 53 | IndexClient.genDomainMutations((DomainUpdate) export.getValue(), export.getSequence(), 54 | consumer); 55 | } else if (export.getValue() instanceof PageUpdate) { 56 | pagesExported.mark(); 57 | IndexClient.genPageMutations((PageUpdate) export.getValue(), export.getSequence(), consumer); 58 | } else if (export.getValue() instanceof UriUpdate) { 59 | linksExported.mark(); 60 | IndexClient.genUriMutations((UriUpdate) export.getValue(), export.getSequence(), consumer); 61 | } else { 62 | String msg = "An object with an IndexUpdate class (" + export.getValue().getClass().toString() 63 | + ") was placed on the export queue"; 64 | log.error(msg); 65 | throw new IllegalStateException(msg); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/fluo/PageLoader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.fluo; 19 | 20 | import java.net.MalformedURLException; 21 | import java.util.Objects; 22 | 23 | import com.google.common.base.Preconditions; 24 | import com.google.gson.Gson; 25 | import org.apache.fluo.api.client.Loader; 26 | import org.apache.fluo.api.client.TransactionBase; 27 | import org.apache.fluo.recipes.core.data.RowHasher; 28 | import org.apache.fluo.recipes.core.types.TypedTransactionBase; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | import webindex.core.Constants; 32 | import webindex.core.models.Page; 33 | import webindex.core.models.URL; 34 | 35 | public class PageLoader implements Loader { 36 | 37 | private static final Logger log = LoggerFactory.getLogger(PageLoader.class); 38 | private Action action; 39 | private Page page; 40 | private URL delUrl; 41 | 42 | private PageLoader() {} 43 | 44 | public static PageLoader updatePage(Page page) { 45 | Preconditions.checkArgument(!page.isEmpty(), "Page cannot be empty"); 46 | PageLoader update = new PageLoader(); 47 | update.action = Action.UPDATE; 48 | update.page = page; 49 | return update; 50 | } 51 | 52 | public static PageLoader deletePage(URL url) throws MalformedURLException { 53 | Objects.requireNonNull(url, "Url cannot be null"); 54 | PageLoader update = new PageLoader(); 55 | update.action = Action.DELETE; 56 | update.delUrl = url; 57 | return update; 58 | } 59 | 60 | @Override 61 | public void load(TransactionBase tx, Context context) throws Exception { 62 | 63 | TypedTransactionBase ttx = Constants.TYPEL.wrap(tx); 64 | 65 | Gson gson = new Gson(); 66 | RowHasher rowHasher = PageObserver.getPageRowHasher(); 67 | 68 | switch (action) { 69 | case DELETE: 70 | ttx.mutate().row(rowHasher.addHash(delUrl.toUri())).col(Constants.PAGE_NEW_COL) 71 | .set(Page.DELETE_JSON); 72 | break; 73 | case UPDATE: 74 | String newJson = gson.toJson(page); 75 | ttx.mutate().row(rowHasher.addHash(page.getUri())).col(Constants.PAGE_NEW_COL).set(newJson); 76 | break; 77 | default: 78 | log.error("PageUpdate called with no action"); 79 | } 80 | } 81 | 82 | private enum Action { 83 | UPDATE, DELETE, 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/fluo/WebindexObservers.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.fluo; 19 | 20 | import org.apache.fluo.api.config.SimpleConfiguration; 21 | import org.apache.fluo.api.metrics.MetricsReporter; 22 | import org.apache.fluo.api.observer.Observer.NotificationType; 23 | import org.apache.fluo.api.observer.ObserverProvider; 24 | import org.apache.fluo.recipes.accumulo.export.function.AccumuloExporter; 25 | import org.apache.fluo.recipes.core.combine.CombineQueue; 26 | import org.apache.fluo.recipes.core.combine.SummingCombiner; 27 | import org.apache.fluo.recipes.core.export.ExportQueue; 28 | import webindex.core.Constants; 29 | import webindex.core.models.UriInfo; 30 | import webindex.core.models.export.IndexUpdate; 31 | import webindex.data.FluoApp; 32 | 33 | /** 34 | * Provides all of the observers needed for this application. 35 | */ 36 | public class WebindexObservers implements ObserverProvider { 37 | 38 | @Override 39 | public void provide(Registry obsRegistry, Context ctx) { 40 | SimpleConfiguration appCfg = ctx.getAppConfiguration(); 41 | MetricsReporter reporter = ctx.getMetricsReporter(); 42 | 43 | // Create an export queue that handles all updates to the query table. 44 | ExportQueue exportQ = 45 | ExportQueue.getInstance(FluoApp.EXPORT_QUEUE_ID, appCfg); 46 | 47 | // Create a combineQ that tracks the number of pages linking to a URI. 48 | CombineQueue uriQ = 49 | CombineQueue.getInstance(UriCombineQ.URI_COMBINE_Q_ID, appCfg); 50 | 51 | // Create a combineQ that tracks the number of unique URIs observed per domain. 52 | CombineQueue domainQ = 53 | CombineQueue.getInstance(DomainCombineQ.DOMAIN_COMBINE_Q_ID, appCfg); 54 | 55 | // Register an observer that handles changes to pages content. 56 | obsRegistry.forColumn(Constants.PAGE_NEW_COL, NotificationType.STRONG).withId("PageObserver") 57 | .useObserver(new PageObserver(uriQ, exportQ, reporter)); 58 | 59 | // Register an observer to processes queued export data. 60 | exportQ.registerObserver(obsRegistry, new AccumuloExporter<>(FluoApp.EXPORT_QUEUE_ID, appCfg, 61 | new IndexUpdateTranslator(reporter))); 62 | 63 | // Register an observer to process updates to the URI map. 64 | uriQ.registerObserver(obsRegistry, UriInfo::reduce, 65 | new UriCombineQ.UriUpdateObserver(exportQ, domainQ, reporter)); 66 | 67 | // Register an observer to process updates to the domain map. 68 | domainQ.registerObserver(obsRegistry, new SummingCombiner<>(), 69 | new DomainCombineQ.DomainUpdateObserver(exportQ, reporter)); 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/spark/IndexStats.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.spark; 19 | 20 | import java.io.Serializable; 21 | 22 | import org.apache.spark.Accumulator; 23 | import org.apache.spark.api.java.JavaSparkContext; 24 | import org.slf4j.Logger; 25 | import org.slf4j.LoggerFactory; 26 | 27 | public class IndexStats implements Serializable { 28 | 29 | private static final long serialVersionUID = 1L; 30 | 31 | private static final Logger log = LoggerFactory.getLogger(IndexUtil.class); 32 | 33 | private Accumulator numPages; 34 | private Accumulator numEmpty; 35 | private Accumulator numExternalLinks; 36 | 37 | public IndexStats(JavaSparkContext ctx) { 38 | numPages = ctx.accumulator(0); 39 | numEmpty = ctx.accumulator(0); 40 | numExternalLinks = ctx.accumulator(0); 41 | } 42 | 43 | public void addPage(Integer num) { 44 | numPages.add(num); 45 | } 46 | 47 | public void addEmpty(Integer num) { 48 | numEmpty.add(num); 49 | } 50 | 51 | public void addExternalLinks(Integer num) { 52 | numExternalLinks.add(num); 53 | } 54 | 55 | public void print() { 56 | log.info("Num empty = {}", numEmpty.value()); 57 | log.info("Num pages = {}", numPages.value()); 58 | log.info("Num external links = {}", numExternalLinks.value()); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/util/WARCFileInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.util; 19 | 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.fs.Path; 23 | import org.apache.hadoop.io.Text; 24 | import org.apache.hadoop.mapreduce.InputSplit; 25 | import org.apache.hadoop.mapreduce.JobContext; 26 | import org.apache.hadoop.mapreduce.RecordReader; 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 28 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 29 | import org.archive.io.ArchiveReader; 30 | 31 | /** 32 | * Minimal implementation of FileInputFormat for WARC files. Hadoop is told that splitting these 33 | * compressed files is not possible. 34 | * 35 | * @author Stephen Merity (Smerity) 36 | */ 37 | public class WARCFileInputFormat extends FileInputFormat { 38 | 39 | @Override 40 | public RecordReader createRecordReader(InputSplit split, 41 | TaskAttemptContext context) throws IOException, InterruptedException { 42 | return new WARCFileRecordReader(); 43 | } 44 | 45 | @Override 46 | protected boolean isSplitable(JobContext context, Path filename) { 47 | // As these are compressed files, they cannot be (sanely) split 48 | return false; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/data/util/WARCFileRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.util; 19 | 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.fs.FSDataInputStream; 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.io.Text; 27 | import org.apache.hadoop.mapreduce.InputSplit; 28 | import org.apache.hadoop.mapreduce.RecordReader; 29 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 30 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 31 | import org.archive.io.ArchiveReader; 32 | import org.archive.io.warc.WARCReaderFactory; 33 | 34 | /** 35 | * The WARC File Record Reader processes a single compressed input. The Record Reader returns a 36 | * single WARC ArchiveReader that can contain numerous individual documents, each document handled 37 | * in a single mapper. 38 | * 39 | * @author Stephen Merity (Smerity) 40 | */ 41 | public class WARCFileRecordReader extends RecordReader { 42 | 43 | private String arPath; 44 | private ArchiveReader ar; 45 | private FSDataInputStream fsin; 46 | private boolean hasBeenRead = false; 47 | 48 | @Override 49 | public void initialize(InputSplit inputSplit, TaskAttemptContext context) 50 | throws IOException, InterruptedException { 51 | FileSplit split = (FileSplit) inputSplit; 52 | Configuration conf = context.getConfiguration(); 53 | Path path = split.getPath(); 54 | FileSystem fs = path.getFileSystem(conf); 55 | fsin = fs.open(path); 56 | arPath = path.getName(); 57 | ar = WARCReaderFactory.get(path.getName(), fsin, true); 58 | } 59 | 60 | @Override 61 | public void close() throws IOException { 62 | fsin.close(); 63 | ar.close(); 64 | } 65 | 66 | @Override 67 | public Text getCurrentKey() throws IOException, InterruptedException { 68 | // Provide the path used for the compressed file as the key 69 | return new Text(arPath); 70 | } 71 | 72 | @Override 73 | public ArchiveReader getCurrentValue() throws IOException, InterruptedException { 74 | // We only ever have one value to give -- the output of the compressed file 75 | return ar; 76 | } 77 | 78 | @Override 79 | public float getProgress() throws IOException, InterruptedException { 80 | // Progress of reader through the data as a float 81 | // As each file only produces one ArchiveReader, this will be one immediately 82 | return hasBeenRead ? 1 : 0; 83 | } 84 | 85 | @Override 86 | public boolean nextKeyValue() throws IOException, InterruptedException { 87 | // As each file only produces one ArchiveReader, if it has been read, there are no more 88 | if (hasBeenRead) { 89 | return false; 90 | } 91 | hasBeenRead = true; 92 | return true; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/java/webindex/serialization/WebindexKryoFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.serialization; 19 | 20 | import java.io.Serializable; 21 | import java.util.ArrayList; 22 | 23 | import com.esotericsoftware.kryo.Kryo; 24 | import com.esotericsoftware.kryo.pool.KryoFactory; 25 | import webindex.core.models.Link; 26 | import webindex.core.models.UriInfo; 27 | import webindex.core.models.export.DomainUpdate; 28 | import webindex.core.models.export.IndexUpdate; 29 | import webindex.core.models.export.PageUpdate; 30 | import webindex.core.models.export.UriUpdate; 31 | 32 | public class WebindexKryoFactory implements KryoFactory, Serializable { 33 | 34 | private static final long serialVersionUID = 1L; 35 | 36 | @Override 37 | public Kryo create() { 38 | Kryo kryo = new Kryo(); 39 | 40 | // Explicitly set class ids when registering. Did not set ids (because thought if registered in 41 | // same order it would be ok) and ran into issue where Spark and Fluo code were using different 42 | // ids for some reason. 43 | kryo.register(UriInfo.class, 9); 44 | kryo.register(IndexUpdate.class, 10); 45 | kryo.register(DomainUpdate.class, 11); 46 | kryo.register(PageUpdate.class, 12); 47 | kryo.register(UriUpdate.class, 13); 48 | kryo.register(ArrayList.class, 14); 49 | kryo.register(Link.class, 15); 50 | 51 | kryo.setRegistrationRequired(true); 52 | 53 | return kryo; 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /webindex/modules/data/src/main/resources/splits/accumulo-default.txt: -------------------------------------------------------------------------------- 1 | d:com.blogg 2 | d:com.dd 3 | d:com.fe 4 | d:com.hg 5 | d:com.mar 6 | d:com.p 7 | d:com.sh 8 | d:com.tu 9 | d:com.y 10 | d:j 11 | d:org.h 12 | d:us.i 13 | p:ca.h 14 | p:com.af 15 | p:com.applec 16 | p:com.beaut 17 | p:com.blogger.www/delete-comment.g?blogID=24 18 | p:com.blogger.www/profile/067 19 | p:com.blogspot.ben 20 | p:com.blogspot.in 21 | p:com.blogspot.sm 22 | p:com.buf 23 | p:com.chick 24 | p:com.cru 25 | p:com.detroitnews.www/article/2014 26 | p:com.ebe 27 | p:com.facebook.www/a 28 | p:com.facebook.www:s/c 29 | p:com.fir 30 | p:com.gee 31 | p:com.google.plus:s/+N 32 | p:com.gotethnicfoods.secure:s/Indian%20Foods%20Company/Store/Login.cfm?Logout=&cfid=18 33 | p:com.homet 34 | p:com.inm 35 | p:com.kay 36 | p:com.linkedin.www: 37 | p:com.mel 38 | p:com.moms 39 | p:com.neimanmarcus.www/p 40 | p:com.ohio/ 41 | p:com.pie 42 | p:com.pro 43 | p:com.rivals.o 44 | p:com.sho 45 | p:com.sportsf 46 | p:com.stun 47 | p:com.thecl 48 | p:com.toy 49 | p:com.twitter/E 50 | p:com.twitter:s/B 51 | p:com.uni 52 | p:com.w 53 | p:com.wordpress.du 54 | p:com.y 55 | p:com.youtube/ 56 | p:edu.p 57 | p:gov.ni 58 | p:jp.n 59 | p:net.doubleclick.g.pubads/gampad/j 60 | p:net.to 61 | p:org.cro 62 | p:org.li 63 | p:org.scp 64 | p:pl.z 65 | p:uk.co.r 66 | t:fefdfaff:o 67 | t:fefdfefdff:com.am 68 | t:fefdfefdff:com.blogger.www/profile/16 69 | t:fefdfefdff:com.facebook.www/s 70 | t:fefdfefdff:com.in 71 | t:fefdfefdff:com.ph 72 | t:fefdfefdff:com.tumblr.www: 73 | t:fefdfefdff:e 74 | t:fefdfefdff:org.s 75 | t:fefeff:com.g 76 | t:fefeff:d 77 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/java/webindex/data/SparkTestUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data; 19 | 20 | import org.apache.spark.SparkConf; 21 | import org.apache.spark.api.java.JavaSparkContext; 22 | 23 | public class SparkTestUtil { 24 | 25 | public static JavaSparkContext getSparkContext(String appName) { 26 | SparkConf sparkConf = new SparkConf(); 27 | sparkConf.setMaster("local"); 28 | sparkConf.setAppName(appName); 29 | sparkConf.set("spark.app.id", appName); 30 | sparkConf.set("spark.ui.port", "4444"); 31 | return new JavaSparkContext(sparkConf); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/java/webindex/data/spark/Hex.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.spark; 19 | 20 | import java.io.ByteArrayOutputStream; 21 | 22 | import org.apache.fluo.api.data.Bytes; 23 | import org.apache.fluo.api.data.Column; 24 | import org.apache.fluo.api.data.RowColumn; 25 | import scala.Tuple2; 26 | 27 | public class Hex { 28 | public static void encNonAscii(StringBuilder sb, Bytes bytes) { 29 | for (int i = 0; i < bytes.length(); i++) { 30 | byte b = bytes.byteAt(i); 31 | if (b >= 32 && b <= 126 && b != '\\') { 32 | sb.append((char) b); 33 | } else { 34 | sb.append(String.format("\\x%02x", b & 0xff)); 35 | } 36 | } 37 | } 38 | 39 | public static String encNonAscii(Bytes bytes) { 40 | StringBuilder sb = new StringBuilder(); 41 | encNonAscii(sb, bytes); 42 | return sb.toString(); 43 | } 44 | 45 | public static void encNonAscii(StringBuilder sb, Column c, String sep) { 46 | encNonAscii(sb, c.getFamily()); 47 | sb.append(sep); 48 | encNonAscii(sb, c.getQualifier()); 49 | } 50 | 51 | public static void encNonAscii(StringBuilder sb, RowColumn rc, String sep) { 52 | encNonAscii(sb, rc.getRow()); 53 | sb.append(sep); 54 | encNonAscii(sb, rc.getColumn(), sep); 55 | } 56 | 57 | public static String encNonAscii(Tuple2 t, String sep) { 58 | StringBuilder sb = new StringBuilder(); 59 | encNonAscii(sb, t._1(), sep); 60 | sb.append(sep); 61 | encNonAscii(sb, t._2()); 62 | return sb.toString(); 63 | } 64 | 65 | static byte[] decode(String s) { 66 | 67 | // the next best thing to a StringBuilder for bytes 68 | ByteArrayOutputStream baos = new ByteArrayOutputStream(s.length()); 69 | 70 | for (int i = 0; i < s.length(); i++) { 71 | byte b; 72 | 73 | if (s.charAt(i) == '\\') { 74 | if (s.charAt(i + 1) != 'x') { 75 | throw new IllegalArgumentException(); 76 | } 77 | 78 | String num = "" + s.charAt(i + 2) + s.charAt(i + 3); 79 | b = (byte) (0xff & Integer.parseInt(num, 16)); 80 | i += 3; 81 | } else { 82 | char c = s.charAt(i); 83 | if (c < 32 || c > 126) { 84 | throw new IllegalArgumentException(); 85 | } 86 | 87 | b = (byte) (0xff & c); 88 | } 89 | 90 | baos.write(b); 91 | } 92 | 93 | return baos.toByteArray(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/java/webindex/data/spark/IndexEnvTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.spark; 19 | 20 | import java.util.SortedSet; 21 | 22 | import org.apache.hadoop.io.Text; 23 | import org.junit.Assert; 24 | import org.junit.Test; 25 | 26 | public class IndexEnvTest { 27 | 28 | @Test 29 | public void testGetSplits() throws Exception { 30 | SortedSet splits = IndexEnv.getAccumuloDefaultSplits(); 31 | 32 | Assert.assertEquals(76, splits.size()); 33 | Assert.assertEquals(new Text("d:com.blogg"), splits.first()); 34 | Assert.assertEquals(new Text("t:fefeff:d"), splits.last()); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/java/webindex/data/util/ArchiveUtilTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.data.util; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | import java.text.ParseException; 23 | import java.util.Iterator; 24 | 25 | import org.archive.io.ArchiveReader; 26 | import org.archive.io.ArchiveRecord; 27 | import org.archive.io.warc.WARCReaderFactory; 28 | import org.junit.Assert; 29 | import org.junit.Test; 30 | import webindex.core.models.Page; 31 | 32 | public class ArchiveUtilTest { 33 | 34 | @Test 35 | public void testBasic() throws IOException, ParseException { 36 | 37 | ArchiveReader archiveReader = WARCReaderFactory.get(new File("src/test/resources/wat.warc")); 38 | Page page = ArchiveUtil.buildPage(archiveReader.get()); 39 | Assert.assertNotNull(page); 40 | Assert.assertFalse(page.isEmpty()); 41 | 42 | Assert.assertEquals( 43 | "http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/", 44 | page.getUrl()); 45 | Assert.assertEquals( 46 | "com.1079ishot>>o>/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/", 47 | page.getUri()); 48 | 49 | Assert.assertEquals("2015-04-18T03:35:13Z", page.getCrawlDate()); 50 | Assert.assertEquals("nginx/1.6.2", page.getServer()); 51 | Assert.assertEquals( 52 | "Presale Password – Trey Songz & Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at ", 53 | page.getTitle()); 54 | Assert.assertEquals(0, page.getOutboundLinks().size()); 55 | 56 | ArchiveReader ar2 = WARCReaderFactory.get(new File("src/test/resources/wat-18.warc")); 57 | 58 | int valid = 0; 59 | int invalid = 0; 60 | Iterator records = ar2.iterator(); 61 | while (records.hasNext()) { 62 | try { 63 | ArchiveRecord r = records.next(); 64 | ArchiveUtil.buildPage(r); 65 | valid++; 66 | } catch (ParseException e) { 67 | invalid++; 68 | } 69 | } 70 | Assert.assertEquals(18, valid); 71 | Assert.assertEquals(0, invalid); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/resources/data/set1/accumulo-data.txt: -------------------------------------------------------------------------------- 1 | d:com.a|domain|pagecount|1 2 | d:com.a:fefeff:com.a>>o>/1|rank||0 3 | d:com.b|domain|pagecount|4 4 | d:com.b:fefdfdff:com.b>>o>/3|rank||2 5 | d:com.b:fefdfefdff:com.b>>o>/1|rank||1 6 | d:com.b:fefdfefdff:com.b>>o>/2|rank||1 7 | d:com.b:fefeff:com.b>>o>/|rank||0 8 | d:com.c|domain|pagecount|1 9 | d:com.c:fefdfdff:com.c>>o>/1|rank||2 10 | p:com.a>>o>/1|page|cur|{"url":"http://a.com/1","uri":"com.a\x5cu003e\x5cu003eo\x5cu003e/1","numOutbound":3,"outboundLinks":[{"url":"http://b.com/1","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"b1"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]} 11 | p:com.a>>o>/1|page|incount|0 12 | p:com.b>>o>/|page|cur|{"url":"http://b.com/","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/","numOutbound":3,"outboundLinks":[{"url":"http://b.com/2","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/2","anchorText":"b2"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]} 13 | p:com.b>>o>/|page|incount|0 14 | p:com.b>>o>/1|inlinks|com.a>>o>/1|b1 15 | p:com.b>>o>/1|page|incount|1 16 | p:com.b>>o>/2|inlinks|com.b>>o>/|b2 17 | p:com.b>>o>/2|page|incount|1 18 | p:com.b>>o>/3|inlinks|com.a>>o>/1|b3 19 | p:com.b>>o>/3|inlinks|com.b>>o>/|b3 20 | p:com.b>>o>/3|page|incount|2 21 | p:com.c>>o>/1|inlinks|com.a>>o>/1|c1 22 | p:com.c>>o>/1|inlinks|com.b>>o>/|c1 23 | p:com.c>>o>/1|page|incount|2 24 | t:fefdfdff:com.b>>o>/3|||2 25 | t:fefdfdff:com.c>>o>/1|||2 26 | t:fefdfefdff:com.b>>o>/1|||1 27 | t:fefdfefdff:com.b>>o>/2|||1 28 | t:fefeff:com.a>>o>/1|||0 29 | t:fefeff:com.b>>o>/|||0 30 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/resources/data/set1/fluo-data.txt: -------------------------------------------------------------------------------- 1 | dm:d:28:\x03\x01com.\xe3|data|current|\x09\x02 2 | dm:d:57:\x03\x01com.\xe1|data|current|\x09\x02 3 | dm:d:5a:\x03\x01com.\xe2|data|current|\x09\x08 4 | p:saxb:com.a>>o>/1|page|cur|{"url":"http://a.com/1","uri":"com.a\x5cu003e\x5cu003eo\x5cu003e/1","numOutbound":3,"outboundLinks":[{"url":"http://b.com/1","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"b1"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]} 5 | p:xdjd:com.b>>o>/|page|cur|{"url":"http://b.com/","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/","numOutbound":3,"outboundLinks":[{"url":"http://b.com/2","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/2","anchorText":"b2"},{"url":"http://b.com/3","uri":"com.b\x5cu003e\x5cu003eo\x5cu003e/3","anchorText":"b3"},{"url":"http://c.com/1","uri":"com.c\x5cu003e\x5cu003eo\x5cu003e/1","anchorText":"c1"}]} 6 | um:d:06:\x03\x01com.b>>o>/\xb3|data|current|\x0b\x01\x00\x04 7 | um:d:2d:\x03\x01com.a>>o>/\xb1|data|current|\x0b\x01\x02\x00 8 | um:d:3c:\x03\x01com.c>>o>/\xb1|data|current|\x0b\x01\x00\x04 9 | um:d:43:\x03\x01com.b>>o>\xaf|data|current|\x0b\x01\x02\x00 10 | um:d:59:\x03\x01com.b>>o>/\xb1|data|current|\x0b\x01\x00\x02 11 | um:d:76:\x03\x01com.b>>o>/\xb2|data|current|\x0b\x01\x00\x02 12 | -------------------------------------------------------------------------------- /webindex/modules/data/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | log4j.rootLogger=INFO, CA 16 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n 19 | 20 | log4j.logger.akka=WARN 21 | log4j.logger.org.apache.accumulo=WARN 22 | log4j.logger.org.apache.curator=ERROR 23 | log4j.logger.org.apache.fluo=WARN 24 | log4j.logger.org.apache.hadoop=WARN 25 | log4j.logger.org.apache.hadoop.mapreduce=ERROR 26 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR 27 | log4j.logger.org.apache.spark=WARN 28 | log4j.logger.org.apache.zookeeper=WARN 29 | log4j.logger.org.apache.zookeeper.ClientCnxn=ERROR 30 | log4j.logger.org.spark-project=WARN 31 | log4j.logger.webindex=WARN 32 | log4j.logger.Remoting=WARN 33 | -------------------------------------------------------------------------------- /webindex/modules/integration/src/main/java/webindex/integration/DevServerOpts.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.integration; 19 | 20 | import com.beust.jcommander.Parameter; 21 | 22 | public class DevServerOpts { 23 | 24 | @Parameter(names = {"--metrics", "-m"}, description = "Enables sending metrics to localhost:3000") 25 | boolean metrics = false; 26 | 27 | @Parameter(names = {"--pages", "-p"}, description = "Number of pages to load") 28 | int numPages = 1000; 29 | 30 | @Parameter(names = {"--templateDir", "-t"}, description = "Specifies template directory") 31 | String templateDir = "modules/ui/src/main/resources/spark/template/freemarker"; 32 | 33 | @Parameter(names = {"--help", "-h"}, description = "Prints usage", help = true) 34 | boolean help; 35 | } 36 | -------------------------------------------------------------------------------- /webindex/modules/integration/src/main/java/webindex/integration/SampleData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.integration; 19 | 20 | import java.io.BufferedWriter; 21 | import java.net.URL; 22 | import java.nio.file.Files; 23 | import java.nio.file.Path; 24 | 25 | import com.google.gson.Gson; 26 | import org.archive.io.ArchiveReader; 27 | import org.archive.io.ArchiveRecord; 28 | import org.archive.io.warc.WARCReaderFactory; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | import webindex.core.models.Page; 32 | import webindex.data.util.ArchiveUtil; 33 | 34 | public class SampleData { 35 | 36 | private static final Logger log = LoggerFactory.getLogger(SampleData.class); 37 | 38 | private static final String sourceURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/" 39 | + "CC-MAIN-2015-32/segments/1438042981460.12/wat/" 40 | + "CC-MAIN-20150728002301-00043-ip-10-236-191-2.ec2.internal.warc.wat.gz"; 41 | 42 | public static void generate(Path path, int numPages) throws Exception { 43 | 44 | Gson gson = new Gson(); 45 | long count = 0; 46 | try (BufferedWriter writer = Files.newBufferedWriter(path)) { 47 | ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0); 48 | for (ArchiveRecord r : ar) { 49 | Page p = ArchiveUtil.buildPage(r); 50 | if (p.isEmpty() || p.getOutboundLinks().isEmpty()) { 51 | log.debug("Skipping {}", p.getUrl()); 52 | continue; 53 | } 54 | log.debug("Found {} {}", p.getUrl(), p.getNumOutbound()); 55 | String json = gson.toJson(p); 56 | writer.write(json); 57 | writer.newLine(); 58 | count++; 59 | if (count == numPages) { 60 | break; 61 | } else if ((count % 1000) == 0) { 62 | log.info("Wrote {} of {} pages to {}", count, numPages, path); 63 | } 64 | } 65 | } 66 | log.info("Wrote {} pages to {}", numPages, path); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /webindex/modules/integration/src/test/java/webindex/integration/DevServerIT.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package webindex.integration; 19 | 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import java.nio.file.Paths; 24 | 25 | import org.apache.commons.io.FileUtils; 26 | import org.jsoup.Jsoup; 27 | import org.jsoup.nodes.Document; 28 | import org.junit.AfterClass; 29 | import org.junit.Assert; 30 | import org.junit.BeforeClass; 31 | import org.junit.Test; 32 | import webindex.core.IndexClient; 33 | import webindex.core.models.Pages; 34 | 35 | public class DevServerIT { 36 | 37 | static DevServer devServer; 38 | static Path tempPath; 39 | 40 | @BeforeClass 41 | public static void init() throws Exception { 42 | tempPath = Files.createTempDirectory(Paths.get("target/"), "webindex-dev-"); 43 | Path dataPath = Paths.get("src/test/resources/5-pages.txt"); 44 | devServer = new DevServer(dataPath, 24567, null, tempPath, false); 45 | devServer.start(); 46 | } 47 | 48 | @Test 49 | public void basic() throws Exception { 50 | Document doc = Jsoup.connect("http://localhost:24567/").get(); 51 | Assert.assertTrue(doc.text().contains("Enter a domain to view known webpages in that domain")); 52 | 53 | IndexClient client = devServer.getIndexClient(); 54 | Pages pages = client.getPages("stackoverflow.com", "", 0); 55 | Assert.assertEquals(4, pages.getTotal().intValue()); 56 | 57 | Pages.PageScore pageScore = pages.getPages().get(0); 58 | Assert.assertEquals("http://blog.stackoverflow.com/2009/06/attribution-required/", 59 | pageScore.getUrl()); 60 | Assert.assertEquals(4, pageScore.getScore().intValue()); 61 | } 62 | 63 | @AfterClass 64 | public static void destroy() throws IOException { 65 | devServer.stop(); 66 | FileUtils.deleteDirectory(tempPath.toFile()); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /webindex/modules/integration/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Webindex authors (see AUTHORS) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | log4j.rootLogger=INFO, CA 16 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.CA.layout.ConversionPattern=%d{ISO8601} [%c] %-5p: %m%n 19 | 20 | log4j.logger.org.apache.accumulo=WARN 21 | log4j.logger.org.apache.curator=ERROR 22 | log4j.logger.org.apache.fluo=WARN 23 | log4j.logger.org.apache.hadoop=WARN 24 | log4j.logger.org.apache.hadoop.mapreduce=ERROR 25 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR 26 | log4j.logger.org.apache.spark=WARN 27 | log4j.logger.org.apache.zookeeper=ERROR 28 | log4j.logger.org.eclipse.jetty=WARN 29 | log4j.logger.org.spark-project=WARN 30 | log4j.logger.webindex=WARN 31 | log4j.logger.spark=WARN 32 | -------------------------------------------------------------------------------- /webindex/modules/ui/.gitignore: -------------------------------------------------------------------------------- 1 | dependency-reduced-pom.xml 2 | -------------------------------------------------------------------------------- /webindex/modules/ui/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | org.apache.fluo 22 | webindex-parent 23 | 0.0.1-SNAPSHOT 24 | ../../pom.xml 25 | 26 | webindex-ui 27 | WebIndex UI 28 | 29 | 30 | com.sparkjava 31 | spark-core 32 | 33 | 34 | com.sparkjava 35 | spark-template-freemarker 36 | 37 | 38 | org.apache.accumulo 39 | accumulo-core 40 | 41 | 42 | org.apache.fluo 43 | fluo-api 44 | 45 | 46 | org.apache.fluo 47 | fluo-core 48 | 49 | 50 | org.apache.fluo 51 | webindex-core 52 | 53 | 54 | org.slf4j 55 | slf4j-api 56 | 57 | 58 | org.slf4j 59 | slf4j-log4j12 60 | 61 | 62 | 63 | 64 | webindex-web-server 65 | 66 | 67 | 68 | org.codehaus.mojo 69 | exec-maven-plugin 70 | 71 | 72 | 73 | java 74 | 75 | compile 76 | 77 | webindex.ui.WebServer 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/assets/img/webindex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/examples/a155274150ec5fdf74341f340294e60af7d48fed/webindex/modules/ui/src/main/resources/assets/img/webindex.png -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/404.ftl: -------------------------------------------------------------------------------- 1 | 2 | <#include "common/head.ftl"> 3 | 4 |
5 |
6 |
7 |

404: Page not found

8 |
9 |
10 | <#include "common/footer.ftl"> 11 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/common/footer.ftl: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/common/head.ftl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/common/header.ftl: -------------------------------------------------------------------------------- 1 | <#setting url_escaping_charset='ISO-8859-1'> 2 | 3 | <#include "head.ftl"> 4 | 5 |
6 |
7 |
8 | WebIndex Home 9 |
10 |
11 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/home.ftl: -------------------------------------------------------------------------------- 1 | 2 | <#include "common/head.ftl"> 3 | 4 |
5 |
6 |
7 | WebIndex 8 |
9 |

Enter a domain to view known webpages in that domain:

10 |
11 |
12 |
13 | 14 | 15 | 16 | 17 |
18 | 19 |
20 |
21 |
22 |
23 |

Or view the webpages with the most inbound links for all processed data.

24 |
25 |
26 | <#include "common/footer.ftl"> 27 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/links.ftl: -------------------------------------------------------------------------------- 1 | <#include "common/header.ftl"> 2 | <#if links.links?has_content> 3 |
4 |
5 | <#if links.linkType == "in"> 6 |

Webpages that link to ${links.url?html}

7 | <#else> 8 |

Outbound links from ${links.url?html}

9 | 10 |
11 |
12 |
13 |
14 |

Page ${links.pageNum+1} of ${links.total} results

15 |
16 |
17 | <#if (links.next?length > 0)> 18 | Next 19 | 20 | <#if (links.pageNum - 1 >= 0)> 21 | Previous 22 | 23 |
24 |
25 |
26 |
27 |
28 | 29 | <#list links.links as link> 30 | 31 | 32 | 33 | 34 | 35 |
URLAnchor Text
${link.url?html}${link.anchorText?html}
36 | 37 | 38 | <#else> 39 |
40 |
41 |

No ${links.linkType?cap_first}bound links to page: ${links.url?html}

42 |
43 |
44 | 45 | <#include "common/footer.ftl"> 46 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/page.ftl: -------------------------------------------------------------------------------- 1 | <#include "common/header.ftl"> 2 |
4 |

Page Info

5 | 6 | <#if page.crawlDate??> 7 | 8 | 9 | 10 | 11 | 12 | <#if page.crawlDate??> 13 | 14 | 15 | 16 | 17 |
Title${page.title!''?html}
URL${page.url?html}  -  Go to page
Domain${page.domain?html}
Inbound links${page.numInbound}
Outbound links${page.numOutbound}
Server${page.server!''?html}
Last Crawled${page.crawlDate!''?html}
18 |
19 | 20 | <#include "common/footer.ftl"> 21 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/pages.ftl: -------------------------------------------------------------------------------- 1 | <#include "common/header.ftl"> 2 | <#if pages.pages?has_content> 3 |
4 |
5 |

Webpages in ${pages.domain?html} domain (ordered by number of inbound links)

6 |
7 |
8 |
9 |
10 |

Page ${pages.pageNum+1} of ${pages.total} results

11 |
12 |
13 | <#if (pages.next?length > 0)> 14 | Next 15 | 16 | <#if (pages.pageNum - 1 >= 0)> 17 | Previous 18 | 19 |
20 |
21 |
22 |
23 | 24 | 25 | <#list pages.pages as page> 26 | 27 | 28 | 29 | 30 | 31 |
Inbound LinksURL
${page.score?html}${page.url?html}
32 |
33 |
34 | <#else> 35 |
36 |
37 |

No results for ${pages.domain?html}

38 |
39 |
40 | 41 | <#include "common/footer.ftl"> 42 | -------------------------------------------------------------------------------- /webindex/modules/ui/src/main/resources/spark/template/freemarker/top.ftl: -------------------------------------------------------------------------------- 1 | <#include "common/header.ftl"> 2 | <#if top.results?has_content> 3 |
4 |
5 |

Webpages with the most inbound links for all processed data

6 |
7 |
8 |
9 |
10 |

Page ${top.pageNum+1}

11 |
12 |
13 | <#if top.next??> 14 | Next 15 | 16 | <#if (top.pageNum - 1 >= 0)> 17 | Previous 18 | 19 |
20 |
21 |
22 |
23 | 24 | 25 | <#list top.results as result> 26 | 27 | 28 | 29 | 30 | 31 |
Inbound LinksURL
${result.value?html}${result.key?html}
32 |
33 |
34 | <#else> 35 |
36 |
37 |

No results found

38 |
39 |
40 | 41 | <#include "common/footer.ftl"> 42 | --------------------------------------------------------------------------------