├── .gitignore ├── LICENSE ├── README.md ├── docs ├── Makefile ├── architecture.rst ├── batch_system.rst ├── conf.py ├── filters.rst ├── hacking.rst ├── index.rst ├── install ├── installation.rst ├── introduction.rst ├── queries.rst ├── rest_api.rst ├── todo.rst ├── transforms.rst └── usage.rst ├── filters └── language │ ├── lib │ └── language-profiles │ │ ├── af │ │ ├── ar │ │ ├── bg │ │ ├── bn │ │ ├── cs │ │ ├── da │ │ ├── de │ │ ├── el │ │ ├── en │ │ ├── es │ │ ├── fa │ │ ├── fi │ │ ├── fr │ │ ├── gu │ │ ├── he │ │ ├── hi │ │ ├── hr │ │ ├── hu │ │ ├── id │ │ ├── it │ │ ├── ja │ │ ├── kn │ │ ├── ko │ │ ├── mk │ │ ├── ml │ │ ├── mr │ │ ├── ne │ │ ├── nl │ │ ├── no │ │ ├── pa │ │ ├── pl │ │ ├── pt │ │ ├── ro │ │ ├── ru │ │ ├── sk │ │ ├── so │ │ ├── sq │ │ ├── sv │ │ ├── sw │ │ ├── ta │ │ ├── te │ │ ├── th │ │ ├── tl │ │ ├── tr │ │ ├── uk │ │ ├── ur │ │ ├── vi │ │ ├── zh-cn │ │ └── zh-tw │ └── src │ └── main │ └── java │ └── com │ └── mozilla │ └── grouperfish │ └── text │ ├── Dictionary.java │ └── filter │ └── LanguageFilter.java ├── install ├── integration-test ├── .gitignore ├── config │ ├── elasticsearch.yml │ └── hazelcast.xml ├── install ├── pom.xml └── src │ └── test │ ├── java │ └── com │ │ └── mozilla │ │ └── grouperfish │ │ └── integration │ │ ├── IntegrationTestHelper.java │ │ ├── batch │ │ └── RunResourceTest.java │ │ └── rest │ │ ├── ConfigurationsResourceTest.java │ │ ├── DocumentLoaderTest.java │ │ ├── DocumentsResourceTest.java │ │ └── QueriesResourceTest.java │ └── resources │ └── ng_integration.xml ├── project ├── VERSION └── pom.xml ├── service ├── .gitignore ├── bin │ ├── create_hbase_tables │ ├── grouperfish │ └── littlefish ├── conf │ ├── elasticsearch.yml │ ├── elasticsearch_hc.yml │ ├── grouperfish.properties │ ├── hazelcast.xml │ └── hazelcast_hbase.xml ├── install ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── mozilla │ │ │ └── grouperfish │ │ │ ├── base │ │ │ ├── ArrayTool.java │ │ │ ├── Assert.java │ │ │ ├── Box.java │ │ │ ├── Configuration.java │ │ │ ├── ImmutableTools.java │ │ │ ├── PropertiesTool.java │ │ │ ├── SlugTool.java │ │ │ ├── StreamTool.java │ │ │ └── json │ │ │ │ ├── JsonValidator.java │ │ │ │ ├── MapStreamer.java │ │ │ │ └── TsvJsonWriter.java │ │ │ ├── batch │ │ │ ├── api │ │ │ │ ├── BatchService.java │ │ │ │ └── guice │ │ │ │ │ └── BatchSystem.java │ │ │ ├── handlers │ │ │ │ ├── CleanupHandler.java │ │ │ │ ├── FetchHandler.java │ │ │ │ ├── PutHandler.java │ │ │ │ ├── RunHandler.java │ │ │ │ ├── SequentialHandler.java │ │ │ │ └── TaskHandler.java │ │ │ ├── scheduling │ │ │ │ ├── AbstractBatchService.java │ │ │ │ ├── Helpers.java │ │ │ │ ├── PipeliningBatchService.java │ │ │ │ ├── SingleQueueBatchService.java │ │ │ │ ├── SynchronousBatchService.java │ │ │ │ └── Worker.java │ │ │ └── transforms │ │ │ │ ├── ExecutableTransform.java │ │ │ │ ├── HadoopTransform.java │ │ │ │ ├── LocalTransform.java │ │ │ │ ├── Transform.java │ │ │ │ └── TransformProvider.java │ │ │ ├── bootstrap │ │ │ └── Grouperfish.java │ │ │ ├── model │ │ │ ├── Access.java │ │ │ ├── Document.java │ │ │ ├── Fail.java │ │ │ ├── NamedSource.java │ │ │ ├── Query.java │ │ │ ├── Task.java │ │ │ ├── TransformConfig.java │ │ │ └── Type.java │ │ │ ├── naming │ │ │ ├── Namespace.java │ │ │ └── Scope.java │ │ │ ├── rest │ │ │ ├── api │ │ │ │ └── RestService.java │ │ │ ├── jaxrs │ │ │ │ ├── ConfigurationsResource.java │ │ │ │ ├── DocumentsResource.java │ │ │ │ ├── HttpAccess.java │ │ │ │ ├── QueriesResource.java │ │ │ │ ├── ResourceBase.java │ │ │ │ ├── RestHelper.java │ │ │ │ ├── ResultsResource.java │ │ │ │ └── RunResource.java │ │ │ └── jersey │ │ │ │ ├── JerseyGuiceRestService.java │ │ │ │ └── ResourceConfig.java │ │ │ ├── services │ │ │ ├── api │ │ │ │ ├── FileSystem.java │ │ │ │ ├── Grid.java │ │ │ │ ├── Index.java │ │ │ │ ├── IndexProvider.java │ │ │ │ └── guice │ │ │ │ │ ├── Local.java │ │ │ │ │ ├── Services.java │ │ │ │ │ └── Shared.java │ │ │ ├── elasticsearch │ │ │ │ ├── ElasticSearchIndex.java │ │ │ │ └── ElasticSearchIndexProvider.java │ │ │ ├── hadoop │ │ │ │ └── HadoopFileSystem.java │ │ │ ├── hazelcast │ │ │ │ └── HazelcastGrid.java │ │ │ ├── local │ │ │ │ └── LocalFileSystem.java │ │ │ └── mock │ │ │ │ ├── MockFs.java │ │ │ │ ├── MockGrid.java │ │ │ │ └── MockIndex.java │ │ │ └── util │ │ │ ├── loader │ │ │ ├── DocumentLoader.java │ │ │ └── Loader.java │ │ │ └── logback │ │ │ └── AnsiColorConverter.java │ └── resources │ │ ├── logback-stdout.xml │ │ └── logback.xml │ └── test │ ├── java │ └── com │ │ └── mozilla │ │ └── grouperfish │ │ ├── base │ │ ├── AssertTest.java │ │ ├── SlugToolTest.java │ │ ├── StreamToolTest.java │ │ └── json │ │ │ ├── JsonValidatorTest.java │ │ │ └── MapStreamerTest.java │ │ ├── model │ │ ├── DocumentTest.java │ │ └── DummyAccess.java │ │ ├── naming │ │ └── ScopeTest.java │ │ ├── rest │ │ └── jaxrs │ │ │ └── RestHelperTest.java │ │ └── unit │ │ └── UnitTestHelper.java │ └── resources │ ├── config │ └── hazelcast.xml │ └── ng_unit.xml ├── tools ├── display │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── mozilla │ │ └── grouperfish │ │ └── mahout │ │ └── clustering │ │ └── display │ │ ├── kmeans │ │ ├── DisplayKMeansBase.java │ │ ├── OriginalText.java │ │ └── WordCloud.java │ │ └── lda │ │ ├── DisplayLDABase.java │ │ ├── DisplayLDATopics.java │ │ └── OriginalText.java ├── firefox_input │ ├── .gitignore │ ├── README.md │ ├── install │ ├── load_opinions │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── mozilla │ │ │ └── grouperfish │ │ │ └── tools │ │ │ └── firefox_input │ │ │ ├── OpinionLoader.java │ │ │ ├── OpinionStream.java │ │ │ ├── TsvJsonFromInputTsv.java │ │ │ └── TsvReader.java │ │ └── test │ │ ├── java │ │ └── com │ │ │ └── mozilla │ │ │ └── grouperfish │ │ │ └── tools │ │ │ └── firefox_input │ │ │ ├── OpinionStreamTest.java │ │ │ └── TsvReaderTest.java │ │ └── resources │ │ └── ng_unit.xml └── webui │ ├── public │ ├── css │ │ └── topics.css │ └── js │ │ ├── d3.js │ │ ├── jquery.isotope.min.js │ │ ├── jquery.js │ │ └── toy_topics.js │ └── topics.html └── transforms ├── coclustering ├── INSTALL.MD ├── coclustering ├── install ├── pom.xml └── src │ ├── assembly │ └── job.xml │ └── main │ ├── java │ └── com │ │ └── mozilla │ │ └── grouperfish │ │ └── transforms │ │ └── coclustering │ │ ├── display │ │ ├── CoCluster.java │ │ └── WriteCoClusteringOutput.java │ │ ├── lucene │ │ └── analysis │ │ │ └── en │ │ │ ├── EnglishAnalyzer.java │ │ │ ├── NGramEnglishAnalyzer.java │ │ │ └── ShingleAllStopFilter.java │ │ ├── pig │ │ ├── eval │ │ │ ├── mahout │ │ │ │ └── Vectorizer.java │ │ │ └── text │ │ │ │ ├── ConvertDocumentIDToID.java │ │ │ │ ├── ConvertFeatureToID.java │ │ │ │ ├── NGramTokenize.java │ │ │ │ ├── TermFrequency.java │ │ │ │ ├── Tokenize.java │ │ │ │ └── UnigramExtractor.java │ │ └── storage │ │ │ ├── KMeansOutputLoader.java │ │ │ └── MahoutVectorStorage.java │ │ └── text │ │ └── Dictionary.java │ ├── json_sample_files │ ├── parameters.json │ ├── results.json │ └── tags.json │ ├── pig │ ├── co_cluster_Z_generator.pig │ ├── co_cluster_generate_tags.pig │ ├── co_cluster_normalized_matrix_generator.pig │ └── co_cluster_preprocessor.pig │ └── python │ └── cocluster.py ├── commons ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── mozilla │ │ └── grouperfish │ │ ├── lucene │ │ └── analysis │ │ │ └── en │ │ │ ├── EnglishAnalyzer.java │ │ │ ├── NGramEnglishAnalyzer.java │ │ │ └── ShingleAllStopFilter.java │ │ ├── pig │ │ ├── eval │ │ │ ├── ml │ │ │ │ ├── TFIDFVectorizer.java │ │ │ │ ├── TFVectorizer.java │ │ │ │ └── Vectorizer.java │ │ │ └── text │ │ │ │ ├── NGramTokenize.java │ │ │ │ ├── TermFrequency.java │ │ │ │ └── Tokenize.java │ │ └── storage │ │ │ ├── LDACStorage.java │ │ │ ├── MahoutVectorStorage.java │ │ │ └── VWStorage.java │ │ └── text │ │ └── Dictionary.java │ └── pig │ ├── generate_document_vectors.pig │ ├── generate_feature_index.pig │ ├── generate_ngram_feature_index.pig │ ├── generate_sequence_files.pig │ ├── generate_tf_document_vectors.pig │ └── generate_tfidf_document_vectors.pig ├── count └── count ├── lda_gensim ├── src │ └── python │ │ ├── filter.py │ │ └── lda.py └── stopwords-en.txt ├── lda_r └── src │ ├── R │ └── lda.r │ └── python │ └── convert_r_to_grouperfish.py ├── lda_vw ├── src │ └── main │ │ └── python │ │ ├── vowpalwabbit.py │ │ └── vw-printtopics.py └── vw-lda.sh └── textcluster ├── .gitignore ├── install ├── run.py ├── tests ├── small │ ├── input.json.tsv │ ├── parameters.json │ └── results.expected.json └── standard │ ├── input.json.tsv │ ├── parameters.json │ └── results.expected.json └── textcluster /.gitignore: -------------------------------------------------------------------------------- 1 | grouperfish-*.tar.gz 2 | conf/grouperfish.json 3 | .project 4 | .classpath 5 | .settings/ 6 | target/ 7 | data/ 8 | docs/_build 9 | build 10 | *.jar 11 | *.pyc 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ***** BEGIN LICENSE BLOCK ***** 2 | Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | 4 | The contents of this file are subject to the Mozilla Public License Version 5 | 1.1 (the "License"); you may not use this file except in compliance with 6 | the License. You may obtain a copy of the License at 7 | http://www.mozilla.org/MPL/ 8 | 9 | Software distributed under the License is distributed on an "AS IS" basis, 10 | WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | for the specific language governing rights and limitations under the 12 | License. 13 | 14 | The Original Code is Mozilla Grouperfish. 15 | 16 | The Initial Developer of the Original Code is Mozilla. 17 | Portions created by the Initial Developer are Copyright (C) 2011 18 | the Initial Developer. All Rights Reserved. 19 | 20 | Contributor(s): 21 | Michael Kurze 22 | 23 | Alternatively, the contents of this file may be used under the terms of 24 | either the GNU General Public License Version 2 or later (the "GPL"), or 25 | the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | in which case the provisions of the GPL or the LGPL are applicable instead 27 | of those above. If you wish to allow use of your version of this file only 28 | under the terms of either the GPL or the LGPL, and not to allow others to 29 | use your version of this file under the terms of the MPL, indicate your 30 | decision by deleting the provisions above and replace them with the notice 31 | and other provisions required by the GPL or the LGPL. If you do not delete 32 | the provisions above, a recipient may use your version of this file under 33 | the terms of any one of the MPL, the GPL or the LGPL. 34 | 35 | ***** END LICENSE BLOCK ***** 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grouperfish 2 | 3 | ## A Document Transformation Engine 4 | 5 | The nascent Grouperfish project aims to provide a simple, online, scalable text 6 | clustering solution as a REST/JSON service. Initially this service is needed to 7 | drive sites and themes for [Firefox Input](http://input.mozilla.com), as 8 | described in 9 | [mozilla bug 629019](https://bugzilla.mozilla.org/show_bug.cgi?id=629019). 10 | 11 | The main service is written in Java. Individual algorithms can use varying 12 | technologies and platform. 13 | 14 | For more extensive documentation, 15 | [read the docs](http://grouperfish.readthedocs.org) 16 | -------------------------------------------------------------------------------- /docs/filters.rst: -------------------------------------------------------------------------------- 1 | .. _filters: 2 | 3 | ======= 4 | Filters 5 | ======= 6 | 7 | As of Grouperfish 0.1, filters are not yet available. 8 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ########### 2 | Grouperfish 3 | ########### 4 | 5 | .. note:: 6 | This documentation serves as a specification. 7 | It describes a system that has not reached a usable state yet. 8 | 9 | 10 | Contents: 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | 15 | introduction 16 | architecture 17 | installation 18 | rest_api 19 | usage 20 | filters 21 | batch_system 22 | transforms 23 | queries 24 | todo 25 | hacking 26 | 27 | * :ref:`search` 28 | 29 | -------------------------------------------------------------------------------- /docs/install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | cmd="--build" 4 | if [[ "${#}" -eq "1" ]]; then 5 | if [[ "${1}" == --* ]]; then 6 | cmd=$1 7 | fi 8 | fi 9 | 10 | case "${cmd}" in 11 | --build|--package) 12 | make html || exit 1 13 | mkdir -p ../build/docs 14 | cp -rf _build/html ../build/docs 15 | ;; 16 | --clean) 17 | make clean 18 | rm -rf ../build/docs 19 | rm -rf _build 20 | ;; 21 | --help) 22 | "Usage: ${0} [--build|--clean]" 23 | ;; 24 | esac 25 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Prerequisites 9 | ------------- 10 | 11 | These are the requirements to run Grouperfish. 12 | For development, see :ref:`hacking`. 13 | 14 | * A machine running a **Unix-style OS** (such as *Linux*). 15 | 16 | Support for windows currently not planned (and probably not easy to add). 17 | 18 | So far, we have been using Red Hat 5.2 19 | and -- for development -- Mac OS X 10.6+. 20 | 21 | * **JRE 6** or higher 22 | 23 | * **Python 2.6** or higher (*not* tested with 3.x) 24 | 25 | * **ElasticSearch 0.17.6** 26 | 27 | The ElasticSearch cluster does not need to be running on the same machines as 28 | Grouperfish. For Hadoop/HBase you will need to make sure that the 29 | configuration is on your classpath (easiest with a local installation). 30 | 31 | 32 | Prepare your installation 33 | ------------------------- 34 | 35 | * Obtain a grouperfish tarball [#]_ and unpack it into a directory of your choice. 36 | 37 | :: 38 | 39 | > tar xzf grouperfish-0.1.tar 40 | 41 | > cd grouperfish-0.1 42 | 43 | * Under ``config``, modify the ``elasticsearch.yml`` and 44 | ``elasticsearch_hc.yml`` so that Grouperfish will be able to discover your 45 | cluster. 46 | **Advanced:** You can modify the ``elasticsearch.yml`` to make 47 | each Grouperfish instance run its own ElasticSearch data node. By default, 48 | Grouperfish depends on joining an existing cluster though. Refer to the 49 | `ElasticSearch configuration documentation`_ for details. 50 | 51 | .. _`ElasticSearch configuration documentation`: 52 | http://www.elasticsearch.org/guide/reference/setup/configuration.html 53 | 54 | * In the ``hazelcast.xml``, have a look at ```` section. 55 | If your network does not support multicast based discovery, make changes 56 | as described in the `Hazelcast documentation`_. 57 | 58 | .. _`Hazelcast documentation`: 59 | http://www.hazelcast.com/docs/1.9.4/manual/multi_html/ch09.html 60 | 61 | .. [#] right now, the only way is to build it from source. See :ref:`hacking`. 62 | 63 | 64 | Launch the daemon 65 | ----------------- 66 | 67 | To run grouperfish (currently, no service wrapper is available): 68 | 69 | :: 70 | 71 | grouperfish-0.1> ./bin/grouperfish -f 72 | 73 | Grouperfish will be listening on port 61732 74 | (mnemonic: ``FISH = 0xF124 = 61732``). 75 | 76 | You can safely ignore the logback warning (which will only appear with ``-f`` 77 | given). It is due to an `error`_ in logback. 78 | 79 | .. _error: http://jira.qos.ch/browse/LBCORE-198 80 | 81 | Omit the ``-f`` to run grouperfish as a background process, detached from your 82 | shell. You can use ``jps`` to determine the process id. 83 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | Grouperfish is built to perform text clustering for `Firefox Input`_. 5 | Due to its generic nature, it also serves as a testbed to prototype machine 6 | learning algorithms. 7 | 8 | .. _Firefox Input: http://input.mozilla.com 9 | 10 | How does it work? 11 | ----------------- 12 | 13 | Grouperfish is a *document transformation system*, for high throughput 14 | applications. 15 | 16 | Roughly summarized: 17 | 18 | * users put *documents* into Grouperfish using a REST interface 19 | 20 | * *transformations* are performed on one or several subsets of these documents. 21 | 22 | * *results* can be retrieved by users over the REST interface 23 | 24 | * all components are distributed for high volume applications 25 | 26 | 27 | What can be done? 28 | """"""""""""""""" 29 | 30 | Assume a scenario where a steady stream of documents is generated. 31 | For example: 32 | 33 | * user feedback 34 | * software crash reports 35 | * twitter messages 36 | 37 | Now, these documents can be processed to make them more useful. 38 | For example: 39 | 40 | * clustering (grouping related documents together, detecting common topics) 41 | * classification (associating documents with predefined categories including 42 | spam) 43 | * trending (identifying new topics over time). 44 | 45 | 46 | Vocabulary 47 | ---------- 48 | 49 | Grouperfish users can assume one of three roles (or any combination thereof): 50 | 51 | Document Producer 52 | Some user (usually another piece of software) that will 53 | put documents into the System. 54 | 55 | Result Consumer 56 | Some user/software that gets the generated results. 57 | 58 | Admin 59 | A user who configures which subsets of documents to transform, but also 60 | how and when to do that. 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/queries.rst: -------------------------------------------------------------------------------- 1 | .. _queries: 2 | 3 | ======= 4 | Queries 5 | ======= 6 | 7 | Concrete Queries 8 | ---------------- 9 | 10 | A concrete query is just a regular ElasticSearch query, e.g.: 11 | 12 | :: 13 | 14 | { 15 | "query": { 16 | "bool": { 17 | "must": [ 18 | {"field": {"os": "Android"}}, 19 | {"field": {"platform": "ARM"}}, 20 | ] 21 | } 22 | } 23 | } 24 | 25 | All documents matching this query will be processed together in a batch run. 26 | 27 | .. note:: 28 | Find the full `Query DSL documentation`_ on the ElasticSearch Website. 29 | 30 | .. _`Query DSL documentation`: 31 | http://www.elasticsearch.org/guide/reference/query-dsl/ 32 | 33 | 34 | .. _template-queries: 35 | 36 | 37 | Template Queries 38 | ---------------- 39 | 40 | A template query will generate a bunch of concrete queries every time it is 41 | evaluated. It is different in that it has an additional top-level field 42 | "facet_by", which is a list of field names. 43 | 44 | Let us assume we have these documents in our namespace: 45 | 46 | :: 47 | 48 | {"id": 1, "desc": "Why do you crash?", "os": "win7", "platform": "x64"}, 49 | {"id": 2, "desc": "Don't crash plz", "os": "xp", "platform": "x86"}, 50 | {"id": 3, "desc": "It doesn't crash!", "os": "win7", "platform": "x86"}, 51 | {"id": 3, "desc": "Over 9000!", "os": "linux", "platform": "x86"}, 52 | 53 | 54 | And this template query: 55 | 56 | :: 57 | 58 | { 59 | "query": {"text": {"desc": "crash"}}, 60 | "facet_by": ["platform", "os"] 61 | } 62 | 63 | 64 | This will generate the following set of queries: 65 | 66 | :: 67 | 68 | {"query": {"filtered": 69 | {"query": {"text": {"desc": "crash"}}, "filter": {"and": [ 70 | {"field": {"os": "win7"}}, 71 | {"field": {"platform": "x64"}}, 72 | ]}}}} 73 | {"query": {"filtered": 74 | {"query": {"text": {"desc": "crash"}}, "filter": {"and": [ 75 | {"field": {"os": "win7"}}, 76 | {"field": {"platform": "x86"}}, 77 | ]}}}} 78 | {"query": {"filtered": 79 | {"query": {"text": {"desc": "crash"}}, "filter": {"and": [ 80 | {"field": {"os": "xp"}}, 81 | {"field": {"platform": "x86"}}, 82 | ]}}}} 83 | 84 | Note that no query for ``os=linux`` is generated in this case, because the 85 | query for ``crash`` does not match any document with that ``os`` in the first 86 | place. 87 | -------------------------------------------------------------------------------- /docs/todo.rst: -------------------------------------------------------------------------------- 1 | .. _todo: 2 | 3 | ===== 4 | To Do 5 | ===== 6 | 7 | These components are not necessarily listed in the order they need to be 8 | implemented: 9 | 10 | * Filtering functionality (:ref:`filters`) 11 | 12 | * Language detection filter 13 | 14 | * Allow clients to extract sub-results from a result doc (using JSON paths) 15 | 16 | * Add template Queries 17 | 18 | * Add tagging of ElasticSearch documents based on transform results 19 | 20 | * :ref:`Transforms` 21 | 22 | * Co-Clustering 23 | 24 | * LDA 25 | 26 | * Validate configuration pieces based on a schema, specific to 27 | each filter/transform 28 | 29 | * JS client library (possibly hook in with ``pyes``) 30 | E.g. to be used by the admin interface. 31 | 32 | * Admin interface 33 | 34 | * Python client library (possibly hook in with ``pyes``) 35 | 36 | * Online service for ad-hoc requests 37 | 38 | * Define online API (Client/server? JVM using Jython etc.?) 39 | 40 | * Integrate a fast clustering algorithm for this 41 | 42 | -------------------------------------------------------------------------------- /docs/transforms.rst: -------------------------------------------------------------------------------- 1 | .. _transforms: 2 | 3 | ========== 4 | Transforms 5 | ========== 6 | 7 | Transforms are the heart of Grouperfish. They generate the results that will 8 | actually be interesting to consumers. 9 | 10 | Note: The minimal transform interface is defined by the :ref:`batch_system` 11 | 12 | 13 | Transform Configuration 14 | ----------------------- 15 | 16 | The same transform (e.g. a clustering algorithm) might be used with different 17 | parameters to generate different results. For this reason, the system 18 | contains a *transform configurations* for each result that should be 19 | generated. 20 | 21 | Primarily, a transform configuration parameterizes its transform (e.g. for 22 | clustering, it might specify the desired number of clusters). It can also be 23 | used to tell the Grouperfish batch system how to interact with a transform. 24 | 25 | Currently, a transform configuration is a JSON document with two fields: The 26 | *transform* determines which piece of software to use, and *parameters* tells 27 | that software what to do. 28 | Example configuration for the *textcluster* transform: 29 | 30 | :: 31 | 32 | { 33 | "transform": "textcluster", 34 | "parameters": { 35 | "fields": {"id": "id", "text": "text"}, 36 | "limits": {"clusters": 10,"top_documents": 10} 37 | } 38 | } 39 | 40 | 41 | Result Types 42 | ------------ 43 | 44 | Topics (or Clusters) 45 | ^^^^^^^^^^^^^^^^^^^^ 46 | 47 | Clustering transforms try to extract the main topics from a set of documents. 48 | As of Grouperfish version 0.1, the only available transform is a clustering 49 | transform named textcluster. The results of clustering transform are topics, 50 | the structure of the result is as follows: 51 | 52 | :: 53 | 54 | { 55 | "clusters": [ 56 | { 57 | "top_documents": [{...}, {...}, ..., {...}], 58 | "top_terms": ["Something", "Else", ..., "Another"] 59 | }, 60 | ... 61 | ] 62 | } 63 | 64 | Depending on the actually configured transform, only top documents *or* top 65 | terms might be generated for a topic. Also, any given transform might add 66 | other top-level fields than just *clusters*. 67 | 68 | 69 | Available Transforms 70 | -------------------- 71 | 72 | textcluster 73 | ^^^^^^^^^^^ 74 | 75 | Textcluster is a relatively simple clustering algorithm written in Python by 76 | Dave Dash for Firefox Input. It is very fast for small input sets, but 77 | requires a lot of memory, especially when processing more than 10,000 78 | documents at a time. Textcluster is `available on github`__. 79 | 80 | .. __: https://github.com/davedash/textcluster 81 | 82 | In Grouperfish, you can select how many topics you want textcluster to 83 | extract, and how many documents to include in the results for each topic. 84 | 85 | * Parameters 86 | 87 | :: 88 | 89 | { 90 | "fields": { 91 | "id": "id", 92 | "text": "text" 93 | }, 94 | "limits": { 95 | "clusters": 10, 96 | "top_documents": 10 97 | } 98 | } 99 | 100 | These are the default parameters (top 10 topics/clusters, 101 | with 10 documents each). 102 | 103 | 104 | * Results 105 | 106 | Textcluster uses the standard clustering result format (see above), but does 107 | not inclue top terms, only documents. 108 | 109 | -------------------------------------------------------------------------------- /install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | self="${0}" 4 | usage() { 5 | echo "Usage: ${self} [--help | --package | (--build|--clean) [PATH]]" 6 | echo " --help Show this message." 7 | echo " --clean Discard all build results and intermediate files." 8 | echo " --build Install components to ./build (default)" 9 | echo " --package Create a complete grouperfish tarball." 10 | echo " Cleans and rebuilds every component first." 11 | echo 12 | echo " PATH The component to build. Examples:" 13 | echo " ./service" 14 | echo " ./transforms/coclustering" 15 | echo " If omitted, everything is built." 16 | echo 17 | echo "This script must be called from the root directory of the project." 18 | echo 19 | } 20 | 21 | cmd="--build" 22 | all="YES" 23 | what="project docs service transforms/* tools/* filters/* integration-test" 24 | 25 | if [[ "${#}" -gt "2" ]]; then 26 | usage 27 | exit 1 28 | fi 29 | 30 | if [[ "${#}" -eq "2" ]]; then 31 | cmd=$1 32 | all="NO" 33 | what=$2 34 | fi 35 | 36 | if [[ "${#}" -eq "1" ]]; then 37 | if [[ "${1}" == -* ]]; then 38 | cmd=$1 39 | else 40 | all="NO" 41 | what=$1 42 | fi 43 | fi 44 | 45 | fail() { 46 | echo "Build aborted: ${1}" 47 | exit 1 48 | } 49 | 50 | clean() { 51 | what=$1 52 | for component in $what; do 53 | if [[ -x "$component/install" ]]; then 54 | echo $'\n\n'"Cleaning $component ..." 55 | ( cd "${component}" 56 | ./install --clean || fail "Clean of '${component}' failed." ) 57 | fi 58 | rm -rf "./build/${component}" 59 | done 60 | if [[ "YES" = "${all}" ]]; then 61 | rm -rf ./build 62 | version="$(cat ./project/VERSION)" 63 | rm -f "./grouperfish-${version}.tar.gz" 64 | cd ./project && mvn clean ; cd .. 65 | fi 66 | } 67 | 68 | build() { 69 | # either --build or --release 70 | mode=$1 71 | what=$2 72 | if [[ "YES" = "${all}" ]]; then 73 | cd ./project && mvn install ; cd .. 74 | fi 75 | for component in $what; do 76 | if [[ -x "$component/install" ]]; then 77 | echo $'\n\n'"Installing $component ..." 78 | ( cd "${component}" 79 | ./install $mode || fail "Installation of '${component}' failed." ) 80 | else 81 | echo "Copying $component ..." 82 | mkdir -p "build/${component}/" 83 | cp -r "${component}"/* "build/${component}/" 84 | fi 85 | done 86 | } 87 | 88 | package() { 89 | what=$1 90 | version="$(cat ./project/VERSION)" 91 | clean "${what}" 92 | build --package "${what}" 93 | mv ./build "./grouperfish-${version}" 94 | tar czf "grouperfish-${version}.tar.gz" "./grouperfish-${version}" 95 | } 96 | 97 | case "${cmd}" in 98 | --help) 99 | usage 100 | ;; 101 | --build) 102 | build --build "${what}" 103 | ;; 104 | --clean) 105 | clean "${what}" 106 | ;; 107 | --package) 108 | package "${what}" 109 | ;; 110 | *) 111 | usage 112 | exit 1 113 | ;; 114 | esac 115 | -------------------------------------------------------------------------------- /integration-test/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | data 3 | -------------------------------------------------------------------------------- /integration-test/config/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | # Cluster Settings 2 | cluster: 3 | name: gfintegration 4 | 5 | path: 6 | data: data/elasticsearch 7 | -------------------------------------------------------------------------------- /integration-test/install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | cmd="--build" 4 | if [[ "${#}" -eq "1" ]]; then 5 | if [[ "${1}" == --* ]]; then 6 | cmd=$1 7 | fi 8 | fi 9 | 10 | case "${cmd}" in 11 | --build|--package) 12 | mvn test || exit 1 13 | ;; 14 | --clean) 15 | mvn clean 16 | rm -rf target 17 | ;; 18 | --help) 19 | "Usage: ${0} [--build|--clean]" 20 | ;; 21 | esac 22 | -------------------------------------------------------------------------------- /integration-test/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | grouperfish-integration-test 6 | ${grouperfishVersion} 7 | 8 | com.mozilla 9 | grouperfish-parent 10 | ../project 11 | FIXED 12 | 13 | 14 | https://github.com/mozilla-metrics/grouperfish 15 | 16 | 17 | UTF-8 18 | false 19 | 20 | 21 | 22 | 23 | 24 | com.mozilla 25 | grouperfish-service 26 | ${grouperfishVersion} 27 | 28 | 29 | 30 | com.jayway.restassured 31 | rest-assured 32 | 1.2.2 33 | test 34 | 35 | 36 | 37 | 38 | 39 | 40 | grouperfish-service 41 | 42 | 43 | 44 | 45 | org.apache.maven.plugins 46 | maven-compiler-plugin 47 | 2.3.2 48 | 49 | 1.6 50 | 1.6 51 | 52 | 53 | 54 | 55 | 56 | org.apache.maven.plugins 57 | maven-surefire-plugin 58 | 2.5 59 | 60 | ../build/ 61 | ${skip.tests.unit} 62 | -Xms128m -Xmx1024m -XX:PermSize=128m -XX:MaxPermSize=512m 63 | methods 64 | 1 65 | 66 | src/test/resources/ng_integration.xml 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /integration-test/src/test/java/com/mozilla/grouperfish/integration/IntegrationTestHelper.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.integration; 2 | 3 | //import org.apache.hadoop.conf.Configuration; 4 | //import org.apache.hadoop.hbase.HBaseConfiguration; 5 | //import org.apache.hadoop.hbase.LocalHBaseCluster; 6 | import org.testng.annotations.AfterGroups; 7 | import org.testng.annotations.BeforeGroups; 8 | import org.testng.annotations.BeforeTest; 9 | import org.testng.annotations.Test; 10 | 11 | import groovyx.net.http.ContentType; 12 | 13 | import com.mozilla.grouperfish.bootstrap.Grouperfish; 14 | 15 | import com.hazelcast.core.Hazelcast; 16 | import com.jayway.restassured.RestAssured; 17 | import com.mozilla.grouperfish.base.Assert; 18 | import com.mozilla.grouperfish.rest.jersey.JerseyGuiceRestService; 19 | 20 | 21 | @Test(groups="integration") 22 | public class IntegrationTestHelper { 23 | 24 | public static final int port = Grouperfish.DEFAULT_PORT + 100; 25 | static { 26 | setUpRestAssured(); 27 | } 28 | 29 | public static String NS = "integration"; 30 | 31 | // private LocalHBaseCluster hbase; 32 | 33 | private final Thread grouperfish = new Thread() { 34 | @Override 35 | public void run() { 36 | System.setProperty("hazelcast.config", "config/hazelcast.xml"); 37 | System.setProperty(JerseyGuiceRestService.PROPERTY_PORT, String.valueOf(port)); 38 | try { 39 | Grouperfish.main(new String[]{}); 40 | } 41 | catch (InterruptedException interrupt) { 42 | Hazelcast.getMap("documents_" + NS).destroy(); 43 | Thread.currentThread().interrupt(); 44 | } 45 | catch (Exception e) { 46 | Assert.unreachable(null, e); 47 | } 48 | } 49 | }; 50 | 51 | 52 | @BeforeGroups(groups="integration") 53 | void setUp() throws Exception { 54 | 55 | // Local HBaseCluster to use. 56 | // hbase = new LocalHBaseCluster(HBaseConfiguration.create(new Configuration())); 57 | // hbase.startup(); 58 | // Thread.sleep(3000); 59 | 60 | // Set required bagheera configuration: 61 | 62 | // Give some time for Grouperfish (and especially HazelCast) to come up: 63 | grouperfish.start(); 64 | Thread.sleep(10000); 65 | 66 | setUpRestAssured(); 67 | } 68 | 69 | 70 | @AfterGroups(groups="integration") 71 | void tearDown() throws InterruptedException { 72 | grouperfish.interrupt(); 73 | Thread.sleep(2000); 74 | //hbase.shutdown(); 75 | //hbase.join(); 76 | } 77 | 78 | 79 | @BeforeTest(groups="integration") 80 | public static void setUpRestAssured() { 81 | RestAssured.baseURI = "http://127.0.0.1"; 82 | RestAssured.port = port; 83 | RestAssured.basePath = ""; 84 | RestAssured.requestContentType(ContentType.JSON); 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /integration-test/src/test/java/com/mozilla/grouperfish/integration/batch/RunResourceTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.integration.batch; 2 | 3 | public class RunResourceTest { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /integration-test/src/test/java/com/mozilla/grouperfish/integration/rest/QueriesResourceTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.integration.rest; 2 | 3 | import static com.jayway.restassured.RestAssured.expect; 4 | import static com.jayway.restassured.RestAssured.given; 5 | import static java.lang.String.format; 6 | 7 | import org.json.simple.JSONObject; 8 | import org.testng.annotations.Test; 9 | 10 | import com.mozilla.grouperfish.integration.IntegrationTestHelper; 11 | 12 | 13 | @Test(groups="integration") 14 | @SuppressWarnings({ "unchecked", "serial" }) 15 | public class QueriesResourceTest { 16 | 17 | final IntegrationTestHelper helper = new IntegrationTestHelper(); 18 | final String NS = IntegrationTestHelper.NS; 19 | 20 | private static final String QUERY_ALL = (new JSONObject() {{ 21 | put("query", new JSONObject(){{ 22 | put("match_all", new JSONObject()); 23 | }}); 24 | }}).toJSONString(); 25 | 26 | public void testPutQuery() { 27 | given().body(QUERY_ALL). 28 | expect().statusCode(201). 29 | when().put(format("/queries/%s/ALL", NS)); 30 | } 31 | 32 | public void testPutTooEmpty() { 33 | given().body(""). 34 | expect().statusCode(400). 35 | when().put(format("/queries/%s/Z", NS)); 36 | } 37 | 38 | // // These tests cannot work yet (we first need to verify queries using ES). 39 | // public void testPutInvalidQuery() { 40 | // ... 41 | // } 42 | // 43 | // public void testPutEmptyQuery() { 44 | // given().body("{}"). 45 | // expect().statusCode(400). 46 | // when().put(format("/queries/%s/MYBAD", NS)); 47 | // } 48 | 49 | 50 | public void testDeleteQuery() { 51 | testPutQuery(); 52 | expect(). 53 | statusCode(204). 54 | when().delete(format("/queries/%s/ALL", NS)); 55 | } 56 | 57 | public void testRepeatDeleteQuery() { 58 | testPutQuery(); 59 | expect(). 60 | statusCode(204). 61 | when().delete(format("/queries/%s/ALL", NS)); 62 | expect(). 63 | statusCode(204). 64 | when().delete(format("/queries/%s/ALL", NS)); 65 | } 66 | 67 | public void testGetQuery() { 68 | testPutQuery(); 69 | expect(). 70 | statusCode(200). 71 | when().get(format("/queries/%s/ALL", NS)); 72 | } 73 | 74 | public void testNotFound() { 75 | expect(). 76 | statusCode(404). 77 | when().get(format("/queries/%s/Yeti", NS)); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /integration-test/src/test/resources/ng_integration.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /project/VERSION: -------------------------------------------------------------------------------- 1 | 0.1-SNAPSHOT 2 | -------------------------------------------------------------------------------- /service/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | data 3 | grouperfish.pid 4 | test-output 5 | -------------------------------------------------------------------------------- /service/bin/create_hbase_tables: -------------------------------------------------------------------------------- 1 | #!hbase shell 2 | 3 | # Alternative Usage: 4 | # cat create_hbase_tables | hbase shell 5 | 6 | create 'documents', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'} 7 | create 'queries', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'} 8 | create 'results', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'} 9 | create 'configurations', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'} 10 | -------------------------------------------------------------------------------- /service/bin/littlefish: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Starts up a grouperfish instance in the foreground without creating 4 | # a jar first. 5 | # Allows to quickly test changes without running mvn install. 6 | 7 | function usage() { 8 | echo "Usage: $0 [-h] [hazelcast-config-path]" 9 | echo 10 | echo "You need to fully build the project once first:" 11 | echo "Work directory is ../build" 12 | echo 13 | } 14 | 15 | bin=`dirname "$0"` 16 | bin=`cd $bin; pwd` 17 | 18 | if [ ! -f "${bin}/../target/classpath" ]; then 19 | if [ ! -d "${bin}/../target/" ]; then 20 | mvn compile 21 | fi 22 | mvn dependency:build-classpath 23 | fi 24 | 25 | 26 | 27 | build=$bin/../../build 28 | build=`cd $build; pwd` 29 | 30 | #### Process options 31 | 32 | # FISH = 0xF124 = 61732 33 | SERVER_PORT=61732 34 | 35 | while getopts ":fhp:" optname ; do 36 | case "$optname" in 37 | "h") 38 | usage 39 | exit 0 40 | ;; 41 | "?") 42 | echo "Unknown option ${OPTARG}" 43 | usage 44 | exit 1 45 | ;; 46 | *) 47 | usage 48 | exit 1 49 | ;; 50 | esac 51 | done 52 | shift $(($OPTIND - 1)) 53 | 54 | 55 | #### Process arguments 56 | 57 | HAZELCAST_CONF=$bin/../conf/hazelcast.xml 58 | if [[ $# -gt 1 ]] ; then usage; exit 1; fi 59 | if [[ $# -gt 0 ]]; then HAZELCAST_CONF=$1; fi 60 | 61 | CLASSPATH="$bin/../conf":"../service/target/classes":$(cat target/classpath) 62 | CLASSPATH="$CLASSPATH":"$HADOOP_CONF":"$HBASE_CONF" 63 | SERVER_CLASS_NAME="com.mozilla.grouperfish.bootstrap.Grouperfish" 64 | JAVA_OPTS="-Xmx1g -XX:+UseParNewGC -XX:+UseConcMarkSweepGC" 65 | 66 | if [ "$GROUPERFISH_USER" = "" ]; then GROUPERFISH_USER="$USER"; fi 67 | if [ "$HADOOP_CONF" = "" ]; then HADOOP_CONF="/etc/hadoop/conf"; fi 68 | if [ "$HBASE_CONF" = "" ]; then HBASE_CONF="/etc/hbase/conf"; fi 69 | HAZELCAST_OPTS="-Dhazelcast.logging.type=slf4j -Dhazelcast.config=${HAZELCAST_CONF}" 70 | BAGHEERA_OPTS="-Dbagheera.log.dir=${bin}/../logs" 71 | GROUPERFISH_OPTS="-Dgrouperfish.rest.port=${SERVER_PORT}" 72 | 73 | mkdir -p "${build}/../logs" 74 | 75 | 76 | cd "${build}" 77 | echo "Work directory: $(pwd) / ${build}" 78 | echo "Using classpath: " $CLASSPATH 79 | 80 | java -Dlogback.configurationFile=logback-stdout.xml \ 81 | $GROUPERFISH_OPTS $BAGHEERA_OPTS $HAZELCAST_OPTS $JAVA_OPTS \ 82 | -cp $CLASSPATH \ 83 | $SERVER_CLASS_NAME 84 | 85 | RETVAL=$? 86 | exit $RETVAL 87 | -------------------------------------------------------------------------------- /service/conf/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | # Cluster Settings 2 | cluster.name: grouperfish 3 | 4 | node.data: false 5 | node.client: true 6 | http.enabled: false 7 | transport.tcp.port: 9301 8 | -------------------------------------------------------------------------------- /service/conf/elasticsearch_hc.yml: -------------------------------------------------------------------------------- 1 | # Cluster Settings 2 | cluster.name: grouperfish 3 | 4 | node.data: false 5 | node.client: true 6 | http.enabled: false 7 | transport.tcp.port: 9302 8 | -------------------------------------------------------------------------------- /service/conf/grouperfish.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla-metrics/grouperfish/71f6074c1b08626437242509126c6f3732d7b036/service/conf/grouperfish.properties -------------------------------------------------------------------------------- /service/install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # normalize work directory 4 | wd=`dirname "$0"` 5 | wd=`cd "$wd"; pwd` 6 | 7 | 8 | cmd="--build" 9 | if [[ "${#}" -eq "1" ]]; then 10 | if [[ "${1}" == --* ]]; then 11 | cmd=$1 12 | fi 13 | fi 14 | 15 | build() { 16 | mvn install || exit 1 17 | mkdir -p ../build/lib ../build/conf ../build/bin 18 | cp target/grouperfish-service-*.jar ../build/lib/ 19 | cp conf/* ../build/conf/ 20 | cp bin/grouperfish ../build/bin/ 21 | } 22 | 23 | package() { 24 | mvn dependency:copy-dependencies || exit 1 25 | cp target/lib/* ../build/lib 26 | } 27 | 28 | 29 | case "${cmd}" in 30 | --build) 31 | build 32 | ;; 33 | --package) 34 | build 35 | package 36 | ;; 37 | --clean) 38 | mvn clean 39 | rm -f ../build/lib/grouperfish-service-* 40 | rm -f ../build/bin/grouperfish 41 | ;; 42 | --help) 43 | "Usage: ${0} [--build|--clean]" 44 | ;; 45 | esac 46 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/ArrayTool.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | public class ArrayTool { 4 | 5 | public static byte[] concat(final byte[] a, final byte[] b) { 6 | final byte[] c = new byte[a.length + b.length]; 7 | System.arraycopy(a, 0, c, 0, a.length); 8 | System.arraycopy(b, 0, c, a.length, b.length); 9 | return c; 10 | } 11 | 12 | } 13 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/Assert.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | 4 | public class Assert { 5 | 6 | private static final String PREFIX = "[ASSERTION FAILED]"; 7 | 8 | public static void nonNull(Object... values) { 9 | int i = 0; 10 | for (Object value : values) { 11 | ++i; 12 | if (value == null) { 13 | String message = String.format("%s Value %d/%d is null.", PREFIX, i, values.length); 14 | throw new IllegalArgumentException(message); 15 | } 16 | } 17 | } 18 | 19 | public static void check(boolean... values) { 20 | int i = 0; 21 | for (boolean value : values) { 22 | ++i; 23 | if (!value) { 24 | String msg = String.format("%s Check %d/%d failed!", PREFIX, i, values.length); 25 | throw new IllegalArgumentException(msg); 26 | } 27 | } 28 | } 29 | 30 | public static void unreachable() { 31 | String message = String.format("%s Code should be unreachable!\n", PREFIX); 32 | throw new IllegalStateException(message); 33 | } 34 | 35 | public static void unreachable(String message, Object... objects) { 36 | String msg = String.format("%s Code should be unreachable: %s\n", PREFIX, String.format(message, objects)); 37 | throw new IllegalStateException(msg); 38 | } 39 | 40 | /** Use this where java wants a return type T. Silly, really... */ 41 | public static T unreachable(Class returnType) { 42 | String msg = String.format("%s Code should be unreachable!\n", PREFIX); 43 | throw new IllegalStateException(msg); 44 | } 45 | 46 | /** @see #unreachable(Class) */ 47 | public static T unreachable(Class returnType, String message, Object... objects) { 48 | String msg = String.format("%s Code should be unreachable: %s\n", PREFIX, String.format(message, objects)); 49 | throw new IllegalStateException(msg); 50 | } 51 | 52 | /** @see #unreachable(Class) */ 53 | public static T unreachable(Class returnType, Exception problem) { 54 | String msg = String.format("%s Code should be unreachable\n", PREFIX); 55 | throw new IllegalStateException(msg, problem); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/Box.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.util.Iterator; 4 | import java.util.NoSuchElementException; 5 | 6 | /** 7 | * Can be used as an out-param or optional return value. 8 | * Either use has+get or iterate over the results. 9 | */ 10 | public class Box implements Iterable { 11 | 12 | private T value; 13 | 14 | public Box put(final T value) { 15 | this.value = value; 16 | return this; 17 | } 18 | 19 | public boolean empty() { 20 | return value == null; 21 | } 22 | 23 | public T get() { 24 | return value; 25 | } 26 | 27 | /** Iterates 0 or 1 times. */ 28 | public Iterator iterator() { 29 | return new Iterator() { 30 | private boolean taken = false; 31 | 32 | @Override public boolean hasNext() { 33 | return !taken && !empty(); 34 | } 35 | 36 | @Override public T next() { 37 | if (empty() || taken) throw new NoSuchElementException(); 38 | taken = true; 39 | return value; 40 | } 41 | 42 | @Override public void remove() { 43 | if (empty() || taken) throw new NoSuchElementException(); 44 | taken = true; 45 | value = null; 46 | } 47 | }; 48 | 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/Configuration.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.util.Properties; 4 | 5 | public interface Configuration { 6 | 7 | Properties properties(); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/ImmutableTools.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | import com.google.common.collect.ImmutableList; 7 | import com.google.common.collect.ImmutableMap; 8 | 9 | /** Inefficient "functional" maps. */ 10 | public class ImmutableTools { 11 | 12 | public static Map immutable(final Map in) { 13 | return new ImmutableMap.Builder().putAll(in).build(); 14 | } 15 | 16 | public static List immutable(final List in) { 17 | return new ImmutableList.Builder().addAll(in).build(); 18 | } 19 | 20 | public static Map put(final Map in, final K key, final V value) { 21 | return new ImmutableMap.Builder().putAll(in).put(key, value).build(); 22 | } 23 | 24 | public static Map putAll(final Map a, final Map b) { 25 | return new ImmutableMap.Builder().putAll(a).putAll(b).build(); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/PropertiesTool.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.net.URL; 7 | import java.util.Properties; 8 | 9 | 10 | public class PropertiesTool { 11 | 12 | public static Properties load(final Class context, final String resourceName) { 13 | final Properties properties = new Properties(); 14 | URL source = context.getResource(resourceName); 15 | if (source == null) return properties; 16 | 17 | InputStream stream = null; 18 | try { 19 | stream = source.openStream(); 20 | properties.load(new InputStreamReader(stream, StreamTool.UTF8)); 21 | } 22 | catch (IOException e) { 23 | throw new RuntimeException(String.format("Failed to load properties from '%s'...", resourceName), e); 24 | } 25 | finally { 26 | if (stream == null) return properties; 27 | try { stream.close(); } 28 | catch (IOException e) { throw new RuntimeException(e); } 29 | } 30 | return properties; 31 | } 32 | 33 | 34 | } 35 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/SlugTool.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.text.Normalizer; 4 | import java.text.Normalizer.Form; 5 | import java.util.Locale; 6 | import java.util.regex.Pattern; 7 | 8 | 9 | public class SlugTool { 10 | 11 | /** 12 | * http://stackoverflow.com/questions/1657193/ 13 | * @param input 14 | * @return A representation of the input string, containing only non-whitespace, latin characters. 15 | */ 16 | public static String toSlug(String input) { 17 | String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); 18 | String normalized = Normalizer.normalize(nowhitespace, Form.NFD); 19 | String slug = NONLATIN.matcher(normalized).replaceAll(""); 20 | return slug.toLowerCase(Locale.ENGLISH); 21 | } 22 | 23 | private static final Pattern NONLATIN = Pattern.compile("[^\\w-]"); 24 | private static final Pattern WHITESPACE = Pattern.compile("[\\s]"); 25 | 26 | } 27 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/StreamTool.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.io.Reader; 7 | import java.nio.charset.Charset; 8 | 9 | 10 | public class StreamTool { 11 | 12 | public static Charset UTF8 = Charset.forName("UTF-8"); 13 | 14 | /** 15 | * @param stream The character source. 16 | * @param encoding An encoding, e.g. #UTF8 17 | */ 18 | public static String consume(final InputStream stream, final Charset encoding) 19 | throws IOException { 20 | return maybeConsume(stream, encoding, 0); 21 | } 22 | 23 | /** 24 | * Consume everything from this reader into a string. 25 | * Close the reader when done. 26 | */ 27 | public static String consume(final Reader in) 28 | throws IOException { 29 | Assert.nonNull(in); 30 | return consume(in, 0); 31 | } 32 | 33 | /** 34 | * Consume everything up to limit from this reader into a string. 35 | * If the stream has more characters than the given limit. 36 | * 37 | * @param A reader, will be closed when done. 38 | * @param limit If limit is reached while consuming the stream, 39 | * null is returned. 40 | * Set to 0 for no limit. 41 | * @return The contents, or null if the limit was exceeded. 42 | */ 43 | public static String consume(final Reader in, final int limit) 44 | throws IOException { 45 | Assert.nonNull(in); 46 | final char[] buffer = new char[8192]; 47 | final StringBuilder out = new StringBuilder(); 48 | int size = 0; 49 | 50 | int read; 51 | do { 52 | read = in.read(buffer, 0, buffer.length); 53 | size += read; 54 | if (limit != 0 && size > limit) { 55 | in.close(); 56 | return null; 57 | } 58 | if (read>0) out.append(buffer, 0, read); 59 | } while (read>=0); 60 | 61 | in.close(); 62 | return out.toString(); 63 | } 64 | 65 | /** 66 | * @param stream The character source. 67 | * @param encoding An encoding, e.g. #UTF8 68 | * @param limit If limit is reached while consuming the stream, 69 | * null is returned. 70 | * Set to 0 for no limit. 71 | */ 72 | public static String maybeConsume(final InputStream stream, final Charset encoding, final int limit) 73 | throws IOException { 74 | Assert.nonNull(stream, encoding); 75 | return consume(new InputStreamReader(stream, encoding), limit); 76 | } 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/json/JsonValidator.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base.json; 2 | 3 | import java.io.IOException; 4 | 5 | import org.codehaus.jackson.JsonFactory; 6 | import org.codehaus.jackson.JsonParseException; 7 | import org.codehaus.jackson.JsonParser; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | 12 | public class JsonValidator { 13 | 14 | static Logger log = LoggerFactory.getLogger(JsonValidator.class); 15 | 16 | private final JsonFactory jsonFactory = new JsonFactory(); 17 | 18 | public boolean isValid(String json) throws IOException { 19 | if (json.length() == 0) { 20 | return false; 21 | } 22 | 23 | try { 24 | JsonParser parser = jsonFactory.createJsonParser(json); 25 | while (parser.nextToken() != null) { } 26 | } catch (JsonParseException e) { 27 | log.error("Error parsing JSON", e); 28 | return false; 29 | } 30 | 31 | return true; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/json/MapStreamer.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base.json; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | import java.io.OutputStreamWriter; 7 | import java.io.Writer; 8 | import java.util.Map; 9 | 10 | import org.json.simple.JSONValue; 11 | 12 | import com.mozilla.grouperfish.base.StreamTool; 13 | 14 | 15 | /** 16 | * Takes String keys and JSON values and streams them out as one JSON map, 17 | * without composing everything in memory. 18 | */ 19 | public class MapStreamer { 20 | 21 | private final Map map; 22 | 23 | public MapStreamer(final Map map) { 24 | this.map = map; 25 | } 26 | 27 | public void write(OutputStream out) throws IOException { 28 | final Writer writer = new BufferedWriter(new OutputStreamWriter(out, StreamTool.UTF8)); 29 | boolean first = true; 30 | 31 | writer.write('{'); 32 | for (final Map.Entry items : map.entrySet()) { 33 | if (first) { 34 | first = false; 35 | } 36 | else { 37 | writer.append(','); 38 | writer.append('\n'); 39 | } 40 | 41 | writer 42 | .append('"') 43 | .append(JSONValue.escape(items.getKey())) 44 | .append('"') 45 | .append(':') 46 | .append(' ') 47 | .write(items.getValue()); 48 | } 49 | writer.write('}'); 50 | writer.flush(); 51 | } 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/base/json/TsvJsonWriter.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base.json; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.IOException; 5 | import java.io.Writer; 6 | 7 | import com.mozilla.grouperfish.model.Document; 8 | 9 | 10 | /** If using a buffered writer, make sure to {@link #flush()} when you are done. */ 11 | public class TsvJsonWriter { 12 | 13 | private final Writer writer; 14 | 15 | public TsvJsonWriter(final Writer writer) { 16 | this.writer = new BufferedWriter(writer); 17 | } 18 | 19 | public void write(final String key, final String source) throws IOException { 20 | writer.write(key.replace("\t", "\\t").replace("\n", "\\n")); 21 | writer.write("\t"); 22 | writer.write(source.replace("\n", "")); 23 | writer.write("\n"); 24 | } 25 | 26 | public void write(final Document document) throws IOException { 27 | write(document.id(), document.source()); 28 | } 29 | 30 | public void flush() throws IOException { 31 | writer.flush(); 32 | } 33 | 34 | public void close() throws IOException { 35 | writer.close(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/api/BatchService.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.api; 2 | 3 | import com.mozilla.grouperfish.model.Query; 4 | import com.mozilla.grouperfish.model.Task; 5 | import com.mozilla.grouperfish.model.TransformConfig; 6 | import com.mozilla.grouperfish.naming.Scope; 7 | 8 | 9 | /** 10 | * The batch system component as documented at: 11 | * http://grouperfish.readthedocs.org/en/latest/batch_system.html 12 | */ 13 | public interface BatchService { 14 | 15 | /** Run this specific task. */ 16 | void schedule(Task task); 17 | 18 | /** Run the configured transform over the query results. */ 19 | void schedule(Scope ns, Query query, TransformConfig transform); 20 | 21 | /** Run all configured transforms over the query results. */ 22 | void schedule(Scope ns, Query query); 23 | 24 | /** 25 | * Run all transforms configurations of this 26 | * namespace over the results of all queries. 27 | */ 28 | void schedule(Scope ns); 29 | 30 | /** Start execution of tasks. */ 31 | void start(); 32 | 33 | /** 34 | * Stop execution of new tasks. 35 | * Should be called before shutting down the node. 36 | * 37 | * :TODO: Next: 38 | * We probably need some sort of lifecycle events so 39 | * services can manage this transparently. 40 | */ 41 | void stop(); 42 | 43 | } 44 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/api/guice/BatchSystem.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.api.guice; 2 | 3 | import java.util.Properties; 4 | 5 | import com.google.common.collect.ImmutableMap; 6 | import com.google.inject.AbstractModule; 7 | import com.google.inject.Inject; 8 | import com.mozilla.grouperfish.batch.api.BatchService; 9 | import com.mozilla.grouperfish.batch.scheduling.SynchronousBatchService; 10 | import com.mozilla.grouperfish.batch.transforms.HadoopTransform; 11 | import com.mozilla.grouperfish.batch.transforms.LocalTransform; 12 | import com.mozilla.grouperfish.batch.transforms.Transform; 13 | import com.mozilla.grouperfish.batch.transforms.TransformProvider; 14 | import com.mozilla.grouperfish.services.api.FileSystem; 15 | import com.mozilla.grouperfish.services.api.guice.Local; 16 | import com.mozilla.grouperfish.services.api.guice.Services; 17 | import com.mozilla.grouperfish.services.api.guice.Shared; 18 | 19 | public class BatchSystem extends AbstractModule { 20 | 21 | @Override 22 | protected void configure() { 23 | bind(BatchService.class).to(SynchronousBatchService.class).asEagerSingleton(); 24 | bind(TransformProvider.class).to(StaticTransformProvider.class).asEagerSingleton(); 25 | } 26 | 27 | static class StaticTransformProvider implements TransformProvider { 28 | 29 | private final ImmutableMap transformsByName; 30 | 31 | @Inject 32 | public StaticTransformProvider( 33 | final Properties properties, 34 | final @Shared FileSystem dfs, 35 | final @Local FileSystem localFs) { 36 | 37 | final ImmutableMap.Builder builder = 38 | new ImmutableMap.Builder(); 39 | 40 | builder.put("count", new LocalTransform("count", dfs, localFs)); 41 | builder.put("textcluster", new LocalTransform("textcluster", dfs, localFs)); 42 | // :TODO: Next: autodiscover available transforms 43 | 44 | if (Services.hasHadoop(properties)) { 45 | builder.put("coclustering", new HadoopTransform("coclustering", dfs)); 46 | } 47 | 48 | transformsByName = builder.build(); 49 | } 50 | 51 | @Override 52 | public Transform get(final String name) { 53 | return transformsByName.get(name); 54 | } 55 | 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/handlers/CleanupHandler.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.handlers; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import com.mozilla.grouperfish.batch.scheduling.Helpers; 7 | import com.mozilla.grouperfish.model.Fail; 8 | import com.mozilla.grouperfish.model.Task; 9 | import com.mozilla.grouperfish.services.api.FileSystem; 10 | import com.mozilla.grouperfish.services.api.FileSystem.Denied; 11 | import com.mozilla.grouperfish.services.api.FileSystem.NotFound; 12 | 13 | 14 | public class CleanupHandler implements TaskHandler { 15 | 16 | private static final Logger log = LoggerFactory.getLogger(CleanupHandler.class); 17 | 18 | 19 | private final FileSystem fs; 20 | 21 | public CleanupHandler(FileSystem fs) { 22 | this.fs = fs; 23 | } 24 | 25 | @Override 26 | public Task handle(final Task task) throws Fail { 27 | try { 28 | fs.removeRecursively(Helpers.taskDirectory(task)); 29 | } 30 | catch (final Denied denied) { 31 | throw Fail.hard(task, "Could not cleanup task directory.", denied); 32 | } 33 | catch (final NotFound e) { 34 | // ok, ignore 35 | log.debug("Missing task directory during cleanup, this can indicate problems. Task: %s", task); 36 | } 37 | 38 | return task; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/handlers/FetchHandler.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.handlers; 2 | 3 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.inputFilename; 4 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.parametersFilename; 5 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.writer; 6 | 7 | import java.io.Writer; 8 | 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import com.mozilla.grouperfish.base.Assert; 13 | import com.mozilla.grouperfish.base.json.TsvJsonWriter; 14 | import com.mozilla.grouperfish.batch.scheduling.Helpers; 15 | import com.mozilla.grouperfish.model.Document; 16 | import com.mozilla.grouperfish.model.Fail; 17 | import com.mozilla.grouperfish.model.Task; 18 | import com.mozilla.grouperfish.model.Type; 19 | import com.mozilla.grouperfish.services.api.FileSystem; 20 | import com.mozilla.grouperfish.services.api.Index; 21 | import com.mozilla.grouperfish.services.api.IndexProvider; 22 | 23 | public class FetchHandler implements TaskHandler { 24 | 25 | private static final Logger log = LoggerFactory.getLogger(FetchHandler.class); 26 | 27 | private final IndexProvider indexes; 28 | private final FileSystem fs; 29 | 30 | public FetchHandler(final FileSystem fs, final IndexProvider index) { 31 | this.fs = fs; 32 | this.indexes = index; 33 | } 34 | 35 | @Override 36 | public Task handle(final Task task) throws Fail { 37 | Index index = indexes.index(task.namespace().bucket(Type.DOCUMENT)); 38 | Assert.nonNull(task); 39 | try { 40 | final TsvJsonWriter tsvWriter = new TsvJsonWriter(writer(fs, task, inputFilename(task))); 41 | for (final Document doc : index.find(task.query())) tsvWriter.write(doc); 42 | tsvWriter.close(); 43 | 44 | final Writer parametersWriter = writer(fs, task, parametersFilename(task)); 45 | parametersWriter.write(task.transform().parametersJson()); 46 | parametersWriter.close(); 47 | } 48 | catch (final Exception e) { 49 | final String message = String.format( 50 | "Failed writing doc to %s", Helpers.inputFilename(task)); 51 | log.error("Exception", e); 52 | throw Fail.hard(task, message, e); 53 | } 54 | return task; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/handlers/PutHandler.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.handlers; 2 | 3 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.resultsFilename; 4 | 5 | import java.io.Reader; 6 | import java.util.Map; 7 | 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import com.mozilla.grouperfish.base.StreamTool; 12 | import com.mozilla.grouperfish.model.Fail; 13 | import com.mozilla.grouperfish.model.Task; 14 | import com.mozilla.grouperfish.naming.Scope; 15 | import com.mozilla.grouperfish.rest.jaxrs.ResultsResource; 16 | import com.mozilla.grouperfish.services.api.FileSystem; 17 | import com.mozilla.grouperfish.services.api.Grid; 18 | 19 | 20 | /** 21 | * Put run results into results storage. 22 | */ 23 | public class PutHandler implements TaskHandler { 24 | 25 | private static final Logger log = LoggerFactory.getLogger(PutHandler.class); 26 | 27 | private final FileSystem fs; 28 | private final Grid grid; 29 | 30 | public PutHandler(final Grid grid, final FileSystem fs) { 31 | this.grid = grid; 32 | this.fs = fs; 33 | } 34 | 35 | @Override 36 | public Task handle(final Task task) throws Fail { 37 | 38 | if (!task.isOk()) { 39 | log.debug("Not putting result for failed task %s", task); 40 | } 41 | 42 | final String key = ResultsResource.key(task.transform().name(), task.query().name()); 43 | final Map results = new Scope(task.namespace(), grid).results(); 44 | 45 | try { 46 | final Reader reader = fs.reader(resultsFilename(task)); 47 | results.put(key, StreamTool.consume(reader)); 48 | } 49 | catch (final Exception e) { 50 | throw Fail.hard(task, "Could not read results from filesystem.", e); 51 | } 52 | return task; 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/handlers/RunHandler.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.handlers; 2 | 3 | import java.io.IOException; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import com.mozilla.grouperfish.base.Assert; 9 | import com.mozilla.grouperfish.base.StreamTool; 10 | import com.mozilla.grouperfish.batch.scheduling.Helpers; 11 | import com.mozilla.grouperfish.batch.transforms.Transform; 12 | import com.mozilla.grouperfish.batch.transforms.Transform.TransformResult; 13 | import com.mozilla.grouperfish.batch.transforms.TransformProvider; 14 | import com.mozilla.grouperfish.model.Fail; 15 | import com.mozilla.grouperfish.model.Task; 16 | import com.mozilla.grouperfish.model.TransformConfig; 17 | import com.mozilla.grouperfish.services.api.FileSystem; 18 | import com.mozilla.grouperfish.services.api.FileSystem.Denied; 19 | import com.mozilla.grouperfish.services.api.FileSystem.NotFound; 20 | 21 | 22 | /** Perform the actual running of the transform. */ 23 | public class RunHandler implements TaskHandler { 24 | 25 | private static final Logger log = LoggerFactory.getLogger(RunHandler.class); 26 | 27 | private final FileSystem fs; 28 | private final TransformProvider transforms; 29 | 30 | public RunHandler(final FileSystem fs, final TransformProvider transforms) { 31 | this.fs = fs; 32 | this.transforms = transforms; 33 | } 34 | 35 | @Override 36 | public Task handle(final Task task) throws Fail { 37 | final String inputDirectory; 38 | try { 39 | inputDirectory = fs.uri(Helpers.taskDirectory(task)); 40 | } 41 | catch (final NotFound e) { 42 | throw Fail.hard(task, "Task input not found...", e); 43 | } 44 | 45 | try { 46 | fs.makeDirectory(Helpers.outputDirectory(task)); 47 | } catch (final Denied e) { 48 | throw Fail.hard(task, "Cannot create output directory.", e); 49 | } 50 | 51 | final TransformConfig config = task.transform(); 52 | final Transform transform = transforms.get(config.transform()); 53 | Assert.nonNull(transform); 54 | log.info(String.format("Launching transform '%s' with input directory '%s'", transform, inputDirectory)); 55 | 56 | try { 57 | final TransformResult result = transform.run(task); 58 | if (result.success()) { 59 | log.info("Transform {} for task {} was run successfully.", transform, task); 60 | } 61 | else { 62 | final String message = String.format("Failed to run transform: %s (task %s)", transform, task); 63 | log.warn(message); 64 | log.warn("STDERR: {}", StreamTool.consume(result.stderr(), StreamTool.UTF8)); 65 | throw Fail.hard(task, message, null); 66 | } 67 | } 68 | catch (final InterruptedException e) { 69 | throw Fail.soft(task, "Interrupted during run.", e); 70 | } 71 | catch (final IOException e) { 72 | throw Fail.hard(task, "Received IO error reading from task STDERR", e); 73 | } 74 | 75 | return task; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/handlers/SequentialHandler.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.handlers; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import com.mozilla.grouperfish.model.Fail; 7 | import com.mozilla.grouperfish.model.Task; 8 | 9 | 10 | /** 11 | * Composite handler. 12 | * 13 | * Applies all sub-handlers synchronously, in order. 14 | * Can be helpful to simplify things for development/testing 15 | * (compared to pipelining). 16 | */ 17 | public class SequentialHandler implements TaskHandler { 18 | 19 | private static final Logger log = LoggerFactory.getLogger(SequentialHandler.class); 20 | 21 | private final TaskHandler[] handlers; 22 | 23 | public SequentialHandler(final TaskHandler... handlers) { 24 | this.handlers = handlers; 25 | } 26 | 27 | @Override 28 | public Task handle(Task task) throws Fail { 29 | for (final TaskHandler handler : handlers) { 30 | log.debug("Task {}: starting handler: {}", task, handler.getClass().getSimpleName()); 31 | task = handler.handle(task); 32 | } 33 | return task; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/handlers/TaskHandler.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.handlers; 2 | 3 | import com.mozilla.grouperfish.model.Fail; 4 | import com.mozilla.grouperfish.model.Task; 5 | 6 | public interface TaskHandler { 7 | 8 | /** 9 | * Carry out some processing on this task. 10 | * @return The same task, or some modified version with more information. 11 | */ 12 | Task handle(Task task) throws Fail; 13 | 14 | } 15 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/scheduling/AbstractBatchService.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.scheduling; 2 | 3 | import java.util.Map; 4 | 5 | import com.mozilla.grouperfish.base.Assert; 6 | import com.mozilla.grouperfish.batch.api.BatchService; 7 | import com.mozilla.grouperfish.model.Type; 8 | import com.mozilla.grouperfish.model.Query; 9 | import com.mozilla.grouperfish.model.Task; 10 | import com.mozilla.grouperfish.model.TransformConfig; 11 | import com.mozilla.grouperfish.naming.Scope; 12 | import com.mozilla.grouperfish.services.api.Index; 13 | import com.mozilla.grouperfish.services.api.IndexProvider; 14 | 15 | abstract class AbstractBatchService implements BatchService { 16 | 17 | private final IndexProvider indexes; 18 | 19 | public AbstractBatchService(final IndexProvider indexes) { 20 | this.indexes = indexes; 21 | } 22 | 23 | /** Run the configured transform over the query results. */ 24 | public void schedule(final Scope ns, final Query query, final TransformConfig transform) { 25 | Assert.nonNull(query, transform); 26 | final Index index = indexes.index(ns.bucket(Type.DOCUMENT)); 27 | for (final Query concreteQuery : index.resolve(query)) { 28 | schedule(new Task(ns, concreteQuery, transform)); 29 | } 30 | } 31 | 32 | /** Run all configured transforms over the query results. */ 33 | public void schedule(final Scope ns, final Query query) { 34 | final Map transforms = ns.map(Type.CONFIGURATION_TRANSFORM); 35 | for (final Map.Entry item : transforms.entrySet()) { 36 | schedule(ns, query, new TransformConfig(item.getKey(), item.getValue())); 37 | } 38 | } 39 | 40 | /** Run all transforms configurations of this namespace over the results of all queries. */ 41 | public void schedule(final Scope ns) { 42 | final Map queries = ns.queries(); 43 | final Map transforms = ns.map(Type.CONFIGURATION_TRANSFORM); 44 | for (final Map.Entry queryEntry : queries.entrySet()) { 45 | final Query query = new Query(queryEntry.getKey(), queryEntry.getValue()); 46 | for (final Map.Entry item : transforms.entrySet()) { 47 | schedule(ns, query, new TransformConfig(item.getKey(), item.getValue())); 48 | } 49 | } 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/scheduling/SingleQueueBatchService.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.scheduling; 2 | 3 | import java.util.concurrent.BlockingQueue; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import com.google.inject.Inject; 9 | import com.mozilla.grouperfish.batch.transforms.TransformProvider; 10 | import com.mozilla.grouperfish.model.Task; 11 | import com.mozilla.grouperfish.services.api.FileSystem; 12 | import com.mozilla.grouperfish.services.api.Grid; 13 | import com.mozilla.grouperfish.services.api.IndexProvider; 14 | 15 | /** 16 | * Run everything using one queue and a single worker. 17 | * Mostly useful to test the worker. 18 | */ 19 | public class SingleQueueBatchService extends AbstractBatchService { 20 | 21 | private static final Logger log = LoggerFactory.getLogger(SingleQueueBatchService.class); 22 | 23 | private final Worker worker; 24 | private final BlockingQueue inQueue; 25 | private final BlockingQueue failQueue; 26 | 27 | @Override 28 | public void schedule(Task task) { 29 | inQueue.add(task); 30 | } 31 | 32 | @Inject 33 | public SingleQueueBatchService( 34 | final Grid grid, 35 | final IndexProvider indexes, 36 | final FileSystem fs, 37 | final TransformProvider transforms) { 38 | 39 | super(indexes); 40 | inQueue = grid.queue("grouperfish_in"); 41 | failQueue = grid.queue("grouperfish_fail"); 42 | 43 | worker = new Worker(failQueue, inQueue, null, Helpers.sequentialHandler(grid, fs, indexes, transforms)); 44 | 45 | log.info("Instantiated service: {}", getClass().getSimpleName()); 46 | } 47 | 48 | public void start() { 49 | worker.start(); 50 | } 51 | 52 | public void stop() { 53 | worker.cancel(); 54 | } 55 | 56 | 57 | } 58 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/scheduling/SynchronousBatchService.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.scheduling; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import com.google.inject.Inject; 7 | import com.mozilla.grouperfish.batch.handlers.TaskHandler; 8 | import com.mozilla.grouperfish.batch.transforms.TransformProvider; 9 | import com.mozilla.grouperfish.model.Fail; 10 | import com.mozilla.grouperfish.model.Task; 11 | import com.mozilla.grouperfish.services.api.FileSystem; 12 | import com.mozilla.grouperfish.services.api.Grid; 13 | import com.mozilla.grouperfish.services.api.IndexProvider; 14 | 15 | 16 | /** 17 | * Braindead fully synchronous "batch" service. 18 | * 19 | * It has no queue, no multithreading. 20 | * It just executes everything right away, while you wait for results. 21 | * 22 | * Can be useful in testing/development. 23 | */ 24 | public class SynchronousBatchService extends AbstractBatchService { 25 | 26 | private static final Logger log = LoggerFactory.getLogger(SynchronousBatchService.class); 27 | 28 | private final TaskHandler handler; 29 | 30 | @Inject 31 | public SynchronousBatchService( 32 | final Grid grid, 33 | final IndexProvider indexes, 34 | final FileSystem fs, 35 | final TransformProvider transforms) { 36 | super(indexes); 37 | handler = Helpers.sequentialHandler(grid, fs, indexes, transforms); 38 | 39 | log.info("Instantiated service: {}", getClass().getSimpleName()); 40 | } 41 | 42 | @Override 43 | public void schedule(final Task task) { 44 | try { 45 | handler.handle(task); 46 | } 47 | catch (Fail e) { 48 | throw new RuntimeException(e); 49 | } 50 | } 51 | 52 | @Override 53 | public void start() { } 54 | 55 | @Override 56 | public void stop() { } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/scheduling/Worker.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.scheduling; 2 | 3 | import java.util.concurrent.BlockingQueue; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import com.mozilla.grouperfish.batch.handlers.TaskHandler; 9 | import com.mozilla.grouperfish.model.Fail; 10 | import com.mozilla.grouperfish.model.Task; 11 | 12 | class Worker extends Thread { 13 | 14 | private static Logger log = LoggerFactory.getLogger(Worker.class); 15 | private static final int NUM_TRIES = 3; 16 | 17 | private final BlockingQueue inQueue; 18 | private final BlockingQueue outQueue; 19 | private final BlockingQueue failQueue; 20 | private final TaskHandler handler; 21 | private final String name; 22 | 23 | public Worker(final BlockingQueue failQueue, 24 | final BlockingQueue inQueue, 25 | final BlockingQueue outQueue, 26 | final TaskHandler actor) { 27 | this.inQueue = inQueue; 28 | this.outQueue = outQueue; 29 | this.failQueue = failQueue; 30 | this.handler = actor; 31 | this.name = String.format("[Worker for %s]", actor.getClass().getSimpleName()); 32 | } 33 | 34 | public String toString() { 35 | return name; 36 | } 37 | 38 | public void run() { 39 | Task task = null; 40 | try { 41 | while (!Thread.currentThread().isInterrupted()) { 42 | task = inQueue.take(); 43 | try { 44 | // :TODO: NEXT: 45 | // If power fails, tasks can go MIA here. 46 | // We should maintain a global map of tasks, check it periodically, and restart tasks that went MIA. 47 | // Task update their status there, and clients could check the status using a GET /run/... call. 48 | task = handler.handle(task); 49 | } 50 | catch (final Fail e) { 51 | log.warn(String.format("%s %s: failed with message '%s'", name, task, e.getMessage())); 52 | if (task.failures().size() >= NUM_TRIES) { 53 | log.error(String.format("%s %s: Error details:", name, task), e); 54 | log.error(String.format("%s %s: Retries exhausted. Failing.", name, task)); 55 | failQueue.put(task); 56 | } 57 | else { 58 | log.warn(String.format("%s %s: recording failure & requeuing...", name, task)); 59 | inQueue.put(task.fail(e.getMessage())); 60 | } 61 | continue; 62 | } 63 | catch (final Exception e) { 64 | log.error(String.format("%s %s: Exception while handling.", name, task)); 65 | log.error(String.format("%s %s: Error details:", name, task), e); 66 | failQueue.put(task.fail(e.getMessage())); 67 | continue; 68 | } 69 | 70 | if (outQueue != null) outQueue.put(task); 71 | task = null; 72 | } 73 | } 74 | catch (InterruptedException ex) { 75 | Thread.currentThread().interrupt(); 76 | } 77 | } 78 | 79 | public void cancel() { 80 | interrupt(); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/transforms/HadoopTransform.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.transforms; 2 | 3 | import com.mozilla.grouperfish.batch.scheduling.Helpers; 4 | import com.mozilla.grouperfish.model.Task; 5 | import com.mozilla.grouperfish.services.api.FileSystem; 6 | import com.mozilla.grouperfish.services.api.FileSystem.FsError; 7 | 8 | 9 | /** Transform that relies on a distributed fs for processing. */ 10 | public class HadoopTransform extends ExecutableTransform { 11 | 12 | public HadoopTransform(final String name, final FileSystem dfs) { 13 | super(name, dfs); 14 | } 15 | 16 | @Override 17 | protected String taskDirectoryUri(final Task task) throws FsError { 18 | return dataFs().uri(Helpers.taskDirectory(task)); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/transforms/LocalTransform.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.transforms; 2 | 3 | import com.mozilla.grouperfish.base.Assert; 4 | import com.mozilla.grouperfish.batch.scheduling.Helpers; 5 | import com.mozilla.grouperfish.model.Fail; 6 | import com.mozilla.grouperfish.model.Task; 7 | import com.mozilla.grouperfish.services.api.FileSystem; 8 | import com.mozilla.grouperfish.services.api.FileSystem.FsError; 9 | 10 | 11 | /** 12 | * A transform can be implemented as a local executable that does not 13 | * know about hadoop or how to talk to HDFS, and instead uses a 14 | * (temporary) local work directory. 15 | * 16 | * Such transforms are made available through the LocalTransform 17 | * wrapper which will copy inputs from HDFS to the local file system, 18 | * and results back to HDFS. 19 | * 20 | * The actual executable will receive a local directory (as an absolute 21 | * path) instead of an HDFS uri. 22 | */ 23 | public class LocalTransform extends ExecutableTransform { 24 | 25 | private final FileSystem localFs; 26 | private final boolean needsToCopy; 27 | 28 | /** 29 | * A local transform in a distributed environment: 30 | * Task input data is copied from the dfs to the local fs before 31 | * running, and results are copied back afterwards. 32 | * 33 | * @param name The transform executable. It should take the location of the input data 34 | * as its single argument. 35 | * @param dfs The distributed filesystem used by grouperfish (e.g. HDFS). 36 | * @param localFs The local filesystem where working directories for local processes can be created. 37 | */ 38 | public LocalTransform( 39 | final String name, 40 | final FileSystem dfs, 41 | final FileSystem localFs) { 42 | super(name, dfs); 43 | Assert.nonNull(localFs); 44 | this.localFs = localFs; 45 | this.needsToCopy = !dfs.equals(localFs); 46 | } 47 | 48 | @Override 49 | protected String taskDirectoryUri(final Task task) throws FsError { 50 | return localFs.uri(Helpers.taskDirectory(task)).substring("file://".length()); 51 | } 52 | 53 | @Override 54 | public TransformResult run(Task task) throws Fail, InterruptedException { 55 | if (needsToCopy) { 56 | try { 57 | Helpers.copy(Helpers.inputFilename(task), dataFs(), localFs); 58 | Helpers.copy(Helpers.parametersFilename(task), dataFs(), localFs); 59 | } 60 | catch (final Exception e) { 61 | throw Fail.hard(task, "Could not copy data to local fs.", e); 62 | } 63 | } 64 | 65 | final TransformResult result = super.run(task); 66 | 67 | if (needsToCopy) { 68 | try { 69 | Helpers.copy(Helpers.resultsFilename(task), localFs, dataFs()); 70 | } 71 | catch (final Exception e) { 72 | throw Fail.hard(task, "Could not copy results back to distributed fs.", e); 73 | } 74 | } 75 | 76 | return result; 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/transforms/Transform.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.transforms; 2 | 3 | import java.io.InputStream; 4 | 5 | import com.mozilla.grouperfish.model.Fail; 6 | import com.mozilla.grouperfish.model.Task; 7 | 8 | 9 | /** 10 | * Proxy to the real transform implementation (which can be a java class, a local executable, a RPC call...). 11 | */ 12 | public interface Transform { 13 | 14 | public interface TransformResult { 15 | InputStream stderr(); 16 | boolean success(); 17 | } 18 | 19 | public TransformResult run(Task task) throws Fail, InterruptedException; 20 | 21 | } 22 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/batch/transforms/TransformProvider.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.batch.transforms; 2 | 3 | public interface TransformProvider { 4 | 5 | Transform get(String name); 6 | 7 | } 8 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/bootstrap/Grouperfish.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.bootstrap; 2 | 3 | import java.util.Properties; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.slf4j.bridge.SLF4JBridgeHandler; 8 | 9 | import com.google.inject.AbstractModule; 10 | import com.google.inject.Guice; 11 | import com.google.inject.Injector; 12 | import com.google.inject.Module; 13 | import com.google.inject.Provider; 14 | import com.mozilla.grouperfish.base.PropertiesTool; 15 | import com.mozilla.grouperfish.batch.api.guice.BatchSystem; 16 | import com.mozilla.grouperfish.rest.api.RestService; 17 | import com.mozilla.grouperfish.rest.jersey.JerseyGuiceRestService; 18 | import com.mozilla.grouperfish.rest.jersey.ResourceConfig; 19 | import com.mozilla.grouperfish.services.api.guice.Services; 20 | 21 | 22 | /** Entry class to set up the Grouperfish service. */ 23 | public class Grouperfish { 24 | 25 | public static final int DEFAULT_PORT = 0xF124; 26 | 27 | static final Logger log = LoggerFactory.getLogger(Grouperfish.class); 28 | 29 | /** 30 | * Starts the Grouperfish engine. 31 | * REST resources will be autodiscovered by Jersey (JAX-RS). 32 | * 33 | * @param arguments not used 34 | * @throws Exception 35 | */ 36 | public static void main(final String[] arguments) throws Exception { 37 | final Properties properties = 38 | PropertiesTool.load(Grouperfish.class, "grouperfish.properties"); 39 | new Grouperfish( 40 | new Services(properties), 41 | new BatchSystem(), 42 | new AbstractModule() { 43 | @Override protected void configure() { 44 | bind(Properties.class).toProvider(new Provider() { 45 | @Override public Properties get() { return properties; } 46 | }).asEagerSingleton(); 47 | } 48 | } 49 | ); 50 | } 51 | 52 | public Grouperfish(final Module... modules) { 53 | SLF4JBridgeHandler.install(); 54 | final Injector injector = Guice.createInjector(modules); 55 | final RestService rest = new JerseyGuiceRestService(injector, ResourceConfig.class); 56 | rest.start(); 57 | log.info("Grouperfish started."); 58 | log.debug("Configured port: {}, default: {}", 59 | System.getProperty(JerseyGuiceRestService.PROPERTY_PORT), DEFAULT_PORT); 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/Access.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | public interface Access { 4 | 5 | enum Operation {CREATE, READ, RUN, DELETE, LIST}; 6 | 7 | String origin(); 8 | 9 | Operation type(); 10 | 11 | } 12 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/Document.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | import java.util.Map; 4 | 5 | import com.mozilla.grouperfish.base.Assert; 6 | 7 | 8 | /** Simple multi-field text document. Each document has at least id and (full) text. */ 9 | public class Document extends NamedSource { 10 | 11 | public Document(final String id, final String source) { 12 | super(id, source); 13 | } 14 | 15 | public Document(final String id, final Map fields) { 16 | super(id, fields); 17 | } 18 | 19 | public Document(final Map fields) { 20 | super(String.valueOf(fields.get("id")), fields); 21 | Assert.nonNull(fields.get("id")); 22 | } 23 | 24 | /** 25 | * For documents this is the same as name. 26 | */ 27 | public String id() { 28 | return name(); 29 | } 30 | 31 | private static final long serialVersionUID = 0; 32 | 33 | } 34 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/Fail.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | 4 | /** 5 | * Describes how a task failed. Generated by handlers. 6 | * This information should be made available somehow under the run resource. 7 | */ 8 | public abstract class Fail extends Exception { 9 | 10 | private static final long serialVersionUID = 0; 11 | 12 | private final Task task; 13 | 14 | public static Fail hard(final Task task, final String message, final Throwable maybeCause) { 15 | if (maybeCause == null) return new HardFail(task, message); 16 | return new HardFail(task, message, maybeCause); 17 | } 18 | 19 | public static Fail soft(final Task task, final String message, final Throwable maybeCause) { 20 | if (maybeCause == null) return new SoftFail(task, message); 21 | return new SoftFail(task, message, maybeCause); 22 | } 23 | 24 | public Fail(final Task task, final String message) { 25 | super(String.format("Task %s failed. %s", task, message)); 26 | this.task = task; 27 | } 28 | 29 | public Fail(final Task task, final String message, final Throwable cause) { 30 | super(message, cause); 31 | this.task = task; 32 | } 33 | 34 | public Task task() { 35 | return task; 36 | } 37 | 38 | /** 39 | * Handlers can throw a hard failure if they are fairly certain that 40 | * retrying will not help. 41 | */ 42 | public static final class HardFail extends Fail { 43 | HardFail(final Task task, final String message, final Throwable cause) { 44 | super(task, message, cause); 45 | } 46 | 47 | HardFail(final Task task, final String message) { 48 | super(task, message); 49 | } 50 | 51 | private static final long serialVersionUID = 1L; 52 | } 53 | 54 | 55 | /** 56 | * Handlers can throw a soft failure if they think that 57 | * retrying might help, e.g. if they were interrupted during execution. 58 | */ 59 | public static final class SoftFail extends Fail { 60 | SoftFail(final Task task, final String message, final Throwable cause) { 61 | super(task, message, cause); 62 | } 63 | 64 | SoftFail(final Task task, final String message) { 65 | super(task, message); 66 | } 67 | 68 | private static final long serialVersionUID = 1L; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/NamedSource.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | import static com.mozilla.grouperfish.base.ImmutableTools.immutable; 4 | 5 | import java.io.Serializable; 6 | import java.util.Map; 7 | 8 | import org.json.simple.JSONObject; 9 | import org.json.simple.parser.JSONParser; 10 | 11 | import com.mozilla.grouperfish.base.Assert; 12 | 13 | 14 | public abstract class NamedSource implements Serializable { 15 | 16 | private final String name; 17 | private final String source; 18 | private transient Map fields; 19 | 20 | NamedSource(final String name, final String source) { 21 | Assert.nonNull(name, source); 22 | Assert.check(!name.isEmpty(), !source.isEmpty()); 23 | this.name = name; 24 | this.source = source; 25 | } 26 | 27 | /** @param fields Must be directly mappable to a JSONObject. 28 | * That means, a java.util.Map with string keys and mappable values. 29 | * http://code.google.com/p/json-simple/wiki/MappingBetweenJSONAndJavaEntities 30 | */ 31 | NamedSource(final String name, final Map fields) { 32 | Assert.nonNull(name, fields); 33 | Assert.check(!name.isEmpty()); 34 | this.name = name; 35 | this.fields = fields; 36 | this.source = JSONObject.toJSONString(fields); 37 | } 38 | 39 | public String toString() { 40 | return String.format("[%s %s, source.length=%s]", getClass().getSimpleName(), name(), source().length()); 41 | } 42 | 43 | 44 | public String name() { 45 | return name; 46 | } 47 | 48 | public String source() { 49 | return source; 50 | } 51 | 52 | @SuppressWarnings("unchecked") 53 | public Map fields() { 54 | if (fields != null) return fields; 55 | try { 56 | fields = immutable((Map) new JSONParser().parse(source())); 57 | } catch (Exception e) { 58 | String message = String.format("Failed to parse source for %s with id='%s'", 59 | getClass().getSimpleName(), name); 60 | Assert.unreachable(message, e); 61 | } 62 | Assert.check(fields instanceof Map); 63 | return fields; 64 | } 65 | 66 | private static final long serialVersionUID = 0; 67 | 68 | } 69 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/Query.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | import com.mozilla.grouperfish.base.Assert; 4 | 5 | 6 | /** Simple query+name wrapper. */ 7 | public class Query extends NamedSource { 8 | 9 | public Query(final String name, final String json) { 10 | super(name, json); 11 | Assert.check(!name.isEmpty(), !json.isEmpty()); 12 | } 13 | 14 | public boolean isTemplate() { 15 | // :TODO: NEXT: 16 | // Implement templates 17 | return false; 18 | } 19 | 20 | private static final long serialVersionUID = 0; 21 | 22 | } 23 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/Task.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | import org.joda.time.Instant; 7 | 8 | import com.google.common.collect.ImmutableList; 9 | import com.mozilla.grouperfish.base.Assert; 10 | import com.mozilla.grouperfish.naming.Namespace; 11 | 12 | 13 | /** Immutable task description. */ 14 | public class Task implements Serializable { 15 | 16 | // We do not want to serialize the namespace Object itself, but rather its name. 17 | private final String namespace; 18 | private final Query query; 19 | private final TransformConfig transform; 20 | private final Instant created; 21 | private final List failures; 22 | 23 | public Task(final Namespace ns, final Query query, final TransformConfig transform) { 24 | Assert.nonNull(ns, query, transform); 25 | this.namespace = ns.raw(); 26 | this.query = query; 27 | this.transform = transform; 28 | created = Instant.now(); 29 | failures = ImmutableList.of(); 30 | } 31 | 32 | private Task(final Task task, final String failure) { 33 | Assert.nonNull(task, failure); 34 | Assert.check(!failure.isEmpty()); 35 | this.namespace = task.namespace; 36 | this.query = task.query; 37 | this.transform = task.transform; 38 | this.created = task.created(); 39 | this.failures = new ImmutableList.Builder().addAll(task.failures).add(failure).build(); 40 | } 41 | 42 | public boolean isOk() { 43 | return failures.isEmpty(); 44 | } 45 | 46 | public Namespace namespace() { 47 | return new Namespace(namespace); 48 | } 49 | 50 | public Query query() { 51 | return query; 52 | } 53 | 54 | public String toString() { 55 | final String faildesc = (failures.size() == 0) ? "" : String.format(" (%s failed attempts)", failures.size()); 56 | return String.format("[Task @%s, T:%s, Q:%s%s]", created(), transform.name(), query.name(), faildesc); 57 | } 58 | 59 | public TransformConfig transform() { 60 | return transform; 61 | } 62 | 63 | public Task fail(final String failureMessage) { 64 | return new Task(this, failureMessage); 65 | } 66 | 67 | public List failures() { 68 | return failures; 69 | } 70 | 71 | public Instant created() { 72 | return created; 73 | } 74 | 75 | private static final long serialVersionUID = 0; 76 | 77 | } 78 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/TransformConfig.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | import java.util.Map; 4 | 5 | import org.json.simple.JSONObject; 6 | 7 | import com.mozilla.grouperfish.base.Assert; 8 | 9 | 10 | /** Simple config+name wrapper. */ 11 | public class TransformConfig extends NamedSource { 12 | 13 | public TransformConfig(final String name, final String source) { 14 | super(name, source); 15 | Assert.nonNull(name, source); 16 | } 17 | 18 | private static final long serialVersionUID = 0; 19 | 20 | @SuppressWarnings("rawtypes") 21 | public String parametersJson() { 22 | return JSONObject.toJSONString((Map) fields().get("parameters")); 23 | } 24 | 25 | public String transform() { 26 | return (String) fields().get("transform"); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/model/Type.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | public enum Type { 4 | DOCUMENT, 5 | QUERY, 6 | CONFIGURATION_FILTER, 7 | CONFIGURATION_TRANSFORM, 8 | RESULT 9 | } -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/naming/Namespace.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.naming; 2 | 3 | import java.util.EnumMap; 4 | 5 | import com.mozilla.grouperfish.base.Assert; 6 | import com.mozilla.grouperfish.model.Type; 7 | 8 | 9 | /** 10 | * Scopes resource access to a namespace. 11 | */ 12 | public class Namespace { 13 | 14 | protected final String namespace; 15 | 16 | public Namespace(final String namespace) { 17 | if (namespace.indexOf('.') != -1) { 18 | IllegalStateException e =new IllegalStateException("Illegal namespace: " + namespace); 19 | e.printStackTrace(); 20 | throw e; 21 | } 22 | this.namespace = namespace; 23 | } 24 | 25 | @SuppressWarnings("serial") 26 | private static final EnumMap prefixes = new EnumMap(Type.class) {{ 27 | for (Type t : Type.values()) { 28 | switch (t) { 29 | case DOCUMENT: put(t, "documents_"); break; 30 | case QUERY: put(t, "queries_"); break; 31 | case CONFIGURATION_FILTER: put(t, "configurations_filters_"); break; 32 | case CONFIGURATION_TRANSFORM: put(t, "configurations_transforms_"); break; 33 | case RESULT: put(t, "results_"); break; 34 | default: Assert.unreachable(); 35 | } 36 | } 37 | }}; 38 | 39 | /** Buckets are used to name maps on the grid and indexes. */ 40 | public final String bucket(final Type type) { 41 | return prefixes.get(type) + namespace; 42 | } 43 | 44 | public String raw() { 45 | return namespace; 46 | } 47 | 48 | public String toString() { 49 | return String.format("[Namespace %s]", raw()); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/naming/Scope.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.naming; 2 | 3 | import java.util.Map; 4 | 5 | import com.mozilla.grouperfish.base.Assert; 6 | import com.mozilla.grouperfish.base.json.JsonValidator; 7 | import com.mozilla.grouperfish.model.Access; 8 | import com.mozilla.grouperfish.model.Type; 9 | import com.mozilla.grouperfish.rest.jaxrs.DocumentsResource; 10 | import com.mozilla.grouperfish.rest.jaxrs.QueriesResource; 11 | import com.mozilla.grouperfish.rest.jaxrs.ResultsResource; 12 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource.FilterConfigsResource; 13 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource.TransformConfigsResource; 14 | import com.mozilla.grouperfish.services.api.Grid; 15 | 16 | 17 | /** 18 | * Helps to consistently associate resource access to a namespace. 19 | * Gatekeeperfor each access, allows to implement permissions 20 | * (in {@link #allows(Class, Access)}). 21 | */ 22 | public class Scope extends Namespace { 23 | 24 | private final Grid grid; 25 | private final int maxDocumentLength = 512 * 1024 * 1024; 26 | 27 | public Scope(final String namespace, final Grid grid) { 28 | super(namespace); 29 | this.grid = grid; 30 | } 31 | 32 | public Scope(final Namespace ns, final Grid grid) { 33 | super(ns.raw()); 34 | this.grid = grid; 35 | } 36 | 37 | public Map documents() { 38 | return grid.map(bucket(Type.DOCUMENT)); 39 | } 40 | 41 | public Map queries() { 42 | return grid.map(bucket(Type.QUERY)); 43 | } 44 | 45 | public Map results() { 46 | return grid.map(bucket(Type.RESULT)); 47 | } 48 | 49 | public Map map(final Type type) { 50 | Assert.nonNull(type); 51 | return grid.map(bucket(type)); 52 | } 53 | 54 | public Map resourceMap(final Class resourceType) { 55 | Assert.nonNull(resourceType); 56 | if (resourceType == ResultsResource.class) return results(); 57 | if (resourceType == DocumentsResource.class) return documents(); 58 | if (resourceType == QueriesResource.class) return queries(); 59 | if (resourceType == TransformConfigsResource.class) return map(Type.CONFIGURATION_TRANSFORM); 60 | if (resourceType == FilterConfigsResource.class) return map(Type.CONFIGURATION_FILTER); 61 | Assert.unreachable("Unhandled resource type: %s", resourceType.getName()); 62 | return null; 63 | } 64 | 65 | public int maxLength(final Class resourceType, final Access access) { 66 | return maxDocumentLength; 67 | } 68 | 69 | public boolean allows(final Class resourceType, final Access access) { 70 | return true; 71 | } 72 | 73 | public JsonValidator validator(final Class resourceType) { 74 | return new JsonValidator(); 75 | } 76 | 77 | public String toString() { 78 | return String.format("[Scope %s]", raw()); 79 | } 80 | 81 | } -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/api/RestService.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.api; 2 | 3 | 4 | public interface RestService { 5 | 6 | public Daemon start(); 7 | 8 | public static interface Daemon { 9 | /** Wait for shutdown of the daemon. Intercept the interrupt to clean up your resources. */ 10 | void join() throws InterruptedException; 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/DocumentsResource.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jaxrs; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.http.HttpServletRequest; 6 | import javax.ws.rs.Consumes; 7 | import javax.ws.rs.DELETE; 8 | import javax.ws.rs.GET; 9 | import javax.ws.rs.PUT; 10 | import javax.ws.rs.Path; 11 | import javax.ws.rs.PathParam; 12 | import javax.ws.rs.Produces; 13 | import javax.ws.rs.core.Context; 14 | import javax.ws.rs.core.MediaType; 15 | import javax.ws.rs.core.Response; 16 | 17 | import com.google.inject.Inject; 18 | import com.mozilla.grouperfish.services.api.Grid; 19 | 20 | 21 | @Path("/documents/{namespace}/{id}") 22 | public class DocumentsResource extends ResourceBase { 23 | 24 | @Inject 25 | public DocumentsResource(final Grid grid) { super(grid); } 26 | 27 | @GET 28 | @Produces(MediaType.APPLICATION_JSON) 29 | public Response getDocument(@PathParam("namespace") String namespace, 30 | @PathParam("id") String id, 31 | @Context HttpServletRequest request) { 32 | return RestHelper.getAny(getClass(), scope(namespace), id, request); 33 | } 34 | 35 | @PUT 36 | @Consumes(MediaType.APPLICATION_JSON) 37 | public Response putDocument(@PathParam("namespace") String namespace, 38 | @PathParam("id") String id, 39 | @Context HttpServletRequest request) throws IOException { 40 | return RestHelper.putAny(getClass(), scope(namespace), id, request); 41 | } 42 | 43 | @DELETE 44 | public Response deleteDocument(@PathParam("namespace") String namespace, 45 | @PathParam("id") String id, 46 | @Context HttpServletRequest request) throws IOException { 47 | return RestHelper.deleteAny(getClass(), scope(namespace), id, request); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/HttpAccess.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jaxrs; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import javax.servlet.http.HttpServletRequest; 7 | 8 | import com.mozilla.grouperfish.base.Assert; 9 | import com.mozilla.grouperfish.model.Access; 10 | 11 | 12 | public class HttpAccess implements Access { 13 | 14 | private final Access.Operation type; 15 | private final HttpServletRequest request; 16 | 17 | @SuppressWarnings("serial") 18 | private static final Map defaultType = new HashMap() {{ 19 | put("PUT", Access.Operation.CREATE); 20 | put("GET", Access.Operation.READ); 21 | put("POST", Access.Operation.RUN); 22 | put("DELETE", Access.Operation.DELETE); 23 | }}; 24 | 25 | public HttpAccess(final HttpServletRequest request) { 26 | this(defaultType.get(request.getMethod()), request); 27 | } 28 | 29 | public HttpAccess(final Operation type, 30 | final HttpServletRequest request) { 31 | Assert.nonNull(type); 32 | this.type = type; 33 | this.request = request; 34 | } 35 | 36 | @Override 37 | public String origin() { 38 | return request.getRemoteHost(); 39 | } 40 | 41 | @Override 42 | public Operation type() { 43 | return type; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/QueriesResource.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jaxrs; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.http.HttpServletRequest; 6 | import javax.ws.rs.Consumes; 7 | import javax.ws.rs.DELETE; 8 | import javax.ws.rs.GET; 9 | import javax.ws.rs.PUT; 10 | import javax.ws.rs.Path; 11 | import javax.ws.rs.PathParam; 12 | import javax.ws.rs.Produces; 13 | import javax.ws.rs.core.Context; 14 | import javax.ws.rs.core.MediaType; 15 | import javax.ws.rs.core.Response; 16 | 17 | import com.google.inject.Inject; 18 | import com.mozilla.grouperfish.services.api.Grid; 19 | 20 | 21 | @Path("/queries/{namespace}") 22 | public class QueriesResource extends ResourceBase { 23 | 24 | @Inject 25 | public QueriesResource(final Grid grid) { super(grid); } 26 | 27 | @GET 28 | @Produces(MediaType.APPLICATION_JSON) 29 | public Response list(@PathParam("namespace") String namespace, 30 | @Context HttpServletRequest request) { 31 | return RestHelper.listAny(getClass(), scope(namespace), request); 32 | } 33 | 34 | @GET 35 | @Path("/{queryName}") 36 | @Produces(MediaType.APPLICATION_JSON) 37 | public Response getQuery(@PathParam("namespace") String namespace, 38 | @PathParam("queryName") String queryName, 39 | @Context HttpServletRequest request) { 40 | return RestHelper.getAny(getClass(), scope(namespace), queryName, request); 41 | } 42 | 43 | @PUT 44 | @Path("/{queryName}") 45 | @Consumes(MediaType.APPLICATION_JSON) 46 | public Response putQuery(@PathParam("namespace") String namespace, 47 | @PathParam("queryName") String queryName, 48 | @Context HttpServletRequest request) throws IOException { 49 | return RestHelper.putAny(getClass(), scope(namespace), queryName, request); 50 | } 51 | 52 | 53 | @DELETE 54 | @Path("/{queryName}") 55 | public Response deleteQuery(@PathParam("namespace") String namespace, 56 | @PathParam("queryName") String queryName, 57 | @Context HttpServletRequest request) throws IOException { 58 | return RestHelper.deleteAny(getClass(), scope(namespace), queryName, request); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/ResourceBase.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jaxrs; 2 | 3 | 4 | import com.mozilla.grouperfish.naming.Scope; 5 | import com.mozilla.grouperfish.services.api.Grid; 6 | 7 | public class ResourceBase { 8 | 9 | private final Grid grid; 10 | 11 | public ResourceBase(final Grid grid) { 12 | this.grid = grid; 13 | } 14 | 15 | protected Scope scope(final String namespace) { 16 | return new Scope(namespace, grid); 17 | } 18 | 19 | 20 | } 21 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/ResultsResource.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jaxrs; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.http.HttpServletRequest; 6 | import javax.ws.rs.Consumes; 7 | import javax.ws.rs.DELETE; 8 | import javax.ws.rs.GET; 9 | import javax.ws.rs.PUT; 10 | import javax.ws.rs.Path; 11 | import javax.ws.rs.PathParam; 12 | import javax.ws.rs.Produces; 13 | import javax.ws.rs.core.Context; 14 | import javax.ws.rs.core.MediaType; 15 | import javax.ws.rs.core.Response; 16 | 17 | import com.google.inject.Inject; 18 | import com.mozilla.grouperfish.services.api.Grid; 19 | 20 | 21 | //:TODO: v0.1 22 | // Integrate facet query parameters 23 | @Path("/results/{namespace}/{transform}/{query}") 24 | public class ResultsResource extends ResourceBase { 25 | 26 | @Inject 27 | public ResultsResource(final Grid grid) { super(grid); } 28 | 29 | @GET 30 | @Produces(MediaType.APPLICATION_JSON) 31 | public Response getResult(@PathParam("namespace") String namespace, 32 | @PathParam("transform") String transformName, 33 | @PathParam("query") String queryName, 34 | @Context HttpServletRequest request) { 35 | return RestHelper.getAny(getClass(), scope(namespace), key(transformName, queryName), request); 36 | } 37 | 38 | @PUT 39 | @Consumes(MediaType.APPLICATION_JSON) 40 | public Response putResult(@PathParam("namespace") String namespace, 41 | @PathParam("transform") String transformName, 42 | @PathParam("query") String queryName, 43 | @Context HttpServletRequest request) throws IOException { 44 | return RestHelper.putAny(getClass(), scope(namespace), key(transformName, queryName), request); 45 | } 46 | 47 | @DELETE 48 | public Response deleteResult(@PathParam("namespace") String namespace, 49 | @PathParam("transform") String transformName, 50 | @PathParam("query") String queryName, 51 | @Context HttpServletRequest request) throws IOException { 52 | return RestHelper.deleteAny(getClass(), scope(namespace), key(transformName, queryName), request); 53 | } 54 | 55 | public static String key(final String transformName, final String queryName) { 56 | return String.format("%s_%s", transformName, queryName); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jersey/JerseyGuiceRestService.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jersey; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import org.eclipse.jetty.server.Server; 7 | import org.eclipse.jetty.servlet.DefaultServlet; 8 | import org.eclipse.jetty.servlet.ServletContextHandler; 9 | 10 | import com.google.inject.Injector; 11 | import com.google.inject.servlet.GuiceFilter; 12 | import com.google.inject.servlet.GuiceServletContextListener; 13 | import com.mozilla.grouperfish.base.Assert; 14 | import com.mozilla.grouperfish.bootstrap.Grouperfish; 15 | import com.mozilla.grouperfish.rest.api.RestService; 16 | import com.sun.jersey.api.core.PackagesResourceConfig; 17 | import com.sun.jersey.api.core.ResourceConfig; 18 | import com.sun.jersey.guice.JerseyServletModule; 19 | import com.sun.jersey.guice.spi.container.servlet.GuiceContainer; 20 | 21 | 22 | public class JerseyGuiceRestService implements RestService { 23 | 24 | public static final String PROPERTY_PORT = "grouperfish.rest.port"; 25 | public static final String PROPERTY_PORT_DEFAULT = String.valueOf(Grouperfish.DEFAULT_PORT); 26 | 27 | private final Server server; 28 | 29 | /** 30 | * Initializes a Jersey based JAX-RS service using the given resource configuration. 31 | * The procided confgiuration class must not be anonymous. 32 | */ 33 | public JerseyGuiceRestService(final Injector parentInjector, 34 | final Class resourceConfigClass) { 35 | 36 | Assert.nonNull(parentInjector); 37 | Assert.nonNull(resourceConfigClass, resourceConfigClass.getCanonicalName()); 38 | Assert.check(!resourceConfigClass.getCanonicalName().isEmpty()); 39 | 40 | final int port = Integer.parseInt(System.getProperty(PROPERTY_PORT, PROPERTY_PORT_DEFAULT)); 41 | 42 | server = new Server(port); 43 | final ServletContextHandler root = 44 | new ServletContextHandler(server, "/", ServletContextHandler.NO_SESSIONS); 45 | 46 | root.addEventListener(new GuiceServletContextListener() { 47 | 48 | @Override 49 | protected Injector getInjector() { 50 | return parentInjector.createChildInjector(new JerseyServletModule() { 51 | protected void configureServlets() { 52 | final Map params = new HashMap(); 53 | params.put(PackagesResourceConfig.PROPERTY_PACKAGES, 54 | "jetty"); 55 | params.put("com.sun.jersey.config.property.resourceConfigClass", 56 | resourceConfigClass.getCanonicalName()); 57 | serve("/*").with(GuiceContainer.class, params); 58 | } 59 | }); 60 | } 61 | }); 62 | 63 | root.addFilter(GuiceFilter.class, "/*", null); 64 | root.addServlet(DefaultServlet.class, "/"); 65 | 66 | server.setSendServerVersion(false); 67 | server.setSendDateHeader(false); 68 | server.setStopAtShutdown(true); 69 | 70 | } 71 | 72 | @Override 73 | public Daemon start() { 74 | try { 75 | server.start(); 76 | } 77 | catch(final Exception e) { 78 | throw new RuntimeException(e); 79 | } 80 | 81 | return new Daemon() { 82 | @Override 83 | public void join() throws InterruptedException { 84 | server.join(); 85 | } 86 | }; 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/rest/jersey/ResourceConfig.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jersey; 2 | 3 | import com.sun.jersey.api.core.PackagesResourceConfig; 4 | 5 | public class ResourceConfig extends PackagesResourceConfig { 6 | 7 | public ResourceConfig() { 8 | super("com.mozilla.grouperfish.rest.jaxrs"); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/api/FileSystem.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.api; 2 | 3 | import java.io.Reader; 4 | import java.io.Writer; 5 | 6 | 7 | /** 8 | * A (virtual) file system root with the essential primitives for 9 | * Grouperfish batch operation: 10 | * - create/remove directories 11 | * - read/write textual data (tsv, json) 12 | * - obtain global uris that can be passed to external components. 13 | * 14 | * All operations except {@link #uri(String)} work with relative 15 | * (virtual) paths that are only meaningful to this filesystem 16 | * instance (in-memory fs, temp fs) or instances created with the same 17 | * parameters (local fs, distributed fs). 18 | */ 19 | public interface FileSystem { 20 | 21 | /** Result: The absolute path that was removed. */ 22 | String removeRecursively(String relativePath) throws Denied, NotFound; 23 | 24 | /** 25 | * Creates the given directory if it does not exist already. 26 | * Fails if path exists but is not a directory. 27 | * @return The uri of the directory that was created. 28 | */ 29 | String makeDirectory(String relativePath) throws Denied; 30 | 31 | /** 32 | * Opens a file for writing (creates the file if not present). 33 | * @return A suitable writer for string data. 34 | */ 35 | Writer writer(String path) throws Denied; 36 | 37 | /** 38 | * Opens a file for writing (creates the file if not present). 39 | * @param path The filesystem local path. 40 | * @return A suitable reader for string data. 41 | */ 42 | Reader reader(String path) throws Denied, NotFound; 43 | 44 | /** 45 | * Generate a url that can be used to reference this relative path externally. 46 | * Ensures that the referee actually exists (at least currently). 47 | */ 48 | String uri(String path) throws NotFound; 49 | 50 | public static class FsError extends Exception { 51 | public FsError(final String message) { super(message); } 52 | public FsError(final String message, final Exception reason) { super(message, reason); } 53 | private static final long serialVersionUID = 1L; 54 | }; 55 | 56 | public static class Denied extends FsError { 57 | public Denied(final String more) { super("Denied: " + more); } 58 | public Denied(final String more, final Exception reason) { super("Denied: " + more, reason); } 59 | private static final long serialVersionUID = 1L; 60 | }; 61 | 62 | public static class NotFound extends FsError { 63 | public NotFound(final String uri) { super("Not found: " + uri); } 64 | public NotFound(final String uri, final Exception reason) { super("Not found: " + uri, reason); } 65 | private static final long serialVersionUID = 1L; 66 | }; 67 | 68 | } 69 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/api/Grid.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.api; 2 | 3 | import java.util.Map; 4 | import java.util.concurrent.BlockingQueue; 5 | 6 | public interface Grid { 7 | 8 | Map map(String name); 9 | 10 | BlockingQueue queue(String name); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/api/Index.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.api; 2 | 3 | import com.mozilla.grouperfish.model.Document; 4 | import com.mozilla.grouperfish.model.Query; 5 | 6 | 7 | public interface Index { 8 | 9 | Iterable find(Query query); 10 | 11 | Iterable resolve(Query query); 12 | 13 | } 14 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/api/IndexProvider.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.api; 2 | 3 | public interface IndexProvider { 4 | 5 | Index index(String name); 6 | 7 | } 8 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/api/guice/Local.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.api.guice; 2 | 3 | import static java.lang.annotation.ElementType.PARAMETER; 4 | import static java.lang.annotation.RetentionPolicy.RUNTIME; 5 | 6 | import java.lang.annotation.Retention; 7 | import java.lang.annotation.Target; 8 | 9 | import com.google.inject.BindingAnnotation; 10 | 11 | /** 12 | * When a local resource is requested as a parameter, the client 13 | * will not try to share it. Examples: The local FileSystem, a 14 | * local memory grid instead of a HazelCast grid. 15 | * 16 | * Whether injectors can still pass shared resources depends on 17 | * the resource: A local FileSystem needs to be local, because 18 | * other local processes need to read from it. A 'local' grid 19 | * might simply be an optimization over a shared grid. 20 | */ 21 | @BindingAnnotation @Target(PARAMETER) @Retention(RUNTIME) 22 | public @interface Local { } 23 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/api/guice/Shared.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.api.guice; 2 | 3 | import static java.lang.annotation.ElementType.PARAMETER; 4 | import static java.lang.annotation.RetentionPolicy.RUNTIME; 5 | 6 | import java.lang.annotation.Retention; 7 | import java.lang.annotation.Target; 8 | 9 | import com.google.inject.BindingAnnotation; 10 | 11 | 12 | /** 13 | * When a shared resource is requested as a parameter, the client 14 | * intends to share the service with other cluster members. Modules 15 | * should provide appropriate services (Hadoop FS, HazelCast map...). 16 | * 17 | * Shared is the default. 18 | * 19 | * Injection might still pass a local resource, but should do so 20 | * only if the modus operandi is guaranteed to be standalone: Here 21 | * local/shared makes no difference. Examples for this are 22 | * testing/development setups. 23 | */ 24 | @BindingAnnotation @Target(PARAMETER) @Retention(RUNTIME) 25 | public @interface Shared { } 26 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/elasticsearch/ElasticSearchIndexProvider.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.elasticsearch; 2 | 3 | import java.util.Properties; 4 | 5 | import org.elasticsearch.client.Client; 6 | import org.elasticsearch.node.Node; 7 | import org.elasticsearch.node.NodeBuilder; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import com.mozilla.grouperfish.services.api.Index; 12 | import com.mozilla.grouperfish.services.api.IndexProvider; 13 | 14 | public class ElasticSearchIndexProvider implements IndexProvider { 15 | 16 | private static final Logger log = LoggerFactory.getLogger(ElasticSearchIndexProvider.class); 17 | 18 | public static final String PROPERTY_CLUSTER = "grouperfish.services.elasticsearch.cluster"; 19 | public static final String PROPERTY_CLUSTER_DEFAULT = "grouperfish"; 20 | 21 | public static final String PROPERTY_TYPE = "grouperfish.services.elasticsearch.type"; 22 | // :TODO: Hack... to simplify, we should use 1 index for all HC maps, and differentiate solely using type. 23 | public static final String PROPERTY_TYPE_DEFAULT = "documents"; 24 | 25 | private final String type; 26 | private final Client client; 27 | 28 | public ElasticSearchIndexProvider(final Properties properties) { 29 | type = System.getProperty(PROPERTY_TYPE, PROPERTY_TYPE_DEFAULT); 30 | final String clusterName = System.getProperty(PROPERTY_CLUSTER, PROPERTY_CLUSTER_DEFAULT); 31 | final Node node = NodeBuilder.nodeBuilder().loadConfigSettings(false).client(true).data(false).clusterName(clusterName).build(); 32 | node.start(); 33 | client = node.client(); 34 | 35 | log.info(String.format("Instantiated index provider: %s (cluster.name=%s)", 36 | getClass().getSimpleName(), clusterName)); 37 | } 38 | 39 | @Override 40 | public Index index(final String name) { 41 | return new ElasticSearchIndex(client, name, type); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/hazelcast/HazelcastGrid.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.hazelcast; 2 | 3 | import java.util.Map; 4 | import java.util.concurrent.BlockingQueue; 5 | 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import com.hazelcast.config.Config; 10 | import com.hazelcast.config.MapConfig; 11 | import com.hazelcast.core.Hazelcast; 12 | import com.mozilla.grouperfish.services.api.Grid; 13 | 14 | 15 | public class HazelcastGrid implements Grid { 16 | 17 | private static final Logger log = LoggerFactory.getLogger(HazelcastGrid.class); 18 | 19 | public HazelcastGrid() { 20 | // Initialize some of Hazelcast now rather than waiting for the first request 21 | Hazelcast.getDefaultInstance(); 22 | final Config config = Hazelcast.getConfig(); 23 | final StringBuilder sb = new StringBuilder(); 24 | for (final Map.Entry entry : config.getMapConfigs().entrySet()) { 25 | sb.append(entry.getKey()).append(", "); 26 | } 27 | final int numMembers = Hazelcast.getCluster().getMembers().size(); 28 | 29 | // Force initialization of index. 30 | // :TODO: make less hacky... 31 | log.info("Initializing HC ES node..."); 32 | Hazelcast.getMap("documents_grouperfish").get("unused"); 33 | 34 | log.info(String.format("Instantiated service: %s (maps=%smembers=%s)", 35 | getClass().getSimpleName(), sb.toString(), numMembers)); 36 | } 37 | 38 | @Override 39 | public Map map(final String name) { 40 | return Hazelcast.getMap(name); 41 | } 42 | 43 | @Override 44 | public BlockingQueue queue(final String name) { 45 | return Hazelcast.getQueue(name); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/mock/MockFs.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.mock; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.io.OutputStreamWriter; 8 | import java.io.Reader; 9 | import java.io.Writer; 10 | import java.util.ArrayList; 11 | import java.util.Hashtable; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | import com.mozilla.grouperfish.base.ArrayTool; 16 | import com.mozilla.grouperfish.base.Assert; 17 | import com.mozilla.grouperfish.services.api.FileSystem; 18 | 19 | 20 | public class MockFs implements FileSystem { 21 | 22 | private final String root; 23 | 24 | private final Map files = new Hashtable(); 25 | 26 | public MockFs(final String root) { 27 | this.root = root; 28 | } 29 | 30 | @Override 31 | public synchronized String removeRecursively(final String relativePath) throws Denied, NotFound { 32 | Assert.nonNull(relativePath); 33 | Assert.check(!relativePath.isEmpty()); 34 | 35 | final List toRemove = new ArrayList(); 36 | for (final String key : files.keySet()) { 37 | if (!key.startsWith(relativePath)) continue; 38 | final String rest = key.substring(key.length()); 39 | if (rest.startsWith("/") || relativePath.endsWith("/") || rest.isEmpty()) { 40 | toRemove.add(key); 41 | } 42 | } 43 | for (final String key : toRemove) files.remove(key); 44 | return uncheckedUri(relativePath); 45 | } 46 | 47 | @Override 48 | public synchronized String makeDirectory(final String relativePath) throws Denied { 49 | Assert.nonNull(relativePath); 50 | Assert.check(!relativePath.isEmpty()); 51 | if (files.containsKey(relativePath)) throw new Denied("used as file: " + uncheckedUri(relativePath)); 52 | return uncheckedUri(relativePath); 53 | } 54 | 55 | @Override 56 | public synchronized Writer writer(final String path) throws Denied { 57 | return new OutputStreamWriter(new ByteArrayOutputStream() { 58 | @Override 59 | public void close() throws IOException { 60 | if (files.containsKey(path)) { 61 | files.put(path, ArrayTool.concat(files.get(path), toByteArray())); 62 | } 63 | else { 64 | files.put(path, toByteArray()); 65 | } 66 | } 67 | }); 68 | } 69 | 70 | @Override 71 | public synchronized Reader reader(final String path) throws Denied, NotFound { 72 | if (!files.containsKey(path)) throw new NotFound(uri(path)); 73 | return new InputStreamReader(new ByteArrayInputStream(files.get(path))); 74 | } 75 | 76 | @Override 77 | public String uri(final String relativePath) throws NotFound { 78 | if (!files.containsKey(relativePath)) throw new NotFound(relativePath); 79 | return uncheckedUri(relativePath); 80 | } 81 | 82 | private String uncheckedUri(final String relativePath) { 83 | return "mockfs://" + root + (relativePath.startsWith("/") ? "" : "/" ) + relativePath; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/mock/MockGrid.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.mock; 2 | 3 | import java.util.Hashtable; 4 | import java.util.Map; 5 | import java.util.concurrent.ArrayBlockingQueue; 6 | import java.util.concurrent.BlockingQueue; 7 | 8 | import com.mozilla.grouperfish.services.api.Grid; 9 | 10 | 11 | /** 12 | * In memory grid service, usable for some mocking. 13 | * 14 | * This cannot be used as an actual replacement for Hazelcast 15 | * because it lacks the persistence/indexing provided by Bagheera. 16 | * 17 | * Make sure to instantiate this as a singleton (e.g. using Guice). 18 | */ 19 | public class MockGrid implements Grid { 20 | 21 | // We want a concurrent map, like Hazelcast provides. 22 | private final Map> maps = 23 | new Hashtable>(); 24 | 25 | private final int queueCapacity = 1000; 26 | 27 | private final Map> queues = 28 | new Hashtable>(); 29 | 30 | @Override 31 | public synchronized Map map(final String name) { 32 | if (!maps.containsKey(name)) { 33 | maps.put(name, new Hashtable()); 34 | } 35 | return maps.get(name); 36 | } 37 | 38 | @SuppressWarnings("unchecked") 39 | @Override 40 | public synchronized BlockingQueue queue(final String name) { 41 | 42 | if (!queues.containsKey(name)) { 43 | queues.put(name, new ArrayBlockingQueue(queueCapacity)); 44 | } 45 | 46 | return (BlockingQueue) queues.get(name); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/services/mock/MockIndex.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.services.mock; 2 | 3 | import java.util.List; 4 | 5 | import org.elasticsearch.common.collect.ImmutableList; 6 | 7 | import com.mozilla.grouperfish.model.Document; 8 | import com.mozilla.grouperfish.model.Query; 9 | import com.mozilla.grouperfish.services.api.Index; 10 | 11 | public class MockIndex implements Index { 12 | 13 | // Chosen by fair dice roll. 14 | private final List randomDocuments = 15 | new ImmutableList.Builder(). 16 | add(new Document("A", "{\"id\": \"A\", \"text\": \"Some random text.\"}")). 17 | add(new Document("B", "{\"id\": \"B\", \"text\": \"Another text which is completely random.\"}")). 18 | add(new Document("C", "{\"id\": \"C\", \"text\": \"Only an ape with typewriter could think of this.\"}")). 19 | build(); 20 | 21 | private final List randomQueries = 22 | new ImmutableList.Builder(). 23 | add(new Query("A", "{\"query\": {\"field\": {\"x\": \"some\"}}}")). 24 | add(new Query("B", "{\"query\": {\"field\": {\"x\": \"thing\"}}}")). 25 | build(); 26 | 27 | @Override 28 | public Iterable find(final Query query) { 29 | return randomDocuments; 30 | } 31 | 32 | @Override 33 | public Iterable resolve(Query query) { 34 | return randomQueries; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/util/loader/DocumentLoader.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.util.loader; 2 | 3 | import com.mozilla.grouperfish.model.Document; 4 | 5 | 6 | public class DocumentLoader extends Loader { 7 | 8 | public DocumentLoader(final String baseUrl, final String namespace) { 9 | super(baseUrl + "/documents/" + namespace); 10 | } 11 | 12 | } 13 | -------------------------------------------------------------------------------- /service/src/main/java/com/mozilla/grouperfish/util/logback/AnsiColorConverter.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.util.logback; 2 | 3 | import ch.qos.logback.classic.Level; 4 | import ch.qos.logback.classic.pattern.ClassicConverter; 5 | import ch.qos.logback.classic.spi.ILoggingEvent; 6 | 7 | @SuppressWarnings("unused") 8 | public class AnsiColorConverter extends ClassicConverter { 9 | 10 | private static final int NORMAL = 0; 11 | private static final int BRIGHT = 1; 12 | private static final int FOREGROUND_BLACK = 30; 13 | private static final int FOREGROUND_RED = 31; 14 | private static final int FOREGROUND_GREEN = 32; 15 | private static final int FOREGROUND_YELLOW = 33; 16 | private static final int FOREGROUND_BLUE = 34; 17 | private static final int FOREGROUND_MAGENTA = 35; 18 | private static final int FOREGROUND_CYAN = 36; 19 | private static final int FOREGROUND_WHITE = 37; 20 | 21 | private static final String PREFIX = "\u001b["; 22 | private static final String SUFFIX = "m"; 23 | private static final char SEPARATOR = ';'; 24 | private static final String END_COLOR = PREFIX + SUFFIX; 25 | 26 | private static final String ERROR_COLOR = PREFIX + BRIGHT + SEPARATOR + FOREGROUND_RED + SUFFIX; 27 | private static final String WARN_COLOR = PREFIX + NORMAL + SEPARATOR + FOREGROUND_YELLOW + SUFFIX; 28 | private static final String INFO_COLOR = PREFIX + NORMAL + SEPARATOR + FOREGROUND_GREEN + SUFFIX; 29 | private static final String DEBUG_COLOR = PREFIX + NORMAL + SEPARATOR + FOREGROUND_CYAN + SUFFIX; 30 | private static final String TRACE_COLOR = PREFIX + NORMAL + SEPARATOR + FOREGROUND_BLUE + SUFFIX; 31 | 32 | @Override 33 | public String convert(final ILoggingEvent event) { 34 | final StringBuilder sb = new StringBuilder(); 35 | sb.append(getColor(event.getLevel())); 36 | sb.append(event.getLevel()); 37 | sb.append(END_COLOR); 38 | return sb.toString(); 39 | } 40 | 41 | /** 42 | * Returns the appropriate characters to change the color for the specified 43 | * logging level. 44 | */ 45 | private String getColor(final Level level) { 46 | switch (level.toInt()) { 47 | case Level.ERROR_INT: return ERROR_COLOR; 48 | case Level.WARN_INT: return WARN_COLOR; 49 | case Level.INFO_INT: return INFO_COLOR; 50 | case Level.DEBUG_INT: return DEBUG_COLOR; 51 | case Level.TRACE_INT: return TRACE_COLOR; 52 | default: 53 | return ""; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /service/src/main/resources/logback-stdout.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 9 | 10 | %d{HH:mm:ss} [%thread] %ansiLevel %logger{35} - %msg %ex{full} %n 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 28 | 29 | 30 | 32 | 33 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{35} - %msg %ex{full} %n 34 | 35 | logs/grouperfish.log 36 | 37 | 38 | logs/archive/grouperfish.%d{yyyy/MM}.log.gz 39 | 365 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/base/AssertTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import org.testng.annotations.Test; 4 | 5 | import static org.testng.AssertJUnit.assertEquals; 6 | import static org.testng.AssertJUnit.fail; 7 | 8 | 9 | @Test(groups="unit") 10 | public class AssertTest { 11 | 12 | public void testNonNullPass() { 13 | Assert.nonNull(new int[100]); 14 | Assert.nonNull("a", 123, new Object()); 15 | } 16 | 17 | @Test(expectedExceptions=IllegalArgumentException.class) 18 | public void testNonNullFailSingle() { 19 | final String nothing = null; 20 | Assert.nonNull(nothing); 21 | } 22 | 23 | @Test(expectedExceptions=IllegalArgumentException.class) 24 | public void testNonNullFailMulti() { 25 | Assert.nonNull("a", 123, new Object(), null); 26 | } 27 | 28 | public void testCheckPass() { 29 | Assert.check(true); 30 | Assert.check(true, true); 31 | Assert.check(true, true, true); 32 | } 33 | 34 | @Test(expectedExceptions=IllegalArgumentException.class) 35 | public void testCheckFailSingle() { 36 | Assert.check(false); 37 | } 38 | 39 | @Test(expectedExceptions=IllegalArgumentException.class) 40 | public void testCheckFailMulti() { 41 | Assert.check(true, true, false); 42 | } 43 | 44 | @Test(expectedExceptions=IllegalStateException.class) 45 | public void testUnreachable() { 46 | Assert.unreachable(); 47 | } 48 | 49 | @Test(expectedExceptions = IllegalStateException.class) 50 | public void testUnreachableType() { 51 | String bogus = Assert.unreachable(String.class); 52 | fail(bogus); 53 | } 54 | 55 | public void testUnreachableWrap() { 56 | Exception inner = new RuntimeException(); 57 | 58 | try { 59 | String neverAssigned = Assert.unreachable(String.class, inner); 60 | fail(neverAssigned); 61 | } 62 | catch (IllegalStateException e) { 63 | assertEquals(inner, e.getCause()); 64 | } 65 | } 66 | 67 | public void testUnreachableArgs() { 68 | try { 69 | Assert.unreachable("Arrrgh"); 70 | fail(); 71 | } 72 | catch (IllegalStateException e) { 73 | assertEquals( 74 | "[ASSERTION FAILED] Code should be unreachable: Arrrgh\n", 75 | e.getMessage()); 76 | } 77 | 78 | try { 79 | Assert.unreachable("Wut: %s %s???", "Over", 9000); 80 | fail(); 81 | } 82 | catch (IllegalStateException e) { 83 | assertEquals( 84 | "[ASSERTION FAILED] Code should be unreachable: Wut: Over 9000???\n", 85 | e.getMessage()); 86 | } 87 | 88 | try { 89 | String neverAssigned = 90 | Assert.unreachable(String.class, "Wut: %s %s???", "Over", 9000); 91 | fail(neverAssigned); 92 | } 93 | catch (IllegalStateException e) { 94 | assertEquals( 95 | "[ASSERTION FAILED] Code should be unreachable: Wut: Over 9000???\n", 96 | e.getMessage()); 97 | } 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/base/SlugToolTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import org.testng.annotations.Test; 4 | 5 | import static org.testng.AssertJUnit.assertEquals; 6 | 7 | 8 | @Test(groups="unit") 9 | public class SlugToolTest { 10 | 11 | public void testToSlug() { 12 | 13 | assertEquals("my-name-is-joe", SlugTool.toSlug("My Name is Joe")); 14 | assertEquals("wut-over-9000", SlugTool.toSlug("Wut, over 9000?!?")); 15 | assertEquals("space-----madness", SlugTool.toSlug("Space Madness")); 16 | 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/base/StreamToolTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.nio.charset.Charset; 7 | 8 | import org.testng.annotations.Test; 9 | 10 | import static org.testng.AssertJUnit.assertEquals; 11 | import static org.testng.AssertJUnit.fail; 12 | 13 | 14 | @Test(groups="unit") 15 | public class StreamToolTest { 16 | 17 | Charset UTF8 = Charset.forName("UTF-8"); 18 | 19 | private String[] fixtures() { 20 | final String empty = ""; 21 | final String single = "A"; 22 | final String shortish = 23 | "The Mozilla project is a global community of people who believe that openness, " 24 | + "innovation, and opportunity are key to the continued health of the Internet. " 25 | + "We have worked together since 1998 to ensure that the Internet is developed " 26 | + "in a way that benefits everyone. As a result of the community's efforts, we " 27 | + "have distilled a set of principles that we believe are critical for the " 28 | + "Internet to continue to benefit the public good. These principles are " 29 | + "contained in the Mozilla Manifesto."; 30 | 31 | final String longish = shortish + shortish + shortish + shortish + shortish; 32 | final String longer = longish + longish + longish + longish + longish; 33 | final String reallyLong = longer + longer + longer + longer + longer; 34 | 35 | final String unicode = "Internet se stává důležitou součástí našich životů."; 36 | 37 | return new String[]{empty, single, shortish, 38 | longish, longer, reallyLong, unicode}; 39 | } 40 | 41 | public void testConsumeInputStreamCharset() { 42 | for (String fixture : fixtures()) { 43 | InputStream stream = new ByteArrayInputStream(fixture.getBytes(UTF8)); 44 | try { 45 | assertEquals(fixture, StreamTool.consume(stream, UTF8)); 46 | } catch (IOException e) { 47 | fail(e.getMessage()); 48 | } 49 | } 50 | } 51 | 52 | public void testConsumeInputStreamCharsetLimit() { 53 | for (String fixture : fixtures()) { 54 | try { 55 | InputStream stream = new ByteArrayInputStream(fixture.getBytes(UTF8)); 56 | assertEquals(fixture, StreamTool.maybeConsume(stream, UTF8, fixture.length())); 57 | 58 | if (fixture.length() <= 1) continue; 59 | stream = new ByteArrayInputStream(fixture.getBytes(UTF8)); 60 | assertEquals(null, StreamTool.maybeConsume(stream, UTF8, fixture.length() - 1)); 61 | } catch (IOException e) { 62 | fail(e.getMessage()); 63 | } 64 | } 65 | } 66 | 67 | @Test(expectedExceptions = IllegalArgumentException.class) 68 | public void testMissingStream() throws IOException { 69 | StreamTool.maybeConsume(null, UTF8, 0); 70 | } 71 | 72 | @Test(expectedExceptions = IllegalArgumentException.class) 73 | public void testMissingEncoding() throws IOException { 74 | StreamTool.consume(new ByteArrayInputStream("lolwut".getBytes(UTF8)), null); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/base/json/JsonValidatorTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base.json; 2 | 3 | import java.io.IOException; 4 | 5 | import org.testng.annotations.Test; 6 | 7 | import com.mozilla.grouperfish.base.json.JsonValidator; 8 | 9 | import static org.testng.AssertJUnit.assertTrue; 10 | import static org.testng.AssertJUnit.assertFalse; 11 | 12 | 13 | @Test(groups="unit") 14 | public class JsonValidatorTest { 15 | 16 | public void testInvalidDocument() throws IOException { 17 | assertFalse(new JsonValidator().isValid("Your mom is valit!!!!")); 18 | assertFalse(new JsonValidator().isValid("{{{}}")); 19 | } 20 | 21 | public void testTooEmptyDocument() throws IOException { 22 | assertFalse(new JsonValidator().isValid("")); 23 | } 24 | 25 | public void testValidDocument() throws IOException { 26 | assertTrue(new JsonValidator().isValid("{}")); 27 | assertTrue(new JsonValidator().isValid("{\"a\": 1}")); 28 | assertTrue(new JsonValidator().isValid("{\"a\": 1, \"b\": 2}")); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/base/json/MapStreamerTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.base.json; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.IOException; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.TreeMap; 8 | 9 | import org.testng.annotations.Test; 10 | 11 | import com.mozilla.grouperfish.base.json.MapStreamer; 12 | 13 | import static org.testng.AssertJUnit.assertEquals; 14 | 15 | 16 | @Test(groups="unit") 17 | public class MapStreamerTest { 18 | 19 | @SuppressWarnings("serial") 20 | enum Fixture { 21 | EMPTY( 22 | new HashMap(), 23 | "{}"), 24 | ONE_ENTRY( 25 | new HashMap() {{ 26 | put("item", "{\"something\": 123}"); 27 | }}, 28 | "{\"item\": {\"something\": 123}}"), 29 | MULTIPLE( 30 | new TreeMap() {{ 31 | put("A", "{\"x\": 123}"); 32 | put("B", "{\"y\": [45, 67]}"); 33 | put("C", "{\"z\": 89}"); 34 | }}, 35 | "{\"A\": {\"x\": 123},\n\"B\": {\"y\": [45, 67]},\n\"C\": {\"z\": 89}}"); 36 | 37 | 38 | Map in; 39 | String expected; 40 | 41 | Fixture(Map in, String out) { 42 | this.in = in; 43 | this.expected = out; 44 | } 45 | } 46 | 47 | private void check(Map in, String expected) throws IOException { 48 | MapStreamer streamer = new MapStreamer(in); 49 | final ByteArrayOutputStream out = new ByteArrayOutputStream(); 50 | streamer.write(out); 51 | assertEquals(expected, out.toString("UTF-8")); 52 | } 53 | 54 | public void testEmpty() throws IOException { 55 | check(Fixture.EMPTY.in, Fixture.EMPTY.expected); 56 | } 57 | 58 | public void testOneEntry() throws IOException { 59 | check(Fixture.ONE_ENTRY.in, Fixture.ONE_ENTRY.expected); 60 | } 61 | 62 | public void testMultiple() throws IOException { 63 | check(Fixture.MULTIPLE.in, Fixture.MULTIPLE.expected); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/model/DocumentTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | import java.util.Collections; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import org.testng.annotations.Test; 8 | 9 | import static org.testng.AssertJUnit.assertEquals; 10 | 11 | 12 | @Test(groups="unit") 13 | @SuppressWarnings("serial") 14 | public class DocumentTest { 15 | 16 | @Test(expectedExceptions=IllegalArgumentException.class) 17 | public void testEmptyDocument() { 18 | final Map empty = Collections.emptyMap(); 19 | new Document(empty).source(); 20 | } 21 | 22 | public void testVerySimpleDocument() { 23 | final Map fields = new HashMap() {{ 24 | put("id", 1323); 25 | }}; 26 | Document doc = new Document(fields); 27 | assertEquals("{\"id\":1323}", doc.source()); 28 | assertEquals("1323", doc.name()); 29 | assertEquals("1323", doc.id()); 30 | } 31 | 32 | public void testSimpleDocument() { 33 | final Map fields = new HashMap() {{ 34 | put("id", 1323); 35 | put("something", "else"); 36 | }}; 37 | Document doc = new Document(fields); 38 | assertEquals("1323", doc.id()); 39 | assertEquals("else", doc.fields().get("something")); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/model/DummyAccess.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.model; 2 | 3 | 4 | public class DummyAccess implements Access { 5 | 6 | private final String origin; 7 | private final Operation type; 8 | 9 | public DummyAccess(Operation type, String origin) { 10 | this.origin = origin; 11 | this.type = type; 12 | } 13 | 14 | @Override 15 | public String origin() { 16 | return origin; 17 | } 18 | 19 | @Override 20 | public Operation type() { 21 | return type; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/naming/ScopeTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.naming; 2 | 3 | import static org.testng.AssertJUnit.assertEquals; 4 | import static org.testng.AssertJUnit.assertNotNull; 5 | import static org.testng.AssertJUnit.assertTrue; 6 | 7 | import org.testng.annotations.Test; 8 | 9 | import com.mozilla.grouperfish.model.Access; 10 | import com.mozilla.grouperfish.model.Type; 11 | import com.mozilla.grouperfish.model.DummyAccess; 12 | import com.mozilla.grouperfish.model.Access.Operation; 13 | import com.mozilla.grouperfish.naming.Scope; 14 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource; 15 | import com.mozilla.grouperfish.rest.jaxrs.DocumentsResource; 16 | import com.mozilla.grouperfish.rest.jaxrs.QueriesResource; 17 | import com.mozilla.grouperfish.rest.jaxrs.ResultsResource; 18 | import com.mozilla.grouperfish.services.api.Grid; 19 | import com.mozilla.grouperfish.services.mock.MockGrid; 20 | 21 | 22 | @Test(groups="unit") 23 | public class ScopeTest { 24 | 25 | private final String NS = "unit-test"; 26 | private final Grid grid = new MockGrid(); 27 | private final Access DUMMY_ACCESS = new DummyAccess(Operation.CREATE, "dummy.example.com"); 28 | 29 | public void testAllows() { 30 | assertTrue(scope(NS).allows(DocumentsResource.class, DUMMY_ACCESS)); 31 | } 32 | 33 | public void testExistingConfigurations() { 34 | for (final Type type : Type.values()) { 35 | assertNotNull(scope(NS).map(type)); 36 | } 37 | } 38 | 39 | @Test(expectedExceptions=IllegalArgumentException.class) 40 | public void testInvalidConfigurations() { 41 | scope(NS).map(null); 42 | } 43 | 44 | public void testDocuments() { 45 | assertNotNull(scope(NS).documents()); 46 | } 47 | 48 | public void testMaxLength() { 49 | Access access = new DummyAccess(Operation.CREATE, "dummy.example.com"); 50 | assertTrue(0 < scope(NS).maxLength(DocumentsResource.class, access)); 51 | } 52 | 53 | public void testQueries() { 54 | assertNotNull(scope(NS).queries()); 55 | } 56 | 57 | public void testResourceMap() { 58 | Scope ns = scope(NS); 59 | assertEquals( 60 | ns.documents(), ns.resourceMap(DocumentsResource.class)); 61 | assertEquals( 62 | ns.queries(), ns.resourceMap(QueriesResource.class)); 63 | assertEquals( 64 | ns.results(), ns.resourceMap(ResultsResource.class)); 65 | assertEquals( 66 | ns.map(Type.CONFIGURATION_FILTER), 67 | ns.resourceMap(ConfigurationsResource.FilterConfigsResource.class)); 68 | assertEquals( 69 | ns.map(Type.CONFIGURATION_TRANSFORM), 70 | ns.resourceMap(ConfigurationsResource.TransformConfigsResource.class)); 71 | } 72 | 73 | @Test(expectedExceptions=IllegalStateException.class) 74 | public void testInvalidResourceMap() { 75 | final Scope ns = scope(NS); 76 | ns.resourceMap(Object.class); 77 | } 78 | 79 | public void testResults() { 80 | assertNotNull(scope(NS).results()); 81 | } 82 | 83 | public void testToString() { 84 | assertEquals(NS, scope(NS).raw()); 85 | } 86 | 87 | public void testValidator() { 88 | assertNotNull(scope(NS).validator(DocumentsResource.class)); 89 | } 90 | 91 | private Scope scope(String namespace) { 92 | return new Scope(namespace, grid); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/rest/jaxrs/RestHelperTest.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.rest.jaxrs; 2 | 3 | import static org.mockito.Mockito.mock; 4 | import static org.mockito.Mockito.when; 5 | import static org.testng.AssertJUnit.assertEquals; 6 | import static org.testng.AssertJUnit.assertNotNull; 7 | 8 | import java.io.ByteArrayInputStream; 9 | import java.io.IOException; 10 | 11 | import javax.servlet.ServletInputStream; 12 | import javax.servlet.http.HttpServletRequest; 13 | import javax.ws.rs.core.Response; 14 | 15 | import org.testng.annotations.Test; 16 | 17 | import com.mozilla.grouperfish.naming.Scope; 18 | import com.mozilla.grouperfish.rest.jaxrs.DocumentsResource; 19 | import com.mozilla.grouperfish.rest.jaxrs.RestHelper; 20 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource.TransformConfigsResource; 21 | import com.mozilla.grouperfish.services.api.Grid; 22 | import com.mozilla.grouperfish.services.mock.MockGrid; 23 | 24 | 25 | @Test(groups="unit") 26 | public class RestHelperTest { 27 | 28 | private final Grid grid = new MockGrid(); 29 | private final Scope NS = new Scope("unit-test", grid); 30 | 31 | public void testPutAny() throws IOException { 32 | final HttpServletRequest mock = mock(HttpServletRequest.class); 33 | 34 | final String body = "{\"id\": \"mydoc\"}"; 35 | when(mock.getMethod()).thenReturn("PUT"); 36 | when(mock.getContentLength()).thenReturn(body.length()); 37 | when(mock.getInputStream()).thenReturn(new ServletInputStream() { 38 | final ByteArrayInputStream byteStream = new ByteArrayInputStream(body.getBytes()); 39 | @Override 40 | public int read() throws IOException { 41 | return byteStream.read(); 42 | } 43 | }); 44 | 45 | final Response response = RestHelper.putAny(DocumentsResource.class, NS, "mydoc", mock); 46 | assertNotNull(response); 47 | assertEquals(201, response.getStatus()); 48 | } 49 | 50 | 51 | public void testDeleteAny() throws IOException { 52 | final HttpServletRequest mock = mock(HttpServletRequest.class); 53 | when(mock.getMethod()).thenReturn("DELETE"); 54 | 55 | final Response response = RestHelper.deleteAny(DocumentsResource.class, NS, "somedoc", mock); 56 | assertNotNull(response); 57 | assertEquals(204, response.getStatus()); 58 | } 59 | 60 | 61 | public void testGetAny() { 62 | // Put stuff in, to get afterwards: 63 | NS.documents().put("myGetDoc", "{\"id\": \"myGetDoc\"}"); 64 | 65 | final HttpServletRequest mock = mock(HttpServletRequest.class); 66 | final Response response = RestHelper.getAny(DocumentsResource.class, NS, "myGetDoc", mock); 67 | assertNotNull(response); 68 | assertEquals(200, response.getStatus()); 69 | 70 | final Response response404 = RestHelper.getAny(DocumentsResource.class, NS, "no such doc", mock); 71 | assertNotNull(response404); 72 | assertEquals(404, response404.getStatus()); 73 | } 74 | 75 | 76 | public void testListAny() { 77 | final HttpServletRequest mock = mock(HttpServletRequest.class); 78 | final Response response = RestHelper.listAny(TransformConfigsResource.class, NS, mock); 79 | assertNotNull(response); 80 | assertEquals(200, response.getStatus()); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /service/src/test/java/com/mozilla/grouperfish/unit/UnitTestHelper.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.unit; 2 | 3 | import org.testng.annotations.BeforeGroups; 4 | import org.testng.annotations.Test; 5 | 6 | import com.mozilla.grouperfish.bootstrap.Grouperfish; 7 | 8 | 9 | @Test(groups="unit") 10 | public class UnitTestHelper { 11 | 12 | private final int port = Grouperfish.DEFAULT_PORT + 10; 13 | 14 | @BeforeGroups(groups="unit") 15 | void setUp() throws Exception { 16 | System.setProperty("hazelcast.config", "config/hazelcast.xml"); 17 | System.setProperty("server.port", String.valueOf(port)); 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /service/src/test/resources/config/hazelcast.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | 9 | grouperfish-unit 10 | grouperfish-unit 11 | 12 | 13 | 14 | 5701 15 | 16 | 17 | 18 | 127.0.0.1 19 | 20 | 21 | 22 | * 23 | 24 | 25 | 26 | 27 | 16 28 | 64 29 | 60 30 | 31 | 32 | 33 | 35 | 36 | 0 37 | LRU 38 | 10000 39 | 25 40 | 41 | 42 | 43 | 44 | 45 | 0 46 | 0 47 | LRU 48 | 5000 49 | 25 50 | 51 | 52 | 53 | 54 | 55 | 0 56 | 0 57 | LRU 58 | 5000 59 | 25 60 | 61 | 62 | 63 | 64 | 65 | 0 66 | 0 67 | LRU 68 | 5000 69 | 25 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /service/src/test/resources/ng_unit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tools/display/src/main/java/com/mozilla/grouperfish/mahout/clustering/display/kmeans/DisplayKMeansBase.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.mahout.clustering.display.kmeans; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.log4j.Logger; 11 | import org.apache.mahout.clustering.WeightedVectorWritable; 12 | import org.apache.mahout.clustering.kmeans.Cluster; 13 | import org.apache.mahout.common.Pair; 14 | import org.apache.mahout.math.Vector; 15 | 16 | import com.mozilla.hadoop.fs.SequenceFileDirectoryReader; 17 | 18 | public class DisplayKMeansBase { 19 | 20 | private static final Logger LOG = Logger.getLogger(DisplayKMeansBase.class); 21 | 22 | public List> readClusteredPoints(Path clusteredPointsPath) { 23 | List> clusteredPoints = new ArrayList>(); 24 | SequenceFileDirectoryReader pointsReader = null; 25 | try { 26 | IntWritable k = new IntWritable(); 27 | WeightedVectorWritable wvw = new WeightedVectorWritable(); 28 | pointsReader = new SequenceFileDirectoryReader(clusteredPointsPath); 29 | while (pointsReader.next(k, wvw)) { 30 | clusteredPoints.add(new Pair(k.get(), wvw.getVector())); 31 | } 32 | } catch (IOException e) { 33 | LOG.error("IOException caught while reading clustered points", e); 34 | } finally { 35 | if (pointsReader != null) { 36 | pointsReader.close(); 37 | } 38 | } 39 | 40 | return clusteredPoints; 41 | } 42 | 43 | public List readClustersIteration(Path clusterIterationPath) { 44 | List clusters = new ArrayList(); 45 | SequenceFileDirectoryReader iterationReader = null; 46 | try { 47 | Text k = new Text(); 48 | Cluster c = new Cluster(); 49 | iterationReader = new SequenceFileDirectoryReader(clusterIterationPath); 50 | while (iterationReader.next(k, c)) { 51 | clusters.add(c); 52 | } 53 | } catch (IOException e) { 54 | LOG.error("IOException caught while reading clustered points", e); 55 | } finally { 56 | if (iterationReader != null) { 57 | iterationReader.close(); 58 | } 59 | } 60 | 61 | return clusters; 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /tools/firefox_input/.gitignore: -------------------------------------------------------------------------------- 1 | opinions.* 2 | *.tsv 3 | logs 4 | -------------------------------------------------------------------------------- /tools/firefox_input/README.md: -------------------------------------------------------------------------------- 1 | # Firefox Input support 2 | 3 | This tool loads opinion data, as exposed by Firefox Input, into grouperfish. 4 | 5 | [Firefox Input](https://input.mozilla.com) 6 | 7 | [Data Format](https://wiki.mozilla.org/Firefox/Input/Data) 8 | -------------------------------------------------------------------------------- /tools/firefox_input/install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # normalize work directory 4 | wd=`dirname "$0"` 5 | wd=`cd "$wd"; pwd` 6 | 7 | cmd="--build" 8 | if [[ "${#}" -eq "1" ]]; then 9 | if [[ "${1}" == --* ]]; then cmd=$1; fi 10 | fi 11 | 12 | dest=../../build/tools/firefox_input/ 13 | case "${cmd}" in 14 | --build|--package) 15 | mvn install || exit 1 16 | mkdir -p "${dest}/lib" 17 | cp target/grouperfish-*.jar "${dest}/lib/" 18 | cp ./load_opinions "${dest}/" 19 | ;; 20 | --clean) 21 | mvn clean 22 | rm -rf "${dest}" 23 | ;; 24 | --help) 25 | "Usage: ${0} [--build|--clean]" 26 | ;; 27 | esac 28 | -------------------------------------------------------------------------------- /tools/firefox_input/load_opinions: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # normalize work directory 4 | wd=`dirname "$0"` 5 | wd=`cd -P "$wd"; pwd` 6 | 7 | 8 | # Load Firefox Input opinions into Grouperfish. 9 | 10 | self="${0}" 11 | usage() { 12 | echo "Usage: ${self} [http://grouperfish:port] namespace" 13 | echo 14 | echo "Reads opinion data from standard input and loads it into the" 15 | echo "given namespace." 16 | echo 17 | } 18 | 19 | cp_add_dir() { 20 | d="${1}" 21 | for lib in `find "${d}" -type f -name '*.jar'`; do 22 | CLASSPATH=${CLASSPATH}:"${lib}" 23 | done 24 | } 25 | 26 | 27 | tool=`dirname "$self"` 28 | 29 | LOGBACK_OPTS=-Dlogback.configurationFile=logback-stdout.xml 30 | 31 | load() { 32 | if [ -d $tool/target ]; then 33 | # we are on source tree 34 | grouperfish_home="${tool}/../../build" 35 | cp_add_dir "${tool}/target" 36 | else 37 | # we are on build tree 38 | grouperfish_home="${tool}/../.." 39 | cp_add_dir "${tool}/lib" 40 | fi 41 | 42 | cp_add_dir "${grouperfish_home}/lib" 43 | echo "CP $CLASSPATH" 44 | 45 | main=com.mozilla.grouperfish.tools.firefox_input.OpinionLoader 46 | java -cp $CLASSPATH $LOGBACK_OPTS ${main} $@ 47 | } 48 | 49 | case "${1}" in 50 | --help) 51 | usage 52 | ;; 53 | *) 54 | load $@ 55 | esac 56 | -------------------------------------------------------------------------------- /tools/firefox_input/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | grouperfish-tools-fxinput 6 | ${grouperfishVersion} 7 | 8 | com.mozilla 9 | grouperfish-parent 10 | ../../project 11 | FIXED 12 | 13 | 14 | https://github.com/mozilla-metrics/grouperfish 15 | 16 | jar 17 | 18 | 19 | UTF-8 20 | false 21 | 22 | 23 | 24 | 25 | com.mozilla 26 | grouperfish-service 27 | ${grouperfishVersion} 28 | 29 | 30 | 31 | 32 | grouperfish-service 33 | 34 | 35 | 36 | 37 | org.apache.maven.plugins 38 | maven-surefire-plugin 39 | 2.5 40 | 41 | ${skip.tests.unit} 42 | -Xms128m -Xmx768m -XX:PermSize=128m -XX:MaxPermSize=512m 43 | methods 44 | 1 45 | ${project.build.directory}/test-classes 46 | 47 | src/test/resources/ng_unit.xml 48 | 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-jar-plugin 56 | 2.3.1 57 | 58 | ${project.name}-${project.version} 59 | 60 | 61 | true 62 | ${settings.localRepository} 63 | repository 64 | com.mozilla.grouperfish.tools.firefox_input.OpinionLoader 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/OpinionLoader.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.tools.firefox_input; 2 | 3 | import com.mozilla.grouperfish.bootstrap.Grouperfish; 4 | import com.mozilla.grouperfish.util.loader.DocumentLoader; 5 | 6 | public class OpinionLoader { 7 | 8 | public static void main(final String[] arguments) { 9 | 10 | if (arguments.length > 2 || (arguments.length >= 1 && "--help".equals(arguments[0]))) { 11 | System.err.println("arguments: [BASE_URL] NAMESPACE"); 12 | System.exit(1); 13 | } 14 | 15 | final String baseUrl; 16 | final String namespace; 17 | if (arguments.length == 2) { 18 | baseUrl = arguments[0]; 19 | namespace = arguments[1]; 20 | } 21 | else { 22 | baseUrl = "http://localhost:" + Grouperfish.DEFAULT_PORT; 23 | namespace = arguments[0]; 24 | } 25 | 26 | new DocumentLoader(baseUrl, namespace).load(new OpinionStream(System.in)); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/OpinionStream.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.tools.firefox_input; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Arrays; 6 | import java.util.HashMap; 7 | import java.util.Iterator; 8 | 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import com.mozilla.grouperfish.model.Document; 13 | 14 | 15 | public class OpinionStream implements Iterable { 16 | 17 | public OpinionStream(final InputStream in) { 18 | in_ = in; 19 | } 20 | 21 | static enum Field { 22 | ID(0), TIMESTAMP(1), TYPE(2), PRODUCT(3), VERSION(4), PLATFORM(5), LOCALE(6), 23 | MANUFACTURER(7), DEVICE(8), URL(9), TEXT(10); 24 | public int i; 25 | 26 | Field(int c) { 27 | i = c; 28 | } 29 | } 30 | 31 | @Override 32 | public Iterator iterator() { 33 | return new OpinionsIterator(new TsvReader(in_)); 34 | } 35 | 36 | private class OpinionsIterator implements Iterator { 37 | 38 | final TsvReader reader_; 39 | int i_ = 0; 40 | String[] row_; 41 | 42 | public OpinionsIterator(TsvReader reader) { 43 | reader_ = reader; 44 | } 45 | 46 | @Override 47 | public Document next() { 48 | @SuppressWarnings("serial") 49 | Document doc = new Document( 50 | row_[Field.ID.i], 51 | new HashMap() {{ 52 | for (Field f : Field.values()) 53 | put(f.name().toLowerCase(), row_[f.i]); 54 | }}); 55 | row_ = null; 56 | return doc; 57 | } 58 | 59 | @Override 60 | public boolean hasNext() { 61 | if (row_ != null) 62 | return true; 63 | try { 64 | row_ = reader_.nextRow(); 65 | if (row_ == null) 66 | return false; 67 | if (row_.length != Field.values().length) { 68 | log.warn( 69 | "L{} skipping record (wrong number of columns) {}\n", 70 | i_, Arrays.toString(row_)); 71 | ++i_; 72 | row_ = null; 73 | return hasNext(); 74 | } 75 | ++i_; 76 | } catch (IOException e) { 77 | e.printStackTrace(); 78 | throw new RuntimeException(e); 79 | } 80 | return true; 81 | } 82 | 83 | @Override 84 | public void remove() { 85 | throw new UnsupportedOperationException(); 86 | } 87 | 88 | } 89 | 90 | private static final Logger log = LoggerFactory.getLogger(OpinionStream.class); 91 | 92 | private final InputStream in_; 93 | 94 | } 95 | -------------------------------------------------------------------------------- /tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/TsvJsonFromInputTsv.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.tools.firefox_input; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.IOException; 5 | import java.io.OutputStreamWriter; 6 | 7 | import com.mozilla.grouperfish.base.StreamTool; 8 | import com.mozilla.grouperfish.base.json.TsvJsonWriter; 9 | import com.mozilla.grouperfish.model.Document; 10 | 11 | 12 | /** 13 | * Produces a TSV/JSON (our algorithm format) directly from input data, 14 | * without need for a running grouperfish instance. 15 | */ 16 | public class TsvJsonFromInputTsv { 17 | 18 | public static void main(String[] args) throws IOException { 19 | 20 | TsvJsonWriter writer = 21 | new TsvJsonWriter( 22 | new BufferedWriter( 23 | new OutputStreamWriter(System.out, StreamTool.UTF8))); 24 | 25 | 26 | for (final Document doc : new OpinionStream(System.in)) { 27 | writer.write(doc); 28 | }; 29 | 30 | writer.flush(); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/TsvReader.java: -------------------------------------------------------------------------------- 1 | package com.mozilla.grouperfish.tools.firefox_input; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.nio.charset.Charset; 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | 11 | 12 | /** 13 | * TSV reading state machine. The opencsv lib does not support the 14 | * input.mozilla.com export format (escape without quotes). 15 | * 16 | * More information: https://wiki.mozilla.org/Firefox/Input/Data 17 | */ 18 | public class TsvReader { 19 | 20 | private static final int EOF = -1; 21 | private static final Charset UTF8 = Charset.forName("UTF-8"); 22 | private static final int BUF_SIZE = 32768 * 32; 23 | 24 | private boolean escaped = false; 25 | private boolean done = false; 26 | private final StringBuilder builder = new StringBuilder(); 27 | private final BufferedReader reader; 28 | 29 | public TsvReader(final InputStream in) { 30 | reader = new BufferedReader(new InputStreamReader(in, UTF8), BUF_SIZE); 31 | } 32 | 33 | public String[] nextRow() throws IOException { 34 | final List row = new LinkedList(); 35 | char c; 36 | while (true) { 37 | if (done) { 38 | return null; 39 | } 40 | int i = reader.read(); 41 | if (i == EOF) { 42 | done = true; 43 | if (builder.length() == 0) 44 | return null; 45 | row.add(builder.toString()); 46 | return row.toArray(new String[row.size()]); 47 | } 48 | 49 | c = (char) i; 50 | if (!escaped) { 51 | switch (c) { 52 | case '\\': 53 | escaped = true; 54 | continue; 55 | case '\t': 56 | row.add(builder.toString()); 57 | builder.setLength(0); 58 | continue; 59 | case '\n': 60 | row.add(builder.toString()); 61 | builder.setLength(0); 62 | return row.toArray(new String[row.size()]); 63 | } 64 | } 65 | builder.append(c); 66 | escaped = false; 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tools/firefox_input/src/test/resources/ng_unit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /tools/webui/public/css/topics.css: -------------------------------------------------------------------------------- 1 | a:link, a, a.visited { text-decoration:none; } 2 | body,div,dl,dt,dd,ul,ol,li,h1,h2,h3,h4,h5,h6,pre,form, fieldset,input,textarea,blockquote,th,td,p { margin:0; padding:0; } 3 | h1 a, h2 a, h3 a, h4 a, h5 a, h6 a, h1 a:hover, h2 a:hover, h3 a:hover, h4 a:hover, h5 a:hover, h6 a:hover { text-decoration:none; } 4 | table { border-collapse:collapse; border-spacing:0; } 5 | fieldset,img { border:0; } 6 | address,caption,cite,code,dfn,em,strong,th,var { font-style:normal; font-weight:normal; } 7 | ul { list-style:none; } 8 | caption,th { text-align:left; } 9 | q:before,q:after { content:''; } 10 | abbr,acronym { border:0; } 11 | 12 | body { 13 | font-family: Gill Sans, sans-serif; 14 | margin-left:2em; 15 | margin-top:2em; 16 | } 17 | 18 | td { 19 | padding-right:.5em; 20 | padding-left:.5em; 21 | padding-bottom:.25em; 22 | padding-top:.25em; 23 | } 24 | 25 | #words { 26 | margin-bottom:.25em; 27 | border-bottom:1px solid gray; 28 | } 29 | 30 | .clicked { 31 | background-color:black; 32 | color:white; 33 | } 34 | 35 | .hovered { 36 | background-color:gray; 37 | color:white; 38 | } 39 | 40 | #docs { 41 | margin-top:1em; 42 | width:800px; 43 | } 44 | 45 | #docs p { 46 | margin-top:.5em; 47 | } -------------------------------------------------------------------------------- /tools/webui/topics.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Topic Prototype 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |

Topics

13 |
14 |
15 |
16 | 79 | 80 | -------------------------------------------------------------------------------- /transforms/coclustering/INSTALL.MD: -------------------------------------------------------------------------------- 1 | 1. Copy all pig scripts to a directory of your choice. This directory will also 2 | be used as local directory for the scripts. 3 | 2. Copy src/main/python/cocluster.py to this directory. 4 | 3. Copy the bash script ``cocluster`` to this directory. 5 | 4. Ensure you have Python 2.6 with ``python26 `` the command to invoke python 6 | 2.6. 7 | 5. Do mvn package and mvn assembly:assembly in the transforms/coclustering to 8 | generate two JARS in the target folder. 9 | 6. Create a lib directory in your directory of choice that contains 10 | A. lucene-analyzers-3.1.0.jar 11 | B. lucene-core-3.1.0.jar 12 | C. akela-0.2-SNAPSHOT.jar 13 | D. grouperfish-transforms-coclustering-0.3-SNAPSHOT.jar 14 | E. mahout-collections-1.0.jar (Patched for CDH3 u0) 15 | F. mahout-core-0.5.jar (Patched for CDH3 u0) 16 | G. mahout-examples-0.5-job.jar (Patched for CDH3 u0) 17 | H. mahout-math-0.5.jar (Patched for CDH3 u0) 18 | 7. Copy grouperfish-transforms-coclustering-0.3-SNAPSHOT-job.jar to top level of 19 | the directory of your choice. 20 | 8. Create an HDFS directory that contains: 21 | A. input.TSV (Refer readthedocs for format ) 22 | B. parameters.json (Refer this directory for example file) 23 | 9. Do ./cocluster 24 | 10. You will find results.json and tags.json inside the HDFS directory. 25 | 26 | Contact evijayakumar@mozilla.com for any questions. 27 | 28 | -------------------------------------------------------------------------------- /transforms/coclustering/coclustering: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Call python script that performs co-clustering. 3 | exec python cocluster.py $1 4 | -------------------------------------------------------------------------------- /transforms/coclustering/install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # normalize work directory 4 | wd=`dirname "$0"` 5 | wd=`cd "$wd"; pwd` 6 | 7 | 8 | cmd="--build" 9 | if [[ "${#}" -eq "1" ]]; then 10 | if [[ "${1}" == --* ]]; then 11 | cmd=$1 12 | fi 13 | fi 14 | 15 | 16 | dest=../../build/transforms/coclustering 17 | case "${cmd}" in 18 | --build|--package) 19 | mvn assembly:assembly || exit 1 20 | mkdir -p $dest 21 | cp target/grouperfish-*.jar $dest/ 22 | cp ./src/main/pig/* $dest/ 23 | cp ./coclustering $dest/ 24 | cp ./src/main/python/cocluster.py $dest/ 25 | # TODO: move the (job) jars into the right place 26 | ;; 27 | --clean) 28 | mvn clean 29 | rm -rf "${dest}" 30 | ;; 31 | --help) 32 | "Usage: ${0} [--build|--clean]" 33 | ;; 34 | *) 35 | "Usage: ${0} [--build|--clean]" 36 | exit 1 37 | ;; 38 | esac 39 | -------------------------------------------------------------------------------- /transforms/coclustering/src/assembly/job.xml: -------------------------------------------------------------------------------- 1 | 3 | job 4 | 5 | jar 6 | 7 | false 8 | 9 | 10 | false 11 | runtime 12 | lib 13 | 14 | ${artifact.groupId}:${artifact.artifactId} 15 | 16 | 17 | 18 | false 19 | system 20 | lib 21 | 22 | ${artifact.groupId}:${artifact.artifactId} 23 | 24 | 25 | 26 | 27 | 28 | ${basedir}/target/classes 29 | / 30 | 31 | *.jar 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/mahout/Vectorizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Mozilla Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.mahout; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.pig.EvalFunc; 25 | import org.apache.pig.data.DataBag; 26 | import org.apache.pig.data.Tuple; 27 | import org.apache.pig.data.TupleFactory; 28 | 29 | public class Vectorizer extends EvalFunc { 30 | 31 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); 32 | 33 | public Tuple exec(Tuple input) throws IOException { 34 | if (input == null) { 35 | return null; 36 | } 37 | 38 | if (input.size() != 1) { 39 | throw new IOException("Vectorizer requires exactly 1 parameter"); 40 | } 41 | Tuple output = tupleFactory.newTuple(); 42 | DataBag db = (DataBag) input.get(0); 43 | for (Tuple t : db) { 44 | if (t.size() == 2) { 45 | Integer rowId = (Integer) t.get(0); 46 | if (rowId != null) { 47 | Tuple subt = tupleFactory.newTuple(2); 48 | subt.set(0, rowId); 49 | subt.set(1, t.get(1)); 50 | output.append(subt); 51 | } 52 | } 53 | } 54 | return output; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/text/ConvertDocumentIDToID.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Mozilla Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.text; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.IOException; 24 | import java.io.InputStreamReader; 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | import org.apache.hadoop.conf.Configuration; 29 | import org.apache.hadoop.fs.FileStatus; 30 | import org.apache.hadoop.fs.FileSystem; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.pig.EvalFunc; 33 | import org.apache.pig.data.Tuple; 34 | 35 | public class ConvertDocumentIDToID extends EvalFunc { 36 | 37 | private Map documentIndex; 38 | 39 | private void loadDocumentIndex(String documentIndexPath) throws IOException { 40 | if (documentIndex == null) { 41 | documentIndex = new HashMap(); 42 | 43 | Path p = new Path(documentIndexPath); 44 | FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); 45 | int index = 0; 46 | for (FileStatus status : fs.listStatus(p)) { 47 | Path currPath = status.getPath(); 48 | if (!status.isDir() && !currPath.getName().startsWith("_")) { 49 | BufferedReader reader = null; 50 | try { 51 | reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); 52 | String line = null; 53 | while ((line = reader.readLine()) != null) { 54 | documentIndex.put(line.trim(), index++); 55 | } 56 | } finally { 57 | if (reader != null) { 58 | reader.close(); 59 | } 60 | } 61 | } 62 | } 63 | 64 | log.info("Loaded document index with size: " + documentIndex.size()); 65 | } 66 | } 67 | 68 | @Override 69 | public Integer exec(Tuple input) throws IOException { 70 | if (input == null || input.size() == 0) { 71 | return null; 72 | } 73 | if (input.size() != 2) { 74 | throw new IOException("ConvertDocumentIDToID requires 2 parameters"); 75 | } 76 | 77 | String documentIndexPath = (String) input.get(0); 78 | if (documentIndex == null) { 79 | loadDocumentIndex(documentIndexPath); 80 | } 81 | String docID = (String) input.get(1); 82 | return documentIndex.get(docID); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/text/ConvertFeatureToID.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Mozilla Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.text; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.IOException; 24 | import java.io.InputStreamReader; 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | import org.apache.hadoop.conf.Configuration; 29 | import org.apache.hadoop.fs.FileStatus; 30 | import org.apache.hadoop.fs.FileSystem; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.pig.EvalFunc; 33 | import org.apache.pig.data.Tuple; 34 | 35 | public class ConvertFeatureToID extends EvalFunc { 36 | 37 | private Map featureIndex; 38 | 39 | private void loadFeatureIndex(String featureIndexPath) throws IOException { 40 | if (featureIndex == null) { 41 | featureIndex = new HashMap(); 42 | 43 | Path p = new Path(featureIndexPath); 44 | FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); 45 | int index = 0; 46 | for (FileStatus status : fs.listStatus(p)) { 47 | Path currPath = status.getPath(); 48 | if (!status.isDir() && !currPath.getName().startsWith("_")) { 49 | BufferedReader reader = null; 50 | try { 51 | reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); 52 | String line = null; 53 | while ((line = reader.readLine()) != null) { 54 | featureIndex.put(line.trim(), index++); 55 | } 56 | } finally { 57 | if (reader != null) { 58 | reader.close(); 59 | } 60 | } 61 | } 62 | } 63 | 64 | log.info("Loaded feature index with size: " + featureIndex.size()); 65 | } 66 | } 67 | 68 | @Override 69 | public Integer exec(Tuple input) throws IOException { 70 | if (input == null || input.size() == 0) { 71 | return null; 72 | } 73 | if (input.size() != 2) { 74 | throw new IOException("ConvertFeatureToID requires 2 parameters"); 75 | } 76 | 77 | String featureIndexPath = (String) input.get(0); 78 | if (featureIndex == null) { 79 | loadFeatureIndex(featureIndexPath); 80 | } 81 | String feature = (String) input.get(1); 82 | return featureIndex.get(feature); 83 | 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/text/TermFrequency.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Mozilla Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.text; 21 | 22 | import java.io.IOException; 23 | import java.util.HashMap; 24 | import java.util.Map; 25 | 26 | import org.apache.pig.EvalFunc; 27 | import org.apache.pig.data.BagFactory; 28 | import org.apache.pig.data.DataBag; 29 | import org.apache.pig.data.Tuple; 30 | import org.apache.pig.data.TupleFactory; 31 | 32 | public class TermFrequency extends EvalFunc { 33 | 34 | private static BagFactory bagFactory = BagFactory.getInstance(); 35 | private static TupleFactory tupleFactory = TupleFactory.getInstance(); 36 | 37 | @Override 38 | public DataBag exec(Tuple input) throws IOException { 39 | if (input == null || input.size() == 0) { 40 | return null; 41 | } 42 | 43 | DataBag db = (DataBag) input.get(0); 44 | Map termFreq = new HashMap(); 45 | for (Tuple t : db) { 46 | String word = (String) t.get(0); 47 | int curCount = 0; 48 | if (termFreq.containsKey(word)) { 49 | curCount = termFreq.get(word); 50 | } 51 | termFreq.put(word, ++curCount); 52 | } 53 | 54 | DataBag output = bagFactory.newDefaultBag(); 55 | for (Map.Entry entry : termFreq.entrySet()) { 56 | Tuple t = tupleFactory.newTuple(2); 57 | t.set(0, entry.getKey()); 58 | t.set(1, (double) entry.getValue()); 59 | output.add(t); 60 | } 61 | 62 | return output; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /transforms/coclustering/src/main/json_sample_files/tags.json: -------------------------------------------------------------------------------- 1 | {"48":[479],"91":[479],"71":[479],"121":[479],"107":[479],"153":[479],"80":[479],"289":[479],"162":[479],"190":[479],"61":[479],"286":[479],"201":[479],"213":[479],"223":[479],"194":[479],"258":[479],"49":[479],"104":[479],"265":[479],"321":[479],"8":[479],"202":[479],"263":[479],"23":[479],"88":[479],"299":[479],"228":[408],"65":[408],"255":[408],"93":[408],"182":[408],"244":[408],"113":[408],"24":[408],"221":[408],"131":[408],"144":[408],"231":[408],"5":[408],"17":[408],"86":[408],"169":[408],"353":[408],"207":[408],"220":[408],"19":[408],"339":[408],"215":[408],"63":[408],"229":[408],"212":[408],"348":[408],"325":[408],"30":[408],"101":[408],"147":[408],"195":[408],"318":[408],"248":[408],"328":[408],"72":[408],"160":[408],"22":[408],"272":[519],"94":[519],"297":[519],"133":[519],"87":[519],"18":[519],"290":[519],"350":[519],"139":[519],"193":[519],"343":[519],"208":[519],"206":[519],"27":[519],"294":[519],"307":[519],"165":[519],"241":[321],"1":[321],"10":[321],"188":[321],"129":[321],"14":[321],"2":[321],"187":[321],"327":[321],"214":[321],"261":[321],"12":[321],"302":[321],"301":[321],"227":[321],"264":[321],"191":[321],"21":[321],"59":[434],"324":[434],"185":[434],"349":[434],"138":[434],"58":[434],"120":[434],"67":[434],"108":[434],"171":[434],"68":[434],"197":[434],"167":[434],"181":[434],"172":[434],"316":[434],"283":[434],"122":[434],"236":[434],"89":[434],"184":[434],"132":[434],"352":[434],"205":[434],"240":[434],"159":[434],"239":[434],"274":[434],"106":[434],"3":[434],"232":[434],"92":[434],"178":[434],"312":[434],"25":[434],"176":[434],"82":[434],"224":[434],"15":[434],"243":[434],"119":[434],"37":[434],"341":[434],"300":[434],"73":[434],"320":[434],"317":[434],"98":[434],"170":[434],"151":[434],"260":[434],"196":[434],"156":[434],"285":[434],"45":[434],"296":[434],"270":[176],"180":[176],"36":[176],"309":[176],"118":[176],"211":[176],"218":[176],"173":[176],"308":[176],"79":[176],"314":[176],"112":[176],"60":[176],"102":[176],"235":[176],"152":[176],"254":[176],"310":[176],"340":[176],"35":[176],"292":[176],"39":[176],"150":[176],"334":[176],"338":[176],"38":[176],"20":[176],"46":[176],"53":[176],"251":[436],"105":[436],"81":[436],"315":[436],"110":[436],"346":[436],"31":[436],"186":[436],"168":[436],"313":[436],"116":[436],"311":[436],"124":[436],"331":[436],"99":[436],"78":[436],"41":[436],"189":[436],"164":[436],"303":[436],"225":[436],"13":[436],"257":[436],"280":[436],"335":[436],"149":[436],"256":[436],"336":[436],"304":[436],"155":[436],"200":[436],"70":[436],"109":[436],"135":[436],"288":[481],"268":[481],"57":[481],"351":[481],"238":[481],"295":[481],"247":[481],"271":[481],"323":[481],"83":[481],"6":[481],"141":[481],"337":[481],"9":[481],"273":[481],"333":[481],"217":[481],"16":[481],"174":[481],"175":[481],"127":[481],"75":[481],"233":[481],"34":[466],"145":[466],"95":[466],"230":[466],"4":[466],"234":[466],"276":[466],"204":[466],"47":[466],"262":[466],"322":[466],"242":[466],"253":[466],"66":[466],"40":[466],"142":[466],"111":[466],"281":[466],"279":[466],"306":[466],"55":[466],"140":[466],"33":[466],"293":[466],"56":[466],"44":[466],"69":[466],"329":[466],"298":[466],"237":[362],"125":[362],"29":[362],"166":[362],"269":[362],"97":[362],"134":[362],"342":[362],"114":[362],"42":[362],"209":[362],"26":[362],"54":[362],"198":[362],"291":[362],"74":[362],"64":[362],"76":[362],"347":[362],"52":[362],"136":[362],"115":[362],"85":[362]} -------------------------------------------------------------------------------- /transforms/coclustering/src/main/pig/co_cluster_generate_tags.pig: -------------------------------------------------------------------------------- 1 | -- 2 | -- Script to compute Tags 3 | -- 4 | -- See "Co-clustering documents and words using Bipartite Spectral Graph 5 | -- Partitioning" by Dhillon for more details. 6 | -- 7 | 8 | %default TEMP 'cct' 9 | %default NUM_REDUCERS 7 10 | register './lib/grouperfish-transforms-coclustering-0.3-SNAPSHOT.jar' 11 | register './lib/mahout-core-0.5.jar' 12 | register './lib/mahout-math-0.5.jar' 13 | register './lib/mahout-utils-0.5.jar' 14 | register './lib/mahout-collections-1.0.jar' 15 | SET default_parallel $NUM_REDUCERS 16 | SET pig.splitCombination 'false'; 17 | 18 | -- Load clustered Points which are in the format , 19 | clustered_points = LOAD '$TEMP/kmeans/out/clusteredPoints' USING com.mozilla.grouperfish.transforms.coclustering.pig.storage.KMeansOutputLoader() 20 | AS (cluster_id:int, v_id:int, v_info:bag{t:tuple(col_id:int, 21 | eblement:double)}); 22 | describe clustered_points; 23 | points_clusters = FOREACH clustered_points 24 | GENERATE v_id, cluster_id; 25 | describe points_clusters 26 | doc_map = LOAD '$TEMP/doc_map' AS (doc_id:int, doc: chararray); 27 | doc_clusters = JOIN doc_map BY doc_id, points_clusters BY v_id; 28 | tags = FOREACH doc_clusters 29 | GENERATE doc AS doc, 30 | cluster_id AS cluster_id; 31 | describe tags; 32 | STORE tags INTO '$TEMP/tags' USING PigStorage('\t'); 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /transforms/commons/src/main/java/com/mozilla/grouperfish/pig/eval/ml/Vectorizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Mozilla Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.mozilla.grouperfish.pig.eval.ml; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.IOException; 24 | import java.io.InputStreamReader; 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | import org.apache.hadoop.conf.Configuration; 29 | import org.apache.hadoop.fs.FileStatus; 30 | import org.apache.hadoop.fs.FileSystem; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.pig.EvalFunc; 33 | import org.apache.pig.data.DataBag; 34 | import org.apache.pig.data.Tuple; 35 | import org.apache.pig.data.TupleFactory; 36 | 37 | public class Vectorizer extends EvalFunc { 38 | 39 | private Map featureIndex; 40 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); 41 | 42 | private void loadFeatureIndex(String featureIndexPath) throws IOException { 43 | if (featureIndex == null) { 44 | featureIndex = new HashMap(); 45 | 46 | Path p = new Path(featureIndexPath); 47 | FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); 48 | int index = 0; 49 | for (FileStatus status : fs.listStatus(p)) { 50 | if (!status.isDir()) { 51 | BufferedReader reader = null; 52 | try { 53 | reader = new BufferedReader(new InputStreamReader(fs.open(status.getPath()))); 54 | String line = null; 55 | while ((line = reader.readLine()) != null) { 56 | featureIndex.put(line.trim(), index++); 57 | } 58 | } finally { 59 | if (reader != null) { 60 | reader.close(); 61 | } 62 | } 63 | } 64 | } 65 | 66 | log.info("Loaded feature index with size: " + featureIndex.size()); 67 | } 68 | } 69 | 70 | public Tuple exec(Tuple input) throws IOException { 71 | if (input == null) { 72 | return null; 73 | } 74 | 75 | if (input.size() != 2) { 76 | throw new IOException("Vectorizer requires exactly 2 parameters"); 77 | } 78 | 79 | String featureIndexPath = (String)input.get(0); 80 | if (featureIndex == null) { 81 | loadFeatureIndex(featureIndexPath); 82 | } 83 | 84 | Tuple output = tupleFactory.newTuple(); 85 | DataBag db = (DataBag)input.get(1); 86 | for (Tuple t : db) { 87 | // Expects each tuple's first element to be the feature 88 | Integer idx = featureIndex.get((String)t.get(0)); 89 | if (idx != null) { 90 | output.append(idx); 91 | } 92 | } 93 | 94 | return output; 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /transforms/commons/src/main/java/com/mozilla/grouperfish/pig/eval/text/TermFrequency.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Mozilla Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.mozilla.grouperfish.pig.eval.text; 21 | 22 | import java.io.IOException; 23 | import java.util.HashMap; 24 | import java.util.Map; 25 | 26 | import org.apache.pig.EvalFunc; 27 | import org.apache.pig.data.BagFactory; 28 | import org.apache.pig.data.DataBag; 29 | import org.apache.pig.data.Tuple; 30 | import org.apache.pig.data.TupleFactory; 31 | 32 | public class TermFrequency extends EvalFunc { 33 | 34 | private static BagFactory bagFactory = BagFactory.getInstance(); 35 | private static TupleFactory tupleFactory = TupleFactory.getInstance(); 36 | 37 | @Override 38 | public DataBag exec(Tuple input) throws IOException { 39 | if (input == null || input.size() == 0) { 40 | return null; 41 | } 42 | 43 | DataBag db = (DataBag) input.get(0); 44 | Map termFreq = new HashMap(); 45 | for (Tuple t : db) { 46 | String word = (String) t.get(0); 47 | int curCount = 0; 48 | if (termFreq.containsKey(word)) { 49 | curCount = termFreq.get(word); 50 | } 51 | termFreq.put(word, ++curCount); 52 | } 53 | 54 | DataBag output = bagFactory.newDefaultBag(); 55 | for (Map.Entry entry : termFreq.entrySet()) { 56 | Tuple t = tupleFactory.newTuple(2); 57 | t.set(0, entry.getKey()); 58 | t.set(1, (double) entry.getValue()); 59 | output.add(t); 60 | } 61 | 62 | return output; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /transforms/commons/src/main/pig/generate_document_vectors.pig: -------------------------------------------------------------------------------- 1 | /* Not sure why we have to register this JAR when it's already in Pig's classpath but we do */ 2 | register '/usr/lib/hbase/hbase-0.90.1-cdh3u0.jar' 3 | register './lib/akela-0.1.jar' 4 | register './lib/lucene-core-3.1.0.jar' 5 | register './lib/lucene-analyzers-3.1.0.jar' 6 | register './lib/mahout-core-0.5.jar' 7 | register './lib/mahout-math-0.5.jar' 8 | register './lib/mahout-utils-0.5.jar' 9 | register './lib/mahout-collections-1.0.jar' 10 | 11 | SET default_parallel 7; 12 | SET pig.splitCombination 'false'; 13 | 14 | %default INPUT 'opinions.tsv' 15 | %default STOPWORDS 'stopwords-en.txt' 16 | %default STEM 'true' 17 | %default FEATUREINDEX 'feature-index' 18 | %default OUTPUT 'document-vectors' 19 | 20 | /* 21 | raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray; 22 | genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[]; 23 | document_word_bag = FOREACH genmap GENERATE (chararray)json_map#'id' AS docid:chararray,com.mozilla.pig.eval.text.UnigramExtractor(json_map#'text') AS word_bag; 24 | document_word_vectors = FOREACH document_word_bag GENERATE docid, com.mozilla.pig.eval.ConvertBagToTuple(word_bag) AS word_vector; 25 | 26 | vectors = FOREACH document_word_vectors GENERATE (chararray)docid,com.mozilla.pig.eval.ml.Vectorizer('feature-index', word_vector) AS vec; 27 | STORE vectors INTO 'document-vectors' USING com.mozilla.pig.storage.DocumentVectorStorage(); 28 | */ 29 | 30 | /* Use this output if you're not using Mahout */ 31 | /* 32 | flat_vectors = FOREACH vectors GENERATE docid,FLATTEN(vec); 33 | STORE flat_vectors INTO 'document-vectors'; 34 | */ 35 | 36 | /* Same as above except using tsv file for experimenting */ 37 | raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,locale:chararray,text:chararray); 38 | filtered_raw = FILTER raw BY locale == 'en-US' AND praise_issue == 'issue' AND version == '5.0'; 39 | tokenized = FOREACH filtered_raw GENERATE doc_id,com.mozilla.pig.eval.text.Tokenize(text,'$STOPWORDS', '$STEM') AS token_bag; 40 | vectors = FOREACH tokenized GENERATE (chararray)docid,com.mozilla.pig.eval.ml.Vectorizer('$FEATUREINDEX', token_bag) AS vec; 41 | STORE vectors INTO '$OUTPUT' USING com.mozilla.pig.storage.DocumentVectorStorage('$NFEATURES'); -------------------------------------------------------------------------------- /transforms/commons/src/main/pig/generate_feature_index.pig: -------------------------------------------------------------------------------- 1 | register './akela-0.2-SNAPSHOT.jar' 2 | register './grouperfish-transforms-commons-0.1-SNAPSHOT.jar' 3 | register './lib/lucene-core-3.1.0.jar' 4 | register './lib/lucene-analyzers-3.1.0.jar' 5 | 6 | SET default_parallel 7; 7 | 8 | %default INPUT 'input.json.tsv' 9 | %default STOPWORDS 'stopwords-en.txt' 10 | %default STEM 'false' 11 | %default FREQ_OUTPUT 'feature-freq' 12 | %default OUTPUT 'feature-index' 13 | %default MIN_WORD_LENGTH 3 14 | %default MIN_DF 2 15 | %default MAX_DF_PERCENTAGE 0.9 16 | 17 | /*raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;*/ 18 | raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,json:chararray); 19 | genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[]; 20 | 21 | grouped_raw = GROUP raw ALL; 22 | ndocs = FOREACH grouped_raw GENERATE COUNT(raw); 23 | 24 | tokenized = FOREACH genmap GENERATE FLATTEN(com.mozilla.grouperfish.pig.eval.text.Tokenize(json_map#'text', '$STOPWORDS', '$STEM')) AS token:chararray; 25 | grouped_words = GROUP tokenized BY token; 26 | word_freq = FOREACH grouped_words GENERATE FLATTEN($0) AS word:chararray, COUNT($1) as count; 27 | /* filter on minDF = (count) > 10 AND maxDF % = (count/ndocs) < 0.9 */ 28 | filtered_freq = FILTER word_freq BY SIZE(word) > $MIN_WORD_LENGTH AND count > $MIN_DF AND ((double)count / (double)ndocs.$0) < $MAX_DF_PERCENTAGE; 29 | index = FOREACH filtered_freq GENERATE word; 30 | 31 | STORE filtered_freq INTO '$FREQ_OUTPUT'; 32 | STORE index INTO '$OUTPUT'; -------------------------------------------------------------------------------- /transforms/commons/src/main/pig/generate_sequence_files.pig: -------------------------------------------------------------------------------- 1 | register './akela-0.1.jar' 2 | /* Not sure why we have to register this JAR when it's already in Pig's classpath but we do */ 3 | register '/usr/lib/hbase/hbase-0.90.1-cdh3u0.jar' 4 | 5 | raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray; 6 | genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[]; 7 | documents = FOREACH raw GENERATE (chararray)docid,com.mozilla.pig.eval.text.RemoveStopwords(text); 8 | filtered_documents = FILTER documents BY normtext IS NOT NULL AND SIZE(normtext) > 0; 9 | STORE filtered_documents INTO 'documents' USING com.mozilla.pig.storage.SequenceFileStorage(); 10 | 11 | /* Same as above except using tsv file for experimenting */ 12 | raw = LOAD 'opinions-en.tsv' USING PigStorage('\t') AS (docid:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,language:chararray,text:chararray); 13 | documents = FOREACH raw GENERATE (chararray)docid,text; 14 | /* filtered_documents = FILTER documents BY normtext IS NOT NULL AND SIZE(normtext) > 0; */ 15 | STORE filtered_documents INTO 'documents' USING com.mozilla.pig.storage.SequenceFileStorage(); 16 | 17 | /* 18 | Follow up steps: 19 | 20 | hadoop jar mahout-examples-0.5-job.jar org.apache.mahout.driver.MahoutDriver seq2sparse -i documents -wt tfidf --minDF 2 --maxDFPercent 90 -o seq2sparse-out 21 | hadoop jar mahout-core-0.5-job.jar org.apache.mahout.driver.MahoutDriver kmeans -i seq2sparse-out/tfidf-vectors -o kmeans-cosine-out -dm org.apache.mahout.common.distance.CosineDistanceMeasure -c random-clusters -ow -k 20 -x 10 -cl 22 | 23 | */ -------------------------------------------------------------------------------- /transforms/commons/src/main/pig/generate_tf_document_vectors.pig: -------------------------------------------------------------------------------- 1 | register './akela-0.2-SNAPSHOT.jar' 2 | register './grouperfish-transforms-commons-0.1-SNAPSHOT.jar' 3 | register './lib/lucene-core-3.1.0.jar' 4 | register './lib/lucene-analyzers-3.1.0.jar' 5 | 6 | SET default_parallel 7; 7 | 8 | %default INPUT 'input.json.tsv' 9 | %default STOPWORDS 'stopwords-en.txt' 10 | %default STEM 'FALSE' 11 | %default MIN_TOKENS 4 12 | %default FEATUREINDEX 'feature-index' 13 | %default OUTPUT 'document-vectors-tf' 14 | 15 | /*raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;*/ 16 | raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,json:chararray); 17 | genmap = FOREACH raw GENERATE doc_id,com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[]; 18 | filtered_genmap = FILTER genmap BY json_map#'type' == 'issue' AND json_map#'product' == 'firefox' AND json_map#'version' == '5.0' AND json_map#'platform' == 'win7'; 19 | tokenized = FOREACH filtered_genmap GENERATE doc_id,com.mozilla.grouperfish.pig.eval.text.Tokenize(json_map#'text','$STOPWORDS', '$STEM') AS token_bag; 20 | /* Comment out the line above and uncomment the line below if you are using an ngram feature-index */ 21 | /*tokenized = FOREACH filtered_raw GENERATE doc_id,com.mozilla.pig.eval.text.NGramTokenize(text,'$STOPWORDS', '$STEM', 'true') AS token_bag;*/ 22 | filtered_tokenized = FILTER tokenized BY SIZE(token_bag) > $MIN_TOKENS; 23 | doc_vectors = FOREACH filtered_tokenized GENERATE doc_id,com.mozilla.grouperfish.pig.eval.text.TermFrequency(token_bag) AS tf_bag; 24 | 25 | /* Put things back into document vector form before storing in Mahout's vector format */ 26 | feature_vectors = FOREACH doc_vectors GENERATE (chararray)doc_id,com.mozilla.grouperfish.pig.eval.ml.TFVectorizer('$FEATUREINDEX', tf_bag) AS vec; 27 | STORE feature_vectors INTO '$OUTPUT' USING com.mozilla.grouperfish.pig.storage.VWStorage(); 28 | 29 | /* Run VW LDA on this output */ 30 | /* 31 | ./vw 32 | */ 33 | /* Run Mahout's Clustering on this output */ 34 | /* 35 | /usr/lib/hadoop/bin/hadoop jar /usr/lib/mahout/mahout-core-0.5-job.jar org.apache.mahout.driver.MahoutDriver lda 36 | -i document-vectors-tf 37 | -o lda-out 38 | -ow 39 | -k 20 40 | -v 12000 41 | -x 20 42 | */ -------------------------------------------------------------------------------- /transforms/count/count: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # A minimal hadoop based transform that performs a line-count on the input and 4 | # generates a result like this 5 | # 6 | # {"count": 12345} 7 | 8 | work=${1} 9 | 10 | fs_mkdir() { 11 | hadoop fs -mkdir $1 12 | } 13 | 14 | fs_rmr() { 15 | hadoop fs -rmr $1 16 | } 17 | 18 | fs_cat() { 19 | hadoop fs -cat $1 20 | } 21 | 22 | fs_put() { 23 | hadoop fs -put - $1 24 | } 25 | 26 | fs_rmr "${work}/output" 27 | fs_mkdir "${work}/output" 28 | echo '{"count": '"$(fs_cat ${work}/input.json.tsv | wc -l | awk '{print $1}')"'}' | fs_put "${work}/output/results.json" 29 | -------------------------------------------------------------------------------- /transforms/lda_gensim/src/python/filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | filter.py 5 | 6 | Created by Xavier Stevens on 2011-09-19. 7 | Copyright (c) 2011 Mozilla. All rights reserved. 8 | """ 9 | 10 | import sys 11 | import getopt 12 | import re 13 | import json 14 | 15 | help_message = ''' 16 | The help message goes here. 17 | ''' 18 | 19 | 20 | class Usage(Exception): 21 | def __init__(self, msg): 22 | self.msg = msg 23 | 24 | def filter_data(input_file, output_file, product="firefox", version="5.0", feedback_type="issues"): 25 | fin = open(input_file, "r") 26 | fout = open(output_file, "w") 27 | tab_pattern = re.compile("\t") 28 | for line in fin: 29 | line_splits = tab_pattern.split(line.strip()) 30 | doc_json = json.loads(line_splits[1]) 31 | if doc_json["product"] == product and doc_json["version"] == version and doc_json["type"] == feedback_type: 32 | fout.write(line) 33 | fin.close() 34 | fout.close() 35 | 36 | def main(argv=None): 37 | if argv is None: 38 | argv = sys.argv 39 | try: 40 | try: 41 | opts, args = getopt.getopt(argv[1:], "ho:d:p:v:t:", ["help", "output="]) 42 | except getopt.error, msg: 43 | raise Usage(msg) 44 | 45 | # option processing 46 | data_path = None 47 | output_path = None 48 | product = None 49 | version = None 50 | feedback_type = None 51 | for option, value in opts: 52 | if option == "-d": 53 | data_path = value 54 | if option in ("-h", "--help"): 55 | raise Usage(help_message) 56 | if option in ("-o", "--output"): 57 | output_path = value 58 | if option == "-p": 59 | product = value 60 | if option == "-v": 61 | version = value 62 | if option == "-t": 63 | feedback_type = value 64 | 65 | filter_data(data_path, output_path, product, version, feedback_type) 66 | except Usage, err: 67 | print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) 68 | print >> sys.stderr, "\t for help use --help" 69 | return 2 70 | 71 | 72 | if __name__ == "__main__": 73 | sys.exit(main()) 74 | -------------------------------------------------------------------------------- /transforms/lda_r/src/R/lda.r: -------------------------------------------------------------------------------- 1 | library(lda) 2 | library(RJSONIO) 3 | 4 | # Example: R -f src/R/lda.r --no-save --slave --args input-firefox-5.0-issues-ldac.dat feature-index-en-10-0.7.txt 10 topics.dat topdocs.dat 5 | 6 | args<-commandArgs(TRUE) 7 | 8 | docs<-read.documents(args[1]) 9 | vocab<-read.vocab(args[2]) 10 | K<-as.integer(args[3]) 11 | alpha<-0.01 12 | eta<-0.01 13 | model<-lda.collapsed.gibbs.sampler(docs, K, vocab, 100, alpha, eta) 14 | # Transposed for saving so we can read rows rather than columns 15 | top_10_topic_words<-t(top.topic.words(model$topics, num.words = 10, by.score = TRUE)) 16 | top_20_docs_per_topic<-t(top.topic.documents(model$document_sums, num.documents=20, alpha)) 17 | 18 | e1<-sapply(1:ncol(top_10_topic_words),function(r) top_10_topic_words[,r],simplify=FALSE) 19 | names(e1)<-0:(length(e1)-1) 20 | 21 | e2<-sapply(1:ncol(top_20_docs_per_topic),function(r) top_20_docs_per_topic[,r],simplify=FALSE) 22 | names(e2)<-0:(length(e2)-1) 23 | 24 | e3<-lapply(model$assignments,function(r) { 25 | a0<-table(r) 26 | a1<-as.numeric(a0/length(r)) 27 | names(a1)<-names(a0) 28 | return(a1) 29 | }) 30 | names(e3) <- 1:length(e3) 31 | 32 | json_doc_list <- list(TOP_FEATURES=e1, TOP_DOCS=e2, DOC_TOPICS=e3) 33 | json_docs <- toJSON(json_doc_list) 34 | writeLines(json_docs, "output.json") 35 | 36 | #write.table(top_10_topic_words, file=args[4], quote=FALSE, row.names=FALSE, col.names=FALSE) 37 | #write.table(top_20_docs_per_topic, file=args[5], quote=FALSE, row.names=FALSE, col.names=FALSE) 38 | # Model assignments of topics per word per doc (post process in python) 39 | #lapply(model$assignments, function(x) write.table(t(data.frame(x)), file="assignments.dat", append=TRUE, quote=FALSE, row.names=FALSE, col.names=FALSE)) -------------------------------------------------------------------------------- /transforms/lda_vw/src/main/python/vw-printtopics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # printtopics.py: Prints the words that are most prominent in a set of 4 | # topics. 5 | # 6 | # Copyright (C) 2010 Matthew D. Hoffman 7 | # 8 | # This program is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # This program is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with this program. If not, see . 20 | 21 | import sys 22 | 23 | def loadtxt(filename): 24 | data = [] 25 | passed_header = False 26 | for line in open(filename).readlines(): 27 | line = line.strip() 28 | if passed_header: 29 | data.append(map(float, line.split())) 30 | elif line.startswith('lda:'): 31 | passed_header = True 32 | data = zip(*data) # transpose data 33 | return data 34 | 35 | 36 | def main(): 37 | """ 38 | Displays topics fit by vw's LDA. The first column gives the 39 | (expected) most prominent words in the topics, the second column 40 | gives their (expected) relative prominence. 41 | """ 42 | if len(sys.argv) != 3: 43 | print >>sys.stderr, "Usage: vw-printtopics.py vocab-file topic-score-file" 44 | sys.exit(1) 45 | vocab = str.split(file(sys.argv[1]).read()) 46 | testlambda = loadtxt(sys.argv[2]) 47 | 48 | for k in range(0, len(testlambda)): 49 | lambdak = testlambda[k] 50 | 51 | # pitch extra topic rows 52 | lambdak = lambdak[0:(len(vocab)-1)] 53 | 54 | # normalize row 55 | the_sum = sum(lambdak) 56 | lambdak = [val / the_sum for val in lambdak] 57 | 58 | # resort by normalized value 59 | temp = zip(lambdak, range(0, len(lambdak))) 60 | temp = sorted(temp, key = lambda x: x[0], reverse=True) 61 | print 'topic %d:' % (k) 62 | # feel free to change the "53" here to whatever fits your screen nicely. 63 | for i in range(0, 20): 64 | print '%20s \t---\t %.4f' % (vocab[temp[i][1]], temp[i][0]) 65 | print 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /transforms/lda_vw/vw-lda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATA=$1 4 | K=$2 5 | ALPHA=0.1 6 | RHO=0.1 7 | D=`wc -l $DATA | cut -f 1 -d " "` 8 | echo $D 9 | B=$3 10 | FEATURE_INDEX=$4 11 | POWER_T=0.5 12 | INITIAL_T=1.0 13 | BATCH_SIZE=256 14 | 15 | rm /tmp/vw.cache 16 | ./vowpal_wabbit/vw "$DATA" --lda "$K" --lda_alpha "$ALPHA" --lda_rho "$RHO" --lda_D "$D" --minibatch "$BATCH_SIZE" --power_t "$POWER_T" --initial_t "$INITIAL_T" -b "$B" --cache_file /tmp/vw.cache --passes 10 -p "lda-$K-predictions.dat" --readable_model "lda-$K-topics.dat" 17 | python vowpalwabbit.py -t "lda-$K-topics.dat" -f "$FEATURE_INDEX" > "lda-$K-topics.txt" 18 | -------------------------------------------------------------------------------- /transforms/textcluster/.gitignore: -------------------------------------------------------------------------------- 1 | tests/*/output 2 | lib 3 | -------------------------------------------------------------------------------- /transforms/textcluster/install: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # normalize work directory 4 | wd=`dirname "$0"` 5 | wd=`cd "$wd"; pwd` 6 | 7 | cmd="--build" 8 | if [[ "${#}" -eq "1" ]]; then 9 | if [[ "${1}" == --* ]]; then cmd=$1; fi 10 | fi 11 | 12 | 13 | dest=../../build/transforms/textcluster 14 | case "${cmd}" in 15 | --build|--package) 16 | mkdir -p lib 17 | cd lib 18 | [[ -d stemming ]] || 19 | hg clone https://www.bitbucket.org/mchaput/stemming 20 | [[ -d textcluster ]] || 21 | git clone https://github.com/davedash/textcluster.git 22 | cd .. 23 | 24 | mkdir -p "${dest}" 25 | cp textcluster run.py "${dest}/" 26 | rm -rf "${dest}/lib" 27 | cp -r lib "${dest}/lib" 28 | ;; 29 | --test) 30 | for d in $( ls tests ); do 31 | ./textcluster "tests/${d}" || exit 1 32 | pushd "tests/${d}" > /dev/null 33 | diff results.expected.json output/results.json || 34 | ( echo "Test '${d}': Result seems to be wrong"; exit 1 ) 35 | popd > /dev/null 36 | done 37 | ;; 38 | --clean) 39 | rm -rf ./lib 40 | find . -type f -name '*.pyc' | xargs rm 41 | rm -rf "${dest}" 42 | ;; 43 | --help) 44 | echo "Usage: ${0} [--build|--clean|--test]" 45 | ;; 46 | *) 47 | echo "Usage: ${0} [--build|--clean|--test]" 48 | exit 1 49 | ;; 50 | esac 51 | -------------------------------------------------------------------------------- /transforms/textcluster/run.py: -------------------------------------------------------------------------------- 1 | import json, sys 2 | 3 | from textcluster import Corpus 4 | 5 | 6 | def process(inStream, outStream, 7 | fields={"id": "id", "text": "text"}, 8 | limits={"clusters": 10, "top_documents": 10}): 9 | all = {} 10 | 11 | text_field = fields["text"] 12 | key_field = fields["id"] 13 | max_clusters = limits["clusters"] 14 | max_top_docs = limits["top_documents"] 15 | 16 | c = Corpus() 17 | for line in inStream: 18 | data = line.split('\t', 1)[1] 19 | doc = json.loads(data.decode("utf8")) 20 | key = doc[key_field] 21 | all[key] = doc 22 | text = c.add((key, doc[text_field]), key=key) 23 | 24 | clusters = c.cluster() 25 | results = [] 26 | for c in clusters[:max_clusters]: 27 | tophits = [c.primary] 28 | tophits += [hit["object"] for hit in c.similars[:max_top_docs-1]] 29 | topdocs = [] 30 | for (key, text) in tophits: 31 | topdocs.append(all[key]) 32 | results.append({"top_documents": topdocs}) 33 | 34 | json.dump({"clusters": results}, outStream) 35 | 36 | 37 | def main(args): 38 | work_dir = args[1] 39 | parameters = json.load(open("%s/parameters.json" % work_dir)) 40 | with open("%s/input.json.tsv" % work_dir) as inFile: 41 | with open("%s/output/results.json" % work_dir, "w+") as outFile: 42 | process(inFile, outFile, **parameters) 43 | 44 | if __name__ == "__main__": 45 | main(sys.argv) 46 | -------------------------------------------------------------------------------- /transforms/textcluster/tests/small/parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": { 3 | "id": "id", 4 | "text": "text" 5 | }, 6 | "limits": { 7 | "clusters": 10, 8 | "top_documents": 10 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /transforms/textcluster/tests/standard/parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": { 3 | "id": "id", 4 | "text": "text" 5 | }, 6 | "limits": { 7 | "clusters": 10, 8 | "top_documents": 10 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /transforms/textcluster/textcluster: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export PYTHONPATH="lib/stemming:lib/textcluster:$PYTHONPATH" 4 | 5 | fail() { 6 | echo $1 7 | exit 1 8 | } 9 | 10 | [[ -d "${1}" ]] || fail "usage: ${0} WORKDIR" 11 | 12 | mkdir -p "${1}/output" 13 | env python run.py "${1}" || exit 1 14 | --------------------------------------------------------------------------------