├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── architecture.rst
    ├── batch_system.rst
    ├── conf.py
    ├── filters.rst
    ├── hacking.rst
    ├── index.rst
    ├── install
    ├── installation.rst
    ├── introduction.rst
    ├── queries.rst
    ├── rest_api.rst
    ├── todo.rst
    ├── transforms.rst
    └── usage.rst
├── filters
    └── language
    │   ├── lib
    │       └── language-profiles
    │       │   ├── af
    │       │   ├── ar
    │       │   ├── bg
    │       │   ├── bn
    │       │   ├── cs
    │       │   ├── da
    │       │   ├── de
    │       │   ├── el
    │       │   ├── en
    │       │   ├── es
    │       │   ├── fa
    │       │   ├── fi
    │       │   ├── fr
    │       │   ├── gu
    │       │   ├── he
    │       │   ├── hi
    │       │   ├── hr
    │       │   ├── hu
    │       │   ├── id
    │       │   ├── it
    │       │   ├── ja
    │       │   ├── kn
    │       │   ├── ko
    │       │   ├── mk
    │       │   ├── ml
    │       │   ├── mr
    │       │   ├── ne
    │       │   ├── nl
    │       │   ├── no
    │       │   ├── pa
    │       │   ├── pl
    │       │   ├── pt
    │       │   ├── ro
    │       │   ├── ru
    │       │   ├── sk
    │       │   ├── so
    │       │   ├── sq
    │       │   ├── sv
    │       │   ├── sw
    │       │   ├── ta
    │       │   ├── te
    │       │   ├── th
    │       │   ├── tl
    │       │   ├── tr
    │       │   ├── uk
    │       │   ├── ur
    │       │   ├── vi
    │       │   ├── zh-cn
    │       │   └── zh-tw
    │   └── src
    │       └── main
    │           └── java
    │               └── com
    │                   └── mozilla
    │                       └── grouperfish
    │                           └── text
    │                               ├── Dictionary.java
    │                               └── filter
    │                                   └── LanguageFilter.java
├── install
├── integration-test
    ├── .gitignore
    ├── config
    │   ├── elasticsearch.yml
    │   └── hazelcast.xml
    ├── install
    ├── pom.xml
    └── src
    │   └── test
    │       ├── java
    │           └── com
    │           │   └── mozilla
    │           │       └── grouperfish
    │           │           └── integration
    │           │               ├── IntegrationTestHelper.java
    │           │               ├── batch
    │           │                   └── RunResourceTest.java
    │           │               └── rest
    │           │                   ├── ConfigurationsResourceTest.java
    │           │                   ├── DocumentLoaderTest.java
    │           │                   ├── DocumentsResourceTest.java
    │           │                   └── QueriesResourceTest.java
    │       └── resources
    │           └── ng_integration.xml
├── project
    ├── VERSION
    └── pom.xml
├── service
    ├── .gitignore
    ├── bin
    │   ├── create_hbase_tables
    │   ├── grouperfish
    │   └── littlefish
    ├── conf
    │   ├── elasticsearch.yml
    │   ├── elasticsearch_hc.yml
    │   ├── grouperfish.properties
    │   ├── hazelcast.xml
    │   └── hazelcast_hbase.xml
    ├── install
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── mozilla
    │       │   │       └── grouperfish
    │       │   │           ├── base
    │       │   │               ├── ArrayTool.java
    │       │   │               ├── Assert.java
    │       │   │               ├── Box.java
    │       │   │               ├── Configuration.java
    │       │   │               ├── ImmutableTools.java
    │       │   │               ├── PropertiesTool.java
    │       │   │               ├── SlugTool.java
    │       │   │               ├── StreamTool.java
    │       │   │               └── json
    │       │   │               │   ├── JsonValidator.java
    │       │   │               │   ├── MapStreamer.java
    │       │   │               │   └── TsvJsonWriter.java
    │       │   │           ├── batch
    │       │   │               ├── api
    │       │   │               │   ├── BatchService.java
    │       │   │               │   └── guice
    │       │   │               │   │   └── BatchSystem.java
    │       │   │               ├── handlers
    │       │   │               │   ├── CleanupHandler.java
    │       │   │               │   ├── FetchHandler.java
    │       │   │               │   ├── PutHandler.java
    │       │   │               │   ├── RunHandler.java
    │       │   │               │   ├── SequentialHandler.java
    │       │   │               │   └── TaskHandler.java
    │       │   │               ├── scheduling
    │       │   │               │   ├── AbstractBatchService.java
    │       │   │               │   ├── Helpers.java
    │       │   │               │   ├── PipeliningBatchService.java
    │       │   │               │   ├── SingleQueueBatchService.java
    │       │   │               │   ├── SynchronousBatchService.java
    │       │   │               │   └── Worker.java
    │       │   │               └── transforms
    │       │   │               │   ├── ExecutableTransform.java
    │       │   │               │   ├── HadoopTransform.java
    │       │   │               │   ├── LocalTransform.java
    │       │   │               │   ├── Transform.java
    │       │   │               │   └── TransformProvider.java
    │       │   │           ├── bootstrap
    │       │   │               └── Grouperfish.java
    │       │   │           ├── model
    │       │   │               ├── Access.java
    │       │   │               ├── Document.java
    │       │   │               ├── Fail.java
    │       │   │               ├── NamedSource.java
    │       │   │               ├── Query.java
    │       │   │               ├── Task.java
    │       │   │               ├── TransformConfig.java
    │       │   │               └── Type.java
    │       │   │           ├── naming
    │       │   │               ├── Namespace.java
    │       │   │               └── Scope.java
    │       │   │           ├── rest
    │       │   │               ├── api
    │       │   │               │   └── RestService.java
    │       │   │               ├── jaxrs
    │       │   │               │   ├── ConfigurationsResource.java
    │       │   │               │   ├── DocumentsResource.java
    │       │   │               │   ├── HttpAccess.java
    │       │   │               │   ├── QueriesResource.java
    │       │   │               │   ├── ResourceBase.java
    │       │   │               │   ├── RestHelper.java
    │       │   │               │   ├── ResultsResource.java
    │       │   │               │   └── RunResource.java
    │       │   │               └── jersey
    │       │   │               │   ├── JerseyGuiceRestService.java
    │       │   │               │   └── ResourceConfig.java
    │       │   │           ├── services
    │       │   │               ├── api
    │       │   │               │   ├── FileSystem.java
    │       │   │               │   ├── Grid.java
    │       │   │               │   ├── Index.java
    │       │   │               │   ├── IndexProvider.java
    │       │   │               │   └── guice
    │       │   │               │   │   ├── Local.java
    │       │   │               │   │   ├── Services.java
    │       │   │               │   │   └── Shared.java
    │       │   │               ├── elasticsearch
    │       │   │               │   ├── ElasticSearchIndex.java
    │       │   │               │   └── ElasticSearchIndexProvider.java
    │       │   │               ├── hadoop
    │       │   │               │   └── HadoopFileSystem.java
    │       │   │               ├── hazelcast
    │       │   │               │   └── HazelcastGrid.java
    │       │   │               ├── local
    │       │   │               │   └── LocalFileSystem.java
    │       │   │               └── mock
    │       │   │               │   ├── MockFs.java
    │       │   │               │   ├── MockGrid.java
    │       │   │               │   └── MockIndex.java
    │       │   │           └── util
    │       │   │               ├── loader
    │       │   │                   ├── DocumentLoader.java
    │       │   │                   └── Loader.java
    │       │   │               └── logback
    │       │   │                   └── AnsiColorConverter.java
    │       └── resources
    │       │   ├── logback-stdout.xml
    │       │   └── logback.xml
    │   └── test
    │       ├── java
    │           └── com
    │           │   └── mozilla
    │           │       └── grouperfish
    │           │           ├── base
    │           │               ├── AssertTest.java
    │           │               ├── SlugToolTest.java
    │           │               ├── StreamToolTest.java
    │           │               └── json
    │           │               │   ├── JsonValidatorTest.java
    │           │               │   └── MapStreamerTest.java
    │           │           ├── model
    │           │               ├── DocumentTest.java
    │           │               └── DummyAccess.java
    │           │           ├── naming
    │           │               └── ScopeTest.java
    │           │           ├── rest
    │           │               └── jaxrs
    │           │               │   └── RestHelperTest.java
    │           │           └── unit
    │           │               └── UnitTestHelper.java
    │       └── resources
    │           ├── config
    │               └── hazelcast.xml
    │           └── ng_unit.xml
├── tools
    ├── display
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── mozilla
    │   │                   └── grouperfish
    │   │                       └── mahout
    │   │                           └── clustering
    │   │                               └── display
    │   │                                   ├── kmeans
    │   │                                       ├── DisplayKMeansBase.java
    │   │                                       ├── OriginalText.java
    │   │                                       └── WordCloud.java
    │   │                                   └── lda
    │   │                                       ├── DisplayLDABase.java
    │   │                                       ├── DisplayLDATopics.java
    │   │                                       └── OriginalText.java
    ├── firefox_input
    │   ├── .gitignore
    │   ├── README.md
    │   ├── install
    │   ├── load_opinions
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       └── java
    │   │       │   └── com
    │   │       │       └── mozilla
    │   │       │           └── grouperfish
    │   │       │               └── tools
    │   │       │                   └── firefox_input
    │   │       │                       ├── OpinionLoader.java
    │   │       │                       ├── OpinionStream.java
    │   │       │                       ├── TsvJsonFromInputTsv.java
    │   │       │                       └── TsvReader.java
    │   │   └── test
    │   │       ├── java
    │   │           └── com
    │   │           │   └── mozilla
    │   │           │       └── grouperfish
    │   │           │           └── tools
    │   │           │               └── firefox_input
    │   │           │                   ├── OpinionStreamTest.java
    │   │           │                   └── TsvReaderTest.java
    │   │       └── resources
    │   │           └── ng_unit.xml
    └── webui
    │   ├── public
    │       ├── css
    │       │   └── topics.css
    │       └── js
    │       │   ├── d3.js
    │       │   ├── jquery.isotope.min.js
    │       │   ├── jquery.js
    │       │   └── toy_topics.js
    │   └── topics.html
└── transforms
    ├── coclustering
        ├── INSTALL.MD
        ├── coclustering
        ├── install
        ├── pom.xml
        └── src
        │   ├── assembly
        │       └── job.xml
        │   └── main
        │       ├── java
        │           └── com
        │           │   └── mozilla
        │           │       └── grouperfish
        │           │           └── transforms
        │           │               └── coclustering
        │           │                   ├── display
        │           │                       ├── CoCluster.java
        │           │                       └── WriteCoClusteringOutput.java
        │           │                   ├── lucene
        │           │                       └── analysis
        │           │                       │   └── en
        │           │                       │       ├── EnglishAnalyzer.java
        │           │                       │       ├── NGramEnglishAnalyzer.java
        │           │                       │       └── ShingleAllStopFilter.java
        │           │                   ├── pig
        │           │                       ├── eval
        │           │                       │   ├── mahout
        │           │                       │   │   └── Vectorizer.java
        │           │                       │   └── text
        │           │                       │   │   ├── ConvertDocumentIDToID.java
        │           │                       │   │   ├── ConvertFeatureToID.java
        │           │                       │   │   ├── NGramTokenize.java
        │           │                       │   │   ├── TermFrequency.java
        │           │                       │   │   ├── Tokenize.java
        │           │                       │   │   └── UnigramExtractor.java
        │           │                       └── storage
        │           │                       │   ├── KMeansOutputLoader.java
        │           │                       │   └── MahoutVectorStorage.java
        │           │                   └── text
        │           │                       └── Dictionary.java
        │       ├── json_sample_files
        │           ├── parameters.json
        │           ├── results.json
        │           └── tags.json
        │       ├── pig
        │           ├── co_cluster_Z_generator.pig
        │           ├── co_cluster_generate_tags.pig
        │           ├── co_cluster_normalized_matrix_generator.pig
        │           └── co_cluster_preprocessor.pig
        │       └── python
        │           └── cocluster.py
    ├── commons
        ├── pom.xml
        └── src
        │   └── main
        │       ├── java
        │           └── com
        │           │   └── mozilla
        │           │       └── grouperfish
        │           │           ├── lucene
        │           │               └── analysis
        │           │               │   └── en
        │           │               │       ├── EnglishAnalyzer.java
        │           │               │       ├── NGramEnglishAnalyzer.java
        │           │               │       └── ShingleAllStopFilter.java
        │           │           ├── pig
        │           │               ├── eval
        │           │               │   ├── ml
        │           │               │   │   ├── TFIDFVectorizer.java
        │           │               │   │   ├── TFVectorizer.java
        │           │               │   │   └── Vectorizer.java
        │           │               │   └── text
        │           │               │   │   ├── NGramTokenize.java
        │           │               │   │   ├── TermFrequency.java
        │           │               │   │   └── Tokenize.java
        │           │               └── storage
        │           │               │   ├── LDACStorage.java
        │           │               │   ├── MahoutVectorStorage.java
        │           │               │   └── VWStorage.java
        │           │           └── text
        │           │               └── Dictionary.java
        │       └── pig
        │           ├── generate_document_vectors.pig
        │           ├── generate_feature_index.pig
        │           ├── generate_ngram_feature_index.pig
        │           ├── generate_sequence_files.pig
        │           ├── generate_tf_document_vectors.pig
        │           └── generate_tfidf_document_vectors.pig
    ├── count
        └── count
    ├── lda_gensim
        ├── src
        │   └── python
        │   │   ├── filter.py
        │   │   └── lda.py
        └── stopwords-en.txt
    ├── lda_r
        └── src
        │   ├── R
        │       └── lda.r
        │   └── python
        │       └── convert_r_to_grouperfish.py
    ├── lda_vw
        ├── src
        │   └── main
        │   │   └── python
        │   │       ├── vowpalwabbit.py
        │   │       └── vw-printtopics.py
        └── vw-lda.sh
    └── textcluster
        ├── .gitignore
        ├── install
        ├── run.py
        ├── tests
            ├── small
            │   ├── input.json.tsv
            │   ├── parameters.json
            │   └── results.expected.json
            └── standard
            │   ├── input.json.tsv
            │   ├── parameters.json
            │   └── results.expected.json
        └── textcluster


/.gitignore:
--------------------------------------------------------------------------------
 1 | grouperfish-*.tar.gz
 2 | conf/grouperfish.json
 3 | .project
 4 | .classpath
 5 | .settings/
 6 | target/
 7 | data/
 8 | docs/_build
 9 | build
10 | *.jar
11 | *.pyc
12 | .DS_Store
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ***** BEGIN LICENSE BLOCK *****
 2 | Version: MPL 1.1/GPL 2.0/LGPL 2.1
 3 | 
 4 | The contents of this file are subject to the Mozilla Public License Version
 5 | 1.1 (the "License"); you may not use this file except in compliance with
 6 | the License. You may obtain a copy of the License at
 7 | http://www.mozilla.org/MPL/
 8 | 
 9 | Software distributed under the License is distributed on an "AS IS" basis,
10 | WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | for the specific language governing rights and limitations under the
12 | License.
13 | 
14 | The Original Code is Mozilla Grouperfish.
15 | 
16 | The Initial Developer of the Original Code is Mozilla.
17 | Portions created by the Initial Developer are Copyright (C) 2011
18 | the Initial Developer. All Rights Reserved.
19 | 
20 | Contributor(s):
21 |   Michael Kurze <michael@thefoundation.de>
22 | 
23 | Alternatively, the contents of this file may be used under the terms of
24 | either the GNU General Public License Version 2 or later (the "GPL"), or
25 | the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | in which case the provisions of the GPL or the LGPL are applicable instead
27 | of those above. If you wish to allow use of your version of this file only
28 | under the terms of either the GPL or the LGPL, and not to allow others to
29 | use your version of this file under the terms of the MPL, indicate your
30 | decision by deleting the provisions above and replace them with the notice
31 | and other provisions required by the GPL or the LGPL. If you do not delete
32 | the provisions above, a recipient may use your version of this file under
33 | the terms of any one of the MPL, the GPL or the LGPL.
34 | 
35 | ***** END LICENSE BLOCK *****
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Grouperfish
 2 | 
 3 | ## A Document Transformation Engine
 4 | 
 5 | The nascent Grouperfish project aims to provide a simple, online, scalable text
 6 | clustering solution as a REST/JSON service. Initially this service is needed to
 7 | drive sites and themes for [Firefox Input](http://input.mozilla.com), as
 8 | described in
 9 | [mozilla bug 629019](https://bugzilla.mozilla.org/show_bug.cgi?id=629019).
10 | 
11 | The main service is written in Java. Individual algorithms can use varying
12 | technologies and platform.
13 | 
14 | For more extensive documentation,
15 | [read the docs](http://grouperfish.readthedocs.org)
16 | 


--------------------------------------------------------------------------------
/docs/filters.rst:
--------------------------------------------------------------------------------
1 | .. _filters:
2 | 
3 | =======
4 | Filters
5 | =======
6 | 
7 | As of Grouperfish 0.1, filters are not yet available.
8 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ###########
 2 | Grouperfish
 3 | ###########
 4 | 
 5 | .. note::
 6 |     This documentation serves as a specification.
 7 |     It describes a system that has not reached a usable state yet.
 8 | 
 9 | 
10 | Contents:
11 | 
12 | .. toctree::
13 |    :maxdepth: 2
14 | 
15 |    introduction
16 |    architecture
17 |    installation
18 |    rest_api
19 |    usage
20 |    filters
21 |    batch_system
22 |    transforms
23 |    queries
24 |    todo
25 |    hacking
26 | 
27 | * :ref:`search`
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/install:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | cmd="--build"
 4 | if [[ "${#}" -eq "1" ]]; then
 5 |     if [[ "${1}" == --* ]]; then
 6 |         cmd=$1
 7 |     fi
 8 | fi
 9 | 
10 | case "${cmd}" in
11 |     --build|--package)
12 |         make html || exit 1
13 |         mkdir -p ../build/docs
14 |         cp -rf _build/html ../build/docs
15 |         ;;
16 |     --clean)
17 |         make clean
18 |         rm -rf ../build/docs
19 |         rm -rf _build
20 |         ;;
21 |     --help)
22 |         "Usage: ${0} [--build|--clean]"
23 |         ;;
24 | esac
25 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | These are the requirements to run Grouperfish.
12 | For development, see :ref:`hacking`.
13 | 
14 | * A machine running a **Unix-style OS** (such as *Linux*).
15 | 
16 |   Support for windows currently not planned (and probably not easy to add).
17 | 
18 |   So far, we have been using Red Hat 5.2
19 |   and -- for development -- Mac OS X 10.6+.
20 | 
21 | * **JRE 6** or higher
22 | 
23 | * **Python 2.6** or higher (*not* tested with 3.x)
24 | 
25 | * **ElasticSearch 0.17.6**
26 | 
27 | The ElasticSearch cluster does not need to be running on the same machines as
28 | Grouperfish. For Hadoop/HBase you will need to make sure that the
29 | configuration is on your classpath (easiest with a local installation).
30 | 
31 | 
32 | Prepare your installation
33 | -------------------------
34 | 
35 | * Obtain a grouperfish tarball [#]_ and unpack it into a directory of your choice.
36 | 
37 |   ::
38 | 
39 |       > tar xzf grouperfish-0.1.tar
40 | 
41 |       > cd grouperfish-0.1
42 | 
43 | * Under ``config``, modify the ``elasticsearch.yml`` and
44 |   ``elasticsearch_hc.yml`` so that Grouperfish will be able to discover your
45 |   cluster.
46 |   **Advanced:** You can modify the ``elasticsearch.yml`` to make
47 |   each Grouperfish instance run its own ElasticSearch data node. By default,
48 |   Grouperfish depends on joining an existing cluster though. Refer to the
49 |   `ElasticSearch configuration documentation`_ for details.
50 | 
51 | .. _`ElasticSearch configuration documentation`:
52 |    http://www.elasticsearch.org/guide/reference/setup/configuration.html
53 | 
54 | * In the ``hazelcast.xml``, have a look at ``<network>`` section.
55 |   If your network does not support multicast based discovery, make changes
56 |   as described in the `Hazelcast documentation`_.
57 | 
58 | .. _`Hazelcast documentation`:
59 |    http://www.hazelcast.com/docs/1.9.4/manual/multi_html/ch09.html
60 | 
61 | .. [#] right now, the only way is to build it from source. See :ref:`hacking`.
62 | 
63 | 
64 | Launch the daemon
65 | -----------------
66 | 
67 | To run grouperfish (currently, no service wrapper is available):
68 | 
69 | ::
70 | 
71 |     grouperfish-0.1> ./bin/grouperfish -f
72 | 
73 | Grouperfish will be listening on port 61732
74 | (mnemonic: ``FISH =  0xF124 = 61732``).
75 | 
76 | You can safely ignore the logback warning (which will only appear with ``-f``
77 | given). It is due to an `error`_ in logback.
78 | 
79 | .. _error: http://jira.qos.ch/browse/LBCORE-198
80 | 
81 | Omit the ``-f`` to run grouperfish as a background process, detached from your
82 | shell. You can use ``jps`` to determine the process id.
83 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | Grouperfish is built to perform text clustering for `Firefox Input`_.
 5 | Due to its generic nature, it also serves as a testbed to prototype machine
 6 | learning algorithms.
 7 | 
 8 | .. _Firefox Input: http://input.mozilla.com
 9 | 
10 | How does it work?
11 | -----------------
12 | 
13 | Grouperfish is a *document transformation system*, for high throughput
14 | applications.
15 | 
16 | Roughly summarized:
17 | 
18 | * users put *documents* into Grouperfish using a REST interface
19 | 
20 | * *transformations* are performed on one or several subsets of these documents.
21 | 
22 | * *results* can be retrieved by users over the REST interface
23 | 
24 | * all components are distributed for high volume applications
25 | 
26 | 
27 | What can be done?
28 | """""""""""""""""
29 | 
30 | Assume a scenario where a steady stream of documents is generated.
31 | For example:
32 | 
33 | * user feedback
34 | * software crash reports
35 | * twitter messages
36 | 
37 | Now, these documents can be processed to make them more useful.
38 | For example:
39 | 
40 | * clustering (grouping related documents together, detecting common topics)
41 | * classification (associating documents with predefined categories including
42 |   spam)
43 | * trending (identifying new topics over time).
44 | 
45 | 
46 | Vocabulary
47 | ----------
48 | 
49 | Grouperfish users can assume one of three roles (or any combination thereof):
50 | 
51 | Document Producer
52 |     Some user (usually another piece of software) that will
53 |     put documents into the System.
54 | 
55 | Result Consumer
56 |     Some user/software that gets the generated results.
57 | 
58 | Admin
59 |     A user who configures which subsets of documents to transform, but also
60 |     how and when to do that.
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/queries.rst:
--------------------------------------------------------------------------------
 1 | .. _queries:
 2 | 
 3 | =======
 4 | Queries
 5 | =======
 6 | 
 7 | Concrete Queries
 8 | ----------------
 9 | 
10 | A concrete query is just a regular ElasticSearch query, e.g.:
11 | 
12 | ::
13 | 
14 |     {
15 |       "query": {
16 |         "bool": {
17 |           "must": [
18 |             {"field": {"os": "Android"}},
19 |             {"field": {"platform": "ARM"}},
20 |           ]
21 |         }
22 |       }
23 |     }
24 | 
25 | All documents matching this query will be processed together in a batch run.
26 | 
27 | .. note::
28 |    Find the full `Query DSL documentation`_ on the ElasticSearch Website.
29 | 
30 | .. _`Query DSL documentation`:
31 |    http://www.elasticsearch.org/guide/reference/query-dsl/
32 | 
33 | 
34 | .. _template-queries:
35 | 
36 | 
37 | Template Queries
38 | ----------------
39 | 
40 | A template query will generate a bunch of concrete queries every time it is
41 | evaluated. It is different in that it has an additional top-level field
42 | "facet_by", which is a list of field names.
43 | 
44 | Let us assume we have these documents in our namespace:
45 | 
46 | ::
47 | 
48 |     {"id": 1, "desc": "Why do you crash?", "os": "win7", "platform": "x64"},
49 |     {"id": 2, "desc": "Don't crash plz", "os": "xp", "platform": "x86"},
50 |     {"id": 3, "desc": "It doesn't crash!", "os": "win7", "platform": "x86"},
51 |     {"id": 3, "desc": "Over 9000!", "os": "linux", "platform": "x86"},
52 | 
53 | 
54 | And this template query:
55 | 
56 | ::
57 | 
58 |     {
59 |       "query": {"text": {"desc": "crash"}},
60 |       "facet_by": ["platform", "os"]
61 |     }
62 | 
63 | 
64 | This will generate the following set of queries:
65 | 
66 | ::
67 | 
68 |     {"query": {"filtered":
69 |         {"query": {"text": {"desc": "crash"}}, "filter": {"and": [
70 |             {"field": {"os": "win7"}},
71 |             {"field": {"platform": "x64"}},
72 |     ]}}}}
73 |     {"query": {"filtered":
74 |         {"query": {"text": {"desc": "crash"}}, "filter": {"and": [
75 |             {"field": {"os": "win7"}},
76 |             {"field": {"platform": "x86"}},
77 |     ]}}}}
78 |     {"query": {"filtered":
79 |         {"query": {"text": {"desc": "crash"}}, "filter": {"and": [
80 |             {"field": {"os": "xp"}},
81 |             {"field": {"platform": "x86"}},
82 |     ]}}}}
83 | 
84 | Note that no query for ``os=linux`` is generated in this case, because the
85 | query for ``crash`` does not match any document with that ``os`` in the first
86 | place.
87 | 


--------------------------------------------------------------------------------
/docs/todo.rst:
--------------------------------------------------------------------------------
 1 | .. _todo:
 2 | 
 3 | =====
 4 | To Do
 5 | =====
 6 | 
 7 | These components are not necessarily listed in the order they need to be
 8 | implemented:
 9 | 
10 | * Filtering functionality (:ref:`filters`)
11 | 
12 |   * Language detection filter
13 | 
14 | * Allow clients to extract sub-results from a result doc (using JSON paths)
15 | 
16 | * Add template Queries
17 | 
18 | * Add tagging of ElasticSearch documents based on transform results
19 | 
20 | * :ref:`Transforms`
21 | 
22 |   * Co-Clustering
23 | 
24 |   * LDA
25 | 
26 | * Validate configuration pieces based on a schema, specific to
27 |   each filter/transform
28 | 
29 | * JS client library (possibly hook in with ``pyes``)
30 |   E.g. to be used by the admin interface.
31 | 
32 | * Admin interface
33 | 
34 | * Python client library (possibly hook in with ``pyes``)
35 | 
36 | * Online service for ad-hoc requests
37 | 
38 |   * Define online API (Client/server? JVM using Jython etc.?)
39 | 
40 |   * Integrate a fast clustering algorithm for this
41 | 
42 | 


--------------------------------------------------------------------------------
/docs/transforms.rst:
--------------------------------------------------------------------------------
  1 | .. _transforms:
  2 | 
  3 | ==========
  4 | Transforms
  5 | ==========
  6 | 
  7 | Transforms are the heart of Grouperfish. They generate the results that will
  8 | actually be interesting to consumers.
  9 | 
 10 | Note: The minimal transform interface is defined by the :ref:`batch_system`
 11 | 
 12 | 
 13 | Transform Configuration
 14 | -----------------------
 15 | 
 16 | The same transform (e.g. a clustering algorithm) might be used with different
 17 | parameters to generate different results.  For this reason, the system
 18 | contains a *transform configurations* for each result that should be
 19 | generated.
 20 | 
 21 | Primarily, a transform configuration parameterizes its transform (e.g. for
 22 | clustering, it might specify the desired number of clusters). It can also be
 23 | used to tell the  Grouperfish batch system how to interact with a transform.
 24 | 
 25 | Currently, a transform configuration is a JSON document with two fields: The
 26 | *transform* determines which piece of software to use, and *parameters* tells
 27 | that software what to do.
 28 | Example configuration for the *textcluster* transform:
 29 | 
 30 | ::
 31 | 
 32 |     {
 33 |         "transform": "textcluster",
 34 |         "parameters": {
 35 |             "fields": {"id": "id", "text": "text"},
 36 |             "limits": {"clusters": 10,"top_documents": 10}
 37 |         }
 38 |     }
 39 | 
 40 | 
 41 | Result Types
 42 | ------------
 43 | 
 44 | Topics (or Clusters)
 45 | ^^^^^^^^^^^^^^^^^^^^
 46 | 
 47 | Clustering transforms try to extract the main topics from a set of documents.
 48 | As of Grouperfish version 0.1, the only available transform is a clustering
 49 | transform named textcluster. The results of clustering transform are topics,
 50 | the structure of the result is as follows:
 51 | 
 52 | ::
 53 | 
 54 |     {
 55 |         "clusters": [
 56 |             {
 57 |                 "top_documents": [{...}, {...}, ..., {...}],
 58 |                 "top_terms": ["Something", "Else", ..., "Another"]
 59 |             },
 60 |             ...
 61 |         ]
 62 |     }
 63 | 
 64 | Depending on the actually configured transform, only top documents *or* top
 65 | terms might be generated for a topic. Also, any given transform might add
 66 | other top-level fields than just *clusters*.
 67 | 
 68 | 
 69 | Available Transforms
 70 | --------------------
 71 | 
 72 | textcluster
 73 | ^^^^^^^^^^^
 74 | 
 75 | Textcluster is a relatively simple clustering algorithm written in Python by
 76 | Dave Dash for Firefox Input. It is very fast for small input sets, but
 77 | requires a lot of memory, especially when processing more than 10,000
 78 | documents at a time. Textcluster is `available on github`__.
 79 | 
 80 | .. __: https://github.com/davedash/textcluster
 81 | 
 82 | In Grouperfish, you can select how many topics you want textcluster to
 83 | extract, and how many documents to include in the results for each topic.
 84 | 
 85 | * Parameters
 86 | 
 87 |   ::
 88 | 
 89 |       {
 90 |           "fields": {
 91 |               "id": "id",
 92 |               "text": "text"
 93 |           },
 94 |           "limits": {
 95 |               "clusters": 10,
 96 |               "top_documents": 10
 97 |           }
 98 |       }
 99 | 
100 |   These are the default parameters (top 10 topics/clusters,
101 |   with 10 documents each).
102 | 
103 | 
104 | * Results
105 | 
106 |   Textcluster uses the standard clustering result format (see above), but does
107 |   not inclue top terms, only documents.
108 | 
109 | 


--------------------------------------------------------------------------------
/install:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | self="${0}"
  4 | usage() {
  5 |     echo "Usage: ${self} [--help | --package | (--build|--clean) [PATH]]"
  6 |     echo "    --help      Show this message."
  7 |     echo "    --clean     Discard all build results and intermediate files."
  8 |     echo "    --build     Install components to ./build (default)"
  9 |     echo "    --package   Create a complete grouperfish tarball."
 10 |     echo "                Cleans and rebuilds every component first."
 11 |     echo
 12 |     echo "    PATH        The component to build. Examples:"
 13 |     echo "                  ./service"
 14 |     echo "                  ./transforms/coclustering"
 15 |     echo "                If omitted, everything is built."
 16 |     echo
 17 |     echo "This script must be called from the root directory of the project."
 18 |     echo
 19 | }
 20 | 
 21 | cmd="--build"
 22 | all="YES"
 23 | what="project docs service transforms/* tools/* filters/* integration-test"
 24 | 
 25 | if [[ "${#}" -gt "2" ]]; then
 26 |     usage
 27 |     exit 1
 28 | fi
 29 | 
 30 | if [[ "${#}" -eq "2" ]]; then
 31 |     cmd=$1
 32 |     all="NO"
 33 |     what=$2
 34 | fi
 35 | 
 36 | if [[ "${#}" -eq "1" ]]; then
 37 |     if [[ "${1}" == -* ]]; then
 38 |         cmd=$1
 39 |     else
 40 |         all="NO"
 41 |         what=$1
 42 |     fi
 43 | fi
 44 | 
 45 | fail() {
 46 |     echo "Build aborted: ${1}"
 47 |     exit 1
 48 | }
 49 | 
 50 | clean() {
 51 |     what=$1
 52 |     for component in $what; do
 53 |         if [[ -x "$component/install" ]]; then
 54 |             echo $'\n\n'"Cleaning $component ..."
 55 |             ( cd "${component}"
 56 |               ./install --clean || fail "Clean of '${component}' failed." )
 57 |         fi
 58 |         rm -rf "./build/${component}"
 59 |     done
 60 |     if [[ "YES" = "${all}" ]]; then
 61 |         rm -rf ./build
 62 |         version="$(cat ./project/VERSION)"
 63 |         rm -f "./grouperfish-${version}.tar.gz"
 64 |         cd ./project && mvn clean ; cd ..
 65 |     fi
 66 | }
 67 | 
 68 | build() {
 69 |     # either --build or --release
 70 |     mode=$1
 71 |     what=$2
 72 |     if [[ "YES" = "${all}" ]]; then
 73 |         cd ./project && mvn install ; cd ..
 74 |     fi
 75 |     for component in $what; do
 76 |         if [[ -x "$component/install" ]]; then
 77 |             echo $'\n\n'"Installing $component ..."
 78 |             ( cd "${component}"
 79 |               ./install $mode || fail "Installation of '${component}' failed." )
 80 |         else
 81 |             echo "Copying $component ..."
 82 |             mkdir -p "build/${component}/"
 83 |             cp -r "${component}"/* "build/${component}/"
 84 |         fi
 85 |     done
 86 | }
 87 | 
 88 | package() {
 89 |     what=$1
 90 |     version="$(cat ./project/VERSION)"
 91 |     clean "${what}"
 92 |     build --package "${what}"
 93 |     mv ./build "./grouperfish-${version}"
 94 |     tar czf "grouperfish-${version}.tar.gz" "./grouperfish-${version}"
 95 | }
 96 | 
 97 | case "${cmd}" in
 98 |     --help)
 99 |         usage
100 |         ;;
101 |     --build)
102 |         build --build "${what}"
103 |         ;;
104 |     --clean)
105 |         clean "${what}"
106 |         ;;
107 |     --package)
108 |         package "${what}"
109 |         ;;
110 |     *)
111 |         usage
112 |         exit 1
113 |         ;;
114 | esac
115 | 


--------------------------------------------------------------------------------
/integration-test/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | data
3 | 


--------------------------------------------------------------------------------
/integration-test/config/elasticsearch.yml:
--------------------------------------------------------------------------------
1 | # Cluster Settings
2 | cluster:
3 |   name: gfintegration
4 | 
5 | path:
6 |   data: data/elasticsearch
7 | 


--------------------------------------------------------------------------------
/integration-test/install:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | cmd="--build"
 4 | if [[ "${#}" -eq "1" ]]; then
 5 |     if [[ "${1}" == --* ]]; then
 6 |         cmd=$1
 7 |     fi
 8 | fi
 9 | 
10 | case "${cmd}" in
11 |     --build|--package)
12 |         mvn test || exit 1
13 |         ;;
14 |     --clean)
15 |         mvn clean
16 |         rm -rf target
17 |         ;;
18 |     --help)
19 |         "Usage: ${0} [--build|--clean]"
20 |         ;;
21 | esac
22 | 


--------------------------------------------------------------------------------
/integration-test/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <artifactId>grouperfish-integration-test</artifactId>
 6 |     <version>${grouperfishVersion}</version>
 7 |     <parent>
 8 |         <groupId>com.mozilla</groupId>
 9 |         <artifactId>grouperfish-parent</artifactId>
10 |         <relativePath>../project</relativePath>
11 |         <version>FIXED</version>
12 |     </parent>
13 | 
14 |     <url>https://github.com/mozilla-metrics/grouperfish</url>
15 | 
16 |     <properties>
17 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
18 |         <skip.tests.unit>false</skip.tests.unit>
19 |     </properties>
20 | 
21 |     <dependencies>
22 | 
23 |         <dependency>
24 |             <groupId>com.mozilla</groupId>
25 |             <artifactId>grouperfish-service</artifactId>
26 |             <version>${grouperfishVersion}</version>
27 |         </dependency>
28 | 
29 |         <dependency>
30 |             <groupId>com.jayway.restassured</groupId>
31 |             <artifactId>rest-assured</artifactId>
32 |             <version>1.2.2</version>
33 |             <scope>test</scope>
34 |         </dependency>
35 | 
36 |     </dependencies>
37 | 
38 | 
39 |     <build>
40 |         <finalName>grouperfish-service</finalName>
41 |         <plugins>
42 | 
43 |             <!-- Set compiler level to Java 6 -->
44 |             <plugin>
45 |                 <groupId>org.apache.maven.plugins</groupId>
46 |                 <artifactId>maven-compiler-plugin</artifactId>
47 |                 <version>2.3.2</version>
48 |                 <configuration>
49 |                     <source>1.6</source>
50 |                     <target>1.6</target>
51 |                 </configuration>
52 |             </plugin>
53 | 
54 |             <!-- Test-NG based integration tests -->
55 |             <plugin>
56 |                 <groupId>org.apache.maven.plugins</groupId>
57 |                 <artifactId>maven-surefire-plugin</artifactId>
58 |                 <version>2.5</version>
59 |                 <configuration>
60 |                     <workingDirectory>../build/</workingDirectory>
61 |                     <skip>${skip.tests.unit}</skip>
62 |                     <argLine>-Xms128m -Xmx1024m -XX:PermSize=128m -XX:MaxPermSize=512m</argLine>
63 |                     <parallel>methods</parallel>
64 |                     <threadCount>1</threadCount>
65 |                     <suiteXmlFiles>
66 |                         <suiteXmlFile>src/test/resources/ng_integration.xml</suiteXmlFile>
67 |                     </suiteXmlFiles>
68 |                 </configuration>
69 |             </plugin>
70 | 
71 |         </plugins>
72 | 
73 | 
74 |     </build>
75 | 
76 | </project>
77 | 


--------------------------------------------------------------------------------
/integration-test/src/test/java/com/mozilla/grouperfish/integration/IntegrationTestHelper.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.integration;
 2 | 
 3 | //import org.apache.hadoop.conf.Configuration;
 4 | //import org.apache.hadoop.hbase.HBaseConfiguration;
 5 | //import org.apache.hadoop.hbase.LocalHBaseCluster;
 6 | import org.testng.annotations.AfterGroups;
 7 | import org.testng.annotations.BeforeGroups;
 8 | import org.testng.annotations.BeforeTest;
 9 | import org.testng.annotations.Test;
10 | 
11 | import groovyx.net.http.ContentType;
12 | 
13 | import com.mozilla.grouperfish.bootstrap.Grouperfish;
14 | 
15 | import com.hazelcast.core.Hazelcast;
16 | import com.jayway.restassured.RestAssured;
17 | import com.mozilla.grouperfish.base.Assert;
18 | import com.mozilla.grouperfish.rest.jersey.JerseyGuiceRestService;
19 | 
20 | 
21 | @Test(groups="integration")
22 | public class IntegrationTestHelper {
23 | 
24 |     public static final int port = Grouperfish.DEFAULT_PORT + 100;
25 |     static {
26 |         setUpRestAssured();
27 |     }
28 | 
29 |     public static String NS = "integration";
30 | 
31 |     // private LocalHBaseCluster hbase;
32 | 
33 |     private final Thread grouperfish = new Thread() {
34 |         @Override
35 |         public void run() {
36 |             System.setProperty("hazelcast.config", "config/hazelcast.xml");
37 |             System.setProperty(JerseyGuiceRestService.PROPERTY_PORT, String.valueOf(port));
38 |             try {
39 |                 Grouperfish.main(new String[]{});
40 |             }
41 |             catch (InterruptedException interrupt) {
42 |                 Hazelcast.getMap("documents_" + NS).destroy();
43 |                 Thread.currentThread().interrupt();
44 |             }
45 |             catch (Exception e) {
46 |                 Assert.unreachable(null, e);
47 |             }
48 |         }
49 |     };
50 | 
51 | 
52 |     @BeforeGroups(groups="integration")
53 |     void setUp() throws Exception {
54 | 
55 |         // Local HBaseCluster to use.
56 |         // hbase = new LocalHBaseCluster(HBaseConfiguration.create(new Configuration()));
57 |         // hbase.startup();
58 |         // Thread.sleep(3000);
59 | 
60 |         // Set required bagheera configuration:
61 | 
62 |         // Give some time for Grouperfish (and especially HazelCast) to come up:
63 |         grouperfish.start();
64 |         Thread.sleep(10000);
65 | 
66 |         setUpRestAssured();
67 |     }
68 | 
69 | 
70 |     @AfterGroups(groups="integration")
71 |     void tearDown() throws InterruptedException {
72 |         grouperfish.interrupt();
73 |         Thread.sleep(2000);
74 |         //hbase.shutdown();
75 |         //hbase.join();
76 |     }
77 | 
78 | 
79 |     @BeforeTest(groups="integration")
80 |     public static void setUpRestAssured() {
81 |         RestAssured.baseURI = "http://127.0.0.1";
82 |         RestAssured.port = port;
83 |         RestAssured.basePath = "";
84 |         RestAssured.requestContentType(ContentType.JSON);
85 |     }
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/integration-test/src/test/java/com/mozilla/grouperfish/integration/batch/RunResourceTest.java:
--------------------------------------------------------------------------------
1 | package com.mozilla.grouperfish.integration.batch;
2 | 
3 | public class RunResourceTest {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/integration-test/src/test/java/com/mozilla/grouperfish/integration/rest/QueriesResourceTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.integration.rest;
 2 | 
 3 | import static com.jayway.restassured.RestAssured.expect;
 4 | import static com.jayway.restassured.RestAssured.given;
 5 | import static java.lang.String.format;
 6 | 
 7 | import org.json.simple.JSONObject;
 8 | import org.testng.annotations.Test;
 9 | 
10 | import com.mozilla.grouperfish.integration.IntegrationTestHelper;
11 | 
12 | 
13 | @Test(groups="integration")
14 | @SuppressWarnings({ "unchecked", "serial" })
15 | public class QueriesResourceTest {
16 | 
17 |     final IntegrationTestHelper helper = new IntegrationTestHelper();
18 |     final String NS = IntegrationTestHelper.NS;
19 | 
20 |     private static final String QUERY_ALL = (new JSONObject() {{
21 |         put("query", new JSONObject(){{
22 |             put("match_all", new JSONObject());
23 |         }});
24 |     }}).toJSONString();
25 | 
26 |     public void testPutQuery() {
27 |         given().body(QUERY_ALL).
28 |             expect().statusCode(201).
29 |             when().put(format("/queries/%s/ALL", NS));
30 |     }
31 | 
32 |     public void testPutTooEmpty() {
33 |         given().body("").
34 |             expect().statusCode(400).
35 |             when().put(format("/queries/%s/Z", NS));
36 |     }
37 | 
38 | //    // These tests cannot work yet (we first need to verify queries using ES).
39 | //    public void testPutInvalidQuery() {
40 | //        ...
41 | //    }
42 | //
43 | //    public void testPutEmptyQuery() {
44 | //        given().body("{}").
45 | //            expect().statusCode(400).
46 | //            when().put(format("/queries/%s/MYBAD", NS));
47 | //    }
48 | 
49 | 
50 |     public void testDeleteQuery() {
51 |         testPutQuery();
52 |         expect().
53 |             statusCode(204).
54 |             when().delete(format("/queries/%s/ALL", NS));
55 |     }
56 | 
57 |     public void testRepeatDeleteQuery() {
58 |         testPutQuery();
59 |         expect().
60 |             statusCode(204).
61 |             when().delete(format("/queries/%s/ALL", NS));
62 |         expect().
63 |             statusCode(204).
64 |             when().delete(format("/queries/%s/ALL", NS));
65 |     }
66 | 
67 |     public void testGetQuery() {
68 |         testPutQuery();
69 |         expect().
70 |             statusCode(200).
71 |             when().get(format("/queries/%s/ALL", NS));
72 |     }
73 | 
74 |     public void testNotFound() {
75 |         expect().
76 |             statusCode(404).
77 |             when().get(format("/queries/%s/Yeti", NS));
78 |     }
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/integration-test/src/test/resources/ng_integration.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd">
 3 | 
 4 | <suite name="grouperfish-service-integration" verbose="4">
 5 | 
 6 |     <test name="run-integration">
 7 |         <groups>
 8 |             <run>
 9 |                 <include name="integration" />
10 |             </run>
11 |         </groups>
12 |         <packages>
13 |             <package name="com.mozilla.grouperfish.integration.*" />
14 |         </packages>
15 |     </test>
16 | </suite>
17 | 


--------------------------------------------------------------------------------
/project/VERSION:
--------------------------------------------------------------------------------
1 | 0.1-SNAPSHOT
2 | 


--------------------------------------------------------------------------------
/service/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | data
3 | grouperfish.pid
4 | test-output
5 | 


--------------------------------------------------------------------------------
/service/bin/create_hbase_tables:
--------------------------------------------------------------------------------
 1 | #!hbase shell
 2 | 
 3 | # Alternative Usage:
 4 | # cat create_hbase_tables | hbase shell
 5 | 
 6 | create 'documents', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}
 7 | create 'queries', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}
 8 | create 'results', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}
 9 | create 'configurations', {NAME => 'data', VERSIONS => '1', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}
10 | 


--------------------------------------------------------------------------------
/service/bin/littlefish:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Starts up a grouperfish instance in the foreground without creating
 4 | # a jar first.
 5 | # Allows to quickly test changes without running mvn install.
 6 | 
 7 | function usage() {
 8 |     echo "Usage: $0 [-h] [hazelcast-config-path]"
 9 |     echo
10 |     echo "You need to fully build the project once first:"
11 |     echo "Work directory is ../build"
12 |     echo
13 | }
14 | 
15 | bin=`dirname "$0"`
16 | bin=`cd $bin; pwd`
17 | 
18 | if [ ! -f "${bin}/../target/classpath" ]; then
19 |     if [ ! -d "${bin}/../target/" ]; then
20 |         mvn compile
21 |     fi
22 |     mvn dependency:build-classpath
23 | fi
24 | 
25 | 
26 | 
27 | build=$bin/../../build
28 | build=`cd $build; pwd`
29 | 
30 | #### Process options
31 | 
32 | # FISH = 0xF124 = 61732
33 | SERVER_PORT=61732
34 | 
35 | while getopts ":fhp:" optname ; do
36 |     case "$optname" in
37 |         "h")
38 |             usage
39 |             exit 0
40 |             ;;
41 |         "?")
42 |             echo "Unknown option ${OPTARG}"
43 |             usage
44 |             exit 1
45 |             ;;
46 |         *)
47 |             usage
48 |             exit 1
49 |             ;;
50 |     esac
51 | done
52 | shift $(($OPTIND - 1))
53 | 
54 | 
55 | #### Process arguments
56 | 
57 | HAZELCAST_CONF=$bin/../conf/hazelcast.xml
58 | if [[ $# -gt 1 ]] ; then usage;  exit 1; fi
59 | if [[ $# -gt 0 ]]; then HAZELCAST_CONF=$1; fi
60 | 
61 | CLASSPATH="$bin/../conf":"../service/target/classes":$(cat target/classpath)
62 | CLASSPATH="$CLASSPATH":"$HADOOP_CONF":"$HBASE_CONF"
63 | SERVER_CLASS_NAME="com.mozilla.grouperfish.bootstrap.Grouperfish"
64 | JAVA_OPTS="-Xmx1g -XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
65 | 
66 | if [ "$GROUPERFISH_USER" = "" ]; then GROUPERFISH_USER="$USER"; fi
67 | if [ "$HADOOP_CONF" = "" ]; then HADOOP_CONF="/etc/hadoop/conf"; fi
68 | if [ "$HBASE_CONF" = "" ]; then HBASE_CONF="/etc/hbase/conf"; fi
69 | HAZELCAST_OPTS="-Dhazelcast.logging.type=slf4j -Dhazelcast.config=${HAZELCAST_CONF}"
70 | BAGHEERA_OPTS="-Dbagheera.log.dir=${bin}/../logs"
71 | GROUPERFISH_OPTS="-Dgrouperfish.rest.port=${SERVER_PORT}"
72 | 
73 | mkdir -p "${build}/../logs"
74 | 
75 | 
76 | cd "${build}"
77 | echo "Work directory: $(pwd) / ${build}"
78 | echo "Using classpath: " $CLASSPATH
79 | 
80 | java -Dlogback.configurationFile=logback-stdout.xml \
81 |      $GROUPERFISH_OPTS $BAGHEERA_OPTS $HAZELCAST_OPTS $JAVA_OPTS \
82 |      -cp $CLASSPATH \
83 |      $SERVER_CLASS_NAME
84 | 
85 | RETVAL=$?
86 | exit $RETVAL
87 | 


--------------------------------------------------------------------------------
/service/conf/elasticsearch.yml:
--------------------------------------------------------------------------------
1 | # Cluster Settings
2 | cluster.name: grouperfish
3 | 
4 | node.data: false
5 | node.client: true
6 | http.enabled: false
7 | transport.tcp.port: 9301
8 | 


--------------------------------------------------------------------------------
/service/conf/elasticsearch_hc.yml:
--------------------------------------------------------------------------------
1 | # Cluster Settings
2 | cluster.name: grouperfish
3 | 
4 | node.data: false
5 | node.client: true
6 | http.enabled: false
7 | transport.tcp.port: 9302
8 | 


--------------------------------------------------------------------------------
/service/conf/grouperfish.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-metrics/grouperfish/71f6074c1b08626437242509126c6f3732d7b036/service/conf/grouperfish.properties


--------------------------------------------------------------------------------
/service/install:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # normalize work directory
 4 | wd=`dirname "$0"`
 5 | wd=`cd "$wd"; pwd`
 6 | 
 7 | 
 8 | cmd="--build"
 9 | if [[ "${#}" -eq "1" ]]; then
10 |     if [[ "${1}" == --* ]]; then
11 |         cmd=$1
12 |     fi
13 | fi
14 | 
15 | build() {
16 |     mvn install || exit 1
17 |     mkdir -p ../build/lib ../build/conf ../build/bin
18 |     cp target/grouperfish-service-*.jar ../build/lib/
19 |     cp conf/* ../build/conf/
20 |     cp bin/grouperfish ../build/bin/
21 | }
22 | 
23 | package() {
24 |     mvn dependency:copy-dependencies || exit 1
25 |     cp target/lib/* ../build/lib
26 | }
27 | 
28 | 
29 | case "${cmd}" in
30 |     --build)
31 |         build
32 |         ;;
33 |     --package)
34 |         build
35 |         package
36 |         ;;
37 |     --clean)
38 |         mvn clean
39 |         rm -f ../build/lib/grouperfish-service-*
40 |         rm -f ../build/bin/grouperfish
41 |         ;;
42 |     --help)
43 |         "Usage: ${0} [--build|--clean]"
44 |         ;;
45 | esac
46 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/ArrayTool.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | public class ArrayTool {
 4 | 
 5 |     public static byte[] concat(final byte[] a, final byte[] b) {
 6 |        final byte[] c = new byte[a.length + b.length];
 7 |        System.arraycopy(a, 0, c, 0, a.length);
 8 |        System.arraycopy(b, 0, c, a.length, b.length);
 9 |        return c;
10 |     }
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/Assert.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | 
 4 | public class Assert {
 5 | 
 6 | 	private static final String PREFIX = "[ASSERTION FAILED]";
 7 | 
 8 | 	public static void nonNull(Object... values) {
 9 | 		int i = 0;
10 | 		for (Object value : values) {
11 | 			++i;
12 | 			if (value == null) {
13 | 				String message = String.format("%s Value %d/%d is null.", PREFIX, i, values.length);
14 | 				throw new IllegalArgumentException(message);
15 | 			}
16 | 		}
17 | 	}
18 | 
19 | 	public static void check(boolean... values) {
20 | 		int i = 0;
21 | 		for (boolean value : values) {
22 | 			++i;
23 | 			if (!value) {
24 | 				String msg = String.format("%s Check %d/%d failed!", PREFIX, i, values.length);
25 | 				throw new IllegalArgumentException(msg);
26 | 			}
27 | 		}
28 | 	}
29 | 
30 | 	public static void unreachable() {
31 | 		String message = String.format("%s Code should be unreachable!\n", PREFIX);
32 | 		throw new IllegalStateException(message);
33 | 	}
34 | 
35 | 	public static void unreachable(String message, Object... objects) {
36 | 		String msg = String.format("%s Code should be unreachable: %s\n", PREFIX, String.format(message, objects));
37 | 		throw new IllegalStateException(msg);
38 | 	}
39 | 
40 | 	/** Use this where java wants a return type T. Silly, really... */
41 | 	public static <T> T unreachable(Class<T> returnType) {
42 | 		String msg = String.format("%s Code should be unreachable!\n", PREFIX);
43 | 		throw new IllegalStateException(msg);
44 | 	}
45 | 
46 | 	/** @see #unreachable(Class) */
47 | 	public static <T> T unreachable(Class<T> returnType, String message, Object... objects) {
48 | 		String msg = String.format("%s Code should be unreachable: %s\n", PREFIX, String.format(message, objects));
49 | 		throw new IllegalStateException(msg);
50 | 	}
51 | 
52 | 	/** @see #unreachable(Class) */
53 | 	public static <T> T unreachable(Class<T> returnType, Exception problem) {
54 | 		String msg = String.format("%s Code should be unreachable\n", PREFIX);
55 | 		throw new IllegalStateException(msg, problem);
56 | 	}
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/Box.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.util.Iterator;
 4 | import java.util.NoSuchElementException;
 5 | 
 6 | /**
 7 |  * Can be used as an out-param or optional return value.
 8 |  * Either use has+get or iterate over the results.
 9 |  */
10 | public class Box<T> implements Iterable<T> {
11 | 
12 |     private T value;
13 | 
14 |     public Box<T> put(final T value) {
15 |         this.value = value;
16 |         return this;
17 |     }
18 | 
19 |     public boolean empty() {
20 |         return value == null;
21 |     }
22 | 
23 |     public T get() {
24 |         return value;
25 |     }
26 | 
27 |     /** Iterates 0 or 1 times. */
28 |     public Iterator<T> iterator() {
29 |         return new Iterator<T>() {
30 |             private boolean taken = false;
31 | 
32 |             @Override public boolean hasNext() {
33 |                 return !taken && !empty();
34 |             }
35 | 
36 |             @Override public T next() {
37 |                 if (empty() || taken) throw new NoSuchElementException();
38 |                 taken = true;
39 |                 return value;
40 |             }
41 | 
42 |             @Override public void remove() {
43 |                 if (empty() || taken) throw new NoSuchElementException();
44 |                 taken = true;
45 |                 value = null;
46 |             }
47 |         };
48 | 
49 |     }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/Configuration.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | public interface Configuration {
 6 | 
 7 |     Properties properties();
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/ImmutableTools.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Map;
 5 | 
 6 | import com.google.common.collect.ImmutableList;
 7 | import com.google.common.collect.ImmutableMap;
 8 | 
 9 | /** Inefficient "functional" maps. */
10 | public class ImmutableTools {
11 | 
12 |     public static <K, V> Map<K, V> immutable(final Map<K, V> in) {
13 |         return new ImmutableMap.Builder<K, V>().putAll(in).build();
14 |     }
15 | 
16 |     public static <E> List<E> immutable(final List<E> in) {
17 |         return new ImmutableList.Builder<E>().addAll(in).build();
18 |     }
19 | 
20 |     public static <K, V> Map<K, V> put(final Map<K, V> in, final K key, final V value) {
21 |         return new ImmutableMap.Builder<K, V>().putAll(in).put(key, value).build();
22 |     }
23 | 
24 |     public static <K, V> Map<K, V> putAll(final Map<K, V> a, final Map<K, V> b) {
25 |         return new ImmutableMap.Builder<K, V>().putAll(a).putAll(b).build();
26 |     }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/PropertiesTool.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.InputStreamReader;
 6 | import java.net.URL;
 7 | import java.util.Properties;
 8 | 
 9 | 
10 | public class PropertiesTool {
11 | 
12 |     public static Properties load(final Class<?> context, final String resourceName) {
13 |         final Properties properties = new Properties();
14 |         URL source = context.getResource(resourceName);
15 |         if (source == null) return properties;
16 | 
17 |         InputStream stream = null;
18 |         try {
19 |             stream = source.openStream();
20 |             properties.load(new InputStreamReader(stream, StreamTool.UTF8));
21 |         }
22 |         catch (IOException e) {
23 |             throw new RuntimeException(String.format("Failed to load properties from '%s'...", resourceName), e);
24 |         }
25 |         finally {
26 |             if (stream == null) return properties;
27 |             try { stream.close(); }
28 |             catch (IOException e) { throw new RuntimeException(e); }
29 |         }
30 |         return properties;
31 |     }
32 | 
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/SlugTool.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.text.Normalizer;
 4 | import java.text.Normalizer.Form;
 5 | import java.util.Locale;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | 
 9 | public class SlugTool {
10 | 
11 | 	/**
12 | 	 * http://stackoverflow.com/questions/1657193/
13 | 	 * @param input
14 | 	 * @return A representation of the input string, containing only non-whitespace, latin characters.
15 | 	 */
16 | 	public static String toSlug(String input) {
17 | 	    String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
18 | 	    String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
19 | 	    String slug = NONLATIN.matcher(normalized).replaceAll("");
20 | 	    return slug.toLowerCase(Locale.ENGLISH);
21 | 	}
22 | 
23 | 	private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
24 | 	private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/StreamTool.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.InputStreamReader;
 6 | import java.io.Reader;
 7 | import java.nio.charset.Charset;
 8 | 
 9 | 
10 | public class StreamTool {
11 | 
12 |     public static Charset UTF8 = Charset.forName("UTF-8");
13 | 
14 |     /**
15 |      * @param stream The character source.
16 |      * @param encoding An encoding, e.g. #UTF8
17 |      */
18 |     public static String consume(final InputStream stream, final Charset encoding)
19 |     throws IOException {
20 |         return maybeConsume(stream, encoding, 0);
21 |     }
22 | 
23 |     /**
24 |      * Consume everything from this reader into a string.
25 |      * Close the reader when done.
26 |      */
27 |     public static String consume(final Reader in)
28 |     throws IOException {
29 |         Assert.nonNull(in);
30 |         return consume(in, 0);
31 |     }
32 | 
33 |     /**
34 |      * Consume everything up to limit from this reader into a string.
35 |      * If the stream has more characters than the given limit.
36 |      *
37 |      * @param A reader, will be closed when done.
38 |      * @param limit If limit is reached while consuming the stream,
39 |      *              <tt>null</tt> is returned.
40 |      *              Set to <tt>0</tt> for no limit.
41 |      * @return The contents, or <tt>null</tt> if the limit was exceeded.
42 |      */
43 |     public static String consume(final Reader in, final int limit)
44 |     throws IOException {
45 |         Assert.nonNull(in);
46 |         final char[] buffer = new char[8192];
47 |         final StringBuilder out = new StringBuilder();
48 |         int size = 0;
49 | 
50 |         int read;
51 |         do {
52 |             read = in.read(buffer, 0, buffer.length);
53 |             size += read;
54 |             if (limit != 0 && size > limit) {
55 |                 in.close();
56 |                 return null;
57 |             }
58 |             if (read>0) out.append(buffer, 0, read);
59 |         } while (read>=0);
60 | 
61 |         in.close();
62 |         return out.toString();
63 |     }
64 | 
65 |     /**
66 |      * @param stream The character source.
67 |      * @param encoding An encoding, e.g. #UTF8
68 |      * @param limit If limit is reached while consuming the stream,
69 |      *              <tt>null</tt> is returned.
70 |      *              Set to <tt>0</tt> for no limit.
71 |      */
72 |     public static String maybeConsume(final InputStream stream, final Charset encoding, final int limit)
73 |     throws IOException {
74 |         Assert.nonNull(stream, encoding);
75 |         return consume(new InputStreamReader(stream, encoding), limit);
76 |     }
77 | 
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/json/JsonValidator.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base.json;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.codehaus.jackson.JsonFactory;
 6 | import org.codehaus.jackson.JsonParseException;
 7 | import org.codehaus.jackson.JsonParser;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | 
12 | public class JsonValidator {
13 | 
14 |     static Logger log = LoggerFactory.getLogger(JsonValidator.class);
15 | 
16 |     private final JsonFactory jsonFactory = new JsonFactory();
17 | 
18 |     public boolean isValid(String json) throws IOException {
19 |         if (json.length() == 0) {
20 |             return false;
21 |         }
22 | 
23 |         try {
24 |             JsonParser parser = jsonFactory.createJsonParser(json);
25 |             while (parser.nextToken() != null) { }
26 |         } catch (JsonParseException e) {
27 |             log.error("Error parsing JSON", e);
28 |             return false;
29 |         }
30 | 
31 |         return true;
32 |     }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/json/MapStreamer.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base.json;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.IOException;
 5 | import java.io.OutputStream;
 6 | import java.io.OutputStreamWriter;
 7 | import java.io.Writer;
 8 | import java.util.Map;
 9 | 
10 | import org.json.simple.JSONValue;
11 | 
12 | import com.mozilla.grouperfish.base.StreamTool;
13 | 
14 | 
15 | /**
16 |  * Takes String keys and JSON values and streams them out as one JSON map,
17 |  * without composing everything in memory.
18 |  */
19 | public class MapStreamer {
20 | 
21 |     private final Map<String, String> map;
22 | 
23 |     public MapStreamer(final Map<String, String> map) {
24 |         this.map = map;
25 |     }
26 | 
27 |     public void write(OutputStream out) throws IOException {
28 |         final Writer writer = new BufferedWriter(new OutputStreamWriter(out, StreamTool.UTF8));
29 |         boolean first = true;
30 | 
31 |         writer.write('{');
32 |         for (final Map.Entry<String, String> items : map.entrySet()) {
33 |             if (first) {
34 |                 first = false;
35 |             }
36 |             else {
37 |                 writer.append(',');
38 |                 writer.append('\n');
39 |             }
40 | 
41 |             writer
42 |                 .append('"')
43 |                 .append(JSONValue.escape(items.getKey()))
44 |                 .append('"')
45 |                 .append(':')
46 |                 .append(' ')
47 |                 .write(items.getValue());
48 |         }
49 |         writer.write('}');
50 |         writer.flush();
51 |     }
52 | 
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/base/json/TsvJsonWriter.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base.json;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.IOException;
 5 | import java.io.Writer;
 6 | 
 7 | import com.mozilla.grouperfish.model.Document;
 8 | 
 9 | 
10 | /** If using a buffered writer, make sure to {@link #flush()} when you are done. */
11 | public class TsvJsonWriter {
12 | 
13 |     private final Writer writer;
14 | 
15 |     public TsvJsonWriter(final Writer writer) {
16 |         this.writer = new BufferedWriter(writer);
17 |     }
18 | 
19 |     public void write(final String key, final String source) throws IOException {
20 |         writer.write(key.replace("\t", "\\t").replace("\n", "\\n"));
21 |         writer.write("\t");
22 |         writer.write(source.replace("\n", ""));
23 |         writer.write("\n");
24 |     }
25 | 
26 |     public void write(final Document document) throws IOException {
27 |         write(document.id(), document.source());
28 |     }
29 | 
30 |     public void flush() throws IOException {
31 |         writer.flush();
32 |     }
33 | 
34 |     public void close() throws IOException {
35 |         writer.close();
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/api/BatchService.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.api;
 2 | 
 3 | import com.mozilla.grouperfish.model.Query;
 4 | import com.mozilla.grouperfish.model.Task;
 5 | import com.mozilla.grouperfish.model.TransformConfig;
 6 | import com.mozilla.grouperfish.naming.Scope;
 7 | 
 8 | 
 9 | /**
10 |  * The batch system component as documented at:
11 |  * http://grouperfish.readthedocs.org/en/latest/batch_system.html
12 |  */
13 | public interface BatchService {
14 | 
15 |     /** Run this specific task. */
16 |     void schedule(Task task);
17 | 
18 |     /** Run the configured transform over the query results. */
19 |     void schedule(Scope ns, Query query, TransformConfig transform);
20 | 
21 |     /** Run all configured transforms over the query results. */
22 |     void schedule(Scope ns, Query query);
23 | 
24 |     /**
25 |      * Run all transforms configurations of this
26 |      * namespace over the results of all queries.
27 |      */
28 |     void schedule(Scope ns);
29 | 
30 |     /** Start execution of tasks. */
31 |     void start();
32 | 
33 |     /**
34 |      * Stop execution of new tasks.
35 |      * Should be called before shutting down the node.
36 |      *
37 |      * :TODO: Next:
38 |      * We probably need some sort of lifecycle events so
39 |      * services can manage this transparently.
40 |      */
41 |     void stop();
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/api/guice/BatchSystem.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.api.guice;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import com.google.common.collect.ImmutableMap;
 6 | import com.google.inject.AbstractModule;
 7 | import com.google.inject.Inject;
 8 | import com.mozilla.grouperfish.batch.api.BatchService;
 9 | import com.mozilla.grouperfish.batch.scheduling.SynchronousBatchService;
10 | import com.mozilla.grouperfish.batch.transforms.HadoopTransform;
11 | import com.mozilla.grouperfish.batch.transforms.LocalTransform;
12 | import com.mozilla.grouperfish.batch.transforms.Transform;
13 | import com.mozilla.grouperfish.batch.transforms.TransformProvider;
14 | import com.mozilla.grouperfish.services.api.FileSystem;
15 | import com.mozilla.grouperfish.services.api.guice.Local;
16 | import com.mozilla.grouperfish.services.api.guice.Services;
17 | import com.mozilla.grouperfish.services.api.guice.Shared;
18 | 
19 | public class BatchSystem extends AbstractModule {
20 | 
21 |     @Override
22 |     protected void configure() {
23 |         bind(BatchService.class).to(SynchronousBatchService.class).asEagerSingleton();
24 |         bind(TransformProvider.class).to(StaticTransformProvider.class).asEagerSingleton();
25 |     }
26 | 
27 |     static class StaticTransformProvider implements TransformProvider {
28 | 
29 |         private final ImmutableMap<String, Transform> transformsByName;
30 | 
31 |         @Inject
32 |         public StaticTransformProvider(
33 |                 final Properties properties,
34 |                 final @Shared FileSystem dfs,
35 |                 final @Local FileSystem localFs) {
36 | 
37 |             final ImmutableMap.Builder<String, Transform> builder =
38 |                 new ImmutableMap.Builder<String, Transform>();
39 | 
40 |             builder.put("count", new LocalTransform("count", dfs, localFs));
41 |             builder.put("textcluster", new LocalTransform("textcluster", dfs, localFs));
42 |             // :TODO: Next: autodiscover available transforms
43 | 
44 |             if (Services.hasHadoop(properties)) {
45 |                 builder.put("coclustering", new HadoopTransform("coclustering", dfs));
46 |             }
47 | 
48 |             transformsByName = builder.build();
49 |         }
50 | 
51 |         @Override
52 |         public Transform get(final String name) {
53 |             return transformsByName.get(name);
54 |         }
55 | 
56 |     }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/handlers/CleanupHandler.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.handlers;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | 
 6 | import com.mozilla.grouperfish.batch.scheduling.Helpers;
 7 | import com.mozilla.grouperfish.model.Fail;
 8 | import com.mozilla.grouperfish.model.Task;
 9 | import com.mozilla.grouperfish.services.api.FileSystem;
10 | import com.mozilla.grouperfish.services.api.FileSystem.Denied;
11 | import com.mozilla.grouperfish.services.api.FileSystem.NotFound;
12 | 
13 | 
14 | public class CleanupHandler implements TaskHandler {
15 | 
16 |     private static final Logger log = LoggerFactory.getLogger(CleanupHandler.class);
17 | 
18 | 
19 |     private final FileSystem fs;
20 | 
21 |     public CleanupHandler(FileSystem fs) {
22 |         this.fs = fs;
23 |     }
24 | 
25 |     @Override
26 |     public Task handle(final Task task) throws Fail {
27 |         try {
28 |             fs.removeRecursively(Helpers.taskDirectory(task));
29 |         }
30 |         catch (final Denied denied) {
31 |             throw Fail.hard(task, "Could not cleanup task directory.", denied);
32 |         }
33 |         catch (final NotFound e) {
34 |             // ok, ignore
35 |             log.debug("Missing task directory during cleanup, this can indicate problems. Task: %s", task);
36 |         }
37 | 
38 |         return task;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/handlers/FetchHandler.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.handlers;
 2 | 
 3 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.inputFilename;
 4 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.parametersFilename;
 5 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.writer;
 6 | 
 7 | import java.io.Writer;
 8 | 
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import com.mozilla.grouperfish.base.Assert;
13 | import com.mozilla.grouperfish.base.json.TsvJsonWriter;
14 | import com.mozilla.grouperfish.batch.scheduling.Helpers;
15 | import com.mozilla.grouperfish.model.Document;
16 | import com.mozilla.grouperfish.model.Fail;
17 | import com.mozilla.grouperfish.model.Task;
18 | import com.mozilla.grouperfish.model.Type;
19 | import com.mozilla.grouperfish.services.api.FileSystem;
20 | import com.mozilla.grouperfish.services.api.Index;
21 | import com.mozilla.grouperfish.services.api.IndexProvider;
22 | 
23 | public class FetchHandler implements TaskHandler {
24 | 
25 |     private static final Logger log = LoggerFactory.getLogger(FetchHandler.class);
26 | 
27 |     private final IndexProvider indexes;
28 |     private final FileSystem fs;
29 | 
30 |     public FetchHandler(final FileSystem fs, final IndexProvider index) {
31 |         this.fs = fs;
32 |         this.indexes = index;
33 |     }
34 | 
35 |     @Override
36 |     public Task handle(final Task task) throws Fail {
37 |         Index index = indexes.index(task.namespace().bucket(Type.DOCUMENT));
38 |         Assert.nonNull(task);
39 |         try {
40 |             final TsvJsonWriter tsvWriter = new TsvJsonWriter(writer(fs, task, inputFilename(task)));
41 |             for (final Document doc : index.find(task.query())) tsvWriter.write(doc);
42 |             tsvWriter.close();
43 | 
44 |             final Writer parametersWriter = writer(fs, task, parametersFilename(task));
45 |             parametersWriter.write(task.transform().parametersJson());
46 |             parametersWriter.close();
47 |         }
48 |         catch (final Exception e) {
49 |             final String message = String.format(
50 |                     "Failed writing doc to %s", Helpers.inputFilename(task));
51 |             log.error("Exception", e);
52 |             throw Fail.hard(task, message, e);
53 |         }
54 |         return task;
55 |     }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/handlers/PutHandler.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.handlers;
 2 | 
 3 | import static com.mozilla.grouperfish.batch.scheduling.Helpers.resultsFilename;
 4 | 
 5 | import java.io.Reader;
 6 | import java.util.Map;
 7 | 
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | import com.mozilla.grouperfish.base.StreamTool;
12 | import com.mozilla.grouperfish.model.Fail;
13 | import com.mozilla.grouperfish.model.Task;
14 | import com.mozilla.grouperfish.naming.Scope;
15 | import com.mozilla.grouperfish.rest.jaxrs.ResultsResource;
16 | import com.mozilla.grouperfish.services.api.FileSystem;
17 | import com.mozilla.grouperfish.services.api.Grid;
18 | 
19 | 
20 | /**
21 |  * Put run results into results storage.
22 |  */
23 | public class PutHandler implements TaskHandler {
24 | 
25 |     private static final Logger log = LoggerFactory.getLogger(PutHandler.class);
26 | 
27 |     private final FileSystem fs;
28 |     private final Grid grid;
29 | 
30 |     public PutHandler(final Grid grid, final FileSystem fs) {
31 |         this.grid = grid;
32 |         this.fs = fs;
33 |     }
34 | 
35 |     @Override
36 |     public Task handle(final Task task) throws Fail {
37 | 
38 |         if (!task.isOk()) {
39 |             log.debug("Not putting result for failed task %s", task);
40 |         }
41 | 
42 |         final String key = ResultsResource.key(task.transform().name(), task.query().name());
43 |         final Map<String, String> results = new Scope(task.namespace(), grid).results();
44 | 
45 |         try {
46 |             final Reader reader = fs.reader(resultsFilename(task));
47 |             results.put(key, StreamTool.consume(reader));
48 |         }
49 |         catch (final Exception e) {
50 |             throw Fail.hard(task, "Could not read results from filesystem.", e);
51 |         }
52 |         return task;
53 |     }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/handlers/RunHandler.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.handlers;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import com.mozilla.grouperfish.base.Assert;
 9 | import com.mozilla.grouperfish.base.StreamTool;
10 | import com.mozilla.grouperfish.batch.scheduling.Helpers;
11 | import com.mozilla.grouperfish.batch.transforms.Transform;
12 | import com.mozilla.grouperfish.batch.transforms.Transform.TransformResult;
13 | import com.mozilla.grouperfish.batch.transforms.TransformProvider;
14 | import com.mozilla.grouperfish.model.Fail;
15 | import com.mozilla.grouperfish.model.Task;
16 | import com.mozilla.grouperfish.model.TransformConfig;
17 | import com.mozilla.grouperfish.services.api.FileSystem;
18 | import com.mozilla.grouperfish.services.api.FileSystem.Denied;
19 | import com.mozilla.grouperfish.services.api.FileSystem.NotFound;
20 | 
21 | 
22 | /** Perform the actual running of the transform. */
23 | public class RunHandler implements TaskHandler {
24 | 
25 |     private static final Logger log = LoggerFactory.getLogger(RunHandler.class);
26 | 
27 |     private final FileSystem fs;
28 |     private final TransformProvider transforms;
29 | 
30 |     public RunHandler(final FileSystem fs, final TransformProvider transforms) {
31 |         this.fs = fs;
32 |         this.transforms = transforms;
33 |     }
34 | 
35 |     @Override
36 |     public Task handle(final Task task) throws Fail {
37 |         final String inputDirectory;
38 |         try {
39 |             inputDirectory = fs.uri(Helpers.taskDirectory(task));
40 |         }
41 |         catch (final NotFound e) {
42 |             throw Fail.hard(task, "Task input not found...", e);
43 |         }
44 | 
45 |         try {
46 |             fs.makeDirectory(Helpers.outputDirectory(task));
47 |         } catch (final Denied e) {
48 |             throw Fail.hard(task, "Cannot create output directory.", e);
49 |         }
50 | 
51 |         final TransformConfig config = task.transform();
52 |         final Transform transform = transforms.get(config.transform());
53 |         Assert.nonNull(transform);
54 |         log.info(String.format("Launching transform '%s' with input directory '%s'", transform, inputDirectory));
55 | 
56 |         try {
57 |             final TransformResult result = transform.run(task);
58 |             if (result.success()) {
59 |                 log.info("Transform {} for task {} was run successfully.", transform, task);
60 |             }
61 |             else {
62 |                 final String message = String.format("Failed to run transform: %s (task %s)", transform, task);
63 |                 log.warn(message);
64 |                 log.warn("STDERR: {}", StreamTool.consume(result.stderr(), StreamTool.UTF8));
65 |                 throw Fail.hard(task, message, null);
66 |             }
67 |         }
68 |         catch (final InterruptedException e) {
69 |             throw Fail.soft(task, "Interrupted during run.", e);
70 |         }
71 |         catch (final IOException e) {
72 |             throw Fail.hard(task, "Received IO error reading from task STDERR", e);
73 |         }
74 | 
75 |         return task;
76 |     }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/handlers/SequentialHandler.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.handlers;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | 
 6 | import com.mozilla.grouperfish.model.Fail;
 7 | import com.mozilla.grouperfish.model.Task;
 8 | 
 9 | 
10 | /**
11 |  * Composite handler.
12 |  *
13 |  * Applies all sub-handlers synchronously, in order.
14 |  * Can be helpful to simplify things for development/testing
15 |  * (compared to pipelining).
16 |  */
17 | public class SequentialHandler implements TaskHandler {
18 | 
19 |     private static final Logger log = LoggerFactory.getLogger(SequentialHandler.class);
20 | 
21 |     private final TaskHandler[] handlers;
22 | 
23 |     public SequentialHandler(final TaskHandler... handlers) {
24 |         this.handlers = handlers;
25 |     }
26 | 
27 |     @Override
28 |     public Task handle(Task task) throws Fail {
29 |         for (final TaskHandler handler : handlers) {
30 |             log.debug("Task {}: starting handler: {}", task, handler.getClass().getSimpleName());
31 |             task = handler.handle(task);
32 |         }
33 |         return task;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/handlers/TaskHandler.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.handlers;
 2 | 
 3 | import com.mozilla.grouperfish.model.Fail;
 4 | import com.mozilla.grouperfish.model.Task;
 5 | 
 6 | public interface TaskHandler {
 7 | 
 8 |     /**
 9 |      * Carry out some processing on this task.
10 |      * @return The same task, or some modified version with more information.
11 |      */
12 |     Task handle(Task task) throws Fail;
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/scheduling/AbstractBatchService.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.scheduling;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import com.mozilla.grouperfish.base.Assert;
 6 | import com.mozilla.grouperfish.batch.api.BatchService;
 7 | import com.mozilla.grouperfish.model.Type;
 8 | import com.mozilla.grouperfish.model.Query;
 9 | import com.mozilla.grouperfish.model.Task;
10 | import com.mozilla.grouperfish.model.TransformConfig;
11 | import com.mozilla.grouperfish.naming.Scope;
12 | import com.mozilla.grouperfish.services.api.Index;
13 | import com.mozilla.grouperfish.services.api.IndexProvider;
14 | 
15 | abstract class AbstractBatchService implements BatchService {
16 | 
17 |     private final IndexProvider indexes;
18 | 
19 |     public AbstractBatchService(final IndexProvider indexes) {
20 |         this.indexes = indexes;
21 |     }
22 | 
23 |     /** Run the configured transform over the query results. */
24 |     public void schedule(final Scope ns, final Query query, final TransformConfig transform) {
25 |         Assert.nonNull(query, transform);
26 |         final Index index = indexes.index(ns.bucket(Type.DOCUMENT));
27 |         for (final Query concreteQuery : index.resolve(query)) {
28 |             schedule(new Task(ns, concreteQuery, transform));
29 |         }
30 |     }
31 | 
32 |     /** Run all configured transforms over the query results. */
33 |     public void schedule(final Scope ns, final Query query) {
34 |         final Map<String, String> transforms = ns.map(Type.CONFIGURATION_TRANSFORM);
35 |         for (final Map.Entry<String, String> item : transforms.entrySet()) {
36 |             schedule(ns, query, new TransformConfig(item.getKey(), item.getValue()));
37 |         }
38 |     }
39 | 
40 |     /** Run all transforms configurations of this namespace over the results of all queries. */
41 |     public void schedule(final Scope ns) {
42 |         final Map<String, String> queries = ns.queries();
43 |         final Map<String, String> transforms = ns.map(Type.CONFIGURATION_TRANSFORM);
44 |         for (final Map.Entry<String, String> queryEntry : queries.entrySet()) {
45 |             final Query query = new Query(queryEntry.getKey(), queryEntry.getValue());
46 |             for (final Map.Entry<String, String> item : transforms.entrySet()) {
47 |                 schedule(ns, query, new TransformConfig(item.getKey(), item.getValue()));
48 |             }
49 |         }
50 |     }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/scheduling/SingleQueueBatchService.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.scheduling;
 2 | 
 3 | import java.util.concurrent.BlockingQueue;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import com.google.inject.Inject;
 9 | import com.mozilla.grouperfish.batch.transforms.TransformProvider;
10 | import com.mozilla.grouperfish.model.Task;
11 | import com.mozilla.grouperfish.services.api.FileSystem;
12 | import com.mozilla.grouperfish.services.api.Grid;
13 | import com.mozilla.grouperfish.services.api.IndexProvider;
14 | 
15 | /**
16 |  * Run everything using one queue and a single worker.
17 |  * Mostly useful to test the worker.
18 |  */
19 | public class SingleQueueBatchService extends AbstractBatchService {
20 | 
21 |     private static final Logger log = LoggerFactory.getLogger(SingleQueueBatchService.class);
22 | 
23 |     private final Worker worker;
24 |     private final BlockingQueue<Task> inQueue;
25 |     private final BlockingQueue<Task> failQueue;
26 | 
27 |     @Override
28 |     public void schedule(Task task) {
29 |         inQueue.add(task);
30 |     }
31 | 
32 |     @Inject
33 |     public SingleQueueBatchService(
34 |             final Grid grid,
35 |             final IndexProvider indexes,
36 |             final FileSystem fs,
37 |             final TransformProvider transforms) {
38 | 
39 |         super(indexes);
40 |         inQueue = grid.queue("grouperfish_in");
41 |         failQueue = grid.queue("grouperfish_fail");
42 | 
43 |         worker = new Worker(failQueue, inQueue, null, Helpers.sequentialHandler(grid, fs, indexes, transforms));
44 | 
45 |         log.info("Instantiated service: {}", getClass().getSimpleName());
46 |     }
47 | 
48 |     public void start() {
49 |         worker.start();
50 |     }
51 | 
52 |     public void stop() {
53 |         worker.cancel();
54 |     }
55 | 
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/scheduling/SynchronousBatchService.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.scheduling;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | 
 6 | import com.google.inject.Inject;
 7 | import com.mozilla.grouperfish.batch.handlers.TaskHandler;
 8 | import com.mozilla.grouperfish.batch.transforms.TransformProvider;
 9 | import com.mozilla.grouperfish.model.Fail;
10 | import com.mozilla.grouperfish.model.Task;
11 | import com.mozilla.grouperfish.services.api.FileSystem;
12 | import com.mozilla.grouperfish.services.api.Grid;
13 | import com.mozilla.grouperfish.services.api.IndexProvider;
14 | 
15 | 
16 | /**
17 |  * Braindead fully synchronous "batch" service.
18 |  *
19 |  * It has no queue, no multithreading.
20 |  * It just executes everything right away, while you wait for results.
21 |  *
22 |  * Can be useful in testing/development.
23 |  */
24 | public class SynchronousBatchService extends AbstractBatchService {
25 | 
26 |     private static final Logger log = LoggerFactory.getLogger(SynchronousBatchService.class);
27 | 
28 |     private final TaskHandler handler;
29 | 
30 |     @Inject
31 |     public SynchronousBatchService(
32 |             final Grid grid,
33 |             final IndexProvider indexes,
34 |             final FileSystem fs,
35 |             final TransformProvider transforms) {
36 |         super(indexes);
37 |         handler = Helpers.sequentialHandler(grid, fs, indexes, transforms);
38 | 
39 |         log.info("Instantiated service: {}", getClass().getSimpleName());
40 |     }
41 | 
42 |     @Override
43 |     public void schedule(final Task task) {
44 |         try {
45 |             handler.handle(task);
46 |         }
47 |         catch (Fail e) {
48 |             throw new RuntimeException(e);
49 |         }
50 |     }
51 | 
52 |     @Override
53 |     public void start() { }
54 | 
55 |     @Override
56 |     public void stop() { }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/scheduling/Worker.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.scheduling;
 2 | 
 3 | import java.util.concurrent.BlockingQueue;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import com.mozilla.grouperfish.batch.handlers.TaskHandler;
 9 | import com.mozilla.grouperfish.model.Fail;
10 | import com.mozilla.grouperfish.model.Task;
11 | 
12 | class Worker extends Thread {
13 | 
14 |     private static Logger log = LoggerFactory.getLogger(Worker.class);
15 |     private static final int NUM_TRIES = 3;
16 | 
17 |     private final BlockingQueue<Task> inQueue;
18 |     private final BlockingQueue<Task> outQueue;
19 |     private final BlockingQueue<Task> failQueue;
20 |     private final TaskHandler handler;
21 |     private final String name;
22 | 
23 |     public Worker(final BlockingQueue<Task> failQueue,
24 |                   final BlockingQueue<Task> inQueue,
25 |                   final BlockingQueue<Task> outQueue,
26 |                   final TaskHandler actor) {
27 |         this.inQueue = inQueue;
28 |         this.outQueue = outQueue;
29 |         this.failQueue = failQueue;
30 |         this.handler = actor;
31 |         this.name = String.format("[Worker for %s]", actor.getClass().getSimpleName());
32 |     }
33 | 
34 |     public String toString() {
35 |         return name;
36 |     }
37 | 
38 |     public void run() {
39 |         Task task = null;
40 |         try {
41 |             while (!Thread.currentThread().isInterrupted()) {
42 |                 task = inQueue.take();
43 |                 try {
44 |                     // :TODO: NEXT:
45 |                     // If power fails, tasks can go MIA here.
46 |                     // We should maintain a global map of tasks, check it periodically, and restart tasks that went MIA.
47 |                     // Task update their status there, and clients could check the status using a GET /run/... call.
48 |                     task = handler.handle(task);
49 |                 }
50 |                 catch (final Fail e) {
51 |                     log.warn(String.format("%s %s: failed with message '%s'", name, task, e.getMessage()));
52 |                     if (task.failures().size() >= NUM_TRIES) {
53 |                         log.error(String.format("%s %s: Error details:", name, task), e);
54 |                         log.error(String.format("%s %s: Retries exhausted. Failing.", name, task));
55 |                         failQueue.put(task);
56 |                     }
57 |                     else {
58 |                         log.warn(String.format("%s %s: recording failure & requeuing...", name, task));
59 |                         inQueue.put(task.fail(e.getMessage()));
60 |                     }
61 |                     continue;
62 |                 }
63 |                 catch (final Exception e) {
64 |                     log.error(String.format("%s %s: Exception while handling.", name, task));
65 |                     log.error(String.format("%s %s: Error details:", name, task), e);
66 |                     failQueue.put(task.fail(e.getMessage()));
67 |                     continue;
68 |                 }
69 | 
70 |                 if (outQueue != null) outQueue.put(task);
71 |                 task = null;
72 |             }
73 |         }
74 |         catch (InterruptedException ex) {
75 |             Thread.currentThread().interrupt();
76 |         }
77 |     }
78 | 
79 |     public void cancel() {
80 |         interrupt();
81 |     }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/transforms/HadoopTransform.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.transforms;
 2 | 
 3 | import com.mozilla.grouperfish.batch.scheduling.Helpers;
 4 | import com.mozilla.grouperfish.model.Task;
 5 | import com.mozilla.grouperfish.services.api.FileSystem;
 6 | import com.mozilla.grouperfish.services.api.FileSystem.FsError;
 7 | 
 8 | 
 9 | /** Transform that relies on a distributed fs for processing. */
10 | public class HadoopTransform extends ExecutableTransform {
11 | 
12 |     public HadoopTransform(final String name, final FileSystem dfs) {
13 |         super(name, dfs);
14 |     }
15 | 
16 |     @Override
17 |     protected String taskDirectoryUri(final Task task) throws FsError {
18 |         return dataFs().uri(Helpers.taskDirectory(task));
19 |     }
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/transforms/LocalTransform.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.transforms;
 2 | 
 3 | import com.mozilla.grouperfish.base.Assert;
 4 | import com.mozilla.grouperfish.batch.scheduling.Helpers;
 5 | import com.mozilla.grouperfish.model.Fail;
 6 | import com.mozilla.grouperfish.model.Task;
 7 | import com.mozilla.grouperfish.services.api.FileSystem;
 8 | import com.mozilla.grouperfish.services.api.FileSystem.FsError;
 9 | 
10 | 
11 | /**
12 |  * A transform can be implemented as a local executable that does not
13 |  * know about hadoop or how to talk to HDFS, and instead uses a
14 |  * (temporary) local work directory.
15 |  *
16 |  * Such transforms are made available through the LocalTransform
17 |  * wrapper which will copy inputs from HDFS to the local file system,
18 |  * and results back to HDFS.
19 |  *
20 |  * The actual executable will receive a local directory (as an absolute
21 |  * path) instead of an HDFS uri.
22 |  */
23 | public class LocalTransform extends ExecutableTransform {
24 | 
25 |     private final FileSystem localFs;
26 |     private final boolean needsToCopy;
27 | 
28 |     /**
29 |      * A local transform in a distributed environment:
30 |      * Task input data is copied from the dfs to the local fs before
31 |      * running, and results are copied back afterwards.
32 |      *
33 |      * @param name The transform executable. It should take the location of the input data
34 |      *             as its single argument.
35 |      * @param dfs The distributed filesystem used by grouperfish (e.g. HDFS).
36 |      * @param localFs The local filesystem where working directories for local processes can be created.
37 |      */
38 |     public LocalTransform(
39 |             final String name,
40 |             final FileSystem dfs,
41 |             final FileSystem localFs) {
42 |         super(name, dfs);
43 |         Assert.nonNull(localFs);
44 |         this.localFs = localFs;
45 |         this.needsToCopy = !dfs.equals(localFs);
46 |     }
47 | 
48 |     @Override
49 |     protected String taskDirectoryUri(final Task task) throws FsError {
50 |         return localFs.uri(Helpers.taskDirectory(task)).substring("file://".length());
51 |     }
52 | 
53 |     @Override
54 |     public TransformResult run(Task task) throws Fail, InterruptedException {
55 |         if (needsToCopy) {
56 |             try {
57 |                 Helpers.copy(Helpers.inputFilename(task), dataFs(), localFs);
58 |                 Helpers.copy(Helpers.parametersFilename(task), dataFs(), localFs);
59 |             }
60 |             catch (final Exception e) {
61 |                 throw Fail.hard(task, "Could not copy data to local fs.", e);
62 |             }
63 |         }
64 | 
65 |         final TransformResult result = super.run(task);
66 | 
67 |         if (needsToCopy) {
68 |             try {
69 |                 Helpers.copy(Helpers.resultsFilename(task), localFs, dataFs());
70 |             }
71 |             catch (final Exception e) {
72 |                 throw Fail.hard(task, "Could not copy results back to distributed fs.", e);
73 |             }
74 |         }
75 | 
76 |         return result;
77 |     }
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/transforms/Transform.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.batch.transforms;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | import com.mozilla.grouperfish.model.Fail;
 6 | import com.mozilla.grouperfish.model.Task;
 7 | 
 8 | 
 9 | /**
10 |  * Proxy to the real transform implementation (which can be a java class, a local executable, a RPC call...).
11 |  */
12 | public interface Transform {
13 | 
14 |     public interface TransformResult {
15 |         InputStream stderr();
16 |         boolean success();
17 |     }
18 | 
19 |     public TransformResult run(Task task) throws Fail, InterruptedException;
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/batch/transforms/TransformProvider.java:
--------------------------------------------------------------------------------
1 | package com.mozilla.grouperfish.batch.transforms;
2 | 
3 | public interface TransformProvider {
4 | 
5 |     Transform get(String name);
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/bootstrap/Grouperfish.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.bootstrap;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | import org.slf4j.bridge.SLF4JBridgeHandler;
 8 | 
 9 | import com.google.inject.AbstractModule;
10 | import com.google.inject.Guice;
11 | import com.google.inject.Injector;
12 | import com.google.inject.Module;
13 | import com.google.inject.Provider;
14 | import com.mozilla.grouperfish.base.PropertiesTool;
15 | import com.mozilla.grouperfish.batch.api.guice.BatchSystem;
16 | import com.mozilla.grouperfish.rest.api.RestService;
17 | import com.mozilla.grouperfish.rest.jersey.JerseyGuiceRestService;
18 | import com.mozilla.grouperfish.rest.jersey.ResourceConfig;
19 | import com.mozilla.grouperfish.services.api.guice.Services;
20 | 
21 | 
22 | /** Entry class to  set up the Grouperfish service. */
23 | public class Grouperfish {
24 | 
25 |     public static final int DEFAULT_PORT = 0xF124;
26 | 
27 |     static final Logger log = LoggerFactory.getLogger(Grouperfish.class);
28 | 
29 |     /**
30 |      * Starts the Grouperfish engine.
31 |      * REST resources will be autodiscovered by Jersey (JAX-RS).
32 |      *
33 |      * @param arguments not used
34 |      * @throws Exception
35 |      */
36 | 	public static void main(final String[] arguments) throws Exception {
37 | 	    final Properties properties =
38 |             PropertiesTool.load(Grouperfish.class, "grouperfish.properties");
39 | 	    new Grouperfish(
40 | 	            new Services(properties),
41 | 	            new BatchSystem(),
42 | 	            new AbstractModule() {
43 | 	                @Override protected void configure() {
44 | 	                    bind(Properties.class).toProvider(new Provider<Properties>() {
45 | 	                        @Override public Properties get() { return properties; }
46 | 	                    }).asEagerSingleton();
47 | 	                }
48 | 	            }
49 | 	    );
50 | 	}
51 | 
52 | 	public Grouperfish(final Module... modules) {
53 |         SLF4JBridgeHandler.install();
54 | 	    final Injector injector = Guice.createInjector(modules);
55 | 	    final RestService rest = new JerseyGuiceRestService(injector, ResourceConfig.class);
56 | 	    rest.start();
57 | 	    log.info("Grouperfish started.");
58 |         log.debug("Configured port: {}, default: {}",
59 |                   System.getProperty(JerseyGuiceRestService.PROPERTY_PORT), DEFAULT_PORT);
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/Access.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | public interface Access {
 4 | 
 5 |     enum Operation {CREATE, READ, RUN, DELETE, LIST};
 6 | 
 7 |     String origin();
 8 | 
 9 |     Operation type();
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/Document.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import com.mozilla.grouperfish.base.Assert;
 6 | 
 7 | 
 8 | /** Simple multi-field text document. Each document has at least id and (full) text. */
 9 | public class Document extends NamedSource {
10 | 
11 |     public Document(final String id, final String source) {
12 | 	    super(id, source);
13 | 	}
14 | 
15 |     public Document(final String id, final Map<String, ? extends Object> fields) {
16 |         super(id, fields);
17 |     }
18 | 
19 |     public Document(final Map<String, ? extends Object> fields) {
20 |         super(String.valueOf(fields.get("id")), fields);
21 |         Assert.nonNull(fields.get("id"));
22 |     }
23 | 
24 | 	/**
25 | 	 * For documents this is the same as name.
26 | 	 */
27 | 	public String id() {
28 | 	    return name();
29 | 	}
30 | 
31 |     private static final long serialVersionUID = 0;
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/Fail.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | 
 4 | /**
 5 |  * Describes how a task failed. Generated by handlers.
 6 |  * This information should be made available somehow under the run resource.
 7 |  */
 8 | public abstract class Fail extends Exception {
 9 | 
10 |     private static final long serialVersionUID = 0;
11 | 
12 |     private final Task task;
13 | 
14 |     public static Fail hard(final Task task, final String message, final Throwable maybeCause) {
15 |         if (maybeCause == null) return new HardFail(task, message);
16 |         return new HardFail(task, message, maybeCause);
17 |     }
18 | 
19 |     public static Fail soft(final Task task, final String message, final Throwable maybeCause) {
20 |         if (maybeCause == null) return new SoftFail(task, message);
21 |         return new SoftFail(task, message, maybeCause);
22 |     }
23 | 
24 |     public Fail(final Task task, final String message) {
25 |         super(String.format("Task %s failed. %s", task, message));
26 |         this.task = task;
27 |     }
28 | 
29 |     public Fail(final Task task, final String message, final Throwable cause) {
30 |         super(message, cause);
31 |         this.task = task;
32 |     }
33 | 
34 |     public Task task() {
35 |         return task;
36 |     }
37 | 
38 |     /**
39 |      * Handlers can throw a hard failure if they are fairly certain that
40 |      * retrying will not help.
41 |      */
42 |     public static final class HardFail extends Fail {
43 |         HardFail(final Task task, final String message, final Throwable cause) {
44 |             super(task, message, cause);
45 |         }
46 | 
47 |         HardFail(final Task task, final String message) {
48 |             super(task, message);
49 |         }
50 | 
51 |         private static final long serialVersionUID = 1L;
52 |     }
53 | 
54 | 
55 |     /**
56 |      * Handlers can throw a soft failure if they think that
57 |      * retrying might help, e.g. if they were interrupted during execution.
58 |      */
59 |     public static final class SoftFail extends Fail {
60 |         SoftFail(final Task task, final String message, final Throwable cause) {
61 |             super(task, message, cause);
62 |         }
63 | 
64 |         SoftFail(final Task task, final String message) {
65 |             super(task, message);
66 |         }
67 | 
68 |         private static final long serialVersionUID = 1L;
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/NamedSource.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | import static com.mozilla.grouperfish.base.ImmutableTools.immutable;
 4 | 
 5 | import java.io.Serializable;
 6 | import java.util.Map;
 7 | 
 8 | import org.json.simple.JSONObject;
 9 | import org.json.simple.parser.JSONParser;
10 | 
11 | import com.mozilla.grouperfish.base.Assert;
12 | 
13 | 
14 | public abstract class NamedSource implements Serializable {
15 | 
16 |     private final String name;
17 |     private final String source;
18 |     private transient Map<String, ? extends Object> fields;
19 | 
20 |     NamedSource(final String name, final String source) {
21 |         Assert.nonNull(name, source);
22 |         Assert.check(!name.isEmpty(), !source.isEmpty());
23 |         this.name = name;
24 |         this.source = source;
25 |     }
26 | 
27 |     /** @param fields Must be directly mappable to a JSONObject.
28 |      *                That means, a java.util.Map with string keys and mappable values.
29 |      *                http://code.google.com/p/json-simple/wiki/MappingBetweenJSONAndJavaEntities
30 |      */
31 |     NamedSource(final String name, final Map<String, ? extends Object> fields) {
32 |         Assert.nonNull(name, fields);
33 |         Assert.check(!name.isEmpty());
34 |         this.name = name;
35 |         this.fields = fields;
36 |         this.source = JSONObject.toJSONString(fields);
37 |     }
38 | 
39 |     public String toString() {
40 |         return String.format("[%s %s, source.length=%s]", getClass().getSimpleName(), name(), source().length());
41 |     }
42 | 
43 | 
44 |     public String name() {
45 |         return name;
46 |     }
47 | 
48 |     public String source() {
49 |         return source;
50 |     }
51 | 
52 |     @SuppressWarnings("unchecked")
53 |     public Map<String, ? extends Object> fields() {
54 |         if (fields != null) return fields;
55 |         try {
56 |             fields = immutable((Map<String, ? extends Object>) new JSONParser().parse(source()));
57 |         } catch (Exception e) {
58 |             String message = String.format("Failed to parse source for %s with id='%s'",
59 |                                            getClass().getSimpleName(), name);
60 |             Assert.unreachable(message, e);
61 |         }
62 |         Assert.check(fields instanceof Map);
63 |         return fields;
64 |     }
65 | 
66 |     private static final long serialVersionUID = 0;
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/Query.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | import com.mozilla.grouperfish.base.Assert;
 4 | 
 5 | 
 6 | /** Simple query+name wrapper. */
 7 | public class Query extends NamedSource {
 8 | 
 9 |     public Query(final String name, final String json) {
10 |         super(name, json);
11 |         Assert.check(!name.isEmpty(), !json.isEmpty());
12 |     }
13 | 
14 |     public boolean isTemplate() {
15 |         // :TODO: NEXT:
16 |         // Implement templates
17 |         return false;
18 |     }
19 | 
20 |     private static final long serialVersionUID = 0;
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/Task.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.List;
 5 | 
 6 | import org.joda.time.Instant;
 7 | 
 8 | import com.google.common.collect.ImmutableList;
 9 | import com.mozilla.grouperfish.base.Assert;
10 | import com.mozilla.grouperfish.naming.Namespace;
11 | 
12 | 
13 | /** Immutable task description. */
14 | public class Task implements Serializable {
15 | 
16 |     // We do not want to serialize the namespace Object itself, but rather its name.
17 |     private final String namespace;
18 |     private final Query query;
19 |     private final TransformConfig transform;
20 |     private final Instant created;
21 |     private final List<String> failures;
22 | 
23 |     public Task(final Namespace ns, final Query query, final TransformConfig transform) {
24 |         Assert.nonNull(ns, query, transform);
25 |         this.namespace = ns.raw();
26 |         this.query = query;
27 |         this.transform = transform;
28 |         created = Instant.now();
29 |         failures = ImmutableList.of();
30 |     }
31 | 
32 |     private Task(final Task task, final String failure) {
33 |         Assert.nonNull(task, failure);
34 |         Assert.check(!failure.isEmpty());
35 |         this.namespace = task.namespace;
36 |         this.query = task.query;
37 |         this.transform = task.transform;
38 |         this.created = task.created();
39 |         this.failures = new ImmutableList.Builder<String>().addAll(task.failures).add(failure).build();
40 |     }
41 | 
42 |     public boolean isOk() {
43 |         return failures.isEmpty();
44 |     }
45 | 
46 |     public Namespace namespace() {
47 |         return new Namespace(namespace);
48 |     }
49 | 
50 |     public Query query() {
51 |         return query;
52 |     }
53 | 
54 |     public String toString() {
55 |         final String faildesc = (failures.size() == 0) ? "" : String.format(" (%s failed attempts)", failures.size());
56 |         return String.format("[Task @%s, T:%s, Q:%s%s]", created(), transform.name(), query.name(), faildesc);
57 |     }
58 | 
59 |     public TransformConfig transform() {
60 |         return transform;
61 |     }
62 | 
63 |     public Task fail(final String failureMessage) {
64 |         return new Task(this, failureMessage);
65 |     }
66 | 
67 |     public List<String> failures() {
68 |         return failures;
69 |     }
70 | 
71 |     public Instant created() {
72 |         return created;
73 |     }
74 | 
75 |     private static final long serialVersionUID = 0;
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/TransformConfig.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.json.simple.JSONObject;
 6 | 
 7 | import com.mozilla.grouperfish.base.Assert;
 8 | 
 9 | 
10 | /** Simple config+name wrapper. */
11 | public class TransformConfig extends NamedSource {
12 | 
13 |     public TransformConfig(final String name, final String source) {
14 |         super(name, source);
15 |         Assert.nonNull(name, source);
16 |     }
17 | 
18 |     private static final long serialVersionUID = 0;
19 | 
20 |     @SuppressWarnings("rawtypes")
21 |     public String parametersJson() {
22 |         return JSONObject.toJSONString((Map) fields().get("parameters"));
23 |     }
24 | 
25 |     public String transform() {
26 |         return (String) fields().get("transform");
27 |     }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/model/Type.java:
--------------------------------------------------------------------------------
1 | package com.mozilla.grouperfish.model;
2 | 
3 | public enum Type {
4 |     DOCUMENT,
5 |     QUERY,
6 |     CONFIGURATION_FILTER,
7 |     CONFIGURATION_TRANSFORM,
8 |     RESULT
9 | }


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/naming/Namespace.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.naming;
 2 | 
 3 | import java.util.EnumMap;
 4 | 
 5 | import com.mozilla.grouperfish.base.Assert;
 6 | import com.mozilla.grouperfish.model.Type;
 7 | 
 8 | 
 9 | /**
10 |  * Scopes resource access to a namespace.
11 |  */
12 | public class Namespace {
13 | 
14 |     protected final String namespace;
15 | 
16 |     public Namespace(final String namespace) {
17 |         if (namespace.indexOf('.') != -1) {
18 |             IllegalStateException e =new IllegalStateException("Illegal namespace: " + namespace);
19 |             e.printStackTrace();
20 |             throw e;
21 |         }
22 |         this.namespace = namespace;
23 |     }
24 | 
25 |     @SuppressWarnings("serial")
26 |     private static final EnumMap<Type, String> prefixes = new EnumMap<Type, String>(Type.class) {{
27 |         for (Type t : Type.values()) {
28 |             switch (t) {
29 |                 case DOCUMENT: put(t, "documents_"); break;
30 |                 case QUERY: put(t, "queries_"); break;
31 |                 case CONFIGURATION_FILTER: put(t, "configurations_filters_"); break;
32 |                 case CONFIGURATION_TRANSFORM: put(t, "configurations_transforms_"); break;
33 |                 case RESULT: put(t, "results_"); break;
34 |                 default: Assert.unreachable();
35 |             }
36 |         }
37 |     }};
38 | 
39 |     /** Buckets are used to name maps on the grid and indexes. */
40 |     public final String bucket(final Type type) {
41 |         return prefixes.get(type) + namespace;
42 |     }
43 | 
44 |     public String raw() {
45 |         return namespace;
46 |     }
47 | 
48 |     public String toString() {
49 |         return String.format("[Namespace %s]", raw());
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/naming/Scope.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.naming;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import com.mozilla.grouperfish.base.Assert;
 6 | import com.mozilla.grouperfish.base.json.JsonValidator;
 7 | import com.mozilla.grouperfish.model.Access;
 8 | import com.mozilla.grouperfish.model.Type;
 9 | import com.mozilla.grouperfish.rest.jaxrs.DocumentsResource;
10 | import com.mozilla.grouperfish.rest.jaxrs.QueriesResource;
11 | import com.mozilla.grouperfish.rest.jaxrs.ResultsResource;
12 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource.FilterConfigsResource;
13 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource.TransformConfigsResource;
14 | import com.mozilla.grouperfish.services.api.Grid;
15 | 
16 | 
17 | /**
18 |  * Helps to consistently associate resource access to a namespace.
19 |  * Gatekeeperfor each access, allows to implement permissions
20 |  * (in {@link #allows(Class, Access)}).
21 |  */
22 | public class Scope extends Namespace {
23 | 
24 |     private final Grid grid;
25 |     private final int maxDocumentLength = 512 * 1024 * 1024;
26 | 
27 |     public Scope(final String namespace, final Grid grid) {
28 |         super(namespace);
29 |         this.grid = grid;
30 |     }
31 | 
32 |     public Scope(final Namespace ns, final Grid grid) {
33 |         super(ns.raw());
34 |         this.grid = grid;
35 |     }
36 | 
37 |     public Map<String, String> documents() {
38 |         return grid.map(bucket(Type.DOCUMENT));
39 |     }
40 | 
41 |     public Map<String, String> queries() {
42 |         return grid.map(bucket(Type.QUERY));
43 |     }
44 | 
45 |     public Map<String, String> results() {
46 |         return grid.map(bucket(Type.RESULT));
47 |     }
48 | 
49 |     public Map<String, String> map(final Type type) {
50 |         Assert.nonNull(type);
51 |         return grid.map(bucket(type));
52 |     }
53 | 
54 |     public Map<String, String> resourceMap(final Class<?> resourceType) {
55 |         Assert.nonNull(resourceType);
56 |         if (resourceType == ResultsResource.class) return results();
57 |         if (resourceType == DocumentsResource.class) return documents();
58 |         if (resourceType == QueriesResource.class) return queries();
59 |         if (resourceType == TransformConfigsResource.class) return map(Type.CONFIGURATION_TRANSFORM);
60 |         if (resourceType == FilterConfigsResource.class) return map(Type.CONFIGURATION_FILTER);
61 |         Assert.unreachable("Unhandled resource type: %s", resourceType.getName());
62 |         return null;
63 |     }
64 | 
65 |     public int maxLength(final Class<?> resourceType, final Access access) {
66 |         return maxDocumentLength;
67 |     }
68 | 
69 |     public boolean allows(final Class<?> resourceType, final Access access) {
70 |         return true;
71 |     }
72 | 
73 |     public JsonValidator validator(final Class<?> resourceType) {
74 |         return new JsonValidator();
75 |     }
76 | 
77 |     public String toString() {
78 |         return String.format("[Scope %s]", raw());
79 |     }
80 | 
81 | }


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/api/RestService.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.api;
 2 | 
 3 | 
 4 | public interface RestService {
 5 | 
 6 |     public Daemon start();
 7 | 
 8 |     public static interface Daemon {
 9 |         /** Wait for shutdown of the daemon. Intercept the interrupt to clean up your resources. */
10 |         void join() throws InterruptedException;
11 |     }
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/DocumentsResource.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jaxrs;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import javax.servlet.http.HttpServletRequest;
 6 | import javax.ws.rs.Consumes;
 7 | import javax.ws.rs.DELETE;
 8 | import javax.ws.rs.GET;
 9 | import javax.ws.rs.PUT;
10 | import javax.ws.rs.Path;
11 | import javax.ws.rs.PathParam;
12 | import javax.ws.rs.Produces;
13 | import javax.ws.rs.core.Context;
14 | import javax.ws.rs.core.MediaType;
15 | import javax.ws.rs.core.Response;
16 | 
17 | import com.google.inject.Inject;
18 | import com.mozilla.grouperfish.services.api.Grid;
19 | 
20 | 
21 | @Path("/documents/{namespace}/{id}")
22 | public class DocumentsResource extends ResourceBase {
23 | 
24 |     @Inject
25 |     public DocumentsResource(final Grid grid) { super(grid); }
26 | 
27 |     @GET
28 |     @Produces(MediaType.APPLICATION_JSON)
29 |     public Response getDocument(@PathParam("namespace") String namespace,
30 |                                 @PathParam("id") String id,
31 |                                 @Context HttpServletRequest request) {
32 |         return RestHelper.getAny(getClass(), scope(namespace), id, request);
33 |     }
34 | 
35 |     @PUT
36 |     @Consumes(MediaType.APPLICATION_JSON)
37 |     public Response putDocument(@PathParam("namespace") String namespace,
38 |                                 @PathParam("id") String id,
39 |                                 @Context HttpServletRequest request) throws IOException {
40 |         return RestHelper.putAny(getClass(), scope(namespace), id, request);
41 |     }
42 | 
43 |     @DELETE
44 |     public Response deleteDocument(@PathParam("namespace") String namespace,
45 |                                    @PathParam("id") String id,
46 |                                    @Context HttpServletRequest request) throws IOException {
47 |         return RestHelper.deleteAny(getClass(), scope(namespace), id, request);
48 |     }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/HttpAccess.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jaxrs;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | import javax.servlet.http.HttpServletRequest;
 7 | 
 8 | import com.mozilla.grouperfish.base.Assert;
 9 | import com.mozilla.grouperfish.model.Access;
10 | 
11 | 
12 | public class HttpAccess implements Access {
13 | 
14 |     private final Access.Operation type;
15 |     private final HttpServletRequest request;
16 | 
17 |     @SuppressWarnings("serial")
18 |     private static final Map<String, Access.Operation> defaultType = new HashMap<String, Access.Operation>() {{
19 |         put("PUT", Access.Operation.CREATE);
20 |         put("GET", Access.Operation.READ);
21 |         put("POST", Access.Operation.RUN);
22 |         put("DELETE", Access.Operation.DELETE);
23 |     }};
24 | 
25 |     public HttpAccess(final HttpServletRequest request) {
26 |         this(defaultType.get(request.getMethod()), request);
27 |     }
28 | 
29 |     public HttpAccess(final Operation type,
30 |                       final HttpServletRequest request) {
31 |         Assert.nonNull(type);
32 |         this.type = type;
33 |         this.request = request;
34 |     }
35 | 
36 |     @Override
37 |     public String origin() {
38 |         return request.getRemoteHost();
39 |     }
40 | 
41 |     @Override
42 |     public Operation type() {
43 |         return type;
44 |     }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/QueriesResource.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jaxrs;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import javax.servlet.http.HttpServletRequest;
 6 | import javax.ws.rs.Consumes;
 7 | import javax.ws.rs.DELETE;
 8 | import javax.ws.rs.GET;
 9 | import javax.ws.rs.PUT;
10 | import javax.ws.rs.Path;
11 | import javax.ws.rs.PathParam;
12 | import javax.ws.rs.Produces;
13 | import javax.ws.rs.core.Context;
14 | import javax.ws.rs.core.MediaType;
15 | import javax.ws.rs.core.Response;
16 | 
17 | import com.google.inject.Inject;
18 | import com.mozilla.grouperfish.services.api.Grid;
19 | 
20 | 
21 | @Path("/queries/{namespace}")
22 | public class QueriesResource extends ResourceBase {
23 | 
24 |     @Inject
25 |     public QueriesResource(final Grid grid) { super(grid); }
26 | 
27 |     @GET
28 |     @Produces(MediaType.APPLICATION_JSON)
29 |     public Response list(@PathParam("namespace") String namespace,
30 |                          @Context HttpServletRequest request) {
31 |         return RestHelper.listAny(getClass(), scope(namespace), request);
32 |     }
33 | 
34 |     @GET
35 |     @Path("/{queryName}")
36 |     @Produces(MediaType.APPLICATION_JSON)
37 |     public Response getQuery(@PathParam("namespace") String namespace,
38 |                              @PathParam("queryName") String queryName,
39 |                              @Context HttpServletRequest request) {
40 |         return RestHelper.getAny(getClass(), scope(namespace), queryName, request);
41 |     }
42 | 
43 |     @PUT
44 |     @Path("/{queryName}")
45 |     @Consumes(MediaType.APPLICATION_JSON)
46 |     public Response putQuery(@PathParam("namespace") String namespace,
47 |                              @PathParam("queryName") String queryName,
48 |                              @Context HttpServletRequest request) throws IOException {
49 |         return RestHelper.putAny(getClass(), scope(namespace), queryName, request);
50 |     }
51 | 
52 | 
53 |     @DELETE
54 |     @Path("/{queryName}")
55 |     public Response deleteQuery(@PathParam("namespace") String namespace,
56 |                                    @PathParam("queryName") String queryName,
57 |                                    @Context HttpServletRequest request) throws IOException {
58 |         return RestHelper.deleteAny(getClass(), scope(namespace), queryName, request);
59 |     }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/ResourceBase.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jaxrs;
 2 | 
 3 | 
 4 | import com.mozilla.grouperfish.naming.Scope;
 5 | import com.mozilla.grouperfish.services.api.Grid;
 6 | 
 7 | public class ResourceBase {
 8 | 
 9 |     private final Grid grid;
10 | 
11 |     public ResourceBase(final Grid grid) {
12 |         this.grid = grid;
13 |     }
14 | 
15 |     protected Scope scope(final String namespace) {
16 |         return new Scope(namespace, grid);
17 |     }
18 | 
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jaxrs/ResultsResource.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jaxrs;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import javax.servlet.http.HttpServletRequest;
 6 | import javax.ws.rs.Consumes;
 7 | import javax.ws.rs.DELETE;
 8 | import javax.ws.rs.GET;
 9 | import javax.ws.rs.PUT;
10 | import javax.ws.rs.Path;
11 | import javax.ws.rs.PathParam;
12 | import javax.ws.rs.Produces;
13 | import javax.ws.rs.core.Context;
14 | import javax.ws.rs.core.MediaType;
15 | import javax.ws.rs.core.Response;
16 | 
17 | import com.google.inject.Inject;
18 | import com.mozilla.grouperfish.services.api.Grid;
19 | 
20 | 
21 | //:TODO: v0.1
22 | // Integrate facet query parameters
23 | @Path("/results/{namespace}/{transform}/{query}")
24 | public class ResultsResource extends ResourceBase {
25 | 
26 |     @Inject
27 |     public ResultsResource(final Grid grid) { super(grid); }
28 | 
29 |     @GET
30 |     @Produces(MediaType.APPLICATION_JSON)
31 |     public Response getResult(@PathParam("namespace") String namespace,
32 |                               @PathParam("transform") String transformName,
33 |                               @PathParam("query") String queryName,
34 |                               @Context HttpServletRequest request) {
35 |         return RestHelper.getAny(getClass(), scope(namespace), key(transformName, queryName), request);
36 |     }
37 | 
38 |     @PUT
39 |     @Consumes(MediaType.APPLICATION_JSON)
40 |     public Response putResult(@PathParam("namespace") String namespace,
41 |                               @PathParam("transform") String transformName,
42 |                               @PathParam("query") String queryName,
43 |                               @Context HttpServletRequest request) throws IOException {
44 |         return RestHelper.putAny(getClass(), scope(namespace), key(transformName, queryName), request);
45 |     }
46 | 
47 |     @DELETE
48 |     public Response deleteResult(@PathParam("namespace") String namespace,
49 |                                  @PathParam("transform") String transformName,
50 |                                  @PathParam("query") String queryName,
51 |                                  @Context HttpServletRequest request) throws IOException {
52 |         return RestHelper.deleteAny(getClass(), scope(namespace), key(transformName, queryName), request);
53 |     }
54 | 
55 |     public static String key(final String transformName, final String queryName) {
56 |         return String.format("%s_%s", transformName, queryName);
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jersey/JerseyGuiceRestService.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jersey;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | import org.eclipse.jetty.server.Server;
 7 | import org.eclipse.jetty.servlet.DefaultServlet;
 8 | import org.eclipse.jetty.servlet.ServletContextHandler;
 9 | 
10 | import com.google.inject.Injector;
11 | import com.google.inject.servlet.GuiceFilter;
12 | import com.google.inject.servlet.GuiceServletContextListener;
13 | import com.mozilla.grouperfish.base.Assert;
14 | import com.mozilla.grouperfish.bootstrap.Grouperfish;
15 | import com.mozilla.grouperfish.rest.api.RestService;
16 | import com.sun.jersey.api.core.PackagesResourceConfig;
17 | import com.sun.jersey.api.core.ResourceConfig;
18 | import com.sun.jersey.guice.JerseyServletModule;
19 | import com.sun.jersey.guice.spi.container.servlet.GuiceContainer;
20 | 
21 | 
22 | public class JerseyGuiceRestService implements RestService {
23 | 
24 |     public static final String PROPERTY_PORT = "grouperfish.rest.port";
25 |     public static final String PROPERTY_PORT_DEFAULT = String.valueOf(Grouperfish.DEFAULT_PORT);
26 | 
27 |     private final Server server;
28 | 
29 |     /**
30 |      * Initializes a Jersey based JAX-RS service using the given resource configuration.
31 |      * The procided confgiuration class must not be anonymous.
32 |      */
33 |     public JerseyGuiceRestService(final Injector parentInjector,
34 |                       final Class<? extends ResourceConfig> resourceConfigClass) {
35 | 
36 |         Assert.nonNull(parentInjector);
37 |         Assert.nonNull(resourceConfigClass, resourceConfigClass.getCanonicalName());
38 |         Assert.check(!resourceConfigClass.getCanonicalName().isEmpty());
39 | 
40 |         final int port = Integer.parseInt(System.getProperty(PROPERTY_PORT, PROPERTY_PORT_DEFAULT));
41 | 
42 |         server = new Server(port);
43 |         final ServletContextHandler root =
44 |             new ServletContextHandler(server, "/", ServletContextHandler.NO_SESSIONS);
45 | 
46 |         root.addEventListener(new GuiceServletContextListener() {
47 | 
48 |             @Override
49 |             protected Injector getInjector() {
50 |                 return parentInjector.createChildInjector(new JerseyServletModule() {
51 |                     protected void configureServlets() {
52 |                         final Map<String, String> params = new HashMap<String, String>();
53 |                         params.put(PackagesResourceConfig.PROPERTY_PACKAGES,
54 |                                    "jetty");
55 |                         params.put("com.sun.jersey.config.property.resourceConfigClass",
56 |                                    resourceConfigClass.getCanonicalName());
57 |                         serve("/*").with(GuiceContainer.class, params);
58 |                     }
59 |                 });
60 |             }
61 |         });
62 | 
63 |         root.addFilter(GuiceFilter.class, "/*", null);
64 |         root.addServlet(DefaultServlet.class, "/");
65 | 
66 |         server.setSendServerVersion(false);
67 |         server.setSendDateHeader(false);
68 |         server.setStopAtShutdown(true);
69 | 
70 |     }
71 | 
72 |     @Override
73 |     public Daemon start() {
74 |         try {
75 |             server.start();
76 |         }
77 |         catch(final Exception e) {
78 |             throw new RuntimeException(e);
79 |         }
80 | 
81 |         return new Daemon() {
82 |             @Override
83 |             public void join() throws InterruptedException {
84 |                 server.join();
85 |             }
86 |         };
87 |     }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/rest/jersey/ResourceConfig.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jersey;
 2 | 
 3 | import com.sun.jersey.api.core.PackagesResourceConfig;
 4 | 
 5 | public class ResourceConfig extends PackagesResourceConfig {
 6 | 
 7 |     public ResourceConfig() {
 8 |         super("com.mozilla.grouperfish.rest.jaxrs");
 9 |     }
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/api/FileSystem.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.api;
 2 | 
 3 | import java.io.Reader;
 4 | import java.io.Writer;
 5 | 
 6 | 
 7 | /**
 8 |  * A (virtual) file system root with the essential primitives for
 9 |  * Grouperfish batch operation:
10 |  * - create/remove directories
11 |  * - read/write textual data (tsv, json)
12 |  * - obtain global uris that can be passed to external components.
13 |  *
14 |  * All operations except {@link #uri(String)} work with relative
15 |  * (virtual) paths that are only meaningful to this filesystem
16 |  * instance (in-memory fs, temp fs) or instances created with the same
17 |  * parameters (local fs, distributed fs).
18 |  */
19 | public interface FileSystem {
20 | 
21 |     /** Result: The absolute path that was removed. */
22 |     String removeRecursively(String relativePath) throws Denied, NotFound;
23 | 
24 |     /**
25 |      * Creates the given directory if it does not exist already.
26 |      * Fails if path exists but is not a directory.
27 |      * @return The uri of the directory that was created.
28 |      */
29 |     String makeDirectory(String relativePath) throws Denied;
30 | 
31 |     /**
32 |      * Opens a file for writing (creates the file if not present).
33 |      * @return A suitable writer for string data.
34 |      */
35 |     Writer writer(String path) throws Denied;
36 | 
37 |     /**
38 |      * Opens a file for writing (creates the file if not present).
39 |      * @param path The filesystem local path.
40 |      * @return A suitable reader for string data.
41 |      */
42 |     Reader reader(String path) throws Denied, NotFound;
43 | 
44 |     /**
45 |      * Generate a url that can be used to reference this relative path externally.
46 |      * Ensures that the referee actually exists (at least currently).
47 |      */
48 |     String uri(String path) throws NotFound;
49 | 
50 |     public static class FsError extends Exception {
51 |         public FsError(final String message) { super(message); }
52 |         public FsError(final String message, final Exception reason) { super(message, reason); }
53 |         private static final long serialVersionUID = 1L;
54 |     };
55 | 
56 |     public static class Denied extends FsError {
57 |         public Denied(final String more) { super("Denied: " + more); }
58 |         public Denied(final String more, final Exception reason) { super("Denied: " + more, reason); }
59 |         private static final long serialVersionUID = 1L;
60 |     };
61 | 
62 |     public static class NotFound extends FsError {
63 |         public NotFound(final String uri) { super("Not found: " + uri); }
64 |         public NotFound(final String uri, final Exception reason) { super("Not found: " + uri, reason); }
65 |         private static final long serialVersionUID = 1L;
66 |     };
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/api/Grid.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.api;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.concurrent.BlockingQueue;
 5 | 
 6 | public interface Grid {
 7 | 
 8 |     Map<String, String> map(String name);
 9 | 
10 |     <E> BlockingQueue<E> queue(String name);
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/api/Index.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.api;
 2 | 
 3 | import com.mozilla.grouperfish.model.Document;
 4 | import com.mozilla.grouperfish.model.Query;
 5 | 
 6 | 
 7 | public interface Index {
 8 | 
 9 |     Iterable<Document> find(Query query);
10 | 
11 |     Iterable<Query> resolve(Query query);
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/api/IndexProvider.java:
--------------------------------------------------------------------------------
1 | package com.mozilla.grouperfish.services.api;
2 | 
3 | public interface IndexProvider {
4 | 
5 |     Index index(String name);
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/api/guice/Local.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.api.guice;
 2 | 
 3 | import static java.lang.annotation.ElementType.PARAMETER;
 4 | import static java.lang.annotation.RetentionPolicy.RUNTIME;
 5 | 
 6 | import java.lang.annotation.Retention;
 7 | import java.lang.annotation.Target;
 8 | 
 9 | import com.google.inject.BindingAnnotation;
10 | 
11 | /**
12 |  * When a local resource is requested as a parameter, the client
13 |  * will not try to share it. Examples: The local FileSystem, a
14 |  * local memory grid instead of a HazelCast grid.
15 |  *
16 |  * Whether injectors can still pass shared resources depends on
17 |  * the resource: A local FileSystem needs to be local, because
18 |  * other local processes need to read from it. A 'local' grid
19 |  * might simply be an optimization over a shared grid.
20 |  */
21 | @BindingAnnotation @Target(PARAMETER) @Retention(RUNTIME)
22 | public @interface Local { }
23 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/api/guice/Shared.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.api.guice;
 2 | 
 3 | import static java.lang.annotation.ElementType.PARAMETER;
 4 | import static java.lang.annotation.RetentionPolicy.RUNTIME;
 5 | 
 6 | import java.lang.annotation.Retention;
 7 | import java.lang.annotation.Target;
 8 | 
 9 | import com.google.inject.BindingAnnotation;
10 | 
11 | 
12 | /**
13 |  * When a shared resource is requested as a parameter, the client
14 |  * intends to share the service with other cluster members. Modules
15 |  * should provide appropriate services (Hadoop FS, HazelCast map...).
16 |  *
17 |  * Shared is the default.
18 |  *
19 |  * Injection might still pass a local resource, but should do so
20 |  * only if the modus operandi is guaranteed to be standalone: Here
21 |  * local/shared makes no difference. Examples for this are
22 |  * testing/development setups.
23 |  */
24 | @BindingAnnotation @Target(PARAMETER) @Retention(RUNTIME)
25 | public @interface Shared { }
26 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/elasticsearch/ElasticSearchIndexProvider.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.elasticsearch;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import org.elasticsearch.client.Client;
 6 | import org.elasticsearch.node.Node;
 7 | import org.elasticsearch.node.NodeBuilder;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | import com.mozilla.grouperfish.services.api.Index;
12 | import com.mozilla.grouperfish.services.api.IndexProvider;
13 | 
14 | public class ElasticSearchIndexProvider implements IndexProvider {
15 | 
16 |     private static final Logger log = LoggerFactory.getLogger(ElasticSearchIndexProvider.class);
17 | 
18 |     public static final String PROPERTY_CLUSTER = "grouperfish.services.elasticsearch.cluster";
19 |     public static final String PROPERTY_CLUSTER_DEFAULT = "grouperfish";
20 | 
21 |     public static final String PROPERTY_TYPE = "grouperfish.services.elasticsearch.type";
22 |     // :TODO: Hack... to simplify, we should use 1 index for all HC maps, and differentiate solely using type.
23 |     public static final String PROPERTY_TYPE_DEFAULT = "documents";
24 | 
25 |     private final String type;
26 |     private final Client client;
27 | 
28 |     public ElasticSearchIndexProvider(final Properties properties) {
29 |         type = System.getProperty(PROPERTY_TYPE, PROPERTY_TYPE_DEFAULT);
30 |         final String clusterName = System.getProperty(PROPERTY_CLUSTER, PROPERTY_CLUSTER_DEFAULT);
31 |         final Node node = NodeBuilder.nodeBuilder().loadConfigSettings(false).client(true).data(false).clusterName(clusterName).build();
32 |         node.start();
33 |         client = node.client();
34 | 
35 |         log.info(String.format("Instantiated index provider: %s (cluster.name=%s)",
36 |                                getClass().getSimpleName(), clusterName));
37 |     }
38 | 
39 |     @Override
40 |     public Index index(final String name) {
41 |         return new ElasticSearchIndex(client, name, type);
42 |     }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/hazelcast/HazelcastGrid.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.hazelcast;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.concurrent.BlockingQueue;
 5 | 
 6 | import org.slf4j.Logger;
 7 | import org.slf4j.LoggerFactory;
 8 | 
 9 | import com.hazelcast.config.Config;
10 | import com.hazelcast.config.MapConfig;
11 | import com.hazelcast.core.Hazelcast;
12 | import com.mozilla.grouperfish.services.api.Grid;
13 | 
14 | 
15 | public class HazelcastGrid implements Grid {
16 | 
17 |     private static final Logger log = LoggerFactory.getLogger(HazelcastGrid.class);
18 | 
19 |     public HazelcastGrid() {
20 |         // Initialize some of Hazelcast now rather than waiting for the first request
21 |         Hazelcast.getDefaultInstance();
22 |         final Config config = Hazelcast.getConfig();
23 |         final StringBuilder sb = new StringBuilder();
24 |         for (final Map.Entry<String, MapConfig> entry : config.getMapConfigs().entrySet()) {
25 |             sb.append(entry.getKey()).append(", ");
26 |         }
27 |         final int numMembers = Hazelcast.getCluster().getMembers().size();
28 | 
29 |         // Force initialization of index.
30 |         // :TODO: make less hacky...
31 |         log.info("Initializing HC ES node...");
32 |         Hazelcast.getMap("documents_grouperfish").get("unused");
33 | 
34 |         log.info(String.format("Instantiated service: %s (maps=%smembers=%s)",
35 |                                getClass().getSimpleName(), sb.toString(), numMembers));
36 |     }
37 | 
38 |     @Override
39 |     public Map<String, String> map(final String name) {
40 |         return Hazelcast.getMap(name);
41 |     }
42 | 
43 |     @Override
44 |     public <E> BlockingQueue<E> queue(final String name) {
45 |         return Hazelcast.getQueue(name);
46 |     }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/mock/MockFs.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.mock;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.ByteArrayOutputStream;
 5 | import java.io.IOException;
 6 | import java.io.InputStreamReader;
 7 | import java.io.OutputStreamWriter;
 8 | import java.io.Reader;
 9 | import java.io.Writer;
10 | import java.util.ArrayList;
11 | import java.util.Hashtable;
12 | import java.util.List;
13 | import java.util.Map;
14 | 
15 | import com.mozilla.grouperfish.base.ArrayTool;
16 | import com.mozilla.grouperfish.base.Assert;
17 | import com.mozilla.grouperfish.services.api.FileSystem;
18 | 
19 | 
20 | public class MockFs implements FileSystem {
21 | 
22 |     private final String root;
23 | 
24 |     private final Map<String, byte[]> files = new Hashtable<String, byte[]>();
25 | 
26 |     public MockFs(final String root) {
27 |         this.root = root;
28 |     }
29 | 
30 |     @Override
31 |     public synchronized String removeRecursively(final String relativePath) throws Denied, NotFound {
32 |         Assert.nonNull(relativePath);
33 |         Assert.check(!relativePath.isEmpty());
34 | 
35 |         final List<String> toRemove = new ArrayList<String>();
36 |         for (final String key : files.keySet()) {
37 |             if (!key.startsWith(relativePath)) continue;
38 |             final String rest = key.substring(key.length());
39 |             if (rest.startsWith("/") || relativePath.endsWith("/") || rest.isEmpty()) {
40 |                 toRemove.add(key);
41 |             }
42 |         }
43 |         for (final String key : toRemove) files.remove(key);
44 |         return uncheckedUri(relativePath);
45 |     }
46 | 
47 |     @Override
48 |     public synchronized String makeDirectory(final String relativePath) throws Denied {
49 |         Assert.nonNull(relativePath);
50 |         Assert.check(!relativePath.isEmpty());
51 |         if (files.containsKey(relativePath)) throw new Denied("used as file: " + uncheckedUri(relativePath));
52 |         return uncheckedUri(relativePath);
53 |     }
54 | 
55 |     @Override
56 |     public synchronized Writer writer(final String path) throws Denied {
57 |         return new OutputStreamWriter(new ByteArrayOutputStream() {
58 |             @Override
59 |             public void close() throws IOException {
60 |                 if (files.containsKey(path)) {
61 |                     files.put(path, ArrayTool.concat(files.get(path), toByteArray()));
62 |                 }
63 |                 else {
64 |                     files.put(path, toByteArray());
65 |                 }
66 |             }
67 |         });
68 |     }
69 | 
70 |     @Override
71 |     public synchronized Reader reader(final String path) throws Denied, NotFound {
72 |         if (!files.containsKey(path)) throw new NotFound(uri(path));
73 |         return new InputStreamReader(new ByteArrayInputStream(files.get(path)));
74 |     }
75 | 
76 |     @Override
77 |     public String uri(final String relativePath) throws NotFound {
78 |         if (!files.containsKey(relativePath)) throw new NotFound(relativePath);
79 |         return uncheckedUri(relativePath);
80 |     }
81 | 
82 |     private String uncheckedUri(final String relativePath) {
83 |         return "mockfs://" + root + (relativePath.startsWith("/") ? ""  : "/" ) + relativePath;
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/mock/MockGrid.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.mock;
 2 | 
 3 | import java.util.Hashtable;
 4 | import java.util.Map;
 5 | import java.util.concurrent.ArrayBlockingQueue;
 6 | import java.util.concurrent.BlockingQueue;
 7 | 
 8 | import com.mozilla.grouperfish.services.api.Grid;
 9 | 
10 | 
11 | /**
12 |  * In memory grid service, usable for some mocking.
13 |  *
14 |  * This cannot be used as an actual replacement for Hazelcast
15 |  * because it lacks the persistence/indexing provided by Bagheera.
16 |  *
17 |  * Make sure to instantiate this as a singleton (e.g. using Guice).
18 |  */
19 | public class MockGrid implements Grid {
20 | 
21 |     // We want a concurrent map, like Hazelcast provides.
22 |     private final Map<String, Map<String, String>> maps =
23 |         new Hashtable<String, Map<String, String>>();
24 | 
25 |     private final int queueCapacity = 1000;
26 | 
27 |     private final Map<String, BlockingQueue<?>> queues =
28 |         new Hashtable<String, BlockingQueue<?>>();
29 | 
30 |     @Override
31 |     public synchronized Map<String, String> map(final String name) {
32 |         if (!maps.containsKey(name)) {
33 |             maps.put(name, new Hashtable<String, String>());
34 |         }
35 |         return maps.get(name);
36 |     }
37 | 
38 |     @SuppressWarnings("unchecked")
39 |     @Override
40 |     public synchronized <E> BlockingQueue<E> queue(final String name) {
41 | 
42 |         if (!queues.containsKey(name)) {
43 |             queues.put(name, new ArrayBlockingQueue<E>(queueCapacity));
44 |         }
45 | 
46 |         return (BlockingQueue<E>) queues.get(name);
47 |     }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/services/mock/MockIndex.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.services.mock;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.elasticsearch.common.collect.ImmutableList;
 6 | 
 7 | import com.mozilla.grouperfish.model.Document;
 8 | import com.mozilla.grouperfish.model.Query;
 9 | import com.mozilla.grouperfish.services.api.Index;
10 | 
11 | public class MockIndex implements Index {
12 | 
13 |     // Chosen by fair dice roll.
14 |     private final List<Document> randomDocuments =
15 |         new ImmutableList.Builder<Document>().
16 |             add(new Document("A", "{\"id\": \"A\", \"text\": \"Some random text.\"}")).
17 |             add(new Document("B", "{\"id\": \"B\", \"text\": \"Another text which is completely random.\"}")).
18 |             add(new Document("C", "{\"id\": \"C\", \"text\": \"Only an ape with typewriter could think of this.\"}")).
19 |             build();
20 | 
21 |     private final List<Query> randomQueries =
22 |         new ImmutableList.Builder<Query>().
23 |             add(new Query("A", "{\"query\": {\"field\": {\"x\": \"some\"}}}")).
24 |             add(new Query("B", "{\"query\": {\"field\": {\"x\": \"thing\"}}}")).
25 |             build();
26 | 
27 |     @Override
28 |     public Iterable<Document> find(final Query query) {
29 |         return randomDocuments;
30 |     }
31 | 
32 |     @Override
33 |     public Iterable<Query> resolve(Query query) {
34 |         return randomQueries;
35 |     }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/util/loader/DocumentLoader.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.util.loader;
 2 | 
 3 | import com.mozilla.grouperfish.model.Document;
 4 | 
 5 | 
 6 | public class DocumentLoader extends Loader<Document> {
 7 | 
 8 |     public DocumentLoader(final String baseUrl, final String namespace) {
 9 |         super(baseUrl + "/documents/" + namespace);
10 |     }
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/service/src/main/java/com/mozilla/grouperfish/util/logback/AnsiColorConverter.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.util.logback;
 2 | 
 3 | import ch.qos.logback.classic.Level;
 4 | import ch.qos.logback.classic.pattern.ClassicConverter;
 5 | import ch.qos.logback.classic.spi.ILoggingEvent;
 6 | 
 7 | @SuppressWarnings("unused")
 8 | public class AnsiColorConverter extends ClassicConverter {
 9 | 
10 |     private static final int NORMAL = 0;
11 |     private static final int BRIGHT = 1;
12 |     private static final int FOREGROUND_BLACK = 30;
13 |     private static final int FOREGROUND_RED = 31;
14 |     private static final int FOREGROUND_GREEN = 32;
15 |     private static final int FOREGROUND_YELLOW = 33;
16 |     private static final int FOREGROUND_BLUE = 34;
17 |     private static final int FOREGROUND_MAGENTA = 35;
18 |     private static final int FOREGROUND_CYAN = 36;
19 |     private static final int FOREGROUND_WHITE = 37;
20 | 
21 |     private static final String PREFIX = "\u001b[";
22 |     private static final String SUFFIX = "m";
23 |     private static final char SEPARATOR = ';';
24 |     private static final String END_COLOR = PREFIX + SUFFIX;
25 | 
26 |     private static final String ERROR_COLOR = PREFIX + BRIGHT + SEPARATOR + FOREGROUND_RED    + SUFFIX;
27 |     private static final String WARN_COLOR  = PREFIX + NORMAL + SEPARATOR + FOREGROUND_YELLOW + SUFFIX;
28 |     private static final String INFO_COLOR  = PREFIX + NORMAL + SEPARATOR + FOREGROUND_GREEN  + SUFFIX;
29 |     private static final String DEBUG_COLOR = PREFIX + NORMAL + SEPARATOR + FOREGROUND_CYAN   + SUFFIX;
30 |     private static final String TRACE_COLOR = PREFIX + NORMAL + SEPARATOR + FOREGROUND_BLUE   + SUFFIX;
31 | 
32 |     @Override
33 |     public String convert(final ILoggingEvent event) {
34 |         final StringBuilder sb = new StringBuilder();
35 |         sb.append(getColor(event.getLevel()));
36 |         sb.append(event.getLevel());
37 |         sb.append(END_COLOR);
38 |         return sb.toString();
39 |     }
40 | 
41 |     /**
42 |      * Returns the appropriate characters to change the color for the specified
43 |      * logging level.
44 |      */
45 |     private String getColor(final Level level) {
46 |         switch (level.toInt()) {
47 |             case Level.ERROR_INT: return ERROR_COLOR;
48 |             case Level.WARN_INT: return WARN_COLOR;
49 |             case Level.INFO_INT: return INFO_COLOR;
50 |             case Level.DEBUG_INT: return DEBUG_COLOR;
51 |             case Level.TRACE_INT: return TRACE_COLOR;
52 |             default:
53 |                 return "";
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/service/src/main/resources/logback-stdout.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration scan="true" scanPeriod="250 milliseconds">
 3 | 
 4 |   <conversionRule conversionWord="ansiLevel"
 5 |                   converterClass="com.mozilla.grouperfish.util.logback.AnsiColorConverter" />
 6 | 
 7 |   <appender name="STDOUT"
 8 |             class="ch.qos.logback.core.ConsoleAppender">
 9 |     <encoder>
10 |         <pattern>%d{HH:mm:ss} [%thread] %ansiLevel %logger{35} - %msg %ex{full} %n</pattern>
11 |     </encoder>
12 |   </appender>
13 | 
14 |   <root level="INFO"><appender-ref ref="STDOUT" /></root>
15 | 
16 |   <logger name="com.mozilla.grouperfish" level="DEBUG"/>
17 | 
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 | 
 4 |   <!--
 5 |   <appender name="REST" class="ch.qos.logback.core.rolling.RollingFileAppender">
 6 |     <encoder><pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{35} - %msg%n</pattern></encoder>
 7 |     <file>logs/rest.log</file>
 8 | 
 9 |     <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
10 |       <fileNamePattern>logs/archive/rest.%d{yyyy-MM}.log.gz</fileNamePattern>
11 |       <maxHistory>365</maxHistory>
12 |     </rollingPolicy>
13 | 
14 |   </appender>
15 | 
16 | 
17 |   <appender name="BATCH" class="ch.qos.logback.core.rolling.RollingFileAppender">
18 |     <encoder><pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{35} - %msg%n</pattern></encoder>
19 |     <file>logs/batch.log</file>
20 | 
21 |     <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
22 |       <fileNamePattern>logs/archive/jobs.%d{yyyy-MM}.log.gz</fileNamePattern>
23 |       <maxHistory>365</maxHistory>
24 |     </rollingPolicy>
25 | 
26 |   </appender>
27 |   -->
28 | 
29 | 
30 |   <appender name="GENERAL"
31 |             class="ch.qos.logback.core.rolling.RollingFileAppender">
32 |     <encoder>
33 |         <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{35} - %msg %ex{full} %n</pattern>
34 |     </encoder>
35 |     <file>logs/grouperfish.log</file>
36 | 
37 |     <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
38 |       <fileNamePattern>logs/archive/grouperfish.%d{yyyy/MM}.log.gz</fileNamePattern>
39 |       <maxHistory>365</maxHistory>
40 |     </rollingPolicy>
41 |   </appender>
42 | 
43 | 
44 |   <root level="INFO"><appender-ref ref="GENERAL" /></root>
45 | 
46 | </configuration>
47 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/base/AssertTest.java:
--------------------------------------------------------------------------------
  1 | package com.mozilla.grouperfish.base;
  2 | 
  3 | import org.testng.annotations.Test;
  4 | 
  5 | import static org.testng.AssertJUnit.assertEquals;
  6 | import static org.testng.AssertJUnit.fail;
  7 | 
  8 | 
  9 | @Test(groups="unit")
 10 | public class AssertTest {
 11 | 
 12 |     public void testNonNullPass() {
 13 |         Assert.nonNull(new int[100]);
 14 |         Assert.nonNull("a", 123, new Object());
 15 |     }
 16 | 
 17 |     @Test(expectedExceptions=IllegalArgumentException.class)
 18 |     public void testNonNullFailSingle() {
 19 |         final String nothing = null;
 20 |         Assert.nonNull(nothing);
 21 |     }
 22 | 
 23 |     @Test(expectedExceptions=IllegalArgumentException.class)
 24 |     public void testNonNullFailMulti() {
 25 |         Assert.nonNull("a", 123, new Object(), null);
 26 |     }
 27 | 
 28 |     public void testCheckPass() {
 29 |         Assert.check(true);
 30 |         Assert.check(true, true);
 31 |         Assert.check(true, true, true);
 32 |     }
 33 | 
 34 |     @Test(expectedExceptions=IllegalArgumentException.class)
 35 |     public void testCheckFailSingle() {
 36 |         Assert.check(false);
 37 |     }
 38 | 
 39 |     @Test(expectedExceptions=IllegalArgumentException.class)
 40 |     public void testCheckFailMulti() {
 41 |         Assert.check(true, true, false);
 42 |     }
 43 | 
 44 |     @Test(expectedExceptions=IllegalStateException.class)
 45 |     public void testUnreachable() {
 46 |         Assert.unreachable();
 47 |     }
 48 | 
 49 |     @Test(expectedExceptions = IllegalStateException.class)
 50 |     public void testUnreachableType() {
 51 |         String bogus = Assert.unreachable(String.class);
 52 |         fail(bogus);
 53 |     }
 54 | 
 55 |     public void testUnreachableWrap() {
 56 |         Exception inner = new RuntimeException();
 57 | 
 58 |         try {
 59 |             String neverAssigned = Assert.unreachable(String.class, inner);
 60 |             fail(neverAssigned);
 61 |         }
 62 |         catch (IllegalStateException e) {
 63 |             assertEquals(inner, e.getCause());
 64 |         }
 65 |     }
 66 | 
 67 |     public void testUnreachableArgs() {
 68 |         try {
 69 |             Assert.unreachable("Arrrgh");
 70 |             fail();
 71 |         }
 72 |         catch (IllegalStateException e) {
 73 |             assertEquals(
 74 |                     "[ASSERTION FAILED] Code should be unreachable: Arrrgh\n",
 75 |                     e.getMessage());
 76 |         }
 77 | 
 78 |         try {
 79 |             Assert.unreachable("Wut: %s %s???", "Over", 9000);
 80 |             fail();
 81 |         }
 82 |         catch (IllegalStateException e) {
 83 |             assertEquals(
 84 |                     "[ASSERTION FAILED] Code should be unreachable: Wut: Over 9000???\n",
 85 |                     e.getMessage());
 86 |         }
 87 | 
 88 |         try {
 89 |             String neverAssigned =
 90 |                 Assert.unreachable(String.class, "Wut: %s %s???", "Over", 9000);
 91 |             fail(neverAssigned);
 92 |         }
 93 |         catch (IllegalStateException e) {
 94 |             assertEquals(
 95 |                     "[ASSERTION FAILED] Code should be unreachable: Wut: Over 9000???\n",
 96 |                     e.getMessage());
 97 |         }
 98 |     }
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/base/SlugToolTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import org.testng.annotations.Test;
 4 | 
 5 | import static org.testng.AssertJUnit.assertEquals;
 6 | 
 7 | 
 8 | @Test(groups="unit")
 9 | public class SlugToolTest {
10 | 
11 |     public void testToSlug() {
12 | 
13 |         assertEquals("my-name-is-joe", SlugTool.toSlug("My Name is Joe"));
14 |         assertEquals("wut-over-9000", SlugTool.toSlug("Wut, over 9000?!?"));
15 |         assertEquals("space-----madness", SlugTool.toSlug("Space     Madness"));
16 | 
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/base/StreamToolTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import java.nio.charset.Charset;
 7 | 
 8 | import org.testng.annotations.Test;
 9 | 
10 | import static org.testng.AssertJUnit.assertEquals;
11 | import static org.testng.AssertJUnit.fail;
12 | 
13 | 
14 | @Test(groups="unit")
15 | public class StreamToolTest {
16 | 
17 |     Charset UTF8 = Charset.forName("UTF-8");
18 | 
19 |     private String[] fixtures() {
20 |         final String empty = "";
21 |         final String single = "A";
22 |         final String shortish =
23 |             "The Mozilla project is a global community of people who believe that openness, "
24 |             + "innovation, and opportunity are key to the continued health of the Internet. "
25 |             + "We have worked together since 1998 to ensure that the Internet is developed "
26 |             + "in a way that benefits everyone. As a result of the community's efforts, we "
27 |             + "have distilled a set of principles that we believe are critical for the "
28 |             + "Internet to continue to benefit the public good. These principles are "
29 |             + "contained in the Mozilla Manifesto.";
30 | 
31 |         final String longish = shortish + shortish + shortish + shortish + shortish;
32 |         final String longer = longish + longish +  longish + longish + longish;
33 |         final String reallyLong = longer + longer + longer + longer + longer;
34 | 
35 |         final String unicode = "Internet se stává důležitou součástí našich životů.";
36 | 
37 |         return new String[]{empty, single, shortish,
38 |                             longish, longer, reallyLong, unicode};
39 |     }
40 | 
41 |     public void testConsumeInputStreamCharset() {
42 |         for (String fixture : fixtures()) {
43 |             InputStream stream = new ByteArrayInputStream(fixture.getBytes(UTF8));
44 |             try {
45 |                 assertEquals(fixture, StreamTool.consume(stream, UTF8));
46 |             } catch (IOException e) {
47 |                 fail(e.getMessage());
48 |             }
49 |         }
50 |     }
51 | 
52 |     public void testConsumeInputStreamCharsetLimit() {
53 |         for (String fixture : fixtures()) {
54 |             try {
55 |                 InputStream stream = new ByteArrayInputStream(fixture.getBytes(UTF8));
56 |                 assertEquals(fixture, StreamTool.maybeConsume(stream, UTF8, fixture.length()));
57 | 
58 |                 if (fixture.length() <= 1) continue;
59 |                 stream = new ByteArrayInputStream(fixture.getBytes(UTF8));
60 |                 assertEquals(null, StreamTool.maybeConsume(stream, UTF8, fixture.length() - 1));
61 |             } catch (IOException e) {
62 |                 fail(e.getMessage());
63 |             }
64 |         }
65 |     }
66 | 
67 |     @Test(expectedExceptions = IllegalArgumentException.class)
68 |     public void testMissingStream() throws IOException {
69 |         StreamTool.maybeConsume(null, UTF8, 0);
70 |     }
71 | 
72 |     @Test(expectedExceptions = IllegalArgumentException.class)
73 |     public void testMissingEncoding() throws IOException {
74 |         StreamTool.consume(new ByteArrayInputStream("lolwut".getBytes(UTF8)), null);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/base/json/JsonValidatorTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base.json;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.testng.annotations.Test;
 6 | 
 7 | import com.mozilla.grouperfish.base.json.JsonValidator;
 8 | 
 9 | import static org.testng.AssertJUnit.assertTrue;
10 | import static org.testng.AssertJUnit.assertFalse;
11 | 
12 | 
13 | @Test(groups="unit")
14 | public class JsonValidatorTest {
15 | 
16 |     public void testInvalidDocument() throws IOException {
17 |         assertFalse(new JsonValidator().isValid("Your mom is valit!!!!"));
18 |         assertFalse(new JsonValidator().isValid("{{{}}"));
19 |     }
20 | 
21 |     public void testTooEmptyDocument() throws IOException {
22 |         assertFalse(new JsonValidator().isValid(""));
23 |     }
24 | 
25 |     public void testValidDocument() throws IOException {
26 |         assertTrue(new JsonValidator().isValid("{}"));
27 |         assertTrue(new JsonValidator().isValid("{\"a\": 1}"));
28 |         assertTrue(new JsonValidator().isValid("{\"a\": 1, \"b\": 2}"));
29 |     }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/base/json/MapStreamerTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.base.json;
 2 | 
 3 | import java.io.ByteArrayOutputStream;
 4 | import java.io.IOException;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | import java.util.TreeMap;
 8 | 
 9 | import org.testng.annotations.Test;
10 | 
11 | import com.mozilla.grouperfish.base.json.MapStreamer;
12 | 
13 | import static org.testng.AssertJUnit.assertEquals;
14 | 
15 | 
16 | @Test(groups="unit")
17 | public class MapStreamerTest {
18 | 
19 |     @SuppressWarnings("serial")
20 |     enum Fixture {
21 |         EMPTY(
22 |                 new HashMap<String, String>(),
23 |                 "{}"),
24 |         ONE_ENTRY(
25 |                 new HashMap<String, String>() {{
26 |                     put("item", "{\"something\": 123}");
27 |                 }},
28 |                 "{\"item\": {\"something\": 123}}"),
29 |         MULTIPLE(
30 |                 new TreeMap<String, String>() {{
31 |                     put("A", "{\"x\": 123}");
32 |                     put("B", "{\"y\": [45, 67]}");
33 |                     put("C", "{\"z\": 89}");
34 |                 }},
35 |                 "{\"A\": {\"x\": 123},\n\"B\": {\"y\": [45, 67]},\n\"C\": {\"z\": 89}}");
36 | 
37 | 
38 |         Map<String, String> in;
39 |         String expected;
40 | 
41 |         Fixture(Map<String, String> in, String out) {
42 |             this.in = in;
43 |             this.expected = out;
44 |         }
45 |     }
46 | 
47 |     private void check(Map<String, String> in, String expected) throws IOException {
48 |         MapStreamer streamer = new MapStreamer(in);
49 |         final ByteArrayOutputStream out = new ByteArrayOutputStream();
50 |         streamer.write(out);
51 |         assertEquals(expected, out.toString("UTF-8"));
52 |     }
53 | 
54 |     public void testEmpty() throws IOException {
55 |         check(Fixture.EMPTY.in, Fixture.EMPTY.expected);
56 |     }
57 | 
58 |     public void testOneEntry() throws IOException {
59 |         check(Fixture.ONE_ENTRY.in, Fixture.ONE_ENTRY.expected);
60 |     }
61 | 
62 |     public void testMultiple() throws IOException {
63 |         check(Fixture.MULTIPLE.in, Fixture.MULTIPLE.expected);
64 |     }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/model/DocumentTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | 
 7 | import org.testng.annotations.Test;
 8 | 
 9 | import static org.testng.AssertJUnit.assertEquals;
10 | 
11 | 
12 | @Test(groups="unit")
13 | @SuppressWarnings("serial")
14 | public class DocumentTest {
15 | 
16 |     @Test(expectedExceptions=IllegalArgumentException.class)
17 |     public void testEmptyDocument() {
18 |         final Map<String, Object> empty = Collections.emptyMap();
19 |         new Document(empty).source();
20 |     }
21 | 
22 |     public void testVerySimpleDocument() {
23 |         final Map<String, Object> fields = new HashMap<String, Object>() {{
24 |             put("id", 1323);
25 |         }};
26 |         Document doc = new Document(fields);
27 |         assertEquals("{\"id\":1323}", doc.source());
28 |         assertEquals("1323", doc.name());
29 |         assertEquals("1323", doc.id());
30 |     }
31 | 
32 |     public void testSimpleDocument() {
33 |         final Map<String, Object> fields = new HashMap<String, Object>() {{
34 |             put("id", 1323);
35 |             put("something", "else");
36 |         }};
37 |         Document doc = new Document(fields);
38 |         assertEquals("1323", doc.id());
39 |         assertEquals("else", doc.fields().get("something"));
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/model/DummyAccess.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.model;
 2 | 
 3 | 
 4 | public class DummyAccess implements Access {
 5 | 
 6 |     private final String origin;
 7 |     private final Operation type;
 8 | 
 9 |     public DummyAccess(Operation type, String origin) {
10 |         this.origin = origin;
11 |         this.type = type;
12 |     }
13 | 
14 |     @Override
15 |     public String origin() {
16 |         return origin;
17 |     }
18 | 
19 |     @Override
20 |     public Operation type() {
21 |         return type;
22 |     }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/naming/ScopeTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.naming;
 2 | 
 3 | import static org.testng.AssertJUnit.assertEquals;
 4 | import static org.testng.AssertJUnit.assertNotNull;
 5 | import static org.testng.AssertJUnit.assertTrue;
 6 | 
 7 | import org.testng.annotations.Test;
 8 | 
 9 | import com.mozilla.grouperfish.model.Access;
10 | import com.mozilla.grouperfish.model.Type;
11 | import com.mozilla.grouperfish.model.DummyAccess;
12 | import com.mozilla.grouperfish.model.Access.Operation;
13 | import com.mozilla.grouperfish.naming.Scope;
14 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource;
15 | import com.mozilla.grouperfish.rest.jaxrs.DocumentsResource;
16 | import com.mozilla.grouperfish.rest.jaxrs.QueriesResource;
17 | import com.mozilla.grouperfish.rest.jaxrs.ResultsResource;
18 | import com.mozilla.grouperfish.services.api.Grid;
19 | import com.mozilla.grouperfish.services.mock.MockGrid;
20 | 
21 | 
22 | @Test(groups="unit")
23 | public class ScopeTest {
24 | 
25 |     private final String NS = "unit-test";
26 |     private final Grid grid = new MockGrid();
27 |     private final Access DUMMY_ACCESS = new DummyAccess(Operation.CREATE, "dummy.example.com");
28 | 
29 |     public void testAllows() {
30 |         assertTrue(scope(NS).allows(DocumentsResource.class, DUMMY_ACCESS));
31 |     }
32 | 
33 |     public void testExistingConfigurations() {
34 |         for (final Type type : Type.values()) {
35 |             assertNotNull(scope(NS).map(type));
36 |         }
37 |     }
38 | 
39 |     @Test(expectedExceptions=IllegalArgumentException.class)
40 |     public void testInvalidConfigurations() {
41 |         scope(NS).map(null);
42 |     }
43 | 
44 |     public void testDocuments() {
45 |         assertNotNull(scope(NS).documents());
46 |     }
47 | 
48 |     public void testMaxLength() {
49 |         Access access = new DummyAccess(Operation.CREATE, "dummy.example.com");
50 |         assertTrue(0 < scope(NS).maxLength(DocumentsResource.class, access));
51 |     }
52 | 
53 |     public void testQueries() {
54 |         assertNotNull(scope(NS).queries());
55 |     }
56 | 
57 |     public void testResourceMap() {
58 |         Scope ns = scope(NS);
59 |         assertEquals(
60 |                 ns.documents(), ns.resourceMap(DocumentsResource.class));
61 |         assertEquals(
62 |                 ns.queries(), ns.resourceMap(QueriesResource.class));
63 |         assertEquals(
64 |                 ns.results(), ns.resourceMap(ResultsResource.class));
65 |         assertEquals(
66 |                 ns.map(Type.CONFIGURATION_FILTER),
67 |                 ns.resourceMap(ConfigurationsResource.FilterConfigsResource.class));
68 |         assertEquals(
69 |                 ns.map(Type.CONFIGURATION_TRANSFORM),
70 |                 ns.resourceMap(ConfigurationsResource.TransformConfigsResource.class));
71 |     }
72 | 
73 |     @Test(expectedExceptions=IllegalStateException.class)
74 |     public void testInvalidResourceMap() {
75 |         final Scope ns = scope(NS);
76 |         ns.resourceMap(Object.class);
77 |     }
78 | 
79 |     public void testResults() {
80 |         assertNotNull(scope(NS).results());
81 |     }
82 | 
83 |     public void testToString() {
84 |         assertEquals(NS, scope(NS).raw());
85 |     }
86 | 
87 |     public void testValidator() {
88 |         assertNotNull(scope(NS).validator(DocumentsResource.class));
89 |     }
90 | 
91 |     private Scope scope(String namespace) {
92 |         return new Scope(namespace, grid);
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/rest/jaxrs/RestHelperTest.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.rest.jaxrs;
 2 | 
 3 | import static org.mockito.Mockito.mock;
 4 | import static org.mockito.Mockito.when;
 5 | import static org.testng.AssertJUnit.assertEquals;
 6 | import static org.testng.AssertJUnit.assertNotNull;
 7 | 
 8 | import java.io.ByteArrayInputStream;
 9 | import java.io.IOException;
10 | 
11 | import javax.servlet.ServletInputStream;
12 | import javax.servlet.http.HttpServletRequest;
13 | import javax.ws.rs.core.Response;
14 | 
15 | import org.testng.annotations.Test;
16 | 
17 | import com.mozilla.grouperfish.naming.Scope;
18 | import com.mozilla.grouperfish.rest.jaxrs.DocumentsResource;
19 | import com.mozilla.grouperfish.rest.jaxrs.RestHelper;
20 | import com.mozilla.grouperfish.rest.jaxrs.ConfigurationsResource.TransformConfigsResource;
21 | import com.mozilla.grouperfish.services.api.Grid;
22 | import com.mozilla.grouperfish.services.mock.MockGrid;
23 | 
24 | 
25 | @Test(groups="unit")
26 | public class RestHelperTest {
27 | 
28 |     private final Grid grid = new MockGrid();
29 |     private final Scope NS = new Scope("unit-test", grid);
30 | 
31 |     public void testPutAny() throws IOException {
32 |         final HttpServletRequest mock = mock(HttpServletRequest.class);
33 | 
34 |         final String body = "{\"id\": \"mydoc\"}";
35 |         when(mock.getMethod()).thenReturn("PUT");
36 |         when(mock.getContentLength()).thenReturn(body.length());
37 |         when(mock.getInputStream()).thenReturn(new ServletInputStream() {
38 |             final ByteArrayInputStream byteStream = new ByteArrayInputStream(body.getBytes());
39 |             @Override
40 |             public int read() throws IOException {
41 |                 return byteStream.read();
42 |             }
43 |         });
44 | 
45 |         final Response response = RestHelper.putAny(DocumentsResource.class, NS, "mydoc", mock);
46 |         assertNotNull(response);
47 |         assertEquals(201, response.getStatus());
48 |     }
49 | 
50 | 
51 |     public void testDeleteAny() throws IOException {
52 |         final HttpServletRequest mock = mock(HttpServletRequest.class);
53 |         when(mock.getMethod()).thenReturn("DELETE");
54 | 
55 |         final Response response = RestHelper.deleteAny(DocumentsResource.class, NS, "somedoc", mock);
56 |         assertNotNull(response);
57 |         assertEquals(204, response.getStatus());
58 |     }
59 | 
60 | 
61 |     public void testGetAny() {
62 |         // Put stuff in, to get afterwards:
63 |         NS.documents().put("myGetDoc", "{\"id\": \"myGetDoc\"}");
64 | 
65 |         final HttpServletRequest mock = mock(HttpServletRequest.class);
66 |         final Response response = RestHelper.getAny(DocumentsResource.class, NS, "myGetDoc", mock);
67 |         assertNotNull(response);
68 |         assertEquals(200, response.getStatus());
69 | 
70 |         final Response response404 = RestHelper.getAny(DocumentsResource.class, NS, "no such doc", mock);
71 |         assertNotNull(response404);
72 |         assertEquals(404, response404.getStatus());
73 |     }
74 | 
75 | 
76 |     public void testListAny() {
77 |         final HttpServletRequest mock = mock(HttpServletRequest.class);
78 |         final Response response = RestHelper.listAny(TransformConfigsResource.class, NS, mock);
79 |         assertNotNull(response);
80 |         assertEquals(200, response.getStatus());
81 |     }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/service/src/test/java/com/mozilla/grouperfish/unit/UnitTestHelper.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.unit;
 2 | 
 3 | import org.testng.annotations.BeforeGroups;
 4 | import org.testng.annotations.Test;
 5 | 
 6 | import com.mozilla.grouperfish.bootstrap.Grouperfish;
 7 | 
 8 | 
 9 | @Test(groups="unit")
10 | public class UnitTestHelper {
11 | 
12 |     private final int port = Grouperfish.DEFAULT_PORT + 10;
13 | 
14 |     @BeforeGroups(groups="unit")
15 |     void setUp() throws Exception {
16 |         System.setProperty("hazelcast.config", "config/hazelcast.xml");
17 |         System.setProperty("server.port", String.valueOf(port));
18 |     }
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/service/src/test/resources/config/hazelcast.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <hazelcast
 3 |  xmlns="http://www.hazelcast.com/schema/config"
 4 |  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 5 |  xsi:schemaLocation="http://www.hazelcast.com/schema/config hazelcast-basic.xsd">
 6 | 
 7 |   <!-- HazelCast for unit tests: No map stores, no backups, no multicast. -->
 8 |   <group>
 9 |     <name>grouperfish-unit</name>
10 |     <password>grouperfish-unit</password>
11 |   </group>
12 | 
13 |   <network>
14 |     <port auto-increment="true">5701</port>
15 |     <join>
16 |       <multicast enabled="false"></multicast>
17 |       <tcp-ip enabled="true">
18 |         <interface>127.0.0.1</interface>
19 |       </tcp-ip>
20 |     </join>
21 |     <interfaces enabled="false">
22 |       <interface>*</interface>
23 |     </interfaces>
24 |   </network>
25 | 
26 |   <executor-service>
27 |     <core-pool-size>16</core-pool-size>
28 |     <max-pool-size>64</max-pool-size>
29 |     <keep-alive-seconds>60</keep-alive-seconds>
30 |   </executor-service>
31 | 
32 | 
33 |   <!-- Family of maps used for documents received through REST.
34 |        The wildcard stands for the namespace. -->
35 |   <map name="documents_*">
36 |     <backup-count>0</backup-count>
37 |     <eviction-policy>LRU</eviction-policy>
38 |     <max-size policy="cluster_wide_map_size">10000</max-size>
39 |     <eviction-percentage>25</eviction-percentage>
40 |   </map>
41 | 
42 | 
43 |   <!-- For the results produced by the transforms. HBase only (key-only lookup). -->
44 |   <map name="results_*">
45 |     <backup-count>0</backup-count>
46 |     <time-to-live-seconds>0</time-to-live-seconds>
47 |     <eviction-policy>LRU</eviction-policy>
48 |     <max-size policy="cluster_wide_map_size">5000</max-size>
49 |     <eviction-percentage>25</eviction-percentage>
50 |   </map>
51 | 
52 | 
53 |   <!-- Used for the results produced by the transforms. HBase only (key-only lookup). -->
54 |   <map name="queries_*">
55 |     <backup-count>0</backup-count>
56 |     <time-to-live-seconds>0</time-to-live-seconds>
57 |     <eviction-policy>LRU</eviction-policy>
58 |     <max-size policy="cluster_wide_map_size">5000</max-size>
59 |     <eviction-percentage>25</eviction-percentage>
60 |   </map>
61 | 
62 | 
63 |   <!-- Used for the results produced by the transforms. HBase only (key-only lookup). -->
64 |   <map name="configuration_*">
65 |     <backup-count>0</backup-count>
66 |     <time-to-live-seconds>0</time-to-live-seconds>
67 |     <eviction-policy>LRU</eviction-policy>
68 |     <max-size policy="cluster_wide_map_size">5000</max-size>
69 |     <eviction-percentage>25</eviction-percentage>
70 |   </map>
71 | 
72 | </hazelcast>
73 | 
74 | 


--------------------------------------------------------------------------------
/service/src/test/resources/ng_unit.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd">
 3 | 
 4 | <suite name="grouperfish-service" verbose="4">
 5 |     <test name="run-unit">
 6 |         <groups>
 7 |             <run>
 8 |                 <include name="unit" />
 9 |             </run>
10 |         </groups>
11 |         <packages>
12 |             <package name="com.mozilla.grouperfish.*" />
13 |             <package name="com.mozilla.grouperfish.service.*" />
14 |         </packages>
15 |     </test>
16 | </suite>
17 | 


--------------------------------------------------------------------------------
/tools/display/src/main/java/com/mozilla/grouperfish/mahout/clustering/display/kmeans/DisplayKMeansBase.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.mahout.clustering.display.kmeans;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.log4j.Logger;
11 | import org.apache.mahout.clustering.WeightedVectorWritable;
12 | import org.apache.mahout.clustering.kmeans.Cluster;
13 | import org.apache.mahout.common.Pair;
14 | import org.apache.mahout.math.Vector;
15 | 
16 | import com.mozilla.hadoop.fs.SequenceFileDirectoryReader;
17 | 
18 | public class DisplayKMeansBase {
19 | 
20 |     private static final Logger LOG = Logger.getLogger(DisplayKMeansBase.class);
21 |     
22 |     public List<Pair<Integer,Vector>> readClusteredPoints(Path clusteredPointsPath) {
23 |         List<Pair<Integer,Vector>> clusteredPoints = new ArrayList<Pair<Integer,Vector>>();
24 |         SequenceFileDirectoryReader pointsReader = null;
25 |         try {
26 |             IntWritable k = new IntWritable();
27 |             WeightedVectorWritable wvw = new WeightedVectorWritable();
28 |             pointsReader = new SequenceFileDirectoryReader(clusteredPointsPath);
29 |             while (pointsReader.next(k, wvw)) {                
30 |                 clusteredPoints.add(new Pair<Integer,Vector>(k.get(), wvw.getVector()));
31 |             }
32 |         } catch (IOException e) {
33 |             LOG.error("IOException caught while reading clustered points", e);
34 |         } finally {
35 |             if (pointsReader != null) {
36 |                 pointsReader.close();
37 |             }
38 |         }
39 |         
40 |         return clusteredPoints;
41 |     }
42 |     
43 |     public List<Cluster> readClustersIteration(Path clusterIterationPath) {
44 |         List<Cluster> clusters = new ArrayList<Cluster>();
45 |         SequenceFileDirectoryReader iterationReader = null;
46 |         try {
47 |             Text k = new Text();
48 |             Cluster c = new Cluster();
49 |             iterationReader = new SequenceFileDirectoryReader(clusterIterationPath);
50 |             while (iterationReader.next(k, c)) {
51 |                 clusters.add(c);
52 |             }
53 |         } catch (IOException e) {
54 |             LOG.error("IOException caught while reading clustered points", e);
55 |         } finally {
56 |             if (iterationReader != null) {
57 |                 iterationReader.close();
58 |             }
59 |         }
60 | 
61 |         return clusters;
62 |     }
63 |     
64 | }
65 | 


--------------------------------------------------------------------------------
/tools/firefox_input/.gitignore:
--------------------------------------------------------------------------------
1 | opinions.*
2 | *.tsv
3 | logs
4 | 


--------------------------------------------------------------------------------
/tools/firefox_input/README.md:
--------------------------------------------------------------------------------
1 | # Firefox Input support
2 | 
3 | This tool loads opinion data, as exposed by Firefox Input, into grouperfish.
4 | 
5 | [Firefox Input](https://input.mozilla.com)
6 | 
7 | [Data Format](https://wiki.mozilla.org/Firefox/Input/Data)
8 | 


--------------------------------------------------------------------------------
/tools/firefox_input/install:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # normalize work directory
 4 | wd=`dirname "$0"`
 5 | wd=`cd "$wd"; pwd`
 6 | 
 7 | cmd="--build"
 8 | if [[ "${#}" -eq "1" ]]; then
 9 |     if [[ "${1}" == --* ]]; then cmd=$1; fi
10 | fi
11 | 
12 | dest=../../build/tools/firefox_input/
13 | case "${cmd}" in
14 |     --build|--package)
15 |         mvn install || exit 1
16 |         mkdir -p "${dest}/lib"
17 |         cp target/grouperfish-*.jar "${dest}/lib/"
18 |         cp ./load_opinions "${dest}/"
19 |         ;;
20 |     --clean)
21 |         mvn clean
22 |         rm -rf "${dest}"
23 |         ;;
24 |     --help)
25 |         "Usage: ${0} [--build|--clean]"
26 |         ;;
27 | esac
28 | 


--------------------------------------------------------------------------------
/tools/firefox_input/load_opinions:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # normalize work directory
 4 | wd=`dirname "$0"`
 5 | wd=`cd -P "$wd"; pwd`
 6 | 
 7 | 
 8 | # Load Firefox Input opinions into Grouperfish.
 9 | 
10 | self="${0}"
11 | usage() {
12 |     echo "Usage: ${self} [http://grouperfish:port] namespace"
13 |     echo
14 |     echo "Reads opinion data from standard input and loads it into the"
15 |     echo "given namespace."
16 |     echo
17 | }
18 | 
19 | cp_add_dir() {
20 |     d="${1}"
21 |     for lib in `find "${d}" -type f -name '*.jar'`; do
22 |         CLASSPATH=${CLASSPATH}:"${lib}"
23 |     done
24 | }
25 | 
26 | 
27 | tool=`dirname "$self"`
28 | 
29 | LOGBACK_OPTS=-Dlogback.configurationFile=logback-stdout.xml
30 | 
31 | load() {
32 |     if [ -d $tool/target ]; then
33 |         # we are on source tree
34 |         grouperfish_home="${tool}/../../build"
35 |         cp_add_dir "${tool}/target"
36 |     else
37 |         # we are on build tree
38 |         grouperfish_home="${tool}/../.."
39 |         cp_add_dir "${tool}/lib"
40 |     fi
41 | 
42 |     cp_add_dir "${grouperfish_home}/lib"
43 |     echo "CP $CLASSPATH"
44 | 
45 |     main=com.mozilla.grouperfish.tools.firefox_input.OpinionLoader
46 |     java -cp $CLASSPATH $LOGBACK_OPTS ${main} $@
47 | }
48 | 
49 | case "${1}" in
50 |     --help)
51 |         usage
52 |         ;;
53 |     *)
54 |         load $@
55 | esac
56 | 


--------------------------------------------------------------------------------
/tools/firefox_input/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <artifactId>grouperfish-tools-fxinput</artifactId>
 6 |     <version>${grouperfishVersion}</version>
 7 |     <parent>
 8 |         <groupId>com.mozilla</groupId>
 9 |         <artifactId>grouperfish-parent</artifactId>
10 |         <relativePath>../../project</relativePath>
11 |         <version>FIXED</version>
12 |     </parent>
13 | 
14 |     <url>https://github.com/mozilla-metrics/grouperfish</url>
15 | 
16 |     <packaging>jar</packaging>
17 | 
18 |     <properties>
19 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
20 |         <skip.tests.unit>false</skip.tests.unit>
21 |     </properties>
22 | 
23 |     <dependencies>
24 |         <dependency>
25 |             <groupId>com.mozilla</groupId>
26 |             <artifactId>grouperfish-service</artifactId>
27 |             <version>${grouperfishVersion}</version>
28 |         </dependency>
29 |     </dependencies>
30 | 
31 |     <build>
32 |         <finalName>grouperfish-service</finalName>
33 |         <plugins>
34 | 
35 |             <!-- Test-NG based unit tests -->
36 |             <plugin>
37 |                 <groupId>org.apache.maven.plugins</groupId>
38 |                 <artifactId>maven-surefire-plugin</artifactId>
39 |                 <version>2.5</version>
40 |                 <configuration>
41 |                     <skip>${skip.tests.unit}</skip>
42 |                     <argLine>-Xms128m -Xmx768m -XX:PermSize=128m -XX:MaxPermSize=512m</argLine>
43 |                     <parallel>methods</parallel>
44 |                     <threadCount>1</threadCount>
45 |                     <workingDirectory>${project.build.directory}/test-classes</workingDirectory>
46 |                     <suiteXmlFiles>
47 |                         <suiteXmlFile>src/test/resources/ng_unit.xml</suiteXmlFile>
48 |                     </suiteXmlFiles>
49 |                 </configuration>
50 |             </plugin>
51 | 
52 |             <!-- Standard jar packaging -->
53 |             <plugin>
54 |                 <groupId>org.apache.maven.plugins</groupId>
55 |                 <artifactId>maven-jar-plugin</artifactId>
56 |                 <version>2.3.1</version>
57 |                 <configuration>
58 |                     <finalName>${project.name}-${project.version}</finalName>
59 |                     <archive>
60 |                         <manifest>
61 |                             <addClasspath>true</addClasspath>
62 |                             <classpathPrefix>${settings.localRepository}</classpathPrefix>
63 |                             <classpathLayoutType>repository</classpathLayoutType>
64 |                             <mainClass>com.mozilla.grouperfish.tools.firefox_input.OpinionLoader</mainClass>
65 |                         </manifest>
66 |                     </archive>
67 |                 </configuration>
68 |             </plugin>
69 | 
70 |         </plugins>
71 | 
72 |     </build>
73 | 
74 | </project>
75 | 


--------------------------------------------------------------------------------
/tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/OpinionLoader.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.tools.firefox_input;
 2 | 
 3 | import com.mozilla.grouperfish.bootstrap.Grouperfish;
 4 | import com.mozilla.grouperfish.util.loader.DocumentLoader;
 5 | 
 6 | public class OpinionLoader {
 7 | 
 8 |     public static void main(final String[] arguments) {
 9 | 
10 |         if (arguments.length > 2 || (arguments.length >= 1 && "--help".equals(arguments[0]))) {
11 |             System.err.println("arguments: [BASE_URL] NAMESPACE");
12 |             System.exit(1);
13 |         }
14 | 
15 |         final String baseUrl;
16 |         final String namespace;
17 |         if (arguments.length == 2) {
18 |             baseUrl = arguments[0];
19 |             namespace = arguments[1];
20 |         }
21 |         else {
22 |             baseUrl = "http://localhost:" + Grouperfish.DEFAULT_PORT;
23 |             namespace = arguments[0];
24 |         }
25 | 
26 |         new DocumentLoader(baseUrl, namespace).load(new OpinionStream(System.in));
27 |     }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/OpinionStream.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.tools.firefox_input;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.Arrays;
 6 | import java.util.HashMap;
 7 | import java.util.Iterator;
 8 | 
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import com.mozilla.grouperfish.model.Document;
13 | 
14 | 
15 | public class OpinionStream implements Iterable<Document> {
16 | 
17 |     public OpinionStream(final InputStream in) {
18 |         in_ = in;
19 |     }
20 | 
21 |     static enum Field {
22 |         ID(0), TIMESTAMP(1), TYPE(2), PRODUCT(3), VERSION(4), PLATFORM(5), LOCALE(6),
23 |         MANUFACTURER(7), DEVICE(8), URL(9), TEXT(10);
24 |         public int i;
25 | 
26 |         Field(int c) {
27 |             i = c;
28 |         }
29 |     }
30 | 
31 |     @Override
32 |     public Iterator<Document> iterator() {
33 |         return new OpinionsIterator(new TsvReader(in_));
34 |     }
35 | 
36 |     private class OpinionsIterator implements Iterator<Document> {
37 | 
38 |         final TsvReader reader_;
39 |         int i_ = 0;
40 |         String[] row_;
41 | 
42 |         public OpinionsIterator(TsvReader reader) {
43 |             reader_ = reader;
44 |         }
45 | 
46 |         @Override
47 |         public Document next() {
48 |             @SuppressWarnings("serial")
49 |             Document doc = new Document(
50 |                     row_[Field.ID.i],
51 |                     new HashMap<String, Object>() {{
52 |                         for (Field f : Field.values())
53 |                             put(f.name().toLowerCase(), row_[f.i]);
54 |                     }});
55 |             row_ = null;
56 |             return doc;
57 |         }
58 | 
59 |         @Override
60 |         public boolean hasNext() {
61 |             if (row_ != null)
62 |                 return true;
63 |             try {
64 |                 row_ = reader_.nextRow();
65 |                 if (row_ == null)
66 |                     return false;
67 |                 if (row_.length != Field.values().length) {
68 |                     log.warn(
69 |                         "L{} skipping record (wrong number of columns) {}\n",
70 |                         i_, Arrays.toString(row_));
71 |                     ++i_;
72 |                     row_ = null;
73 |                     return hasNext();
74 |                 }
75 |                 ++i_;
76 |             } catch (IOException e) {
77 |                 e.printStackTrace();
78 |                 throw new RuntimeException(e);
79 |             }
80 |             return true;
81 |         }
82 | 
83 |         @Override
84 |         public void remove() {
85 |             throw new UnsupportedOperationException();
86 |         }
87 | 
88 |     }
89 | 
90 |     private static final Logger log = LoggerFactory.getLogger(OpinionStream.class);
91 | 
92 |     private final InputStream in_;
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/TsvJsonFromInputTsv.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.tools.firefox_input;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.IOException;
 5 | import java.io.OutputStreamWriter;
 6 | 
 7 | import com.mozilla.grouperfish.base.StreamTool;
 8 | import com.mozilla.grouperfish.base.json.TsvJsonWriter;
 9 | import com.mozilla.grouperfish.model.Document;
10 | 
11 | 
12 | /**
13 |  * Produces a TSV/JSON (our algorithm format) directly from input data,
14 |  * without need for a running grouperfish instance.
15 |  */
16 | public class TsvJsonFromInputTsv {
17 | 
18 |     public static void main(String[] args) throws IOException {
19 | 
20 |         TsvJsonWriter writer =
21 |             new TsvJsonWriter(
22 |                 new BufferedWriter(
23 |                         new OutputStreamWriter(System.out, StreamTool.UTF8)));
24 | 
25 | 
26 |         for (final Document doc : new OpinionStream(System.in)) {
27 |             writer.write(doc);
28 |         };
29 | 
30 |         writer.flush();
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/tools/firefox_input/src/main/java/com/mozilla/grouperfish/tools/firefox_input/TsvReader.java:
--------------------------------------------------------------------------------
 1 | package com.mozilla.grouperfish.tools.firefox_input;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import java.io.InputStreamReader;
 7 | import java.nio.charset.Charset;
 8 | import java.util.LinkedList;
 9 | import java.util.List;
10 | 
11 | 
12 | /**
13 |  * TSV reading state machine. The opencsv lib does not support the
14 |  * input.mozilla.com export format (escape without quotes).
15 |  *
16 |  * More information: https://wiki.mozilla.org/Firefox/Input/Data
17 |  */
18 | public class TsvReader {
19 | 
20 |     private static final int EOF = -1;
21 | 	private static final Charset UTF8 = Charset.forName("UTF-8");
22 | 	private static final int BUF_SIZE = 32768 * 32;
23 | 
24 | 	private boolean escaped = false;
25 | 	private boolean done = false;
26 | 	private final StringBuilder builder = new StringBuilder();
27 | 	private final BufferedReader reader;
28 | 
29 | 	public TsvReader(final InputStream in) {
30 | 		reader = new BufferedReader(new InputStreamReader(in, UTF8), BUF_SIZE);
31 | 	}
32 | 
33 | 	public String[] nextRow() throws IOException {
34 | 		final List<String> row = new LinkedList<String>();
35 | 		char c;
36 | 		while (true) {
37 | 		    if (done) {
38 | 		        return null;
39 | 		    }
40 | 			int i = reader.read();
41 |             if (i == EOF) {
42 |                 done = true;
43 |                 if (builder.length() == 0)
44 |                     return null;
45 |                 row.add(builder.toString());
46 |                 return row.toArray(new String[row.size()]);
47 |             }
48 | 
49 | 			c = (char) i;
50 | 			if (!escaped) {
51 | 				switch (c) {
52 | 				case '\\':
53 | 					escaped = true;
54 | 					continue;
55 | 				case '\t':
56 | 					row.add(builder.toString());
57 | 					builder.setLength(0);
58 | 					continue;
59 | 				case '\n':
60 | 					row.add(builder.toString());
61 | 					builder.setLength(0);
62 | 					return row.toArray(new String[row.size()]);
63 | 				}
64 | 			}
65 | 			builder.append(c);
66 | 			escaped = false;
67 | 		}
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/tools/firefox_input/src/test/resources/ng_unit.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd">
 3 | 
 4 | <suite name="grouperfish-tools-fxinput" verbose="4">
 5 |     <test name="run-unit">
 6 |         <groups>
 7 |             <run><include name="unit" /></run>
 8 |         </groups>
 9 |         <packages>
10 |             <package name="com.mozilla.grouperfish.tools.firefox_input.*" />
11 |         </packages>
12 |     </test>
13 | </suite>
14 | 


--------------------------------------------------------------------------------
/tools/webui/public/css/topics.css:
--------------------------------------------------------------------------------
 1 | a:link, a, a.visited { text-decoration:none; }
 2 | body,div,dl,dt,dd,ul,ol,li,h1,h2,h3,h4,h5,h6,pre,form, fieldset,input,textarea,blockquote,th,td,p { margin:0; padding:0; }
 3 | h1 a, h2 a, h3 a, h4 a, h5 a, h6 a, h1 a:hover, h2 a:hover, h3 a:hover, h4 a:hover, h5 a:hover, h6 a:hover { text-decoration:none; }
 4 | table { border-collapse:collapse; border-spacing:0; }
 5 | fieldset,img { border:0; }
 6 | address,caption,cite,code,dfn,em,strong,th,var { font-style:normal; font-weight:normal; }
 7 | ul { list-style:none; }
 8 | caption,th { text-align:left; }
 9 | q:before,q:after { content:''; }
10 | abbr,acronym { border:0; }
11 | 
12 | body {
13 |     font-family: Gill Sans, sans-serif;
14 |     margin-left:2em;
15 |     margin-top:2em;
16 | }
17 | 
18 | td {
19 |     padding-right:.5em;
20 |     padding-left:.5em;
21 |     padding-bottom:.25em;
22 |     padding-top:.25em;
23 | }
24 | 
25 | #words {
26 |     margin-bottom:.25em;
27 |     border-bottom:1px solid gray;
28 | }
29 | 
30 | .clicked {
31 |     background-color:black;
32 |     color:white;
33 | }
34 | 
35 | .hovered {
36 |     background-color:gray;
37 |     color:white;
38 | }
39 | 
40 | #docs {
41 |     margin-top:1em;
42 |     width:800px;
43 | }
44 | 
45 | #docs p {
46 |     margin-top:.5em;
47 | }


--------------------------------------------------------------------------------
/tools/webui/topics.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |     <link href="public/css/topics.css" rel="stylesheet" type="text/css" />
 4 |     <title>Topic Prototype</title>
 5 |     <script src="public/js/jquery.js"></script>
 6 |     <script src="public/js/jquery.isotope.min.js"></script>
 7 |     <script src="public/js/d3.js"></script>
 8 |     <script src="public/js/toy_topics.js"></script>
 9 | </head>
10 | <body>
11 | 
12 | <h1>Topics</h1>
13 | <br>
14 | <table id='words'></table>
15 | <div id='docs'></div>
16 | <script>
17 |     
18 |     var k;
19 |     var data = [];
20 |     var current_row = [];
21 |     
22 |     for (var key in results) {
23 |         k = results[key]['TOP_FEATURES'].length;
24 |         data.push(results[key])
25 |     }
26 |     
27 |     var ks = d3.range(0, k);
28 |     
29 |     for (var i = 0; i < k; i += 1){
30 |         for (var j = 0; j < data.length; j += 1) {
31 |             current_row.push(data[j]['TOP_FEATURES'][i])
32 |         }
33 |         current_row = d3.zip(current_row, ks).map(function(x){
34 |             return '<td class="word" data-col=' + x[1] + '>' + x[0] + '</td>'
35 |         }).reduce(function(p, n, ind, arr) {
36 |             return p + n;
37 |         });
38 |         
39 |         $("#words").append('<tr>' + current_row + '</tr>');
40 |         current_row = [];
41 |         
42 |     }
43 |     var clicked_col;
44 |     // reveal documents.
45 |     var these_docs;
46 |     $('.word').click(function(){
47 |         // add a class.
48 |         
49 |         
50 |         $('.clicked').removeClass('clicked');
51 |         $('td[data-col=' + hi_col + ']').addClass('clicked');
52 |         
53 |         // surface documents.
54 |         $('#docs').html('')
55 |         clicked_col = parseInt($(this).data('col'));
56 |         console.log(data[clicked_col]['TOP_DOCS'])
57 |         these_docs = data[clicked_col]['TOP_DOCS'].map(function(x){
58 |             return '<p>' + x.split('\t')[1] + '</p>'
59 |         }).reduce(function(p,n){
60 |             return p + n
61 |         })
62 |         $('#docs').html(these_docs);
63 |         
64 |     })
65 |     
66 |     var hi_col;
67 |     
68 |     $('.word').hover(
69 |         function(){
70 |             hi_col = $(this).data('col');
71 |             $('td[data-col=' + hi_col + ']').addClass('hovered');
72 |         },
73 |         function(){
74 |             
75 |             $('td[data-col=' + hi_col + ']').removeClass('hovered');
76 |         })
77 |     
78 | </script>
79 | </body>
80 | </html>


--------------------------------------------------------------------------------
/transforms/coclustering/INSTALL.MD:
--------------------------------------------------------------------------------
 1 | 1. Copy all pig scripts to a directory of your choice. This directory will also
 2 | be used as local directory for the scripts.
 3 | 2. Copy src/main/python/cocluster.py to this directory.
 4 | 3. Copy the bash script ``cocluster`` to this directory.
 5 | 4. Ensure you have Python 2.6 with ``python26 `` the command to invoke python
 6 | 2.6.
 7 | 5. Do mvn package and mvn assembly:assembly in the transforms/coclustering to
 8 | generate two JARS in the target folder.
 9 | 6. Create a lib directory in your directory of choice that contains
10 |     A. lucene-analyzers-3.1.0.jar
11 |     B. lucene-core-3.1.0.jar
12 |     C. akela-0.2-SNAPSHOT.jar
13 |     D. grouperfish-transforms-coclustering-0.3-SNAPSHOT.jar
14 |     E. mahout-collections-1.0.jar (Patched for CDH3 u0)
15 |     F. mahout-core-0.5.jar (Patched for CDH3 u0)
16 |     G. mahout-examples-0.5-job.jar (Patched for CDH3 u0)
17 |     H. mahout-math-0.5.jar  (Patched for CDH3 u0)
18 | 7. Copy grouperfish-transforms-coclustering-0.3-SNAPSHOT-job.jar to top level of
19 | the directory of your choice.
20 | 8. Create an HDFS directory that contains:
21 |     A. input.TSV (Refer readthedocs for format )
22 |     B. parameters.json (Refer this directory for example file)
23 | 9. Do ./cocluster <HDFS directory name>
24 | 10. You will find results.json and tags.json inside the HDFS directory.
25 | 
26 | Contact evijayakumar@mozilla.com for any questions.
27 | 
28 | 


--------------------------------------------------------------------------------
/transforms/coclustering/coclustering:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Call python script that performs co-clustering.
3 | exec python cocluster.py $1
4 | 


--------------------------------------------------------------------------------
/transforms/coclustering/install:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # normalize work directory
 4 | wd=`dirname "$0"`
 5 | wd=`cd "$wd"; pwd`
 6 | 
 7 | 
 8 | cmd="--build"
 9 | if [[ "${#}" -eq "1" ]]; then
10 |     if [[ "${1}" == --* ]]; then
11 |         cmd=$1
12 |     fi
13 | fi
14 | 
15 | 
16 | dest=../../build/transforms/coclustering
17 | case "${cmd}" in
18 |     --build|--package)
19 |         mvn assembly:assembly || exit 1
20 |         mkdir -p $dest
21 |         cp target/grouperfish-*.jar $dest/
22 |         cp ./src/main/pig/* $dest/
23 |         cp ./coclustering $dest/
24 |         cp ./src/main/python/cocluster.py $dest/
25 |         # TODO: move the (job) jars into the right place
26 |         ;;
27 |     --clean)
28 |         mvn clean
29 |         rm -rf "${dest}"
30 |         ;;
31 |     --help)
32 |         "Usage: ${0} [--build|--clean]"
33 |         ;;
34 |     *)
35 |         "Usage: ${0} [--build|--clean]"
36 |         exit 1
37 |         ;;
38 | esac
39 | 


--------------------------------------------------------------------------------
/transforms/coclustering/src/assembly/job.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
 3 |     <id>job</id>
 4 |     <formats>
 5 |         <format>jar</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <dependencySets>
 9 |         <dependencySet>
10 |             <unpack>false</unpack>
11 |             <scope>runtime</scope>
12 |             <outputDirectory>lib</outputDirectory>
13 |             <excludes>
14 |                 <exclude>${artifact.groupId}:${artifact.artifactId}</exclude>
15 |             </excludes>
16 |         </dependencySet>
17 |         <dependencySet>
18 |             <unpack>false</unpack>
19 |             <scope>system</scope>
20 |             <outputDirectory>lib</outputDirectory>
21 |             <excludes>
22 |                 <exclude>${artifact.groupId}:${artifact.artifactId}</exclude>
23 |             </excludes>
24 |         </dependencySet>
25 |     </dependencySets>
26 |     <fileSets>
27 |         <fileSet>
28 |             <directory>${basedir}/target/classes</directory>
29 |             <outputDirectory>/</outputDirectory>
30 |             <excludes>
31 |                 <exclude>*.jar</exclude>
32 |             </excludes>
33 |         </fileSet>
34 |     </fileSets>
35 | </assembly>


--------------------------------------------------------------------------------
/transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/mahout/Vectorizer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Mozilla Foundation
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one
 5 |  * or more contributor license agreements.  See the NOTICE file
 6 |  * distributed with this work for additional information
 7 |  * regarding copyright ownership.  The ASF licenses this file
 8 |  * to you under the Apache License, Version 2.0 (the
 9 |  * "License"); you may not use this file except in compliance
10 |  * with the License.  You may obtain a copy of the License at
11 |  *
12 |  *   http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.mahout;
21 | 
22 | import java.io.IOException;
23 | 
24 | import org.apache.pig.EvalFunc;
25 | import org.apache.pig.data.DataBag;
26 | import org.apache.pig.data.Tuple;
27 | import org.apache.pig.data.TupleFactory;
28 | 
29 | public class Vectorizer extends EvalFunc<Tuple> {
30 | 
31 |     private static final TupleFactory tupleFactory = TupleFactory.getInstance();
32 | 
33 |     public Tuple exec(Tuple input) throws IOException {
34 |         if (input == null) {
35 |             return null;
36 |         }
37 | 
38 |         if (input.size() != 1) {
39 |             throw new IOException("Vectorizer requires exactly 1 parameter");
40 |         }
41 |         Tuple output = tupleFactory.newTuple();
42 |         DataBag db = (DataBag) input.get(0);
43 |         for (Tuple t : db) {
44 |             if (t.size() == 2) {
45 |                 Integer rowId = (Integer) t.get(0);
46 |                 if (rowId != null) {
47 |                     Tuple subt = tupleFactory.newTuple(2);
48 |                     subt.set(0, rowId);
49 |                     subt.set(1, t.get(1));
50 |                     output.append(subt);
51 |                 }
52 |             }
53 |         }
54 |         return output;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/text/ConvertDocumentIDToID.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Mozilla Foundation
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one
 5 |  * or more contributor license agreements.  See the NOTICE file
 6 |  * distributed with this work for additional information
 7 |  * regarding copyright ownership.  The ASF licenses this file
 8 |  * to you under the Apache License, Version 2.0 (the
 9 |  * "License"); you may not use this file except in compliance
10 |  * with the License.  You may obtain a copy of the License at
11 |  *
12 |  *   http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.text;
21 | 
22 | import java.io.BufferedReader;
23 | import java.io.IOException;
24 | import java.io.InputStreamReader;
25 | import java.util.HashMap;
26 | import java.util.Map;
27 | 
28 | import org.apache.hadoop.conf.Configuration;
29 | import org.apache.hadoop.fs.FileStatus;
30 | import org.apache.hadoop.fs.FileSystem;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.pig.EvalFunc;
33 | import org.apache.pig.data.Tuple;
34 | 
35 | public class ConvertDocumentIDToID extends EvalFunc<Integer> {
36 | 
37 |     private Map<String, Integer> documentIndex;
38 | 
39 |     private void loadDocumentIndex(String documentIndexPath) throws IOException {
40 |         if (documentIndex == null) {
41 |             documentIndex = new HashMap<String, Integer>();
42 | 
43 |             Path p = new Path(documentIndexPath);
44 |             FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
45 |             int index = 0;
46 |             for (FileStatus status : fs.listStatus(p)) {
47 |                 Path currPath = status.getPath();
48 |                 if (!status.isDir() && !currPath.getName().startsWith("_")) {
49 |                     BufferedReader reader = null;
50 |                     try {
51 |                         reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
52 |                         String line = null;
53 |                         while ((line = reader.readLine()) != null) {
54 |                             documentIndex.put(line.trim(), index++);
55 |                         }
56 |                     } finally {
57 |                         if (reader != null) {
58 |                             reader.close();
59 |                         }
60 |                     }
61 |                 }
62 |             }
63 | 
64 |             log.info("Loaded document index with size: " + documentIndex.size());
65 |         }
66 |     }
67 | 
68 |     @Override
69 |     public Integer exec(Tuple input) throws IOException {
70 |         if (input == null || input.size() == 0) {
71 |             return null;
72 |         }
73 |         if (input.size() != 2) {
74 |             throw new IOException("ConvertDocumentIDToID requires 2 parameters");
75 |         }
76 | 
77 |         String documentIndexPath = (String) input.get(0);
78 |         if (documentIndex == null) {
79 |             loadDocumentIndex(documentIndexPath);
80 |         }
81 |         String docID = (String) input.get(1);
82 |         return documentIndex.get(docID);
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/text/ConvertFeatureToID.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Mozilla Foundation
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one
 5 |  * or more contributor license agreements.  See the NOTICE file
 6 |  * distributed with this work for additional information
 7 |  * regarding copyright ownership.  The ASF licenses this file
 8 |  * to you under the Apache License, Version 2.0 (the
 9 |  * "License"); you may not use this file except in compliance
10 |  * with the License.  You may obtain a copy of the License at
11 |  *
12 |  *   http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.text;
21 | 
22 | import java.io.BufferedReader;
23 | import java.io.IOException;
24 | import java.io.InputStreamReader;
25 | import java.util.HashMap;
26 | import java.util.Map;
27 | 
28 | import org.apache.hadoop.conf.Configuration;
29 | import org.apache.hadoop.fs.FileStatus;
30 | import org.apache.hadoop.fs.FileSystem;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.pig.EvalFunc;
33 | import org.apache.pig.data.Tuple;
34 | 
35 | public class ConvertFeatureToID extends EvalFunc<Integer> {
36 | 
37 |     private Map<String, Integer> featureIndex;
38 | 
39 |     private void loadFeatureIndex(String featureIndexPath) throws IOException {
40 |         if (featureIndex == null) {
41 |             featureIndex = new HashMap<String, Integer>();
42 | 
43 |             Path p = new Path(featureIndexPath);
44 |             FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
45 |             int index = 0;
46 |             for (FileStatus status : fs.listStatus(p)) {
47 |                 Path currPath = status.getPath();
48 |                 if (!status.isDir() && !currPath.getName().startsWith("_")) {
49 |                     BufferedReader reader = null;
50 |                     try {
51 |                         reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
52 |                         String line = null;
53 |                         while ((line = reader.readLine()) != null) {
54 |                             featureIndex.put(line.trim(), index++);
55 |                         }
56 |                     } finally {
57 |                         if (reader != null) {
58 |                             reader.close();
59 |                         }
60 |                     }
61 |                 }
62 |             }
63 | 
64 |             log.info("Loaded feature index with size: " + featureIndex.size());
65 |         }
66 |     }
67 | 
68 |     @Override
69 |     public Integer exec(Tuple input) throws IOException {
70 |         if (input == null || input.size() == 0) {
71 |             return null;
72 |         }
73 |         if (input.size() != 2) {
74 |             throw new IOException("ConvertFeatureToID requires 2 parameters");
75 |         }
76 | 
77 |         String featureIndexPath = (String) input.get(0);
78 |         if (featureIndex == null) {
79 |             loadFeatureIndex(featureIndexPath);
80 |         }
81 |         String feature = (String) input.get(1);
82 |         return featureIndex.get(feature);
83 | 
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/transforms/coclustering/src/main/java/com/mozilla/grouperfish/transforms/coclustering/pig/eval/text/TermFrequency.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Mozilla Foundation
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one
 5 |  * or more contributor license agreements.  See the NOTICE file
 6 |  * distributed with this work for additional information
 7 |  * regarding copyright ownership.  The ASF licenses this file
 8 |  * to you under the Apache License, Version 2.0 (the
 9 |  * "License"); you may not use this file except in compliance
10 |  * with the License.  You may obtain a copy of the License at
11 |  *
12 |  *   http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | package com.mozilla.grouperfish.transforms.coclustering.pig.eval.text;
21 | 
22 | import java.io.IOException;
23 | import java.util.HashMap;
24 | import java.util.Map;
25 | 
26 | import org.apache.pig.EvalFunc;
27 | import org.apache.pig.data.BagFactory;
28 | import org.apache.pig.data.DataBag;
29 | import org.apache.pig.data.Tuple;
30 | import org.apache.pig.data.TupleFactory;
31 | 
32 | public class TermFrequency extends EvalFunc<DataBag> {
33 | 
34 |     private static BagFactory bagFactory = BagFactory.getInstance();
35 |     private static TupleFactory tupleFactory = TupleFactory.getInstance();
36 | 
37 |     @Override
38 |     public DataBag exec(Tuple input) throws IOException {
39 |         if (input == null || input.size() == 0) {
40 |             return null;
41 |         }
42 | 
43 |         DataBag db = (DataBag) input.get(0);
44 |         Map<String, Integer> termFreq = new HashMap<String, Integer>();
45 |         for (Tuple t : db) {
46 |             String word = (String) t.get(0);
47 |             int curCount = 0;
48 |             if (termFreq.containsKey(word)) {
49 |                 curCount = termFreq.get(word);
50 |             }
51 |             termFreq.put(word, ++curCount);
52 |         }
53 | 
54 |         DataBag output = bagFactory.newDefaultBag();
55 |         for (Map.Entry<String, Integer> entry : termFreq.entrySet()) {
56 |             Tuple t = tupleFactory.newTuple(2);
57 |             t.set(0, entry.getKey());
58 |             t.set(1, (double) entry.getValue());
59 |             output.add(t);
60 |         }
61 | 
62 |         return output;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/transforms/coclustering/src/main/json_sample_files/tags.json:
--------------------------------------------------------------------------------
1 | {"48":[479],"91":[479],"71":[479],"121":[479],"107":[479],"153":[479],"80":[479],"289":[479],"162":[479],"190":[479],"61":[479],"286":[479],"201":[479],"213":[479],"223":[479],"194":[479],"258":[479],"49":[479],"104":[479],"265":[479],"321":[479],"8":[479],"202":[479],"263":[479],"23":[479],"88":[479],"299":[479],"228":[408],"65":[408],"255":[408],"93":[408],"182":[408],"244":[408],"113":[408],"24":[408],"221":[408],"131":[408],"144":[408],"231":[408],"5":[408],"17":[408],"86":[408],"169":[408],"353":[408],"207":[408],"220":[408],"19":[408],"339":[408],"215":[408],"63":[408],"229":[408],"212":[408],"348":[408],"325":[408],"30":[408],"101":[408],"147":[408],"195":[408],"318":[408],"248":[408],"328":[408],"72":[408],"160":[408],"22":[408],"272":[519],"94":[519],"297":[519],"133":[519],"87":[519],"18":[519],"290":[519],"350":[519],"139":[519],"193":[519],"343":[519],"208":[519],"206":[519],"27":[519],"294":[519],"307":[519],"165":[519],"241":[321],"1":[321],"10":[321],"188":[321],"129":[321],"14":[321],"2":[321],"187":[321],"327":[321],"214":[321],"261":[321],"12":[321],"302":[321],"301":[321],"227":[321],"264":[321],"191":[321],"21":[321],"59":[434],"324":[434],"185":[434],"349":[434],"138":[434],"58":[434],"120":[434],"67":[434],"108":[434],"171":[434],"68":[434],"197":[434],"167":[434],"181":[434],"172":[434],"316":[434],"283":[434],"122":[434],"236":[434],"89":[434],"184":[434],"132":[434],"352":[434],"205":[434],"240":[434],"159":[434],"239":[434],"274":[434],"106":[434],"3":[434],"232":[434],"92":[434],"178":[434],"312":[434],"25":[434],"176":[434],"82":[434],"224":[434],"15":[434],"243":[434],"119":[434],"37":[434],"341":[434],"300":[434],"73":[434],"320":[434],"317":[434],"98":[434],"170":[434],"151":[434],"260":[434],"196":[434],"156":[434],"285":[434],"45":[434],"296":[434],"270":[176],"180":[176],"36":[176],"309":[176],"118":[176],"211":[176],"218":[176],"173":[176],"308":[176],"79":[176],"314":[176],"112":[176],"60":[176],"102":[176],"235":[176],"152":[176],"254":[176],"310":[176],"340":[176],"35":[176],"292":[176],"39":[176],"150":[176],"334":[176],"338":[176],"38":[176],"20":[176],"46":[176],"53":[176],"251":[436],"105":[436],"81":[436],"315":[436],"110":[436],"346":[436],"31":[436],"186":[436],"168":[436],"313":[436],"116":[436],"311":[436],"124":[436],"331":[436],"99":[436],"78":[436],"41":[436],"189":[436],"164":[436],"303":[436],"225":[436],"13":[436],"257":[436],"280":[436],"335":[436],"149":[436],"256":[436],"336":[436],"304":[436],"155":[436],"200":[436],"70":[436],"109":[436],"135":[436],"288":[481],"268":[481],"57":[481],"351":[481],"238":[481],"295":[481],"247":[481],"271":[481],"323":[481],"83":[481],"6":[481],"141":[481],"337":[481],"9":[481],"273":[481],"333":[481],"217":[481],"16":[481],"174":[481],"175":[481],"127":[481],"75":[481],"233":[481],"34":[466],"145":[466],"95":[466],"230":[466],"4":[466],"234":[466],"276":[466],"204":[466],"47":[466],"262":[466],"322":[466],"242":[466],"253":[466],"66":[466],"40":[466],"142":[466],"111":[466],"281":[466],"279":[466],"306":[466],"55":[466],"140":[466],"33":[466],"293":[466],"56":[466],"44":[466],"69":[466],"329":[466],"298":[466],"237":[362],"125":[362],"29":[362],"166":[362],"269":[362],"97":[362],"134":[362],"342":[362],"114":[362],"42":[362],"209":[362],"26":[362],"54":[362],"198":[362],"291":[362],"74":[362],"64":[362],"76":[362],"347":[362],"52":[362],"136":[362],"115":[362],"85":[362]}


--------------------------------------------------------------------------------
/transforms/coclustering/src/main/pig/co_cluster_generate_tags.pig:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Script to compute Tags
 3 | --
 4 | -- See "Co-clustering documents and words using Bipartite Spectral Graph
 5 | -- Partitioning" by Dhillon for more details.
 6 | --
 7 | 
 8 | %default TEMP 'cct'
 9 | %default NUM_REDUCERS 7
10 | register './lib/grouperfish-transforms-coclustering-0.3-SNAPSHOT.jar'
11 | register './lib/mahout-core-0.5.jar'
12 | register './lib/mahout-math-0.5.jar'
13 | register './lib/mahout-utils-0.5.jar'
14 | register './lib/mahout-collections-1.0.jar'
15 | SET default_parallel $NUM_REDUCERS
16 | SET pig.splitCombination 'false';
17 | 
18 | -- Load clustered Points which are in the format <key>, <Vector>
19 | clustered_points = LOAD '$TEMP/kmeans/out/clusteredPoints' USING com.mozilla.grouperfish.transforms.coclustering.pig.storage.KMeansOutputLoader()
20 | 	    AS (cluster_id:int, v_id:int, v_info:bag{t:tuple(col_id:int,
21 | 						    eblement:double)});
22 | describe clustered_points;
23 | points_clusters = FOREACH clustered_points
24 | 		     GENERATE v_id, cluster_id;
25 | describe points_clusters
26 | doc_map = LOAD '$TEMP/doc_map' AS (doc_id:int, doc: chararray);
27 | doc_clusters = JOIN doc_map BY doc_id, points_clusters BY v_id;
28 | tags = FOREACH doc_clusters
29 | 	    GENERATE  doc AS doc,
30 | 		      cluster_id AS cluster_id;
31 | describe tags;
32 | STORE tags INTO '$TEMP/tags' USING PigStorage('\t');
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/transforms/commons/src/main/java/com/mozilla/grouperfish/pig/eval/ml/Vectorizer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Mozilla Foundation
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one
 5 |  * or more contributor license agreements.  See the NOTICE file
 6 |  * distributed with this work for additional information
 7 |  * regarding copyright ownership.  The ASF licenses this file
 8 |  * to you under the Apache License, Version 2.0 (the
 9 |  * "License"); you may not use this file except in compliance
10 |  * with the License.  You may obtain a copy of the License at
11 |  *
12 |  *   http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | package com.mozilla.grouperfish.pig.eval.ml;
21 | 
22 | import java.io.BufferedReader;
23 | import java.io.IOException;
24 | import java.io.InputStreamReader;
25 | import java.util.HashMap;
26 | import java.util.Map;
27 | 
28 | import org.apache.hadoop.conf.Configuration;
29 | import org.apache.hadoop.fs.FileStatus;
30 | import org.apache.hadoop.fs.FileSystem;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.pig.EvalFunc;
33 | import org.apache.pig.data.DataBag;
34 | import org.apache.pig.data.Tuple;
35 | import org.apache.pig.data.TupleFactory;
36 | 
37 | public class Vectorizer extends EvalFunc<Tuple> {
38 | 
39 | 	private Map<String,Integer> featureIndex;
40 | 	private static final TupleFactory tupleFactory = TupleFactory.getInstance();
41 | 	
42 | 	private void loadFeatureIndex(String featureIndexPath) throws IOException {
43 | 		if (featureIndex == null) {
44 | 			featureIndex = new HashMap<String,Integer>();
45 | 			
46 | 			Path p = new Path(featureIndexPath);
47 | 			FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
48 | 			int index = 0;
49 | 			for (FileStatus status : fs.listStatus(p)) {
50 | 				if (!status.isDir()) {
51 | 					BufferedReader reader = null;
52 | 					try {
53 | 						reader = new BufferedReader(new InputStreamReader(fs.open(status.getPath())));
54 | 						String line = null;
55 | 						while ((line = reader.readLine()) != null) {
56 | 							featureIndex.put(line.trim(), index++);
57 | 						}
58 | 					} finally {
59 | 						if (reader != null) {
60 | 							reader.close();
61 | 						}
62 | 					}
63 | 				}
64 | 			}
65 | 			
66 | 			log.info("Loaded feature index with size: " + featureIndex.size());
67 | 		}
68 | 	}
69 | 	
70 | 	public Tuple exec(Tuple input) throws IOException {
71 | 		if (input == null) {
72 | 			return null;
73 | 		}
74 | 		
75 | 		if (input.size() != 2) {
76 | 			throw new IOException("Vectorizer requires exactly 2 parameters");
77 | 		}
78 | 		
79 | 		String featureIndexPath = (String)input.get(0);
80 | 		if (featureIndex == null) {
81 | 			loadFeatureIndex(featureIndexPath);
82 | 		}
83 | 		
84 | 		Tuple output = tupleFactory.newTuple();
85 | 		DataBag db = (DataBag)input.get(1);
86 | 		for (Tuple t : db) {
87 | 		    // Expects each tuple's first element to be the feature
88 | 			Integer idx = featureIndex.get((String)t.get(0));
89 | 			if (idx != null) {
90 | 				output.append(idx);
91 | 			}
92 | 		}
93 | 		
94 | 		return output;
95 | 	}
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/transforms/commons/src/main/java/com/mozilla/grouperfish/pig/eval/text/TermFrequency.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Mozilla Foundation
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one
 5 |  * or more contributor license agreements.  See the NOTICE file
 6 |  * distributed with this work for additional information
 7 |  * regarding copyright ownership.  The ASF licenses this file
 8 |  * to you under the Apache License, Version 2.0 (the
 9 |  * "License"); you may not use this file except in compliance
10 |  * with the License.  You may obtain a copy of the License at
11 |  *
12 |  *   http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | package com.mozilla.grouperfish.pig.eval.text;
21 | 
22 | import java.io.IOException;
23 | import java.util.HashMap;
24 | import java.util.Map;
25 | 
26 | import org.apache.pig.EvalFunc;
27 | import org.apache.pig.data.BagFactory;
28 | import org.apache.pig.data.DataBag;
29 | import org.apache.pig.data.Tuple;
30 | import org.apache.pig.data.TupleFactory;
31 | 
32 | public class TermFrequency extends EvalFunc<DataBag> {
33 | 
34 |     private static BagFactory bagFactory = BagFactory.getInstance();
35 |     private static TupleFactory tupleFactory = TupleFactory.getInstance();
36 | 
37 |     @Override
38 |     public DataBag exec(Tuple input) throws IOException {
39 |         if (input == null || input.size() == 0) {
40 |             return null;
41 |         }
42 | 
43 |         DataBag db = (DataBag) input.get(0);
44 |         Map<String, Integer> termFreq = new HashMap<String, Integer>();
45 |         for (Tuple t : db) {
46 |             String word = (String) t.get(0);
47 |             int curCount = 0;
48 |             if (termFreq.containsKey(word)) {
49 |                 curCount = termFreq.get(word);
50 |             }
51 |             termFreq.put(word, ++curCount);
52 |         }
53 | 
54 |         DataBag output = bagFactory.newDefaultBag();
55 |         for (Map.Entry<String, Integer> entry : termFreq.entrySet()) {
56 |             Tuple t = tupleFactory.newTuple(2);
57 |             t.set(0, entry.getKey());
58 |             t.set(1, (double) entry.getValue());
59 |             output.add(t);
60 |         }
61 | 
62 |         return output;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/transforms/commons/src/main/pig/generate_document_vectors.pig:
--------------------------------------------------------------------------------
 1 | /* Not sure why we have to register this JAR when it's already in Pig's classpath but we do */
 2 | register '/usr/lib/hbase/hbase-0.90.1-cdh3u0.jar'
 3 | register './lib/akela-0.1.jar'
 4 | register './lib/lucene-core-3.1.0.jar'
 5 | register './lib/lucene-analyzers-3.1.0.jar'
 6 | register './lib/mahout-core-0.5.jar'
 7 | register './lib/mahout-math-0.5.jar'
 8 | register './lib/mahout-utils-0.5.jar'
 9 | register './lib/mahout-collections-1.0.jar'
10 | 
11 | SET default_parallel 7;
12 | SET pig.splitCombination 'false';
13 | 
14 | %default INPUT 'opinions.tsv'
15 | %default STOPWORDS 'stopwords-en.txt'
16 | %default STEM 'true'
17 | %default FEATUREINDEX 'feature-index'
18 | %default OUTPUT 'document-vectors'
19 | 
20 | /*
21 | raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;
22 | genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
23 | document_word_bag = FOREACH genmap GENERATE (chararray)json_map#'id' AS docid:chararray,com.mozilla.pig.eval.text.UnigramExtractor(json_map#'text') AS word_bag;
24 | document_word_vectors = FOREACH document_word_bag GENERATE docid, com.mozilla.pig.eval.ConvertBagToTuple(word_bag) AS word_vector;
25 | 
26 | vectors = FOREACH document_word_vectors GENERATE (chararray)docid,com.mozilla.pig.eval.ml.Vectorizer('feature-index', word_vector) AS vec;
27 | STORE vectors INTO 'document-vectors' USING com.mozilla.pig.storage.DocumentVectorStorage();
28 | */
29 | 
30 | /* Use this output if you're not using Mahout */
31 | /*
32 | flat_vectors = FOREACH vectors GENERATE docid,FLATTEN(vec);
33 | STORE flat_vectors INTO 'document-vectors';
34 | */
35 | 
36 | /* Same as above except using tsv file for experimenting */
37 | raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,locale:chararray,text:chararray);
38 | filtered_raw = FILTER raw BY locale == 'en-US' AND praise_issue == 'issue' AND version == '5.0';
39 | tokenized = FOREACH filtered_raw GENERATE doc_id,com.mozilla.pig.eval.text.Tokenize(text,'$STOPWORDS', '$STEM') AS token_bag;
40 | vectors = FOREACH tokenized GENERATE (chararray)docid,com.mozilla.pig.eval.ml.Vectorizer('$FEATUREINDEX', token_bag) AS vec;
41 | STORE vectors INTO '$OUTPUT' USING com.mozilla.pig.storage.DocumentVectorStorage('$NFEATURES');


--------------------------------------------------------------------------------
/transforms/commons/src/main/pig/generate_feature_index.pig:
--------------------------------------------------------------------------------
 1 | register './akela-0.2-SNAPSHOT.jar'
 2 | register './grouperfish-transforms-commons-0.1-SNAPSHOT.jar'
 3 | register './lib/lucene-core-3.1.0.jar'
 4 | register './lib/lucene-analyzers-3.1.0.jar'
 5 | 
 6 | SET default_parallel 7;
 7 | 
 8 | %default INPUT 'input.json.tsv'
 9 | %default STOPWORDS 'stopwords-en.txt'
10 | %default STEM 'false'
11 | %default FREQ_OUTPUT 'feature-freq'
12 | %default OUTPUT 'feature-index'
13 | %default MIN_WORD_LENGTH 3
14 | %default MIN_DF 2
15 | %default MAX_DF_PERCENTAGE 0.9
16 | 
17 | /*raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;*/
18 | raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,json:chararray);
19 | genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
20 | 
21 | grouped_raw = GROUP raw ALL;
22 | ndocs = FOREACH grouped_raw GENERATE COUNT(raw);
23 | 
24 | tokenized = FOREACH genmap GENERATE FLATTEN(com.mozilla.grouperfish.pig.eval.text.Tokenize(json_map#'text', '$STOPWORDS', '$STEM')) AS token:chararray;
25 | grouped_words = GROUP tokenized BY token;
26 | word_freq = FOREACH grouped_words GENERATE FLATTEN($0) AS word:chararray, COUNT($1) as count;
27 | /* filter on minDF = (count) > 10 AND maxDF % = (count/ndocs) < 0.9 */
28 | filtered_freq = FILTER word_freq BY SIZE(word) > $MIN_WORD_LENGTH AND count > $MIN_DF AND ((double)count / (double)ndocs.$0) < $MAX_DF_PERCENTAGE;
29 | index = FOREACH filtered_freq GENERATE word;
30 | 
31 | STORE filtered_freq INTO '$FREQ_OUTPUT';
32 | STORE index INTO '$OUTPUT';


--------------------------------------------------------------------------------
/transforms/commons/src/main/pig/generate_sequence_files.pig:
--------------------------------------------------------------------------------
 1 | register './akela-0.1.jar'                                                                                                               
 2 | /* Not sure why we have to register this JAR when it's already in Pig's classpath but we do */
 3 | register '/usr/lib/hbase/hbase-0.90.1-cdh3u0.jar'
 4 | 
 5 | raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;
 6 | genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
 7 | documents = FOREACH raw GENERATE (chararray)docid,com.mozilla.pig.eval.text.RemoveStopwords(text);
 8 | filtered_documents = FILTER documents BY normtext IS NOT NULL AND SIZE(normtext) > 0;
 9 | STORE filtered_documents INTO 'documents' USING com.mozilla.pig.storage.SequenceFileStorage();
10 | 
11 | /* Same as above except using tsv file for experimenting */
12 | raw = LOAD 'opinions-en.tsv' USING PigStorage('\t') AS (docid:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,language:chararray,text:chararray);
13 | documents = FOREACH raw GENERATE (chararray)docid,text;
14 | /* filtered_documents = FILTER documents BY normtext IS NOT NULL AND SIZE(normtext) > 0; */
15 | STORE filtered_documents INTO 'documents' USING com.mozilla.pig.storage.SequenceFileStorage();
16 | 
17 | /*
18 | Follow up steps:
19 | 
20 | hadoop jar mahout-examples-0.5-job.jar org.apache.mahout.driver.MahoutDriver seq2sparse -i documents -wt tfidf --minDF 2 --maxDFPercent 90 -o seq2sparse-out
21 | hadoop jar mahout-core-0.5-job.jar org.apache.mahout.driver.MahoutDriver kmeans -i seq2sparse-out/tfidf-vectors -o kmeans-cosine-out -dm org.apache.mahout.common.distance.CosineDistanceMeasure -c random-clusters -ow -k 20 -x 10 -cl
22 | 
23 | */


--------------------------------------------------------------------------------
/transforms/commons/src/main/pig/generate_tf_document_vectors.pig:
--------------------------------------------------------------------------------
 1 | register './akela-0.2-SNAPSHOT.jar'
 2 | register './grouperfish-transforms-commons-0.1-SNAPSHOT.jar'
 3 | register './lib/lucene-core-3.1.0.jar'
 4 | register './lib/lucene-analyzers-3.1.0.jar'
 5 | 
 6 | SET default_parallel 7;
 7 | 
 8 | %default INPUT 'input.json.tsv'
 9 | %default STOPWORDS 'stopwords-en.txt'
10 | %default STEM 'FALSE'
11 | %default MIN_TOKENS 4
12 | %default FEATUREINDEX 'feature-index'
13 | %default OUTPUT 'document-vectors-tf'
14 | 
15 | /*raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;*/
16 | raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,json:chararray);
17 | genmap = FOREACH raw GENERATE doc_id,com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
18 | filtered_genmap = FILTER genmap BY json_map#'type' == 'issue' AND json_map#'product' == 'firefox' AND json_map#'version' == '5.0' AND json_map#'platform' == 'win7';
19 | tokenized = FOREACH filtered_genmap GENERATE doc_id,com.mozilla.grouperfish.pig.eval.text.Tokenize(json_map#'text','$STOPWORDS', '$STEM') AS token_bag;
20 | /* Comment out the line above and uncomment the line below if you are using an ngram feature-index */
21 | /*tokenized = FOREACH filtered_raw GENERATE doc_id,com.mozilla.pig.eval.text.NGramTokenize(text,'$STOPWORDS', '$STEM', 'true') AS token_bag;*/
22 | filtered_tokenized = FILTER tokenized BY SIZE(token_bag) > $MIN_TOKENS;
23 | doc_vectors = FOREACH filtered_tokenized GENERATE doc_id,com.mozilla.grouperfish.pig.eval.text.TermFrequency(token_bag) AS tf_bag;
24 | 
25 | /* Put things back into document vector form before storing in Mahout's vector format */
26 | feature_vectors = FOREACH doc_vectors GENERATE (chararray)doc_id,com.mozilla.grouperfish.pig.eval.ml.TFVectorizer('$FEATUREINDEX', tf_bag) AS vec;
27 | STORE feature_vectors INTO '$OUTPUT' USING com.mozilla.grouperfish.pig.storage.VWStorage();
28 | 
29 | /* Run VW LDA on this output */
30 | /*
31 | ./vw 
32 | */
33 | /* Run Mahout's Clustering on this output */
34 | /*
35 | /usr/lib/hadoop/bin/hadoop jar /usr/lib/mahout/mahout-core-0.5-job.jar org.apache.mahout.driver.MahoutDriver lda 
36 | -i document-vectors-tf 
37 | -o lda-out 
38 | -ow 
39 | -k 20 
40 | -v 12000
41 | -x 20
42 | */


--------------------------------------------------------------------------------
/transforms/count/count:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # A minimal hadoop based transform that performs a line-count on the input and
 4 | # generates a result like this
 5 | #
 6 | # {"count": 12345}
 7 | 
 8 | work=${1}
 9 | 
10 | fs_mkdir() {
11 |    hadoop fs -mkdir $1
12 | }
13 | 
14 | fs_rmr() {
15 |    hadoop fs -rmr $1
16 | }
17 | 
18 | fs_cat() {
19 |   hadoop fs -cat $1
20 | }
21 | 
22 | fs_put() {
23 |   hadoop fs -put - $1
24 | }
25 | 
26 | fs_rmr "${work}/output"
27 | fs_mkdir "${work}/output"
28 | echo '{"count": '"$(fs_cat ${work}/input.json.tsv | wc -l | awk '{print $1}')"'}' | fs_put "${work}/output/results.json"
29 | 


--------------------------------------------------------------------------------
/transforms/lda_gensim/src/python/filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | filter.py
 5 | 
 6 | Created by Xavier Stevens on 2011-09-19.
 7 | Copyright (c) 2011 Mozilla. All rights reserved.
 8 | """
 9 | 
10 | import sys
11 | import getopt
12 | import re
13 | import json
14 | 
15 | help_message = '''
16 | The help message goes here.
17 | '''
18 | 
19 | 
20 | class Usage(Exception):
21 |     def __init__(self, msg):
22 |         self.msg = msg
23 | 
24 | def filter_data(input_file, output_file, product="firefox", version="5.0", feedback_type="issues"):
25 |     fin = open(input_file, "r")
26 |     fout = open(output_file, "w")
27 |     tab_pattern = re.compile("\t")
28 |     for line in fin:
29 |         line_splits = tab_pattern.split(line.strip())
30 |         doc_json = json.loads(line_splits[1])
31 |         if doc_json["product"] == product and doc_json["version"] == version and doc_json["type"] == feedback_type:
32 |             fout.write(line)
33 |     fin.close()
34 |     fout.close()
35 |     
36 | def main(argv=None):
37 |     if argv is None:
38 |         argv = sys.argv
39 |     try:
40 |         try:
41 |             opts, args = getopt.getopt(argv[1:], "ho:d:p:v:t:", ["help", "output="])
42 |         except getopt.error, msg:
43 |             raise Usage(msg)
44 |     
45 |         # option processing
46 |         data_path = None
47 |         output_path = None
48 |         product = None
49 |         version = None
50 |         feedback_type = None
51 |         for option, value in opts:
52 |             if option == "-d":
53 |                 data_path = value
54 |             if option in ("-h", "--help"):
55 |                 raise Usage(help_message)
56 |             if option in ("-o", "--output"):
57 |                 output_path = value
58 |             if option == "-p":
59 |                 product = value
60 |             if option == "-v":
61 |                 version = value
62 |             if option == "-t":
63 |                 feedback_type = value
64 |         
65 |         filter_data(data_path, output_path, product, version, feedback_type)
66 |     except Usage, err:
67 |         print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
68 |         print >> sys.stderr, "\t for help use --help"
69 |         return 2
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     sys.exit(main())
74 | 


--------------------------------------------------------------------------------
/transforms/lda_r/src/R/lda.r:
--------------------------------------------------------------------------------
 1 | library(lda)
 2 | library(RJSONIO)
 3 | 
 4 | # Example: R -f src/R/lda.r --no-save --slave --args input-firefox-5.0-issues-ldac.dat feature-index-en-10-0.7.txt 10 topics.dat topdocs.dat
 5 | 
 6 | args<-commandArgs(TRUE)
 7 | 
 8 | docs<-read.documents(args[1])
 9 | vocab<-read.vocab(args[2])
10 | K<-as.integer(args[3])
11 | alpha<-0.01
12 | eta<-0.01
13 | model<-lda.collapsed.gibbs.sampler(docs, K, vocab, 100, alpha, eta)
14 | # Transposed for saving so we can read rows rather than columns
15 | top_10_topic_words<-t(top.topic.words(model$topics, num.words = 10, by.score = TRUE))
16 | top_20_docs_per_topic<-t(top.topic.documents(model$document_sums, num.documents=20, alpha))
17 | 
18 | e1<-sapply(1:ncol(top_10_topic_words),function(r) top_10_topic_words[,r],simplify=FALSE)
19 | names(e1)<-0:(length(e1)-1)
20 | 
21 | e2<-sapply(1:ncol(top_20_docs_per_topic),function(r) top_20_docs_per_topic[,r],simplify=FALSE)
22 | names(e2)<-0:(length(e2)-1)
23 | 
24 | e3<-lapply(model$assignments,function(r) {
25 |   a0<-table(r)
26 |   a1<-as.numeric(a0/length(r))
27 |   names(a1)<-names(a0)
28 |   return(a1)
29 | })
30 | names(e3) <- 1:length(e3)
31 | 
32 | json_doc_list <- list(TOP_FEATURES=e1, TOP_DOCS=e2, DOC_TOPICS=e3)
33 | json_docs <- toJSON(json_doc_list)
34 | writeLines(json_docs, "output.json")
35 | 
36 | #write.table(top_10_topic_words, file=args[4], quote=FALSE, row.names=FALSE, col.names=FALSE)
37 | #write.table(top_20_docs_per_topic, file=args[5], quote=FALSE, row.names=FALSE, col.names=FALSE)
38 | # Model assignments of topics per word per doc (post process in python)
39 | #lapply(model$assignments, function(x) write.table(t(data.frame(x)), file="assignments.dat", append=TRUE, quote=FALSE, row.names=FALSE, col.names=FALSE))


--------------------------------------------------------------------------------
/transforms/lda_vw/src/main/python/vw-printtopics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # printtopics.py: Prints the words that are most prominent in a set of
 4 | # topics.
 5 | #
 6 | # Copyright (C) 2010  Matthew D. Hoffman
 7 | #
 8 | # This program is free software: you can redistribute it and/or modify
 9 | # it under the terms of the GNU General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # This program is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 | # GNU General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU General Public License
19 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 | 
21 | import sys
22 | 
23 | def loadtxt(filename):
24 |     data = []
25 |     passed_header = False
26 |     for line in open(filename).readlines():
27 |         line = line.strip()
28 |         if passed_header:
29 |             data.append(map(float, line.split()))
30 |         elif line.startswith('lda:'):
31 |             passed_header = True
32 |     data = zip(*data)   # transpose data
33 |     return data
34 | 
35 | 
36 | def main():
37 |     """
38 |     Displays topics fit by vw's LDA. The first column gives the
39 |     (expected) most prominent words in the topics, the second column
40 |     gives their (expected) relative prominence.
41 |     """
42 |     if len(sys.argv) != 3:
43 |         print >>sys.stderr, "Usage: vw-printtopics.py vocab-file topic-score-file"
44 |         sys.exit(1)
45 |     vocab = str.split(file(sys.argv[1]).read())
46 |     testlambda = loadtxt(sys.argv[2])
47 | 
48 |     for k in range(0, len(testlambda)):
49 |         lambdak = testlambda[k]
50 | 
51 |         # pitch extra topic rows
52 |         lambdak = lambdak[0:(len(vocab)-1)]
53 | 
54 |         # normalize row
55 |         the_sum = sum(lambdak)
56 |         lambdak = [val / the_sum for val in lambdak]
57 | 
58 |         # resort by normalized value
59 |         temp = zip(lambdak, range(0, len(lambdak)))
60 |         temp = sorted(temp, key = lambda x: x[0], reverse=True)
61 |         print 'topic %d:' % (k)
62 |         # feel free to change the "53" here to whatever fits your screen nicely.
63 |         for i in range(0, 20):
64 |             print '%20s  \t---\t  %.4f' % (vocab[temp[i][1]], temp[i][0])
65 |         print
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/transforms/lda_vw/vw-lda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DATA=$1
 4 | K=$2
 5 | ALPHA=0.1
 6 | RHO=0.1
 7 | D=`wc -l $DATA | cut -f 1 -d " "`
 8 | echo $D
 9 | B=$3
10 | FEATURE_INDEX=$4
11 | POWER_T=0.5
12 | INITIAL_T=1.0
13 | BATCH_SIZE=256
14 | 
15 | rm /tmp/vw.cache
16 | ./vowpal_wabbit/vw "$DATA" --lda "$K" --lda_alpha "$ALPHA" --lda_rho "$RHO" --lda_D "$D" --minibatch "$BATCH_SIZE" --power_t "$POWER_T" --initial_t "$INITIAL_T" -b "$B" --cache_file /tmp/vw.cache --passes 10 -p "lda-$K-predictions.dat" --readable_model "lda-$K-topics.dat"
17 | python vowpalwabbit.py -t "lda-$K-topics.dat" -f "$FEATURE_INDEX" > "lda-$K-topics.txt"
18 | 


--------------------------------------------------------------------------------
/transforms/textcluster/.gitignore:
--------------------------------------------------------------------------------
1 | tests/*/output
2 | lib
3 | 


--------------------------------------------------------------------------------
/transforms/textcluster/install:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # normalize work directory
 4 | wd=`dirname "$0"`
 5 | wd=`cd "$wd"; pwd`
 6 | 
 7 | cmd="--build"
 8 | if [[ "${#}" -eq "1" ]]; then
 9 |     if [[ "${1}" == --* ]]; then cmd=$1; fi
10 | fi
11 | 
12 | 
13 | dest=../../build/transforms/textcluster
14 | case "${cmd}" in
15 |     --build|--package)
16 |         mkdir -p lib
17 |         cd lib
18 |         [[ -d stemming ]] ||
19 |             hg clone https://www.bitbucket.org/mchaput/stemming
20 |         [[ -d textcluster ]] ||
21 |             git clone https://github.com/davedash/textcluster.git
22 |         cd ..
23 | 
24 |         mkdir -p "${dest}"
25 |         cp textcluster run.py "${dest}/"
26 |         rm -rf  "${dest}/lib"
27 |         cp -r lib "${dest}/lib"
28 |         ;;
29 |     --test)
30 |         for d in $( ls tests ); do
31 |             ./textcluster "tests/${d}" || exit 1
32 |             pushd "tests/${d}" > /dev/null
33 |             diff results.expected.json output/results.json ||
34 |                 ( echo "Test '${d}': Result seems to be wrong"; exit 1 )
35 |             popd > /dev/null
36 |         done
37 |         ;;
38 |     --clean)
39 |         rm -rf ./lib
40 |         find . -type f -name '*.pyc' | xargs rm
41 |         rm -rf "${dest}"
42 |         ;;
43 |     --help)
44 |         echo "Usage: ${0} [--build|--clean|--test]"
45 |         ;;
46 |     *)
47 |         echo "Usage: ${0} [--build|--clean|--test]"
48 |         exit 1
49 |         ;;
50 | esac
51 | 


--------------------------------------------------------------------------------
/transforms/textcluster/run.py:
--------------------------------------------------------------------------------
 1 | import json, sys
 2 | 
 3 | from textcluster import Corpus
 4 | 
 5 | 
 6 | def process(inStream, outStream,
 7 |             fields={"id": "id", "text": "text"},
 8 |             limits={"clusters": 10, "top_documents": 10}):
 9 |     all = {}
10 | 
11 |     text_field = fields["text"]
12 |     key_field = fields["id"]
13 |     max_clusters = limits["clusters"]
14 |     max_top_docs = limits["top_documents"]
15 | 
16 |     c = Corpus()
17 |     for line in inStream:
18 |         data = line.split('\t', 1)[1]
19 |         doc = json.loads(data.decode("utf8"))
20 |         key = doc[key_field]
21 |         all[key] = doc
22 |         text = c.add((key, doc[text_field]), key=key)
23 | 
24 |     clusters = c.cluster()
25 |     results = []
26 |     for c in clusters[:max_clusters]:
27 |         tophits = [c.primary]
28 |         tophits += [hit["object"] for hit in c.similars[:max_top_docs-1]]
29 |         topdocs = []
30 |         for (key, text) in tophits:
31 |             topdocs.append(all[key])
32 |         results.append({"top_documents": topdocs})
33 | 
34 |     json.dump({"clusters": results}, outStream)
35 | 
36 | 
37 | def main(args):
38 |     work_dir = args[1]
39 |     parameters = json.load(open("%s/parameters.json" % work_dir))
40 |     with open("%s/input.json.tsv" % work_dir) as inFile:
41 |         with open("%s/output/results.json" % work_dir, "w+") as outFile:
42 |             process(inFile, outFile, **parameters)
43 | 
44 | if __name__ == "__main__":
45 |     main(sys.argv)
46 | 


--------------------------------------------------------------------------------
/transforms/textcluster/tests/small/parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": {
 3 |         "id": "id",
 4 |         "text": "text"
 5 |     },
 6 |     "limits": {
 7 |         "clusters": 10,
 8 |         "top_documents": 10
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/transforms/textcluster/tests/standard/parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": {
 3 |         "id": "id",
 4 |         "text": "text"
 5 |     },
 6 |     "limits": {
 7 |         "clusters": 10,
 8 |         "top_documents": 10
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/transforms/textcluster/textcluster:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export PYTHONPATH="lib/stemming:lib/textcluster:$PYTHONPATH"
 4 | 
 5 | fail() {
 6 |     echo $1
 7 |     exit 1
 8 | }
 9 | 
10 | [[ -d "${1}" ]] || fail "usage: ${0} WORKDIR"
11 | 
12 | mkdir -p "${1}/output"
13 | env python run.py "${1}" || exit 1
14 | 


--------------------------------------------------------------------------------