├── .gitignore
├── README.md
├── condor
    ├── README.md
    └── shared
    │   ├── URLS
    │   ├── do.sh
    │   └── html2text.py
├── distribute
    ├── .gitignore
    ├── README.md
    ├── azure-client.py
    ├── ec2-client.py
    ├── env_local.sh
    ├── fabfile.py
    ├── generate-keys.sh
    ├── installer
    │   ├── .gitignore
    │   ├── build
    │   ├── decompress
    │   └── payload
    │   │   └── installer
    ├── setup.sh
    └── test
    │   └── input.json
├── parser
    ├── .gitignore
    ├── README.md
    ├── build.sbt
    ├── project
    │   └── plugins.sbt
    ├── run.sh
    ├── run_parallel.sh
    ├── sbt
    │   ├── sbt
    │   └── sbt-launch.jar
    ├── setup.sh
    ├── src
    │   ├── main
    │   │   └── scala
    │   │   │   └── com
    │   │   │       └── clearcut
    │   │   │           └── nlp
    │   │   │               ├── DocumentParseResult.scala
    │   │   │               ├── DocumentParser.scala
    │   │   │               ├── JSONReader.scala
    │   │   │               ├── Main.scala
    │   │   │               ├── Server.scala
    │   │   │               └── TSVReader.scala
    │   └── test
    │   │   ├── resources
    │   │       ├── input.json.txt
    │   │       ├── testdoc.html
    │   │       └── testdoc.txt
    │   │   └── scala
    │   │       └── DocumentParserSpec.scala
    └── test
    │   ├── input.json
    │   ├── input.tsv
    │   └── output.tsv
├── pipe
    ├── .gitignore
    ├── PLANS.md
    ├── README.md
    ├── build.sbt
    ├── config.properties.template
    ├── example
    │   ├── input.json
    │   └── parse.sh
    ├── project
    │   └── plugins.sbt
    ├── run.sh
    ├── run_parallel.sh
    ├── run_test.sh
    ├── sbt
    │   ├── sbt
    │   └── sbt-launch.jar
    ├── setup.sh
    └── src
    │   ├── main
    │       └── scala
    │       │   └── com
    │       │       └── clearcut
    │       │           └── pipe
    │       │               ├── Main.scala
    │       │               ├── Schema.scala
    │       │               ├── Server.scala
    │       │               ├── annotator
    │       │                   ├── Annotator.scala
    │       │                   ├── ExtendedCleanHtmlStanfordPipeline.scala
    │       │                   ├── ExtendedHtmlStanfordPipeline.scala
    │       │                   ├── ExtendedStanfordPipeline.scala
    │       │                   ├── SimpleStanfordPipeline.scala
    │       │                   ├── StanfordCoreferenceResolver.scala
    │       │                   ├── StanfordDependencyExtractor.scala
    │       │                   ├── StanfordLemmatizer.scala
    │       │                   ├── StanfordNERTagger.scala
    │       │                   ├── StanfordPOSTagger.scala
    │       │                   ├── StanfordSRParser.scala
    │       │                   ├── StanfordSentenceSplitter.scala
    │       │                   ├── StanfordTokenizer.scala
    │       │                   ├── StanfordTrueCaseAnnotator.scala
    │       │                   └── StanfordUtil.scala
    │       │               ├── io
    │       │                   ├── ColumnReader.scala
    │       │                   ├── ColumnWriter.scala
    │       │                   ├── JSONWriter.scala
    │       │                   ├── Json.scala
    │       │                   ├── JsonReader.scala
    │       │                   ├── Reader.scala
    │       │                   ├── TsvReader.scala
    │       │                   ├── TsvWriter.scala
    │       │                   └── Writer.scala
    │       │               └── model
    │       │                   ├── Util.scala
    │       │                   └── package.scala
    │   └── test
    │       ├── resources
    │           ├── testdoc.html
    │           ├── testdoc.json
    │           └── testdoc.txt
    │       └── scala
    │           └── BasicSpec.scala
└── view
    ├── .gitignore
    ├── README.md
    ├── app.js
    ├── bin
        └── www
    ├── build.sh
    ├── env.sh
    ├── gulpfile.js
    ├── package.json
    ├── public
        ├── bundle.js
        ├── css
        │   └── main.css
        ├── index.html
        └── js
        │   ├── help
        │       └── Help.js
        │   ├── main.js
        │   ├── vis.js
        │   ├── vis
        │       ├── AnnotationsSelector.js
        │       ├── TextWithAnnotations.js
        │       ├── core
        │       │   ├── CharOffsets.js
        │       │   ├── EdgesVisualization.js
        │       │   ├── FramesVisualization.js
        │       │   ├── SpansVisualization.js
        │       │   └── TokenTagsVisualization.js
        │       ├── vis.js
        │       ├── visedge.js
        │       ├── visframe.js
        │       ├── visspan.js
        │       └── vistokentag.js
        │   ├── visedge.js
        │   └── visframe.js
    ├── routes
        ├── index.js
        └── users.js
    ├── run.sh
    ├── setup.sh
    ├── util
        ├── cat.sh
        ├── create_index.sh
        ├── fetch-annotations.py
        ├── fetch-sentences-table.py
        ├── generate_sentence_table.py
        ├── get.sh
        ├── index_docs.py
        ├── index_extr.py
        ├── index_extrlist.py
        ├── pipe.py
        ├── refresh-annotations.py
        ├── refresh-documents.py
        ├── search.sh
        └── tab
    ├── view.conf
    ├── view
        ├── help
        │   └── Help.js
        ├── main.js
        └── vis
        │   ├── AnnotationsSelector.js
        │   ├── TextWithAnnotations.js
        │   └── core
        │       ├── CharOffsets.js
        │       ├── EdgesVisualization.js
        │       ├── FramesVisualization.js
        │       ├── SentenceUtils.js
        │       ├── SpansVisualization.js
        │       └── TokenTagsVisualization.js
    └── views
        ├── error.jade
        └── layout.jade


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | parser/lib/
3 | 
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Bazaar
 2 | ======
 3 | 
 4 | A collection of tools to generate input for DeepDive.
 5 | 
 6 | ## [Parser](parser)
 7 | 
 8 | Parser is a wrapper of Stanford CoreNLP which takes a simple JSON format as
 9 | input and generates a TSV file that can be directly loaded into a database. 
10 | 
11 | There are five different ways in which the parser package is used.
12 | 
13 | 1. `parser/run.sh` runs the parser as a single process.
14 | 2. `parser/run_parallel.sh` runs multiple instances of the parser on a single machine.
15 | 3. [Distribute](distribute) runs multiple instances of the parser on multiple machines.
16 | 4. [Condor](condor) contains instructions on how to run the parser on the Condor cluster.
17 | 5. `parser/run.sh -p 8080` runs the parser as a REST service.
18 | 
19 | ## [XML](http://github.com/hazyresearch/dd-genomics)
20 | 
21 | Many external datasets are in an XML format. To consume these datasets with DeepDive,
22 | the XML has to be parsed into the simple JSON representation that the Parser package
23 | uses as input.
24 | 
25 | An example of using an XML parser is contained in the dd-genomics project.
26 | 
27 | ## [Distribute](distribute)
28 | 
29 | It is often desirable to run the parser on multiple machines on ec-2 or azure. Distribute contains tools to automatically provision machines, distribute data, perform parsing, and collect results.
30 | 
31 | 


--------------------------------------------------------------------------------
/condor/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Condor
 3 | 
 4 | Put this folder when you use http://chtc.cs.wisc.edu/chtcjobwrapper.shtml
 5 | 
 6 | Follow the instruction of http://chtc.cs.wisc.edu/chtcjobwrapper.shtml to start the job
 7 | 
 8 | ### Notes
 9 | 
10 | It is VERY VERY important to add the line
11 | 
12 |       -Xmx4g -XX:CICompilerCount=1 -XX:ConcGCThreads=1 -XX:ParallelGCThreads=1 
13 | 
14 | whenever you start JVM on Condor. Otherwise, Stanford CoreNLP would use more
15 | than 1 core.
16 | 
17 | In the file URLS, there are two things:
18 |    - jre-8u31-linux-x64.gz: Just pack the Oracle Java's binary into this. When you
19 |    need JVM, start it like ./jre1.8.0_31/bin/java
20 |    - nlp_2015_2.jar: Put your jar here to use it. Make sure it is compiled with JRE-8u31. 


--------------------------------------------------------------------------------
/condor/shared/URLS:
--------------------------------------------------------------------------------
1 | /czhang/cde-package/jre-8u31-linux-x64.gz
2 | /czhang/cde-package/nlp_2015_2.jar
3 | 


--------------------------------------------------------------------------------
/condor/shared/do.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ####
 4 | # We always log which machine does this job run first
 5 | ####
 6 | uname -a
 7 | 
 8 | tar xf jre-8u31-linux-x64.gz
 9 | 
10 | for l in `ls *.nxml`
11 | do
12 |   sed -i 's/title>/p>/g' $l
13 |   python html2text.py $l > $l.txt
14 |   
15 |   time ./jre1.8.0_31/bin/java -Xmx4g -XX:CICompilerCount=1 -XX:ConcGCThreads=1 -XX:ParallelGCThreads=1 -jar nlp_2015_2.jar $l.txt 
16 |   if [ -f $l.txt.nlp ]
17 |   then
18 |      echo "SUCCEED!" > SUCCEED.txt
19 |   fi
20 | done
21 | 
22 | rm nlp_2015_2.jar
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/distribute/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | ssh
3 | conf/credentials.publishsettings
4 | .state
5 | env
6 | result
7 | segments
8 | 


--------------------------------------------------------------------------------
/distribute/README.md:
--------------------------------------------------------------------------------
  1 | Distribute
  2 | ==========
  3 | 
  4 | Runs the [Parser](/parser) on multiple machines in parallel. Distribute provisions machines
  5 | on ec-2 or azure, then processes chunks of your data on each machine, and
  6 | finally terminates the machines.
  7 | 
  8 | Before you begin, follow the instructions in [Setup](#setup) to install Distribute.
  9 | 
 10 | 1.  Launch instances on ec-2 or azure.  **Note: For ec-2, the General Purpose instance type is recommended (e.g. `m3.2xlarge`); instace types with lower memory/core may cause parser to abort.**
 11 | 
 12 |     ```bash
 13 |     fab launch:cloud=ec2,num=1
 14 |     ```
 15 |     This will launch 1 instance on ec-2. It will also put status information
 16 |     about the launched instance into `.state`.
 17 | 
 18 | 2.  Install dependencies on remote machines
 19 |     ```bash
 20 |     fab install > install.log
 21 |     ```
 22 | 
 23 | 3. Copy chunks to remote machines, run parser on remote machines and collect results:
 24 |     ```bash
 25 |     time fab copy_parse_collect > parse.log
 26 |     ```
 27 |     Tip: You can schedule the remote machines to be terminated on task completion automatically; note though that if the `parse` operation fails, nodes may not terminate:
 28 |     ```bash
 29 |     time fab copy_parse_collect terminate > parse.log
 30 |     ```
 31 |     Tip: You can provide additional parameters to override defaults:
 32 |     ```bash
 33 |     time fab copy_parse_collect:input=test/input.json,batch_size=1000,parallelism=8,key_id='item_id',content_id='content' > parse.log
 34 |     ```
 35 |     If `batch_size` is left unspecified, it will be computed automatically.  Note that very large batch sizes may cause memory errors.  See [Parser](/parser) documentation for details on parser parameters. *Note also that commas need to be backslash-escaped when passed in as parameters.*
 36 |     
 37 | 4.  To check global status of distributed parse, run:
 38 |     ```bash
 39 |     fab get_status
 40 |     ```
 41 | 
 42 | 5.  If not automatically terminated as above, or if error occured, terminate remote machines:
 43 |     ```bash
 44 |     fab terminate
 45 |     ```
 46 |     If termination is successful, the status information in `.state` will be deleted.
 47 | 
 48 | Your parsed information should now be available as a tsv file named `result` in your working directory.
 49 | 
 50 | ## Setup
 51 | 
 52 | ### Dependencies
 53 | 
 54 | If you have sudo rights, run `./setup.sh`.
 55 | 
 56 | If you don't have sudo rights, follow these steps instead. These have been tested on raiders3 (Stanford):
 57 | ```
 58 | cd bazaar
 59 | wget https://raw.githubusercontent.com/pypa/virtualenv/develop/virtualenv.py
 60 | python virtualenv.py env --no-setuptools
 61 | source env/bin/activate
 62 | wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py
 63 | python get-pip.py
 64 | pip install fabric
 65 | pip install urltools
 66 | pip install azure
 67 | pip install botocore
 68 | ```
 69 | 
 70 | The fab command line tool should work now.
 71 | 
 72 | Note that you will have to run `source env/bin/activate` after each login to initialize the environment.
 73 | 
 74 | ### Generate SSH Keys 
 75 | 
 76 | Now, generate SSH keys.
 77 | ```
 78 | ./generate-keys.sh
 79 | ```
 80 | 
 81 | ### Build
 82 | 
 83 | Finally, create a self-extracting installer that will be run on worker nodes.
 84 | ```
 85 | cd installer
 86 | ./build 
 87 | cd ..
 88 | ```
 89 | 
 90 | ### Set EC2 or Azure credentials
 91 | 
 92 | See variables in `env_local.sh` and override as needed.
 93 | 
 94 | For ec2, we recommend storing credentials at `~/.aws/credentials` following this 
 95 | [documentation](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html).
 96 | Make sure to `chmod 400 ~/.aws/credentials` and insert your access key and secret key:
 97 | 
 98 | ```
 99 | [default]
100 | aws_access_key_id = 
101 | aws_secret_access_key = 
102 | ```
103 | 
104 | For azure, upload `ssh/mycert.cer` to the management portal via the "Upload" action of the "Settings" tab, and set the following variable in `env_local.sh`:
105 | ```
106 | export AZURE_SUBSCRIPTION_ID=
107 | ```
108 | 
109 | ## Tips
110 | 
111 | *  You can log into any of the launched nodes on ec-2 or azure:
112 |    ```
113 |    ssh -i ssh/bazaar.key -p PORT USER@HOST
114 |    ```
115 |    where USER, HOST, PORT are contained in `.state/HOSTS`.
116 | 
117 | *  You can choose different instance types (see `env_local.sh`).
118 | 
119 | *  Test your distribution setup on smaller samples of your data,
120 |    and more basic instance types (eg. Standard_D2 for azure).
121 |    Then, when you are confident that everything works as expected,
122 |    choose a more powerful instance type (eg. Standard_D14 on azure),
123 |    and increase the parallelism in step 4 above (eg. 8 or 16).
124 | 
125 | *  By default, Azure only allows you to use a maximum of 20 cores
126 |    in total. This means you can not launch more than one instance
127 |    of type Standard_D14 (16 cores) at a time. You can submit a
128 |    request to Microsoft to increase your quota of cores.
129 | 
130 | *  In case of errors, make sure you stop running VMs through the
131 |    Azure management portal or AWS management console. You may have
132 |    to `rm -r .state` to continue using Distribute.
133 | 


--------------------------------------------------------------------------------
/distribute/env_local.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # AZURE SETTINGS
 4 | 
 5 | # your azure subscription ID (look it up under 'Settings' in the management portal)
 6 | # It has the following form '00000000-0000-0000-0000-000000000000'
 7 | export AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID:-}
 8 | 
 9 | # name for service (must be unique among all azure users), eg. 'ddbazaa'
10 | export AZURE_SERVICE_NAME=${AZURE_SERVICE_NAME:-ddbazaa}
11 | 
12 | # name for storage account (must be unique among all azure users)
13 | export AZURE_STORAGE_ACCOUNT=${AZURE_STORAGE_ACCOUNT:-ddbazaastore}
14 | 
15 | # eg. 'Standard_D2', or 'Standard_D14'
16 | export AZURE_ROLE_SIZE=${AZURE_ROLE_SIZE:-Standard_D2}
17 | 
18 | # EC2 SETTINGS
19 | 
20 | # For ec-2, we recommend that you keep your AWS_ACCESS_KEY_ID and your
21 | # AWS_SECRET_ACCESS_KEY in ~/.aws/credentials.
22 | 
23 | # eg. 'm3.large'
24 | #export EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-r3.4xlarge}
25 | export EC2_INSTANCE_TYPE=m3.2xlarge
26 | 


--------------------------------------------------------------------------------
/distribute/generate-keys.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ -d "./ssh" ]; then
 4 |     echo 'Directory ./ssh exists already. Abort.'
 5 |     echo 'If you would like to re-generate the keys, please remove ./ssh and try again.'
 6 |     exit 1
 7 | fi 
 8 | 
 9 | # install SSH keys
10 | echo "Creating SSH keys"
11 | rm -rf ./ssh
12 | mkdir ./ssh
13 | cd ./ssh
14 | 
15 | # generate private/public key pair
16 | ssh-keygen -t rsa -b 2048 -f bazaar.key -N '' -C bazaar
17 | 
18 | # generate azure pem file from openssh private key
19 | openssl req \
20 |     -x509 \
21 |     -days 365 \
22 |     -nodes \
23 |     -key bazaar.key \
24 |     -out bazaar.pem \
25 |     -newkey rsa:2048 \
26 |     -subj "/"
27 | 
28 | # install (separate) management certificates for azure
29 | openssl req -x509 \
30 |     -nodes \
31 |     -days 365 \
32 |     -newkey rsa:1024 \
33 |     -keyout mycert.pem \
34 |     -out mycert.pem \
35 |     -subj "/" 
36 | 
37 | openssl x509 -inform pem -in mycert.pem -outform der -out mycert.cer
38 | 
39 | echo 'All keys have been generated and placed into ./ssh.'
40 | echo '   ssh/bazaar.key is the private key used to log in to worker machines'
41 | echo '   ssh/bazaar.key.pub is the corresponding public key in OpenSSH format (ec2)'
42 | echo '   ssh/bazaar.pem is the corresponding public key in OpenSSL format (azure)'
43 | echo '   ssh/mycert.cer is a management certificate used for azure only'
44 | echo 'NOTE: If you would like to use Azure, you must upload ssh/mycert.cer via the "Upload" action of the "Settings" tab of the management portal.'
45 | 
46 | 


--------------------------------------------------------------------------------
/distribute/installer/.gitignore:
--------------------------------------------------------------------------------
1 | install-parser
2 | payload.tar.gz
3 | 


--------------------------------------------------------------------------------
/distribute/installer/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Create parser package"
 4 | 
 5 | INSTALLER_DIR="$(pwd)"
 6 | PARSER_DIR=../../parser
 7 | 
 8 | cd $PARSER_DIR
 9 | tar cf $INSTALLER_DIR/payload/parser.tar \
10 |     build.sbt \
11 |     src \
12 |     run.sh \
13 |     run_parallel.sh \
14 |     project \
15 |     sbt \
16 |     setup.sh
17 | 
18 | echo "Create self-extracting installer"
19 | 
20 | cd $INSTALLER_DIR/payload
21 | tar cf ../payload.tar ./*
22 | cd ..
23 | 
24 | if [ -e "payload.tar" ]; then
25 |     gzip payload.tar
26 | 
27 |     if [ -e "payload.tar.gz" ]; then
28 |         cat decompress payload.tar.gz > install-parser
29 |     else
30 |         echo "payload.tar.gz does not exist"
31 |         exit 1
32 |     fi
33 | else
34 |     echo "payload.tar does not exist"
35 |     exit 1
36 | fi
37 | 
38 | chmod +x install-parser
39 | echo "install-parser created"
40 | exit 0
41 | 


--------------------------------------------------------------------------------
/distribute/installer/decompress:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo ""
 3 | echo "Self Extracting Installer"
 4 | echo ""
 5 | 
 6 | export TMPDIR=`mktemp -d /tmp/selfextract.XXXXXX`
 7 | 
 8 | ARCHIVE=`awk '/^__ARCHIVE_BELOW__/ {print NR + 1; exit 0; }' $0`
 9 | 
10 | tail -n+$ARCHIVE $0 | tar xzv -C $TMPDIR
11 | 
12 | CDIR=`pwd`
13 | cd $TMPDIR
14 | ./installer
15 | 
16 | cd $CDIR
17 | rm -rf $TMPDIR
18 | 
19 | exit 0
20 | 
21 | __ARCHIVE_BELOW__
22 | 


--------------------------------------------------------------------------------
/distribute/installer/payload/installer:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "Running Installer"
 3 | rm -rf $HOME/parser
 4 | mkdir $HOME/parser
 5 | tar xf ./parser.tar -C $HOME/parser
 6 | 
 7 | # install java?
 8 | if type -p java; then
 9 |     echo "Found java"    
10 | else
11 |     sys=`uname -s`
12 |     if [ ! "$sys" == "Linux" ]; then
13 |        echo "Only supporting Ubuntu"
14 |        exit 1
15 |     fi
16 |     #echo "Getting openjdk"
17 |     #mkdir /tmp/openjdk
18 |     #cd /tmp/openjdk
19 |     #apt-get -y --print-uris install openjdk-7-jdk | grep "http.*deb" -o | xargs -0 echo
20 |     #apt-get -y --print-uris install openjdk-7-jdk | grep "http.*deb" -o | xargs -0 wget
21 |     #for d in *.deb; do dpkg -x "$d" openjdk; done
22 |     #rm *.deb
23 |     #_java=/tmp/openjdk/TODO
24 | 
25 |     cd $HOME
26 |     wget --no-check-certificate --no-cookies --header "Cookie: oraclelicense=accept-securebackup-cookie" http://download.oracle.com/otn-pub/java/jdk/8u45-b14/jdk-8u45-linux-x64.tar.gz
27 |     tar xvzf jdk-8u45-linux-x64.tar.gz
28 | 
29 |     echo 'export PATH=~/jdk1.8.0_45/bin:$PATH' >> ~/.bashrc
30 |     echo 'export JAVA_HOME=~/jdk1.8.0_45' >> ~/.bashrc
31 |     export PATH=~/jdk1.8.0_45/bin:$PATH
32 |     export JAVA_HOME=~/jdk1.8.0_45
33 | fi
34 | 
35 | DIRNAME=$HOME/parser
36 | DESTDIR=$DIRNAME/lib
37 | FILENAME='stanford-srparser-2014-10-23-models.jar'
38 | if [ ! -e "$DESTDIR/$FILENAME" ]; then
39 |     mkdir -p $DESTDIR
40 |     wget -P $DESTDIR http://nlp.stanford.edu/software/stanford-srparser-2014-10-23-models.jar
41 | else
42 |     echo "Skipping download: $DESTDIR/$FILENAME already exists"
43 | fi
44 | 
45 | cd $HOME/parser
46 | which java
47 | sbt/sbt stage
48 | 


--------------------------------------------------------------------------------
/distribute/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # install virtualenv
 4 | command -v virtualenv >/dev/null 2>&1 || {
 5 |   echo >&2 "virtualenv required but not installed. Aborting.";
 6 |   echo >&2 "You can install virtualenv with:"
 7 |   echo >&2 "    sudo pip install virtualenv"
 8 | }
 9 | 
10 | virtualenv env
11 | source env/bin/activate
12 | 
13 | pip install azure
14 | pip install botocore
15 | pip install fabric
16 | pip install urltools
17 | 
18 | 


--------------------------------------------------------------------------------
/parser/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | 


--------------------------------------------------------------------------------
/parser/README.md:
--------------------------------------------------------------------------------
 1 | Parser
 2 | ======
 3 | 
 4 | Run `setup.sh` to install dependencies and build the parser.
 5 | 
 6 | We assume that your input has the following format. There's one line per document and each document is a JSON object with a key and content field.
 7 | 
 8 | ```json
 9 | { "item_id":"doc1", "content":"Here is the content of my document.\nAnd here's another line." }
10 | { "item_id":"doc2", "content":"Here's another document." }
11 | ```
12 | 
13 | You can run the NLP pipeline on 1 core as follows:
14 | 
15 | ```bash
16 | cat input.json | ./run.sh -i json -k "item_id" -v "content" > output.tsv
17 | ```
18 | 
19 | You can run the NLP pipeline on 16 cores as follows:
20 | ```bash
21 | ./run_parallel.sh -in="input.json" --parallelism=16 -i json -k "item_id" -v "content"
22 | ```
23 | 
24 | You can run the NLP pipeline as a REST service as follows:
25 | 
26 | ```bash
27 | ./run.sh -p 8080
28 | ```
29 | 
30 | The output will be files in tsv-format that you can directly load into the database.
31 | 
32 | 
33 | ## Setup
34 | 
35 | This package requires Java 8. 
36 | 
37 | 


--------------------------------------------------------------------------------
/parser/build.sbt:
--------------------------------------------------------------------------------
 1 | import com.typesafe.sbt.SbtStartScript
 2 | 
 3 | name := "deepdive-nlp-parser"
 4 | 
 5 | version := "0.1"
 6 | 
 7 | scalaVersion := "2.10.3"
 8 | 
 9 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
10 | 
11 | resolvers += "Scalaz Bintray Repo" at "https://dl.bintray.com/scalaz/releases"
12 | 
13 | libraryDependencies ++= List(
14 |   "ch.qos.logback" % "logback-classic" % "1.0.7",
15 |   "com.typesafe.play" %% "play-json" % "2.2.1",
16 |   "com.github.scopt" %% "scopt" % "3.2.0",
17 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.5.1",
18 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.5.1" classifier "models",
19 |   "org.scalatest" % "scalatest_2.10" % "2.0.RC2" % "test",
20 |   "org.http4s" %% "http4s-dsl" % "0.7.0",
21 |   "org.http4s" %% "http4s-jetty" % "0.7.0"
22 | )
23 | 
24 | unmanagedJars in Compile += file("lib/stanford-srparser-2014-10-23-models.jar")
25 | 
26 | parallelExecution in Test := false
27 | 
28 | test in assembly := {}
29 | 
30 | seq(SbtStartScript.startScriptForClassesSettings: _*)
31 | 
32 | 


--------------------------------------------------------------------------------
/parser/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0")
2 | 
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
4 | 


--------------------------------------------------------------------------------
/parser/run.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 | 
3 | export JAVA_OPTS="-Xmx4g -Dfile.encoding=UTF-8"
4 | 
5 | $(dirname $0)/target/start $@
6 | 


--------------------------------------------------------------------------------
/parser/run_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Parse sentences in parallel
 3 | 
 4 | set -eu
 5 | 
 6 | # Usage: this_script input_file parallelism input_batch_size
 7 | 
 8 | if [ "$#" -le 1 ]; then
 9 |   echo "Usage: $0 input_file parallelism [input_batch_size=1000] [sentence_words_limit=120]"
10 |   exit
11 | fi
12 | 
13 | for i in "$@"
14 | do
15 | case $i in
16 |   -in=*|--input=*)
17 |     INPUT_FILE="${i#*=}"
18 |     shift
19 |     ;;
20 |   -p=*|--parallelism=*)
21 |     PARALLELISM="${i#*=}"
22 |     shift
23 |     ;;
24 |   -b=*|--batch-size=*)
25 |     BATCH_SIZE="${i#*=}"
26 |     shift
27 |     ;;
28 |     *)
29 |     echo "NO MATCH"
30 |     break
31 |     ;;
32 | esac
33 | done
34 | 
35 | if [ -z "$INPUT_FILE" ]; then
36 |   echo "Usage: $0 -i=input_file [--parallelism=PARALLELISM] [--batch-size=BATCH_SIZE ] <args for run.sh>"
37 |   exit
38 | fi
39 | 
40 | PARALLELISM=${PARALLELISM:-2}
41 | BATCH_SIZE=${BATCH_SIZE:-1000}
42 | 
43 | echo "parallelism = $PARALLELISM"
44 | echo "batch-size  = $BATCH_SIZE"
45 | 
46 | RUN_SCRIPT=`cd $(dirname $0)/; pwd`"/run.sh $@"
47 | echo $RUN_SCRIPT
48 | mkdir -p $INPUT_FILE.split
49 | rm -f $INPUT_FILE.split/*
50 | 
51 | # Split the input file into subfiles
52 | split -a 10 -l $BATCH_SIZE $INPUT_FILE $INPUT_FILE.split/input-
53 | 
54 | # Match all files in the split directory
55 | find $INPUT_FILE.split -name "input-*" 2>/dev/null -print0 | xargs -0 -P $PARALLELISM -L 1 bash -c "${RUN_SCRIPT}"' -f "$0"'
56 | 
57 | echo "Output TSV files are in: $INPUT_FILE.split/*.parsed"
58 | echo "To load them into the databse, run: cat $INPUT_FILE.split/*.parsed | psql YOUR_DB_NAME -c "'"COPY sentences FROM STDIN"'
59 | 


--------------------------------------------------------------------------------
/parser/sbt/sbt:
--------------------------------------------------------------------------------
1 | java $SBT_OPTS -jar `dirname $0`/sbt-launch.jar "$@"


--------------------------------------------------------------------------------
/parser/sbt/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HazyResearch/bazaar/c09dce20f16a90c359f804f9e83d6107547d442c/parser/sbt/sbt-launch.jar


--------------------------------------------------------------------------------
/parser/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | DIRNAME=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 6 | 
 7 | # fetch SR models
 8 | DESTDIR="$DIRNAME"/lib
 9 | FILENAME='stanford-srparser-2014-10-23-models.jar'
10 | if [ ! -e "$DESTDIR/$FILENAME" ]; then
11 |     mkdir -p "$DESTDIR"
12 |     url="http://nlp.stanford.edu/software/stanford-srparser-2014-10-23-models.jar"
13 |     if type wget &>/dev/null; then
14 |         wget -P "$DESTDIR" "$url"
15 |     elif type curl &>/dev/null; then
16 |         ( cd "$DESTDIR" && curl -LO "$url" )
17 |     else
18 |         echo >&2 "Could not find curl or wget.  Manually download $url to $DESTDIR/"
19 |         false
20 |     fi
21 | else
22 |     echo "Skipping download: $DESTDIR/$FILENAME already exists"
23 | fi
24 | 
25 | # java
26 | #sudo add-apt-repository -y ppa:openjdk-r/ppa
27 | #sudo apt-get update
28 | #sudo apt-get install -y openjdk-8-jdk
29 | 
30 | # check if java -version >= 1.8
31 | javaVersion=$(java -version 2>&1 | sed -e '1!d; s/^java version "//; s/"$//')
32 | [[ ! $javaVersion < 1.8 ]] || {
33 |     echo >&2 "java -version >= 1.8 required but found: $javaVersion"
34 |     false
35 | }
36 | 
37 | # build parser
38 | cd "$DIRNAME"
39 | sbt/sbt stage
40 | 
41 | 


--------------------------------------------------------------------------------
/parser/src/main/scala/com/clearcut/nlp/DocumentParseResult.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.nlp
 2 | 
 3 | case class SentenceParseResult(
 4 | 	sentence: String,
 5 | 	words: List[String],
 6 | 	lemma: List[String],
 7 |   	pos_tags: List[String],
 8 |   	ner_tags: List[String],
 9 |   	offsets: List[Int],
10 |   	dep_labels: List[String], 
11 |   	dep_parents: List[Int],
12 |   	collapsed_deps: List[String]
13 | )
14 | 
15 | case class DocumentParseResult(
16 | 	sentences: List[SentenceParseResult]
17 | )
18 | 


--------------------------------------------------------------------------------
/parser/src/main/scala/com/clearcut/nlp/DocumentParser.scala:
--------------------------------------------------------------------------------
  1 | package com.clearcut.nlp
  2 | 
  3 | import edu.stanford.nlp.ling.CoreAnnotations._
  4 | import edu.stanford.nlp.pipeline._
  5 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.{CollapsedCCProcessedDependenciesAnnotation, CollapsedDependenciesAnnotation}
  6 | 
  7 | // import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation
  8 | import java.util.Properties
  9 | 
 10 | import scala.collection.JavaConversions._
 11 | 
 12 | 
 13 | class DocumentParser(props: Properties) {
 14 | 
 15 |   val pipeline = new StanfordCoreNLP(props)
 16 | 
 17 |   def parseDocumentString(doc: String) = {
 18 | 
 19 |     // Temporary fix for bug where brackets are being incorrectly treated as punct
 20 |     // and somehow this messes up the whole dep parse -> change them to round braces
 21 |     val doc2 = doc.replaceAll("""\[""", "(").replaceAll("""\]""", ")")
 22 | 
 23 |     val document = new Annotation(doc2)
 24 |     pipeline.annotate(document)
 25 |     // val dcoref = document.get(classOf[CorefChainAnnotation])
 26 |     val sentences = document.get(classOf[SentencesAnnotation])
 27 | 
 28 |     val sentenceResults = sentences.zipWithIndex.map { case(sentence, sentIdx) =>
 29 |       val content = sentence.toString
 30 |       val tokens = sentence.get(classOf[TokensAnnotation])
 31 |       val wordList = tokens.map(_.get(classOf[TextAnnotation]))
 32 |       val posList = tokens.map(_.get(classOf[PartOfSpeechAnnotation]))
 33 |       val nerList = tokens.map(_.get(classOf[NamedEntityTagAnnotation]))
 34 |       val lemmaList = tokens.map(_.get(classOf[LemmaAnnotation]))
 35 |       val offsetList = tokens.map(_.get(classOf[CharacterOffsetBeginAnnotation]).intValue)
 36 | 
 37 |       // This kind of dep paths seem to be a tree. Need CoreNLP guys to confirm.
 38 |       // Ce has been using this all along.
 39 |       val depCollapsedPaths = sentence.get(classOf[CollapsedDependenciesAnnotation]).edgeIterable
 40 |       val depLabels = Array.fill(tokens.size)("")
 41 |       val depParents = Array.fill(tokens.size)(0)
 42 |       for (path <- depCollapsedPaths) {
 43 |         depLabels(path.getTarget.index - 1) = path.getRelation.toString
 44 |         depParents(path.getTarget.index - 1) = path.getSource.index
 45 |       }
 46 | 
 47 |       // This kind of dep paths may have cycles.
 48 |       val depCCPPaths = sentence.get(classOf[CollapsedCCProcessedDependenciesAnnotation]).edgeIterable
 49 |       val ccpPathTriples = for(path <- depCCPPaths) yield
 50 |         List(path.getSource.index, path.getRelation, path.getTarget.index).mkString(",")
 51 |       
 52 |       SentenceParseResult(
 53 |         content, 
 54 |         wordList.toList, 
 55 |         lemmaList.toList, 
 56 |         posList.toList,
 57 |         nerList.toList,
 58 |         offsetList.toList,
 59 |         depLabels.toList,
 60 |         depParents.toList,
 61 |         ccpPathTriples.toList
 62 |       )
 63 |     }
 64 | 
 65 |     DocumentParseResult(sentenceResults.toList) 
 66 |   }
 67 | 
 68 |   /**
 69 |     Construct a Postgres-acceptable array in the TSV format, from a list
 70 |   */
 71 |   def list2TSVArray(arr: List[String]) : String = {
 72 |     return arr.map( x => 
 73 |       // Replace '\' with '\\\\' to be accepted by COPY FROM
 74 |       // Replace '"' with '\\"' to be accepted by COPY FROM
 75 |       if (x.contains("\\")) 
 76 |         "\"" + x.replace("\\", "\\\\\\\\").replace("\"", "\\\\\"") + "\""
 77 |       else 
 78 |         "\"" + x + "\""
 79 |       ).mkString("{", ",", "}")
 80 |   }
 81 | 
 82 |   def intList2TSVArray(arr: List[Int]) : String = {
 83 |     return arr.map( x => 
 84 |       "" + x
 85 |       ).mkString("{", ",", "}")
 86 |   }
 87 | 
 88 |   def string2TSVString(str: String) : String = {
 89 |     if (str.contains("\\"))
 90 |       str.replace("\\", "\\\\") 
 91 |     else
 92 |       str
 93 |   }
 94 | 
 95 |   // NOTE: an alternative would be to quote the field correctly
 96 |   // http://stackoverflow.com/questions/3089077/new-lines-in-tab-delimited-or-comma-delimtted-output
 97 |   def replaceChars(str: String) : String = {
 98 |     str.replace("\n", " ").replace("\t", " ")
 99 |   }
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/parser/src/main/scala/com/clearcut/nlp/JSONReader.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.nlp
 2 | 
 3 | import play.api.libs.json.{JsObject, JsString, JsValue, Json}
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | class JSONReader(input:Source, docIdKeys:Array[String], documentKey:String)
 8 |   extends Iterator[(Array[String], String)] {
 9 | 
10 |   var it = input.getLines.zipWithIndex
11 |   var _next = fetchNext()
12 | 
13 |   override def hasNext: Boolean =
14 |     _next != null
15 | 
16 |   override def next(): (Array[String], String) = {
17 |     val n = _next
18 |     _next = fetchNext()
19 |     n
20 |   }
21 | 
22 |   private def fetchNext(): (Array[String], String) = {
23 |     var n:(Array[String], String) = null
24 |     while (n == null && it.hasNext) {
25 |       val (line, num) = it.next
26 | 
27 |       val jsObj = Json.parse(line).asInstanceOf[JsObject]
28 | 
29 |       val maybeDocumentIds = new Array[String](docIdKeys.length)
30 |       docIdKeys.zipWithIndex.foreach { case (idk, i) =>
31 |         val maybeDocumentId = jsObj.value.get(idk);
32 |         (maybeDocumentId) match {
33 |           case (Some(documentId:JsString)) =>
34 |             maybeDocumentIds(i) = documentId.value
35 |           case (_) =>
36 |             maybeDocumentIds(i) = "\\N"
37 |         }
38 |       }
39 | 
40 |       val maybeDocumentStr = jsObj.value.get(documentKey).map(_.asInstanceOf[JsString].value)
41 | 
42 |       (maybeDocumentIds, maybeDocumentStr) match {
43 |         case (documentIds:Array[String], Some(documentStr:String)) =>
44 |           n = (documentIds, documentStr)
45 |         //case (Array[None],_) =>
46 |         //  System.err.println(s"Warning: skipped malformed line ${num}: ${line}")
47 |         case (_, None) =>
48 |           System.err.println(s"Warning: skipped malformed line ${num}: ${line}")
49 |       }
50 |     }
51 |     n
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/parser/src/main/scala/com/clearcut/nlp/Server.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.nlp
 2 | 
 3 | import org.http4s._
 4 | import org.http4s.dsl._
 5 | import org.http4s.server.HttpService
 6 | import org.http4s.server.jetty.JettyBuilder
 7 | import scala.collection.mutable.ListBuffer
 8 | import scalaz.stream.Process._
 9 | 
10 | 
11 | class Server(dp: DocumentParser, port: Integer) {
12 | 
13 |   val route = HttpService {
14 |     case req @ GET -> Root =>
15 |       Ok("Hello. I can parse stuff. Just POST the text to me.\n")
16 | 
17 |     case req @ POST -> Root =>
18 |       // WARNING: when request body is empty, http4s seems to hang here
19 |       val content = new String(req.body.runLog.run.reduce(_ ++ _).toArray, Charset.`UTF-8`.nioCharset)
20 | 
21 |       val lines = ListBuffer[String]()
22 |       dp.parseDocumentString(content).sentences.zipWithIndex
23 |         .foreach { case (sentenceResult, sentence_idx) =>
24 | 
25 |         val outline = List(
26 |           sentence_idx + 1,
27 |           sentenceResult.sentence,
28 |           dp.list2TSVArray(sentenceResult.words),
29 |           dp.list2TSVArray(sentenceResult.lemma),
30 |           dp.list2TSVArray(sentenceResult.pos_tags),
31 |           dp.list2TSVArray(sentenceResult.ner_tags),
32 |           dp.intList2TSVArray(sentenceResult.offsets),
33 |           dp.list2TSVArray(sentenceResult.dep_labels),
34 |           dp.intList2TSVArray(sentenceResult.dep_parents)
35 |           // dp.list2TSVArray(sentenceResult.collapsed_deps)
36 |         ).mkString("\t")
37 | 
38 |         lines += outline
39 |       }
40 |       Ok(lines.toList.mkString("\n") + "\n")
41 |   }
42 | 
43 |   def run() = {
44 |     JettyBuilder
45 |       .mountService(route, "")
46 |       .bindHttp(port)
47 |       .run
48 |       .awaitShutdown()
49 |   }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/parser/src/main/scala/com/clearcut/nlp/TSVReader.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.nlp
 2 | 
 3 | import scala.io.BufferedSource
 4 | 
 5 | class TSVReader(input:BufferedSource,
 6 |                  idCols:Array[Int], documentCol:Int)
 7 |   extends Iterator[(Array[String], String)] {
 8 | 
 9 |   var it = input.getLines.zipWithIndex
10 |   var _next = fetchNext()
11 | 
12 |   override def hasNext: Boolean =
13 |     _next != null
14 | 
15 |   override def next(): (Array[String], String) = {
16 |     val n = _next
17 |     _next = fetchNext()
18 |     n
19 |   }
20 | 
21 |   private def fetchNext(): (Array[String], String) = {
22 |     var n:(Array[String], String) = null
23 |     while (n == null && it.hasNext) {
24 |       val (line, num) = it.next
25 |       val tsvArr = line.trim.split("\t")
26 |       if (tsvArr.length >= idCols.length + 1) {
27 |         val documentIds = idCols.map(idc => tsvArr(idc))
28 |         val documentStr = tsvArr(documentCol)
29 |         n = (documentIds, documentStr)
30 |       } else {
31 |         System.err.println(s"Warning: skipped malformed line ${num}: ${line}")
32 |       }
33 |     }
34 |     n
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/parser/src/test/resources/input.json.txt:
--------------------------------------------------------------------------------
1 | {"documents.id" : 5, "documents.text" : "I am document one. I am sentence twp, really. I am another sentence, called sentence three."}
2 | {"documents.id" : 7, "documents.text" : "John drove to Judy’s house and he made her dinner. This sentence should have some corefs."}


--------------------------------------------------------------------------------
/parser/src/test/resources/testdoc.html:
--------------------------------------------------------------------------------
 1 | <div class="article-entry text">
 2 | 
 3 | <!-- Begin: Wordpress Article Content -->
 4 | <img src="http://tctechcrunch2011.files.wordpress.com/2013/12/5396691102_dd3d157676_b.jpg?w=698" class="article-img-feature" originalw="400" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/5396691102_dd3d157676_b.jpg?w=400" scale="2"><p><a target="_blank" href="http://wrike.com">Wrike</a> has launched a new version of its project management platform with an emphasis on real-time analysis and new features such as syncing calendars to work projects. The new platform, Wrike Enterprise, gives the company a deeper focus on the corporate market for its collaboration-centered tools. It gives customers a way to crunch project management data in the order of a million updates per day. This is data around work items such as tasks completed, the original time planned for the&nbsp;project&nbsp;and the historical data that is associated with the project. The data is presented in “instant infographics,” that help people see the latest updates to projects, said Wrike CEO&nbsp;Andrew Filev in an email interview. <img class="aligncenter size-large wp-image-931506" alt="Wrike-Enterprise-visual-reports" src="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-visual-reports.png?w=1152&amp;h=640" originalw="576" width="576" height="638" originalh="640" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-visual-reports.png?w=576&amp;h=640" scale="2"> Historically, project managers have done detailed plans that they then track. The manager periodically updates the projects and then compares the current state to the baseline established at the start of the project. With the Wrike platform, the data from every interaction is stored and then compared to historical data and then presented in a chart. A customer can see the state of the project from different dimensions such as the realistic amount of time a project will take to get done,&nbsp;&nbsp;what&nbsp;requires immediate action and how performance of an employee has evolved over time.</p>
 5 | <p><span style="line-height:1.625;">A new &nbsp;user group feature in Wrike Enterprise allows the project manager to i</span><span style="line-height:1.625;">nclude employees in multiple work groups by project, department, or any other ad hoc query. It can share the needed data with the whole group and keep permissions organized. This allows the manager to keep track of the overall project without hundreds of people making their own changes.&nbsp;</span> <img class="aligncenter size-large wp-image-931882" alt="Wrike-Enterprise-user-groups" src="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-user-groups1.png?w=1280&amp;h=868" width="640" height="434" originalw="640" originalh="434" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-user-groups1.png?w=640&amp;h=434" scale="2"></p>
 6 | <div>Wrike’s new “Custom Calendars,” syncs projects with the calendars of other members on the team. It allows the manager to track a&nbsp;<span style="line-height:1.625;">colleague’s vacations, PTO and extra working days. It is designed to avoid schedule overlaps and build more accurate plans.</span> <img class="aligncenter size-large wp-image-931884" alt="Wrike-Enterprise-custom-calendars" src="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-custom-calendars1.png?w=1280&amp;h=670" width="640" height="335" originalw="640" originalh="335" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-custom-calendars1.png?w=640&amp;h=335" scale="2"> There are also new ways to integrate a company’s &nbsp;identity into the service. Wrike has also added new security controls for larger customers. In October, <a href="http://techcrunch.com/2013/10/02/wrike-raises-10m-in-funding-for-service-that-helps-the-work-get-done/">Wrike raised $10 million in funding</a>. &nbsp;It was the first round since the company was originally founded seven years ago. The company has traditionally served the small business community but this release points to its additonal focus on the larger enterprise companies of the world. Wrike competes with the likes of<a target="_blank" href="http://atlassian.com"> Atlassian</a> and<a target="_blank" href="http://asana.com"> Asana</a>. But its advantage is in its crisp user interface which it can now leverage even more as it embraces data as a way for project managers to better keep track of their projects. <span style="line-height:1.625;">&nbsp;</span></div>
 7 | <div></div>
 8 | <div><em>Feature image courtesy of<a target="_blank" href="http://www.flickr.com/photos/vfsdigitaldesign/"> VFS Digital Design</a> on Flickr via Creative Commons)</em></div>
 9 | <div></div>
10 | <div id="jp-post-flair" class="sharedaddy sd-like-enabled"></div><!-- End: Wordpress Article Content -->
11 | 
12 |                 
13 |               </div>


--------------------------------------------------------------------------------
/parser/src/test/resources/testdoc.txt:
--------------------------------------------------------------------------------
1 | In a decision that could have far-reaching consequences, the D.C. Circuit Court of Appeals today struck down the FCC’s Open Internet Order. That Order, put into force in 2010 by then-chairman Julius Genachowski, was designed to make it so that broadband service providers couldn’t meddle with traffic on the web based on its type – in other words, they couldn’t block certain kinds of online data transmission just because it didn’t align with their own goals and financial strategy.
2 | 
3 | Media watchdog and advocacy agency Free Press released the following statement about the decision via President and CEO Craig Aaron, condemning it while also acknowledging that the Open Internet Order probably wasn’t the best possible solution for enforcing net neutrality:


--------------------------------------------------------------------------------
/parser/src/test/scala/DocumentParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.deepdive.udf.nlp.test
 2 | 
 3 | import java.io._
 4 | import com.clearcut.nlp.{Main, DocumentParser}
 5 | import org.deepdive.udf.nlp._
 6 | import org.scalatest._
 7 | import play.api.libs.json._
 8 | import scala.io.Source
 9 | import java.util.Properties
10 | 
11 | class DocumentParserSpec extends FunSpec {
12 | 
13 |   describe("Parsing documents") {
14 | 
15 |     it("should work with plain text") {
16 |       val inputFile = getClass.getResource("/testdoc.txt").getFile
17 |       val documentStr = Source.fromFile(inputFile).mkString
18 |       
19 |       val props = new Properties()
20 |       props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, dcoref")
21 |       val dp = new DocumentParser(props)
22 | 
23 |       val result = dp.parseDocumentString(documentStr)
24 |       assert(result.sentences.size == 3)
25 |     }
26 | 
27 |     it("should work with HTML documents") {
28 |       val inputFile = getClass.getResource("/testdoc.html").getFile
29 |       val documentStr = Source.fromFile(inputFile).mkString
30 | 
31 |       val props = new Properties()
32 |       props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, dcoref")
33 |       val dp = new DocumentParser(props)
34 | 
35 |       val result = dp.parseDocumentString(documentStr)
36 |       assert(result.sentences.size == 23)
37 |     }
38 | 
39 |   }
40 | 
41 |   describe("Running the main method from the command line") {
42 | 
43 |     it("should work with valid JSON") {
44 |       // Read stdin from file
45 |       val inputFile = getClass.getResource("/input.json.txt").getFile
46 |       val is = new FileInputStream(inputFile)
47 |       System.setIn(is)
48 |       
49 |       // Execute the main method
50 |       Main.main(Array("--valueKey", "documents.text", "--idKey", "documents.id"))
51 |     } 
52 |   }
53 | 
54 | }


--------------------------------------------------------------------------------
/parser/test/input.tsv:
--------------------------------------------------------------------------------
 1 | 1	This is a test document.
 2 | 2	This is another test document. It contains two sentences.
 3 | 3	This is yet another one. It contains three sentences. The last sentence is, however, the longest.
 4 | 4	Barack Obama, the current U.S. president, married to his wife Michelle several years ago.
 5 | 5	In a decision that could have far-reaching consequences, the D.C. Circuit Court of Appeals today struck down the FCC’s Open Internet Order. That Order, put into force in 2010 by then-chairman Julius Genachowski, was designed to make it so that broadband service providers couldn’t meddle with traffic on the web based on its type – in other words, they couldn’t block certain kinds of online data transmission just because it didn’t align with their own goals and financial strategy.
 6 | 
 7 | 6	Media watchdog and advocacy agency Free Press released the following statement about the decision via President and CEO Craig Aaron, condemning it while also acknowledging that the Open Internet Order probably wasn’t the best possible solution for enforcing net neutrality:
 8 | 
 9 | 7	This is a "test document", with "quotation marks and slashes: \ \\ \\\ \\\\"
10 | 8	This sentence contains a mean token\.
11 | 


--------------------------------------------------------------------------------
/pipe/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | lib/


--------------------------------------------------------------------------------
/pipe/PLANS.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Plans
 3 | =====
 4 | 
 5 | Change annotator schema to something like
 6 | 
 7 | ```
 8 | class Annotator[A,B] {}
 9 | 
10 | class StanfordTokenizer[C <: HasText, D <: HasTokens with HasTokenOffsets] extends Annotator[C,D] {}
11 | ```
12 | 
13 | Then the readers can be type-safe, too:
14 | 
15 | ```
16 | val r = new ColumnReader[HasText with HasID]
17 | ```
18 | 
19 | Internally, the reader can look for the right files.
20 | 
21 | And to build a concrete object:
22 | 
23 | ```
24 | trait HasInt { def getInt:Int }
25 | trait HasString { def getString:String }
26 | 
27 | val obj:HasInt with HasString =
28 |    new HasInt with HasString {
29 |      def getInt = 12
30 |      def getString = "hello"
31 |    }
32 | ```
33 | 
34 | Open question: how to merge two objects?
35 | 
36 | If we do multiple extractions, and then want to merge the results:
37 | 
38 | val obj1:HasX with HasY
39 | val obj2:HasZ
40 | 
41 | val obj = ????
42 | 
43 | 


--------------------------------------------------------------------------------
/pipe/README.md:
--------------------------------------------------------------------------------
  1 | Pipe
  2 | ====
  3 | 
  4 | Lightweight schemas and processing framework for NLP. 
  5 | 
  6 | Pipe addresses the following problems: 
  7 | 
  8 | * In many DeepDive applications, errors in pre-processing become more relevant as one tries to push up precision and recall. Often no further quality improvement is possible without targeting these errors. 
  9 | 
 10 |   An example:
 11 |   ```
 12 |   $300. 00 per hour
 13 |   ```
 14 |   Our sentence splitter would break on the period and create two sentences.
 15 | 
 16 |   For some extractors we have tried work-arounds by adding complex rules to our extractors which target these errors. In fact, a significant portion of code in our 'rates' extractor is code to workaround this problem, but this code is complex and difficult to maintain.
 17 |   
 18 |   The right approach, of course, should be to fix the pre-processing components directly. Unfortunately, this is tricky because we treat all pre-processing as a black box, making changes nearly impossible. 
 19 | 
 20 |   Pipe solves this problem by breaking up the preprocessing components. It is now easy to add your custom tokenization or sentence splitting rules. For almost any domain, we want to add a few such domain-specific rules to improve pre-processing.
 21 |   
 22 | * We have a few problems with our current schemas for NLP. 
 23 |   1. Our NLP parser outputs a file in psql-specific format that no other application can read. 
 24 |   2. When running extractors, we manually serialize and deserialize using custom logic consisting of UDFs (array_to_string) and language-specific code (String.split).
 25 |   3. Our sentences table is very wide, but most extractors only need 2 or 3 columns. This creates unnecessary I/O.
 26 |   4. Our output is lossy, because we don't store the original text (only a tokenized version), and we have lost the mapping to the original characters.
 27 |   5. It would be difficult to add coreference information to the sentences table, because it is not document-based.
 28 |   
 29 | ## Schemas
 30 | 
 31 | With Pipe, we create a set of *minimal* schemas for the different NLP annotations. There's one schema for each type of annotation, and we currently have 18 schemas in total. The schemas are in JSON, which makes it trivial to read from and write in any programming language.
 32 | 
 33 | Examples:
 34 | 
 35 | ann.id
 36 | ```
 37 | "doc123"
 38 | ```
 39 | 
 40 | ann.text
 41 | ```
 42 | "This is a very simple text file.\nIt contains two sentences."
 43 | ```
 44 | 
 45 | ann.poss
 46 | ```
 47 | ["DT","VBZ","DT","RB","JJ","NN","NN",".","PRP","VBZ","CD","NNS","."]
 48 | ```
 49 | 
 50 | ann.tokens
 51 | ```
 52 | ["This","is","a","very","simple","text","file",".","It","contains","two","sentences","."]
 53 | ```
 54 | ann.tokenOffsets
 55 | ```
 56 | [[0,4],[5,7],[8,9],[10,14],[15,21],[22,26],[27,31],[31,32],[33,35],[36,44],[45,48],[49,58],[58,59]]
 57 | ```
 58 | 
 59 | ann.sentenceOffsets
 60 | ```
 61 | [[0,32],[33,59]]
 62 | ```
 63 | 
 64 | ann.sentenceTokenOffsets
 65 | ```
 66 | [[0,8],[8,13]]
 67 | ```
 68 | 
 69 | ## Storage
 70 | 
 71 | We propose to store these in column format, where there exists one file for each type of schema.
 72 | Pipe contains readers and writers for column format in both [scala](src/main/scala/com/clearcut/pipe/io) and [python](../view/util/pipe.py).
 73 | 
 74 | For compatibility reasons, Pipe also allows you to read and write as single JSON:
 75 | ```
 76 | {
 77 |   "id": "doc123",
 78 |   "text": "This is a very simple text file.\nIt contains two sentences.",
 79 |   "poss": ["DT","VBZ","DT","RB","JJ","NN","NN",".","PRP","VBZ","CD","NNS","."],
 80 |   "tokens": ["This","is","a","very","simple","text","file",".","It","contains","two","sentences","."],
 81 |  ...
 82 | }
 83 | ```
 84 | And for backwards compatibility, Pipe also allows you to write in our psql-specific TSV.
 85 | 
 86 | ## Framework
 87 | 
 88 | The framework allows you to plug together different preprocessing components. Currently, Pipe contains wrappers for most components of Stanford CoreNLP, as well as a components that can run an entire Stanford pipeline.
 89 | 
 90 | Since the components read and write our language-agnostic schemas, we can now plug together components in arbitrary programming languages including python, scala, julia.
 91 | 
 92 | When working with Scala, you can choose to use static typing or not. If you use static typing, [our typedefs](src/main/scala/com/clearcut/pipe/model/package.scala) make code compact and clean:
 93 | ```
 94 | type ID = String
 95 | type Poss = Array[String]
 96 | type Offsets = Array[Int]
 97 | type SentenceDependencies = Array[Array[Dependency]]
 98 | type SentenceOffsets = Array[Offsets]
 99 | type SentenceTokenOffsets = Array[Offsets]
100 | type Text = String
101 | ...
102 | ```
103 | An example is [here](src/test/scala/BasicSpec.scala).
104 | 
105 | To build a custom tokenizer that solves the `$300. 00` problem, you can write something like 
106 | ```
107 | import com.clearcut.pipe.annotator.Annotator
108 | import com.clearcut.pipe.model._
109 | 
110 | class MyTokenizer extends Annotator[Text,(TokenOffsets,Tokens)] {
111 |   override def annotate(t:(Text)):(TokenOffsets, Tokens) = {
112 |      // add custom logic here
113 |   }
114 | }
115 | ```
116 | 
117 | ## Tip
118 | 
119 | You can run Pipe in a regular scala REPL and manipulate your data or processing components interactively. 
120 | 
121 | You can also run our python readers and writers in a python REPL and create your own components there.
122 | 
123 | ## Setup
124 | 
125 | Run `setup.sh` to install dependencies and build the parser. Pipe requires Java 8.
126 | 
127 | ## Usage
128 | 
129 | Here are a few examples showing how to call Pipe with the provided launcher scripts.
130 | 
131 | ```
132 | ./run.sh -i INPUT.json --formatIn json --formatOut json -v content -k doc_id -a SimpleStanfordPipeline -o OUTPUT
133 | ```
134 | Reads INPUT.json which contains json objects with fields "doc_id" and "content". Writes results as json objects to file OUTPUT.
135 | 
136 | ```
137 | ./run.sh -i INPUT.json --formatIn json --formatOut column -v content -k doc_id -a StanfordTokenizer,StanfordSentenceSplitter,StanfordPOSTagger,StanfordLemmatizer,StanfordNERTagger,StanfordSRParser -o test
138 | ```
139 | Runs a custom set of annotators and stores results in column format.
140 | 
141 | ```
142 | /run_parallel.sh --input=INPUT.json --parallelism=10 '--formatIn json --formatOut column -v content -k doc_id -a ExtendedStanfordPipeline'
143 | ```
144 | Splits the input file into segments and runs 10 parallel processes at a time. The ExtendedStanfordPipeline adds parse trees and true case annotations.
145 | 
146 | 


--------------------------------------------------------------------------------
/pipe/build.sbt:
--------------------------------------------------------------------------------
 1 | import com.typesafe.sbt.SbtStartScript
 2 | 
 3 | organization := "com.clearcut"
 4 | 
 5 | name := "pipe"
 6 | 
 7 | version := "0.1-SNAPSHOT"
 8 | 
 9 | scalaVersion := "2.11.7"
10 | 
11 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
12 | 
13 | resolvers += "Scalaz Bintray Repo" at "https://dl.bintray.com/scalaz/releases"
14 | 
15 | libraryDependencies ++= List(
16 |   "ch.qos.logback" % "logback-classic" % "1.0.7",
17 |   "com.typesafe.play" %% "play-json" % "2.3.4",
18 |   "com.github.scopt" %% "scopt" % "3.2.0",
19 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
20 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" classifier "models",
21 |   "org.scalatest" % "scalatest_2.11" % "2.2.5" % "test",
22 |   "org.http4s" %% "http4s-dsl" % "0.7.0",
23 |   "org.http4s" %% "http4s-jetty" % "0.7.0",
24 |   "org.json4s" %% "json4s-jackson" % "3.2.11",
25 |   "org.jsoup" % "jsoup" % "1.8.3"
26 | )
27 | 
28 | unmanagedJars in Compile += file("lib/stanford-srparser-2014-10-23-models.jar")
29 | 
30 | parallelExecution in Test := false
31 | 
32 | test in assembly := {}
33 | 
34 | seq(SbtStartScript.startScriptForClassesSettings: _*)
35 | 
36 | 


--------------------------------------------------------------------------------
/pipe/config.properties.template:
--------------------------------------------------------------------------------
1 | tokenize.whitespace = true
2 | ssplit.eolonly = true
3 | 


--------------------------------------------------------------------------------
/pipe/example/input.json:
--------------------------------------------------------------------------------
1 | { "doc_id":"1", "content":"I was robbed by this girl.&nbsp;<div>:(" }
2 | 


--------------------------------------------------------------------------------
/pipe/example/parse.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ../run.sh --formatIn json --formatOut json -v content -k doc_id -a ExtendedCleanHtmlStanfordPipeline -i input.json -o output.json
4 | 


--------------------------------------------------------------------------------
/pipe/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0")
2 | 
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
4 | 


--------------------------------------------------------------------------------
/pipe/run.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 | 
3 | export JAVA_OPTS="-Xmx4g -Dfile.encoding=UTF-8"
4 | 
5 | $(dirname $0)/target/start $@
6 | 


--------------------------------------------------------------------------------
/pipe/run_parallel.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Parses documents in parallel.
  4 | #
  5 | # Input is a single file that contains one JSON record per line.
  6 | # Output is a single file that contains one JSON record per line.
  7 | #
  8 | # The number of records and their order is the same in input and output.
  9 | #
 10 | # Example:
 11 | # ./run_parallel.sh --input=INPUT.json --output=OUTPUT.json \
 12 | #                   --params='-v content -k doc_id -a ExtendedCleanHtmlStanfordPipeline'
 13 | #
 14 | # The following environment variables are used when available.
 15 | #   PARALLELISM (default 2)
 16 | #   BATCH_SIZE  (default 1000)
 17 | 
 18 | set -eu
 19 | 
 20 | for i in "$@"
 21 | do
 22 | case $i in
 23 |   -in=*|--input=*)
 24 |     INPUT_FILE="${i#*=}"
 25 |     shift
 26 |     ;;
 27 |   -out=*|--output=*)
 28 |     OUTPUT_FILE="${i#*=}"
 29 |     shift
 30 |     ;;
 31 |   -pa=*|--params=*)
 32 |     PARAMS="${i#*=}"
 33 |     shift
 34 |     ;;
 35 |   -p=*|--parallelism=*)
 36 |     PARALLELISM="${i#*=}"
 37 |     shift
 38 |     ;;
 39 |   -b=*|--batch-size=*)
 40 |     BATCH_SIZE="${i#*=}"
 41 |     shift
 42 |     ;;
 43 |   --keepsplit)
 44 |     KEEP_SPLIT=true
 45 |     shift
 46 |     ;;
 47 |   --compress)
 48 |     COMPRESS_OUTPUT=true
 49 |     shift
 50 |     ;;
 51 |     *)
 52 |     echo "Ignoring parameter: $i"
 53 |     break
 54 |     ;;
 55 | esac
 56 | done
 57 | 
 58 | if [ -z "$INPUT_FILE" ]; then
 59 |   echo "Usage: $0 -in=INPUT.json [-out=OUTPUT.json] [--parallelism=PARALLELISM] \\"
 60 |   echo "                    [--batch-size=BATCH_SIZE ] --params='<args for run.sh>'"
 61 |   exit
 62 | fi
 63 | 
 64 | # Setting defaults
 65 | PARALLELISM=${PARALLELISM:-2}
 66 | BATCH_SIZE=${BATCH_SIZE:-1000}
 67 | PARAMS=${PARAMS:-}
 68 | KEEP_SPLIT=${KEEP_SPLIT:-false}
 69 | COMPRESS_OUTPUT=${COMPRESS_OUTPUT:-false}
 70 | if [ "$COMPRESS_OUTPUT" = false ]; then
 71 |     OUTPUT_FILE=${OUTPUT_FILE:-$INPUT_FILE.out}
 72 | else
 73 |     OUTPUT_FILE=${OUTPUT_FILE:-$INPUT_FILE.out.gz}
 74 | fi
 75 | 
 76 | echo "parallelism = $PARALLELISM"
 77 | echo "batch-size  = $BATCH_SIZE"
 78 | echo "compress    = $COMPRESS_OUTPUT"
 79 | 
 80 | # Fixed a bug when "config.properties" does not exists
 81 | touch config.properties
 82 | 
 83 | RUN_SCRIPT=`cd $(dirname $0)/; pwd`"/run.sh --formatIn json --formatOut json $PARAMS"
 84 | echo $RUN_SCRIPT
 85 | 
 86 | SPLIT_DIR=$INPUT_FILE.split
 87 | mkdir -p $SPLIT_DIR
 88 | rm -rf $SPLIT_DIR/*
 89 | 
 90 | # Split the input file into subfiles
 91 | split -a 10 -l $BATCH_SIZE $INPUT_FILE $SPLIT_DIR/input-
 92 | 
 93 | # Match all files in the split directory
 94 | find $INPUT_FILE.split -name "input-*" 2>/dev/null -print0 | xargs -0 -P $PARALLELISM -L 1 bash -c "${RUN_SCRIPT}"' -i "$0" -o "$0.out"'
 95 | 
 96 | function merge_json_format {
 97 |     SPLIT_DIR=$1
 98 |     OUTPUT_FILE=$2
 99 |     # merging json files
100 |     for file in $SPLIT_DIR/*.out
101 |     do
102 |         if [ "$COMPRESS_OUTPUT" = false ]; then
103 |             cat $file >> $OUTPUT_FILE
104 |         else
105 |             cat $file | gzip >> $OUTPUT_FILE
106 |         fi
107 |     done
108 | }
109 | 
110 | 
111 | function merge_column_format {
112 |     SPLIT_DIR=$1
113 |     OUTPUT_FILE=$2
114 |     # merging column format segments
115 | 
116 |     OUTDIR=$INPUT_FILE.out
117 |     if [ -d "$OUTDIR" ]; then
118 |         echo "$OUTDIR already exists. Aborting."
119 |         exit 1
120 |     fi
121 |     mkdir $OUTDIR
122 | 
123 |     # first we determine the different annotators by looking at only one segment
124 |     annotations=()
125 |     for file in $SPLIT_DIR/*
126 |     do
127 |         if [[ -d $file ]]; then
128 |             for ann in $file/*
129 |             do
130 |                 annotations+=("${ann##*.}")
131 |             done
132 |             break
133 |         fi
134 |     done
135 | 
136 |     # now cat them all together
137 |     for file in $SPLIT_DIR/*
138 |     do
139 |         if [[ -d $file ]]; then
140 |             for ann in "${annotations[@]}"
141 |             do
142 |                 cat $file/ann.$ann >> $OUTDIR/ann.$ann
143 |             done
144 |         fi
145 |     done
146 | }
147 | 
148 | merge_json_format $SPLIT_DIR $OUTPUT_FILE
149 | 
150 | # remove split dir
151 | if [ "$KEEP_SPLIT" = false ]; then
152 |     rm -rf $SPLIT_DIR
153 | fi
154 | 
155 | echo "The output is in $OUTPUT_FILE"
156 | 


--------------------------------------------------------------------------------
/pipe/run_test.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 | 
3 | ./run.sh --formatIn tsv --tsvValue 2 --tsvKey 0 -i test/input.tsv -o test/out
4 | 
5 | 


--------------------------------------------------------------------------------
/pipe/sbt/sbt:
--------------------------------------------------------------------------------
1 | java $SBT_OPTS -jar `dirname $0`/sbt-launch.jar "$@"


--------------------------------------------------------------------------------
/pipe/sbt/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HazyResearch/bazaar/c09dce20f16a90c359f804f9e83d6107547d442c/pipe/sbt/sbt-launch.jar


--------------------------------------------------------------------------------
/pipe/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIRNAME=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 4 | 
 5 | # fetch SR models
 6 | DESTDIR=$DIRNAME/lib
 7 | FILENAME='stanford-srparser-2014-10-23-models.jar'
 8 | if [ ! -e "$DESTDIR/$FILENAME" ]; then
 9 |     mkdir -p $DESTDIR
10 |     wget -P $DESTDIR http://nlp.stanford.edu/software/stanford-srparser-2014-10-23-models.jar
11 | else
12 |     echo "Skipping download: $DESTDIR/$FILENAME already exists"
13 | fi
14 | 
15 | # On Ubuntu, install java 8
16 | #sudo add-apt-repository -y ppa:openjdk-r/ppa
17 | #sudo apt-get update
18 | #sudo apt-get install -y openjdk-8-jdk
19 | 
20 | # build parser
21 | cd $DIRNAME
22 | sbt/sbt stage
23 | 
24 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/Main.scala:
--------------------------------------------------------------------------------
  1 | package com.clearcut.pipe
  2 | 
  3 | import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter}
  4 | 
  5 | import com.clearcut.pipe.annotator.Annotator
  6 | import com.clearcut.pipe.io._
  7 | 
  8 | object Main extends App {
  9 | 
 10 |   // Parse command line options
 11 |   case class Config(serverPort: Integer = null,
 12 |                     in: String = null,
 13 |                     out: String = null,
 14 |                     formatIn: String = "column",
 15 |                     formatOut: String = "column",
 16 |                     documentKey: String = "text",
 17 |                     idKey: String = "id",
 18 |                     documentCol: Int = 1,
 19 |                     idCol: Int = 0,
 20 |                     annotators: String = "SimpleStanfordPipeline")
 21 | 
 22 |   val optionsParser = new scopt.OptionParser[Config]("Pipe") {
 23 |     head("Run CoreNLP annotators and read/write column/json/tsv formats", "0.1")
 24 |     head("Input:  column dir, json file, or tsv file")
 25 |     head("Output: column files, json file, or tsv file")
 26 |     opt[String]("formatIn") action { (x, c) =>
 27 |       c.copy(formatIn = x)
 28 |     } text("column, json or tsv")
 29 |     opt[String]("formatOut") action { (x, c) =>
 30 |       c.copy(formatOut = x)
 31 |     } text("column, json or tsv")
 32 |     opt[String]('v', "jsonValue") action { (x, c) =>
 33 |       c.copy(documentKey = x)
 34 |     } text("JSON key that contains the document content, for example \"documents.text\"")
 35 |     opt[String]('k', "jsonKey") action { (x, c) =>
 36 |       c.copy(idKey = x)
 37 |     } text("JSON key that contains the document id, for example \"documents.id\"")
 38 |     opt[Int]("tsvValue") action { (x, c) =>
 39 |       c.copy(documentCol = x)
 40 |     } text("Column number that contains the document content, for example 1")
 41 |     opt[Int]("tsvKey") action { (x, c) =>
 42 |       c.copy(idCol = x)
 43 |     } text("Column number that contains the document id, for example 0")
 44 |     opt[String]('i', "input") action { (x, c) =>
 45 |       c.copy(in = x)
 46 |     } text("Input dir (column) or file (json, tsv)")
 47 |     opt[String]('o', "output") action { (x, c) =>
 48 |       c.copy(out = x)
 49 |     } text("Output dir (column) or file (json, tsv)")
 50 |     opt[String]('a', "annotators") action { (x, c) =>
 51 |       c.copy(annotators = x)
 52 |     } text("Comma-separated list of annotators. Default: SimpleStanfordPipeline")
 53 |     opt[Int]('p', "serverPort") action { (x, c) =>
 54 |       c.copy(serverPort = x)
 55 |     } text("Run as an HTTP service")
 56 |   }
 57 | 
 58 |   val conf = optionsParser.parse(args, Config()) getOrElse {
 59 |     throw new IllegalArgumentException
 60 |   }
 61 | 
 62 |   if (conf.serverPort != null) {
 63 |     Console.println("Listening on port " + conf.serverPort + "...")
 64 |     new Server(conf.serverPort).run()
 65 |     System.exit(0)
 66 |   }
 67 | 
 68 |   val errors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(conf.out + ".errors")))
 69 | 
 70 |   val annotators:Array[Annotator[_,_]] = conf.annotators.split(",").map (s =>
 71 |       Class.forName("com.clearcut.pipe.annotator." + s.trim).newInstance().asInstanceOf[Annotator[_,_]])
 72 | 
 73 |   // load configuration properties from properties file
 74 |   if (new java.io.File("config.properties").exists) {
 75 |     println("config.properties exists")
 76 |     val prop = new java.util.Properties()
 77 |     val fromFile = new java.io.FileReader("config.properties")
 78 |     prop.load(fromFile)
 79 |     fromFile.close
 80 |     for (ann <- annotators)
 81 |       ann.setProperties(prop)
 82 |   }
 83 | 
 84 |   val reader:Reader = conf.formatIn match {
 85 |     case "column" => new ColumnReader(conf.in)
 86 |     case "json" => new JsonReader(conf.in, conf.idKey, conf.documentKey)
 87 |     case "tsv" => new TsvReader(conf.in, conf.idCol, conf.documentCol)
 88 |   }
 89 | 
 90 |   val writer:Writer = conf.formatOut match {
 91 |     case "column" => new ColumnWriter(conf.out)
 92 |     case "json" => new JsonWriter(conf.out)
 93 |     case "tsv" => new TsvWriter(conf.out)
 94 |   }
 95 | 
 96 |   run(annotators, reader, writer, errors)
 97 | 
 98 |   writer.close
 99 |   reader.close
100 |   errors.close
101 | 
102 |   def run(annotators:Array[Annotator[_,_]], reader:Reader, writer:Writer, errors:BufferedWriter) = {
103 |     val schema = Schema.extendSchema(reader.getSchema, annotators)
104 |     val indices = annotators.map(a => Schema.defaultAnnotationIndices(schema, a.requires))
105 |     writer.setSchema(schema)
106 | 
107 |     for (t <- reader) {
108 |       var all = t
109 |       for ((a, i) <- annotators.zip(indices)) {
110 |         val input = i.map(index => all(index))
111 |         all = all ++ a.annotateUnsafe(input:_*)
112 |       }
113 |       writer.write(all)
114 |     }
115 |   }
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/Schema.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe
 2 | 
 3 | import com.clearcut.pipe.annotator.Annotator
 4 | import scala.collection.mutable.Map
 5 | 
 6 | case class Schema
 7 | (
 8 |   annTyps: Array[String] = Array(),
 9 |   defaults: Map[String, Int] = Map(),
10 |   provenance: Array[String] = Array()
11 | )
12 | 
13 | 
14 | object Schema {
15 | 
16 |   def defaultAnnotations(schema: Schema, needed: Seq[String], all: Seq[String]): Seq[AnyRef] = {
17 |     defaultAnnotationIndices(schema, needed).map(all(_))
18 |   }
19 | 
20 |   def defaultAnnotationIndices(schema: Schema, needed: Seq[String]): Seq[Int] = {
21 |     needed.map(schema.defaults(_))
22 |   }
23 | 
24 |   def extendSchema(before: Schema, annotators: Array[Annotator[_,_]]): Schema = {
25 |     val annTyps = Array.concat(before.annTyps, annotators.flatMap(_.generates))
26 |     val defaults = Map[String, Int]()
27 |     defaults ++= before.defaults
28 |     annTyps.zipWithIndex.foreach { case (c, i) => if (!defaults.contains(c)) defaults += (c -> i) }
29 |     val provenance = Array.concat(before.provenance, annotators.flatMap(_.generates))
30 |     new Schema(annTyps, defaults, provenance)
31 |   }
32 | 
33 |   def createSchema(annTyps: String*): Schema = {
34 |     val defaults = Map[String, Int]()
35 |     annTyps.zipWithIndex.foreach { case (c, i) => if (!defaults.contains(c)) defaults += (c -> i) }
36 |     val provenance = annTyps.toArray.map(_ => "provided")
37 |     new Schema(annTyps.toArray, defaults, provenance)
38 |   }
39 | 
40 |   def prettyPrint(s:Schema) = {
41 |     s.annTyps.map(println(_))
42 |   }
43 | 
44 | }


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/Server.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe
 2 | 
 3 | import java.io._
 4 | 
 5 | import com.clearcut.pipe.annotator._
 6 | import com.clearcut.pipe.io.{TsvWriter, TsvReader, ColumnWriter, ColumnReader}
 7 | import org.http4s._
 8 | import org.http4s.dsl._
 9 | import org.http4s.server.HttpService
10 | import org.http4s.server.jetty.JettyBuilder
11 | import scala.collection.mutable.ListBuffer
12 | import scala.io.Source
13 | 
14 | class Server(port: Integer) {
15 | 
16 |   val route = HttpService {
17 |     case req @ GET -> Root =>
18 |       Ok("Hello. I can parse stuff. Just POST the text to me.\n")
19 | 
20 |     case req @ POST -> Root =>
21 |       // WARNING: when request body is empty, http4s seems to hang here
22 |       val content = new String(req.body.runLog.run.reduce(_ ++ _).toArray, Charset.`UTF-8`.nioCharset)
23 | 
24 |       val lines = ListBuffer[String]()
25 | 
26 |       val annotators:Array[Annotator[_,_]] = Array(new SimpleStanfordPipeline)
27 | 
28 |       val reader = new TsvReader(inSource = Source.fromString("id\t" + content.replace("\t", " ").replace("\n", " ") + "\n"))
29 |       val baos = new ByteArrayOutputStream
30 |       val writer = new TsvWriter(outWriter = new BufferedWriter(new OutputStreamWriter(baos, "utf-8")))
31 |       val errors = new BufferedWriter(new PrintWriter(new OutputStreamWriter(System.err, "utf-8")))
32 | 
33 |       Main.run(annotators, reader, writer, errors)
34 | 
35 |       reader.close
36 |       writer.close
37 | 
38 |       Ok(baos.toString("utf-8"))
39 |   }
40 | 
41 |   def run() = {
42 |     JettyBuilder
43 |       .mountService(route, "")
44 |       .bindHttp(port)
45 |       .run
46 |       .awaitShutdown()
47 |   }
48 | 
49 | }


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/Annotator.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import java.util.Properties
 4 | import scala.reflect.runtime.universe._
 5 | import com.clearcut.pipe.model._
 6 | 
 7 | abstract class Annotator[In,Out](implicit inTag:TypeTag[In], outTag:TypeTag[Out])
 8 |   extends java.io.Serializable {
 9 | 
10 |   var properties = new Properties()
11 | 
12 |   def setProperties(p:java.util.Properties) = {
13 |     properties = p
14 |   }
15 | 
16 |   def annotate(in:In):Out
17 | 
18 |   def init = {}
19 | 
20 |   def close = {}
21 | 
22 |   def requires = inTypes
23 | 
24 |   def generates = outTypes
25 | 
26 |   val inTypes:Seq[String] = toTypes(inTag)
27 |   val outTypes:Seq[String] = toTypes(outTag)
28 | 
29 |   private def toTypes[A](tag:TypeTag[A]):Seq[String] = {
30 |     if (tag.tpe <:< typeOf[Product])
31 |       tag.tpe.typeArgs.map(t => {
32 |         val s = t.toString
33 |         lowerFirst(s.substring(s.lastIndexOf(".") + 1))
34 |       }) else {
35 |       val s = tag.tpe.toString
36 |       Array(lowerFirst(s.substring(s.lastIndexOf(".") + 1)))
37 |     }
38 |   }
39 | 
40 |   val inClazz = inTag.mirror.runtimeClass(inTag.tpe.typeSymbol.asClass)
41 | 
42 |   def annotateUnsafe(in:AnyRef*):Seq[AnyRef] = {
43 |     var outTuple:Out = if (inTypes.size == 1) {
44 |       val inTuple = in(0).asInstanceOf[In]
45 |       annotate(inTuple)
46 |     } else {
47 |       val inTuple = inClazz.getConstructors.apply(0).newInstance(in:_*).asInstanceOf[In]
48 |       annotate(inTuple)
49 |     }
50 |     var outSeq = if (outTypes.size == 1)
51 |       Seq(outTuple.asInstanceOf[AnyRef])
52 |     else
53 |       outTuple.asInstanceOf[Product].productIterator.toSeq.asInstanceOf[Seq[AnyRef]]
54 |     outSeq
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/ExtendedCleanHtmlStanfordPipeline.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation
 5 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation}
 6 | import java.util.Properties
 7 | import com.clearcut.pipe.model._
 8 | import java.util.regex._
 9 | import org.jsoup.Jsoup
10 | import org.jsoup.safety._
11 | 
12 | class ExtendedCleanHtmlStanfordPipeline extends Annotator[(Text), (Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas,
13 |   SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets)] {
14 | 
15 |   override def setProperties(p:Properties) {
16 |     super.setProperties(p)
17 |     properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, truecase")
18 |     properties.put("clean.xmltags", ".*")
19 |     properties.put("parse.maxlen", "100")
20 |     properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz")
21 |     properties.put("truecase.model", "edu/stanford/nlp/models/truecase/truecasing.fast.qn.ser.gz")
22 |     properties.put("threads", "1") // Should use extractor-level parallelism
23 |     properties.put("clean.allowflawedxml", "true")
24 |     properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article")
25 |   }
26 |   
27 |   @transient lazy val pipeline = new StanfordCoreNLP(properties)
28 | 
29 |   val stripHtml = Pattern.compile("<\\/?a|A[^>]*>")
30 | 
31 |   override def annotate(t:Text):(Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets) = {
32 | 
33 |     // clean up Html
34 |     var text = extractCleanHtml(t)
35 | 
36 |     // Temporary fix for bug where brackets are being incorrectly treated as punct
37 |     // and somehow this messes up the whole dep parse -> change them to round braces
38 |     text = text.replaceAll( """\[""", "(").replaceAll( """\]""", ")")
39 | 
40 |     var stanAnn = new Annotation(text)
41 |     try {
42 |       pipeline.annotate(stanAnn)
43 |     
44 |     } catch {
45 |       // If our pipeline still fails on this input, we return an empty tuple.
46 |       case e:Exception =>
47 |          System.err.println(text)
48 |          e.printStackTrace(System.err)
49 |          System.err.flush()
50 |          return (text, Array[Offsets](), Array[Offsets](), Array[String](),           Array[String](), Array[String](), Array[String](), Array[Array[Dependency]](), Array[String](), Array[String](), Array[Offsets]())  
51 |     }
52 | 
53 |     val (toa, to) = StanfordTokenizer.fromStanford(stanAnn)
54 |     val poss = StanfordPOSTagger.fromStanford(stanAnn)
55 |     val nertags = StanfordNERTagger.fromStanford(stanAnn)
56 |     val lemmas = StanfordLemmatizer.fromStanford(stanAnn)
57 |     val deps = StanfordDependencyExtractor.fromStanford(stanAnn)
58 |     val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn)
59 |     val pa = StanfordSRParser.fromStanford(stanAnn)
60 |     val tcs = StanfordTrueCaseAnnotator.fromStanford(stanAnn)
61 | 
62 |     (text, so, toa, to, poss, nertags, lemmas, deps, pa, tcs, sto)
63 |   }
64 | 
65 |   def extractCleanHtml(html:String):String = {
66 |     val doc = Jsoup.parseBodyFragment(html).body()
67 |     doc.html()
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/ExtendedHtmlStanfordPipeline.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation
 5 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation}
 6 | import java.util.Properties
 7 | import com.clearcut.pipe.model._
 8 | import java.util.regex._
 9 | import org.jsoup.Jsoup
10 | import org.jsoup.safety._
11 | 
12 | class ExtendedHtmlStanfordPipeline extends Annotator[(Text), (Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas,
13 |   SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets)] {
14 | 
15 |   override def setProperties(p:Properties) {
16 |     super.setProperties(p)
17 |     properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, truecase")
18 |     properties.put("clean.xmltags", ".*")
19 |     properties.put("parse.maxlen", "100")
20 |     properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz")
21 |     properties.put("truecase.model", "edu/stanford/nlp/models/truecase/truecasing.fast.qn.ser.gz")
22 |     properties.put("threads", "1") // Should use extractor-level parallelism
23 |     properties.put("clean.allowflawedxml", "true")
24 |     properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article")
25 |   }
26 |   
27 |   @transient lazy val pipeline = new StanfordCoreNLP(properties)
28 | 
29 |   val stripHtml = Pattern.compile("<\\/?a|A[^>]*>")
30 | 
31 |   override def annotate(t:Text):(Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets) = {
32 | 
33 |     // clean up Html
34 |     //var text = extractCleanHtml(t)
35 |     var text = t
36 | 
37 |     // Temporary fix for bug where brackets are being incorrectly treated as punct
38 |     // and somehow this messes up the whole dep parse -> change them to round braces
39 |     //text = text.replaceAll( """\[""", "(").replaceAll( """\]""", ")")
40 | 
41 |     var stanAnn = new Annotation(text)
42 |     try {
43 |       pipeline.annotate(stanAnn)
44 |     
45 |     } catch {
46 |       // If our pipeline still fails on this input, we return an empty tuple.
47 |       case e:Exception =>
48 |          System.err.println(text)
49 |          e.printStackTrace(System.err)
50 |          System.err.flush()
51 |          return (text, Array[Offsets](), Array[Offsets](), Array[String](),           Array[String](), Array[String](), Array[String](), Array[Array[Dependency]](), Array[String](), Array[String](), Array[Offsets]())  
52 |     }
53 | 
54 |     val (toa, to) = StanfordTokenizer.fromStanford(stanAnn)
55 |     val poss = StanfordPOSTagger.fromStanford(stanAnn)
56 |     val nertags = StanfordNERTagger.fromStanford(stanAnn)
57 |     val lemmas = StanfordLemmatizer.fromStanford(stanAnn)
58 |     val deps = StanfordDependencyExtractor.fromStanford(stanAnn)
59 |     val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn)
60 |     val pa = StanfordSRParser.fromStanford(stanAnn)
61 |     val tcs = StanfordTrueCaseAnnotator.fromStanford(stanAnn)
62 | 
63 |     (text, so, toa, to, poss, nertags, lemmas, deps, pa, tcs, sto)
64 |   }
65 | 
66 |   def extractCleanHtml(html:String):String = {
67 |     val doc = Jsoup.parseBodyFragment(html).body()
68 |     doc.html()
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/ExtendedStanfordPipeline.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation
 5 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation}
 6 | import java.util.Properties
 7 | import com.clearcut.pipe.model._
 8 | 
 9 | class ExtendedStanfordPipeline extends Annotator[(Text), (SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas,
10 |   SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets)] {
11 | 
12 |   override def setProperties(p:Properties) {
13 |     super.setProperties(p)
14 |     properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, truecase")
15 |     properties.put("parse.maxlen", "100")
16 |     properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz")
17 |     properties.put("threads", "1") // Should use extractor-level parallelism
18 |     properties.put("clean.allowflawedxml", "true")
19 |     properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article")
20 |   }
21 |   
22 |   @transient lazy val pipeline = new StanfordCoreNLP(properties)
23 | 
24 |   override def annotate(t:Text):(SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets) = {
25 |     // Temporary fix for bug where brackets are being incorrectly treated as punct
26 |     // and somehow this messes up the whole dep parse -> change them to round braces
27 |     // val text = t.replaceAll( """\[""", "(").replaceAll( """\]""", ")")
28 |     val text = t
29 | 
30 |     val stanAnn = new Annotation(text)
31 |     pipeline.annotate(stanAnn)
32 | 
33 |     val (toa, to) = StanfordTokenizer.fromStanford(stanAnn)
34 |     val poss = StanfordPOSTagger.fromStanford(stanAnn)
35 |     val nertags = StanfordNERTagger.fromStanford(stanAnn)
36 |     val lemmas = StanfordLemmatizer.fromStanford(stanAnn)
37 |     val deps = StanfordDependencyExtractor.fromStanford(stanAnn)
38 |     val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn)
39 |     val pa = StanfordSRParser.fromStanford(stanAnn)
40 |     val tcs = StanfordTrueCaseAnnotator.fromStanford(stanAnn)
41 | 
42 |     (so, toa, to, poss, nertags, lemmas, deps, pa, tcs, sto)
43 |   }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/SimpleStanfordPipeline.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation}
 4 | import java.util.Properties
 5 | import com.clearcut.pipe.model._
 6 | 
 7 | class SimpleStanfordPipeline extends Annotator[(Text), (SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas,
 8 |   SentenceDependencies)] {
 9 | 
10 |   //val props = new Properties()
11 |   override def setProperties(p:Properties) {
12 |     super.setProperties(p)
13 |     properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse")
14 |     properties.put("parse.maxlen", "100")
15 |     properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz")
16 |     properties.put("threads", "1") // Should use extractor-level parallelism
17 |     properties.put("clean.allowflawedxml", "true")
18 |     properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article")
19 |   }
20 | 
21 |   @transient lazy val pipeline = new StanfordCoreNLP(properties)
22 | 
23 |   override def annotate(t:Text):(SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies) = {
24 |     // Temporary fix for bug where brackets are being incorrectly treated as punct
25 |     // and somehow this messes up the whole dep parse -> change them to round braces
26 |     val text = t.replaceAll( """\[""", "(").replaceAll( """\]""", ")")
27 | 
28 |     val stanAnn = new Annotation(text)
29 |     pipeline.annotate(stanAnn)
30 | 
31 |     val (toa, to) = StanfordTokenizer.fromStanford(stanAnn)
32 |     val poss = StanfordPOSTagger.fromStanford(stanAnn)
33 |     val nertags = StanfordNERTagger.fromStanford(stanAnn)
34 |     val lemmas = StanfordLemmatizer.fromStanford(stanAnn)
35 |     val deps = StanfordDependencyExtractor.fromStanford(stanAnn)
36 |     val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn)
37 | 
38 |     (so, toa, to, poss, nertags, lemmas, deps)
39 |   }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordCoreferenceResolver.scala:
--------------------------------------------------------------------------------
  1 | package com.clearcut.pipe.annotator
  2 | 
  3 | import java.util
  4 | import java.util.{Properties, Set}
  5 | 
  6 | import com.clearcut.pipe.model._
  7 | import edu.stanford.nlp.dcoref.CorefChain.{CorefMention => StCorefMention}
  8 | import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation
  9 | import edu.stanford.nlp.dcoref.{CorefChain => StCorefChain, Dictionaries}
 10 | import edu.stanford.nlp.ling.CoreAnnotations.{SentencesAnnotation, TokenBeginAnnotation}
 11 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
 12 | import edu.stanford.nlp.util.{CoreMap, IntPair, IntTuple}
 13 | 
 14 | import scala.collection.JavaConversions.{asScalaBuffer, collectionAsScalaIterable}
 15 | import scala.collection.mutable.ArrayBuffer
 16 | 
 17 | class StanfordCoreferenceResolver extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets,SentenceTokenOffsets,
 18 | 	Poss,NerTags,Parses,SentenceDependencies),(Mentions,Coreferences)] {
 19 | 
 20 | 	// make sure StanfordCoreNLP has parse annotator, which is needed by dcoref
 21 |   @transient lazy val stanfordAnnotator =
 22 |     AnnotatorFactories.coref(properties, StanfordUtil.annotatorImplementations).create()
 23 | 
 24 | 	override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets,SentenceTokenOffsets,Poss,NerTags,Parses,
 25 | 		SentenceDependencies)):(Mentions, Coreferences) = {
 26 | 		val (t, toa, to, soa, stoa, posa, nerta, pa, sda) = in
 27 | 		val stanAnn = new StAnnotation(t)
 28 |     StanfordTokenizer.toStanford(t, toa, to, stanAnn)
 29 |     StanfordSentenceSplitter.toStanford(soa, stoa, stanAnn)
 30 |     StanfordPOSTagger.toStanford(posa, stanAnn)
 31 |     StanfordNERTagger.toStanford(nerta, stanAnn)
 32 |     StanfordSRParser.toStanford(pa, stanAnn)
 33 |     StanfordDependencyExtractor.toStanford("DepCollapsed", sda, stanAnn)
 34 | 
 35 |     stanfordAnnotator.annotate(stanAnn)
 36 | 
 37 | 		StanfordCoreferenceResolver.fromStanford(stanAnn)
 38 | 	}
 39 | }
 40 | 
 41 | object StanfordCoreferenceResolver {
 42 | 
 43 |   def toStanford(fromT:Text, fromO:TokenOffsets, fromS:SentenceTokenOffsets,
 44 |       fromM:Mentions, fromC:Coreferences, to:StAnnotation):Unit = {
 45 | 	val cm = new java.util.HashMap[Integer, StCorefChain]()
 46 | 	val mentions = fromM
 47 | 	for (c <- fromC) {
 48 | 			
 49 | 	  val mentionMap = new java.util.HashMap[IntPair, Set[StCorefMention]]()
 50 | 	  var representative:StCorefMention = null
 51 | 			
 52 | 	  for (mentionNum <- c.mentionNums) {
 53 | 	    val m = mentions(mentionNum)
 54 | 	    
 55 | 	    // determine sentNum and sentHead
 56 | 	    var sentNum = 0
 57 | 	    var sentHead = -1
 58 | 	    while (sentHead == -1 && sentNum < fromS.size) {
 59 | 	      if (fromS(sentNum)(FROM) <= m.head && m.head < fromS(sentNum)(TO)) {
 60 | 	        sentHead = m.head - fromS(sentNum)(FROM)
 61 | 	      } else
 62 | 	        sentNum += 1
 63 | 	    }
 64 | 	    val mentionSpan = fromT.substring(fromO(m.tokenOffsets(FROM))(FROM), fromO(m.tokenOffsets(TO) - 1)(TO))
 65 |       sentNum += 1
 66 | 	    
 67 | 		val com = new StCorefMention(
 68 | 		  Dictionaries.MentionType.valueOf(Mention.typeFromByte(m.mentionTyp)),
 69 | 		  Dictionaries.Number.valueOf(Mention.numberFromByte(m.number)),
 70 | 		  Dictionaries.Gender.valueOf(Mention.genderFromByte(m.gender)),
 71 | 		  Dictionaries.Animacy.valueOf(Mention.animacyFromByte(m.animacy)),
 72 | 		  m.tokenOffsets(FROM) - fromS(sentNum)(FROM) +1,
 73 | 		  m.tokenOffsets(FROM) - fromS(sentNum)(FROM) +1, // -1??
 74 | 		  sentHead,
 75 | 		  c.chainNum,
 76 | 		  mentionNum,
 77 | 		  sentNum,
 78 | 		  // the arguments here are probably sentNum and headIndex, TODO: verify
 79 | 		  new IntTuple(Array[Int](sentNum, sentHead)),
 80 | 		  //new IntTuple(Array[Int](m.positionFrom, m.positionTo)),
 81 | 		  mentionSpan
 82 | 		)
 83 | 		val pos = new IntPair(sentNum, sentHead)
 84 | 	    if (!mentionMap.containsKey(pos)) 
 85 | 	      mentionMap.put(pos, new java.util.HashSet[StCorefMention]())
 86 | 		mentionMap.get(pos).add(com)
 87 | 			    
 88 | 		if (c.representativeMentionNum == mentionNum)
 89 | 		  representative = com
 90 | 	  }
 91 | 	
 92 | 	  val cc = new StCorefChain(c.chainNum, mentionMap, representative)
 93 | 	  cm.put(c.chainNum, cc)
 94 | 	}
 95 | 	to.set(classOf[CorefChainAnnotation], cm)
 96 |   }	
 97 | 
 98 |   def fromStanford(from:StAnnotation):(Mentions,Coreferences) = {
 99 |     val ms = new ArrayBuffer[Mention]()
100 |     val cl = new ArrayBuffer[CoreferenceChain]()
101 |     try {
102 |     val cca:java.util.Map[Integer,StCorefChain] = from.get(classOf[CorefChainAnnotation])
103 | 
104 |     val sents: util.List[CoreMap] = from.get(classOf[SentencesAnnotation])
105 | 
106 |     var chainNum = 0
107 |     var mentionNum = 0
108 |     for (cc <- cca.values) {
109 |       val l = cc.getMentionsInTextualOrder
110 |       //val lp = new ArrayBuffer[CMention](l.size)
111 |       
112 |       var representativeMentionNum = -1
113 |       val chainMentions = new ArrayBuffer[Int]()
114 |       for (m <- l) {
115 |         
116 | //        val cpm = CMention(
117 | //		  m.mentionType.name,
118 | //		  m.number.name,
119 | //		  m.gender.name,
120 | //		  m.animacy.name,
121 | //		  m.startIndex,
122 | //		  m.endIndex,
123 | //		  m.headIndex,
124 | //		  m.corefClusterID,
125 | //		  m.mentionID,
126 | //		  m.sentNum,
127 | //		  m.position.get(0),
128 | //		  m.position.get(1),
129 | //		  m.mentionSpan)
130 | //
131 | //		lp += cpm
132 |         val sentTokenBegin: Integer = sents(m.sentNum-1).get(classOf[TokenBeginAnnotation])
133 | 
134 | 		ms += Mention(mentionNum,
135 | 		  sentTokenBegin + m.headIndex-1,
136 | 		  Array(sentTokenBegin + m.startIndex-1, sentTokenBegin + m.endIndex-1),
137 | 		  Mention.typeToByte(m.mentionType.name),
138 | 		  Mention.numberToByte(m.number.name),
139 | 		  Mention.genderToByte(m.gender.name),
140 | 		  Mention.animacyToByte(m.animacy.name))
141 | 
142 | 		chainMentions += mentionNum
143 | 
144 | 		if (cc.getRepresentativeMention == m)
145 | 		  representativeMentionNum = mentionNum
146 | 
147 |   		mentionNum += 1		
148 |       }
149 | 
150 |       cl += CoreferenceChain(chainNum, representativeMentionNum, chainMentions.toArray)
151 |       
152 |       chainNum += 1
153 | //      val cpc = CorefChain(cc.getChainID, 
154 | //          cc.getRepresentativeMention().mentionID,
155 | //          lp.toArray)
156 | //      cl += cpc 
157 |       }
158 |     } catch {
159 | 	  case e:Exception =>
160 |       e.printStackTrace()
161 |       println("error in fromStanf")
162 | 	}
163 |     (ms.toArray, cl.toArray)
164 |   }
165 | }
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordDependencyExtractor.scala:
--------------------------------------------------------------------------------
  1 | package com.clearcut.pipe.annotator
  2 | 
  3 | import java.util.List
  4 | 
  5 | import com.clearcut.pipe.model.Dependency
  6 | import com.clearcut.pipe.model.SentenceDependencies
  7 | import edu.stanford.nlp.ling.CoreAnnotations._
  8 | import edu.stanford.nlp.ling.{IndexedWord, CoreLabel}
  9 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation}
 10 | import edu.stanford.nlp.semgraph.SemanticGraph
 11 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.{BasicDependenciesAnnotation,
 12 |   CollapsedCCProcessedDependenciesAnnotation, CollapsedDependenciesAnnotation}
 13 | import edu.stanford.nlp.trees.GrammaticalRelation
 14 | 
 15 | import scala.collection.JavaConversions.asScalaBuffer
 16 | import scala.collection.mutable.ArrayBuffer
 17 | 
 18 | object StanfordDependencyExtractor {
 19 | 
 20 |   val DEFAULT_DEP_TYPE = "DepCCProcessed"
 21 | 
 22 |   val depTypes = Array("DepCollapsed", "DepUncollapsed", "DepCCProcessed")
 23 | 
 24 |   def fromStanford(from:StAnnotation, depTyp:String = DEFAULT_DEP_TYPE):SentenceDependencies = {
 25 |     val sentences = from.get(classOf[SentencesAnnotation])
 26 |     val psl = new ArrayBuffer[Array[Dependency]](sentences.size)
 27 |     for (sentence <- sentences) {
 28 |       val deps = depTyp match {
 29 |         case "DepCollapsed" =>
 30 |           sentence.get(classOf[CollapsedDependenciesAnnotation])
 31 |         case "DepUncollapsed" =>
 32 |           sentence.get(classOf[BasicDependenciesAnnotation])
 33 |         case "DepCCProcessed" =>
 34 |           sentence.get(classOf[CollapsedCCProcessedDependenciesAnnotation])
 35 |       }
 36 | 
 37 |       if (deps != null) {
 38 |         val edgeSet = deps.edgeListSorted
 39 |         val pl = for (e <- edgeSet) yield {
 40 |           Dependency(e.getRelation.toString, e.getGovernor.index - 1, e.getDependent.index - 1)
 41 |         }
 42 |         psl += pl.toArray
 43 |       }
 44 |     }
 45 |     psl.toArray
 46 |   }
 47 | 
 48 |   def toStanford(depTyp:String, from:SentenceDependencies, to:StAnnotation):Unit = {
 49 |     val toks = to.get(classOf[TokensAnnotation])
 50 |     val l = to.get(classOf[SentencesAnnotation])
 51 |     for (i <- 0 until l.size) {
 52 |       val fromIndex = l.get(i).get(classOf[TokenBeginAnnotation])
 53 |       val toIndex = l.get(i).get(classOf[TokenEndAnnotation])
 54 |       val sntToks = toks.subList(fromIndex, toIndex)
 55 | 
 56 |       val sg = toSemanticGraph(sntToks, from(i))
 57 | 
 58 |       depTyp match {
 59 |         case "DepCollapsed" =>
 60 |           l.get(i).set(classOf[CollapsedDependenciesAnnotation], sg)
 61 |         case "DepUncollapsed" =>
 62 |           l.get(i).set(classOf[BasicDependenciesAnnotation], sg)
 63 |         case "DepCCProcessed" =>
 64 |           l.get(i).set(classOf[CollapsedCCProcessedDependenciesAnnotation], sg)
 65 |       }
 66 |     }
 67 |   }
 68 | 
 69 |   def toSemanticGraph(tokens:List[CoreLabel], deps:Array[Dependency]):SemanticGraph = {
 70 |     val sg = new SemanticGraph()
 71 |     for (i <- 0 until tokens.size) {
 72 |       val index = i+1
 73 |       val word = tokens.get(i).value() //getValue();
 74 | 
 75 |       //TODO: not setting root
 76 |       //(are roots those nodes that have 0 incoming edges)
 77 | 
 78 |       val ifl = new IndexedWord(null, 0, index);
 79 |       // condition added by me, after "/" as token caused IndexOutOfBounds, maybe TokensAnnotation in wrong token format?
 80 |       val wordAndTag = if (word.length > 1) word.split("/") else Array(word)
 81 |       ifl.set(classOf[TextAnnotation], wordAndTag(0))
 82 |       if (wordAndTag.length > 1) {
 83 |         ifl.set(classOf[PartOfSpeechAnnotation], wordAndTag(1))
 84 |       }
 85 |       sg.addVertex(ifl)
 86 |     }
 87 |     val vertices = sg.vertexListSorted()
 88 | 
 89 |     for (d <- deps) {
 90 |       val govId = d.from
 91 |       val reln = d.name
 92 |       val depId = d.to
 93 |       val gov = vertices.get(govId)
 94 |       val dep = vertices.get(depId)
 95 |       val isExtra = false; //?
 96 |       sg.addEdge(gov, dep, GrammaticalRelation.valueOf(reln),
 97 |         java.lang.Double.NEGATIVE_INFINITY, isExtra)
 98 |     }
 99 |     sg
100 |   }
101 | }


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordLemmatizer.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import java.util.Properties
 4 | import com.clearcut.pipe.model._
 5 | import scala.collection.JavaConversions.asScalaBuffer
 6 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation
 7 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
 8 | 
 9 | /** Wraps CoreNLP Lemmatizer as an Annotator. */
10 | class StanfordLemmatizer extends Annotator[(Text, Poss, SentenceOffsets, TokenOffsets, Tokens), (Lemmas)] {
11 | 
12 | 	@transient lazy val stanfordAnnotator =
13 | 		AnnotatorFactories.lemma(properties, StanfordUtil.annotatorImplementations).create()
14 | 
15 |   override def annotate(in:(Text, Poss, SentenceOffsets, TokenOffsets, Tokens)):Lemmas = {
16 | 		val (t, poa, soa, toa, to) = in
17 | 		val stanAnn = new StAnnotation(t)
18 | 		StanfordTokenizer.toStanford(t, toa, to, stanAnn)
19 | 		StanfordSentenceSplitter.toStanford(soa, null, stanAnn)
20 | 		StanfordPOSTagger.toStanford(poa, stanAnn)
21 | 
22 | 		stanfordAnnotator.annotate(stanAnn)
23 | 
24 | 		StanfordLemmatizer.fromStanford(stanAnn)
25 |   }
26 | }
27 | 
28 | /** Stanford model mappings for lemmas. */
29 | object StanfordLemmatizer {
30 | 	def toStanford(from:Lemmas, to:StAnnotation):Unit = {
31 | 		val li = to.get(classOf[TokensAnnotation])
32 | 		for (i <- 0 until from.size) {
33 | 			val lemma = from(i)
34 | 			li.get(i).setLemma(lemma)
35 | 		}
36 | 	}
37 | 
38 | 	def fromStanford(from:StAnnotation):Lemmas = {
39 | 		val tokens = from.get(classOf[TokensAnnotation])
40 | 		val li = for (cl <- tokens) yield {
41 | 			// there may be *NL* tokens outside sentences; the lemmatizer didn't reach
42 | 			// these, so set these manually to *NL*, so that serialization is OK
43 | 			var l = cl.lemma()
44 | 			if (l == null) l = "*NL*"
45 | 			l
46 | 		}
47 | 		li.toArray
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordNERTagger.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import com.clearcut.pipe.model._
 4 | import scala.collection.JavaConversions._
 5 | import edu.stanford.nlp.ling.CoreAnnotations
 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
 7 | import com.clearcut.pipe.model._
 8 | import java.util._
 9 | 
10 | /** Wraps CoreNLP NER Tagger as an Annotator. */
11 | class StanfordNERTagger extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets,Lemmas,Poss), (NerTags)] {
12 | 
13 |   @transient lazy val stanfordAnnotator =
14 |     AnnotatorFactories.nerTag(properties, StanfordUtil.annotatorImplementations).create()
15 | 
16 |   override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets,Lemmas,Poss)): NerTags = {
17 |     val (t, toa, to, soa, la, pa) = in
18 |     val stanAnn = new StAnnotation(t)
19 |     StanfordTokenizer.toStanford(t, toa, to, stanAnn)
20 |     StanfordSentenceSplitter.toStanford(soa, null, stanAnn)
21 |     StanfordPOSTagger.toStanford(pa, stanAnn)
22 |     StanfordLemmatizer.toStanford(la, stanAnn)
23 | 
24 |     stanfordAnnotator.annotate(stanAnn)
25 | 
26 |     StanfordNERTagger.fromStanford(stanAnn)
27 |   }
28 | }
29 | 
30 | /** Stanford model mappings for NER. */
31 | object StanfordNERTagger {
32 |   def toStanford(from:NerTags, to:StAnnotation):Unit = {
33 |     val li = to.get(classOf[CoreAnnotations.TokensAnnotation])
34 |     for (i <- 0 until li.size) {
35 |       val ner = from(i)
36 |       li.get(i).setNER(ner)
37 |     }
38 |   }
39 |   
40 |   def fromStanford(from:StAnnotation):NerTags = {
41 |     val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation])
42 |     val li = for (cl <- tokens) yield {
43 |       // there may be *NL* tokens outside sentences; the lemmatizer didn't reach
44 |       // these, so set these manually to *NL*, so that serialization is OK
45 |       val n = cl.ner
46 |       if (n != null) n else "O"
47 |     }
48 |     li.toArray
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordPOSTagger.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import com.clearcut.pipe.model._
 4 | import scala.collection.JavaConversions._
 5 | import edu.stanford.nlp.ling.CoreAnnotations
 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
 7 | import java.util._
 8 | 
 9 | /** Wraps CoreNLP POS Tagger as an Annotator. */
10 | class StanfordPOSTagger extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets),(Poss)] {
11 |   
12 |   @transient lazy val stanfordAnnotator =
13 |     AnnotatorFactories.posTag(properties, StanfordUtil.annotatorImplementations).create()
14 | 
15 |   override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets)):Poss = {
16 |     val (t, toa, to, soa) = in
17 |     val stanAnn = new edu.stanford.nlp.pipeline.Annotation(t)
18 |     StanfordTokenizer.toStanford(t, toa, to, stanAnn)
19 |     StanfordSentenceSplitter.toStanford(soa, null, stanAnn)
20 | 
21 |     stanfordAnnotator.annotate(stanAnn)
22 | 
23 |     StanfordPOSTagger.fromStanford(stanAnn)
24 |   }
25 | }
26 | 
27 | /** Stanford model mappings for POS tags. */
28 | object StanfordPOSTagger {
29 |   def toStanford(from:Poss, to:StAnnotation):Unit = {
30 |     val li = to.get(classOf[CoreAnnotations.TokensAnnotation])
31 |     for (i <- 0 until li.size) {
32 |       val pos = from(i)
33 |       li.get(i).set(classOf[CoreAnnotations.PartOfSpeechAnnotation], pos)
34 |     }
35 |   }
36 | 
37 |   def fromStanford(from:StAnnotation):Poss = {
38 |     val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation])
39 |     tokens.map(_.getString(classOf[CoreAnnotations.PartOfSpeechAnnotation])).toArray
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordSRParser.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | // StanfordSRParser is very fast, but needs A LOT of memory
 4 | // ~ 4GB per thread
 5 | // with less memory it becomes very slow
 6 | 
 7 | import java.util.Properties
 8 | 
 9 | import com.clearcut.pipe.model._
10 | import edu.stanford.nlp.ling.CoreAnnotations.{SentenceIndexAnnotation, SentencesAnnotation}
11 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
12 | import edu.stanford.nlp.trees.Tree
13 | import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation
14 | 
15 | import scala.collection.JavaConversions._
16 | 
17 | class StanfordSRParser extends Annotator[(Text,SentenceOffsets,SentenceTokenOffsets,TokenOffsets,Tokens,Poss),
18 |   (Parses,SentenceDependencies)] {
19 | 
20 |   override def setProperties(p:Properties) {
21 |     super.setProperties(p)
22 |     p.setProperty("annotators", "tokenize,ssplit")
23 |     p.put("parse.maxlen", "100")
24 |     p.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz")
25 |     p.put("threads", "1") // Should use extractor-level parallelism
26 |   }
27 | 
28 |   @transient lazy val stanfordAnnotator =
29 |     AnnotatorFactories.parse(properties, StanfordUtil.annotatorImplementations).create()
30 | 
31 | 	override def annotate(in:(Text,SentenceOffsets,SentenceTokenOffsets,TokenOffsets,Tokens,Poss)):
32 | 		(Parses,SentenceDependencies) = {
33 | 		val (t,soa,stoa,toa,to,poa) = in
34 |     val stanAnn = new StAnnotation(t)
35 |     StanfordTokenizer.toStanford(t, toa, to, stanAnn)
36 |     StanfordSentenceSplitter.toStanford(soa, null, stanAnn)
37 |     StanfordPOSTagger.toStanford(poa, stanAnn)
38 | 
39 |     // NOTE: stanford parser may take too long for all sentences of a document
40 | 		// if we run this on Hadoop/Spark, we must parse sentence by sentence and
41 | 		// then report progress using
42 |     //if (reporter != null) reporter.incrementCounter();
43 | 
44 |     stanfordAnnotator.annotate(stanAnn)
45 | 
46 |     val pa = StanfordSRParser.fromStanford(stanAnn)
47 |     val da = StanfordDependencyExtractor.fromStanford(stanAnn)
48 |     (pa, da)
49 | 	}
50 | }
51 | 
52 | object StanfordSRParser {
53 |   def toStanford(from:Parses, to:StAnnotation):Unit = {
54 | 		val l = from
55 | 		val sentences = to.get(classOf[SentencesAnnotation])
56 | 		for (i <- 0 until l.size) {
57 | 			var tree:Tree = null
58 | 			if (l(i) != null)
59 | 				tree = Tree.valueOf(l(i))
60 | 			sentences.get(i).set(classOf[TreeAnnotation], tree)
61 | 			sentences.get(i).set(classOf[SentenceIndexAnnotation], i.asInstanceOf[Integer])
62 | 		}
63 |   }
64 |   
65 |   def fromStanford(from:StAnnotation):Parses = {
66 | 		val sentences = from.get(classOf[SentencesAnnotation])
67 | 		val l = for (sentence <- sentences) yield {
68 | 			val tree = sentence.get(classOf[TreeAnnotation])
69 | 			if (tree != null) tree.pennString else null
70 | 		}
71 | 		l.toArray
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import java.util.{ArrayList, Properties}
 4 | import com.clearcut.pipe.model.{Offsets, Text, Tokens, TokenOffsets}
 5 | import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
 7 | import scala.collection.JavaConversions._
 8 | import scala.collection.JavaConverters._
 9 | 
10 | /** Wraps CoreNLP Tokenizer as an Annotator. */
11 | class StanfordTokenizer extends Annotator[Text,(TokenOffsets,Tokens)] {
12 | 
13 |   @transient lazy val stanfordAnnotator =
14 |     AnnotatorFactories.tokenize(properties, StanfordUtil.annotatorImplementations).create()
15 | 
16 |   override def annotate(t:(Text)):(TokenOffsets, Tokens) = {
17 |     val stanAnn = new StAnnotation(t)
18 |     stanfordAnnotator.annotate(stanAnn)
19 |     StanfordTokenizer.fromStanford(stanAnn)
20 |   }
21 | }
22 | 
23 | /** Stanford model mappings for tokens. */
24 | object StanfordTokenizer {
25 |   def toStanford(text:Text, tokenOffsets:TokenOffsets, tokens:Tokens, to:StAnnotation):Unit = {
26 |     val li = for (i <- 0 until tokens.size) yield {
27 |       val to = tokenOffsets(i)
28 |       val cl = new CoreLabel
29 |       cl.setValue(tokens(i))
30 |       cl.setWord(tokens(i))
31 |       cl.setOriginalText(text.substring(to(0), to(1)))
32 |       cl.set(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation], to(0).asInstanceOf[Integer])
33 |       cl.set(classOf[CoreAnnotations.CharacterOffsetEndAnnotation], to(1).asInstanceOf[Integer])
34 |       cl
35 |     }
36 |     to.set(classOf[CoreAnnotations.TokensAnnotation], li.asJava)
37 |   }
38 | 
39 |   def fromStanford(from:StAnnotation):(TokenOffsets, Tokens) = {
40 |     val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation])
41 |     val li = tokens.map(cl => Array(cl.beginPosition, cl.endPosition))
42 |     val ti = tokens.map(_.word)
43 |     (li.toArray, ti.toArray)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordTrueCaseAnnotator.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import com.clearcut.pipe.model._
 4 | import scala.collection.JavaConversions._
 5 | import edu.stanford.nlp.ling.CoreAnnotations
 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories}
 7 | import java.util._
 8 | 
 9 | /** Wraps CoreNLP TrueCaseAnnotator as an Annotator. */
10 | class StanfordTrueCaseAnnotator extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets),(TrueCases)] {
11 |   
12 |   @transient lazy val stanfordAnnotator =
13 |     AnnotatorFactories.truecase(properties, StanfordUtil.annotatorImplementations).create()
14 | 
15 |   override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets)):TrueCases = {
16 |     val (t, toa, to, soa) = in
17 |     val stanAnn = new edu.stanford.nlp.pipeline.Annotation(t)
18 |     StanfordTokenizer.toStanford(t, toa, to, stanAnn)
19 |     StanfordSentenceSplitter.toStanford(soa, null, stanAnn)
20 | 
21 |     stanfordAnnotator.annotate(stanAnn)
22 | 
23 |     StanfordTrueCaseAnnotator.fromStanford(stanAnn)
24 |   }
25 | }
26 | 
27 | /** Stanford model mappings for POS tags. */
28 | object StanfordTrueCaseAnnotator {
29 |   def toStanford(from:TrueCases, to:StAnnotation):Unit = {
30 |     val li = to.get(classOf[CoreAnnotations.TokensAnnotation])
31 |     for (i <- 0 until li.size) {
32 |       val tc = from(i)
33 |       li.get(i).set(classOf[CoreAnnotations.TrueCaseAnnotation], tc)
34 |     }
35 |   }
36 | 
37 |   def fromStanford(from:StAnnotation):TrueCases = {
38 |     val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation])
39 |     tokens.map(_.getString(classOf[CoreAnnotations.TrueCaseAnnotation])).toArray
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.annotator
 2 | 
 3 | import edu.stanford.nlp.pipeline.AnnotatorImplementations
 4 | 
 5 | object StanfordUtil {
 6 | 
 7 |   lazy val annotatorImplementations =
 8 |     new AnnotatorImplementations
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/ColumnReader.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import java.io._
 4 | import com.clearcut.pipe.Schema
 5 | import com.clearcut.pipe.model._
 6 | 
 7 | class ColumnReader(dir:String) extends Reader with Iterator[Array[AnyRef]] {
 8 | 
 9 |   val schema = Schema.createSchema(
10 |     // The schema is determined based on file name suffixes
11 |     new File(dir).list.map(n => n.substring(n.lastIndexOf(".") + 1)).map(lowerFirst(_)):_*
12 |   )
13 | 
14 |   val readers = new File(dir).listFiles.map(f => new BufferedReader
15 |     (new InputStreamReader(new FileInputStream(f))))
16 | 
17 |   var _next = fetchNext()
18 | 
19 |   def getSchema(): Schema = schema
20 | 
21 |   override def hasNext: Boolean =
22 |     _next != null
23 | 
24 |   override def next():Array[AnyRef] = {
25 |     val n = _next
26 |     _next = fetchNext()
27 |     n
28 |   }
29 | 
30 |   private def fetchNext(): Array[AnyRef] = {
31 |     readers.zip(schema.annTyps).map { case (r,t) => {
32 |       val line = r.readLine
33 |       if (line == null)
34 |         return null
35 | 
36 |       Json.read[AnyRef](line, Util.name2clazz(t))
37 |     }}
38 |   }
39 | 
40 |   def close = {
41 |     readers.map(_.close)
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/ColumnWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import java.io.{File, OutputStreamWriter, FileOutputStream, BufferedWriter}
 4 | import com.clearcut.pipe.model._
 5 | 
 6 | import com.clearcut.pipe.Schema
 7 | 
 8 | class ColumnWriter(dir:String) extends Writer {
 9 |   val BUFFER_SIZE = 10 * 1024 * 1024
10 | 
11 |   var writers:Array[BufferedWriter] = null
12 | 
13 |   def setSchema(schema:Schema): Unit = {
14 |     if (! new File(dir).exists)
15 |       new File(dir).mkdirs()
16 |     writers = schema.annTyps.map(t => {
17 |       val name = dir + "/ann." + lowerFirst(t)
18 |       if (new File(name).exists)
19 |         null
20 |       else
21 |         new BufferedWriter(
22 |           new OutputStreamWriter(new FileOutputStream(name)), BUFFER_SIZE)
23 |     })
24 |   }
25 | 
26 |   def write(annotations:Seq[AnyRef]) = {
27 |     for (i <- 0 until writers.length) {
28 |       if (writers(i) != null) {
29 |         val json = Json.write(annotations(i))
30 |         writers(i).write(json)
31 |         writers(i).newLine()
32 |       }
33 |     }
34 |   }
35 | 
36 |   def close =
37 |     for (w <- writers)
38 |       if (w != null) w.close()
39 | }
40 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/JSONWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import java.io.{OutputStreamWriter, FileOutputStream, BufferedWriter}
 4 | import com.clearcut.pipe.model._
 5 | 
 6 | import com.clearcut.pipe.Schema
 7 | import org.json4s._
 8 | import org.json4s.JsonDSL._
 9 | import org.json4s.jackson.JsonMethods._
10 | 
11 | class JsonWriter(out:String) extends Writer {
12 | 
13 |   implicit val formats = DefaultFormats
14 | 
15 |   val writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out)))
16 |   var names:Seq[String] = null
17 | 
18 |   def setSchema(schema:Schema): Unit = {
19 |     names = schema.annTyps.map(t => lowerFirst(t))
20 |   }
21 | 
22 |   def write(annotations:Seq[AnyRef]) = {
23 |     val arr:Seq[JObject] = annotations.zip(names).map { case (x,n) => JObject(JField(n, Extraction.decompose(x)))}
24 |     var o:JObject = arr(0)
25 |     for (i <- 1 until arr.length)
26 |       o = o merge arr(i)
27 | 
28 |     writer.write(compact(render(o)))
29 |     writer.newLine
30 |   }
31 | 
32 |   def close =
33 |     writer.close
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/Json.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import org.json4s.Extraction._
 4 | import org.json4s.NoTypeHints
 5 | import org.json4s.jackson.JsonMethods._
 6 | import org.json4s.jackson.Serialization
 7 | import org.json4s.reflect.Reflector
 8 | 
 9 | object Json {
10 | 
11 |   implicit val formats = Serialization.formats(NoTypeHints)
12 | 
13 |   def write[A <: AnyRef](o:A)(implicit m:Manifest[A]):String =
14 |     Serialization.write[A](o)
15 | 
16 |   def read[AnyRef](s:String, t:Class[_]):AnyRef = {
17 |     val json = parse(s)
18 |     extract(json, Reflector.scalaTypeOf(t)).asInstanceOf[AnyRef]
19 |   }
20 | 
21 |   def read[A](s:String)(implicit m:Manifest[A]):A =
22 |     Serialization.read[A](s)
23 | }
24 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/JsonReader.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import java.nio.charset.CodingErrorAction
 4 | 
 5 | import com.clearcut.pipe.Schema
 6 | import com.clearcut.pipe.model.{Id, Text}
 7 | 
 8 | import org.json4s._
 9 | import org.json4s.jackson.JsonMethods._
10 | import org.json4s.jackson.Serialization
11 | 
12 | import scala.io.Source
13 | 
14 | class JsonReader(in:String,
15 |                  idKey:String, documentKey:String)
16 |   extends Reader with Iterator[Array[AnyRef]] {
17 |   val BUFFER_SIZE = 10 * 1024 * 1024
18 | 
19 |   implicit val codec = new scala.io.Codec(
20 |     java.nio.charset.Charset.forName("utf-8"))
21 |   codec.onMalformedInput(CodingErrorAction.IGNORE)
22 |   codec.onUnmappableCharacter(CodingErrorAction.IGNORE)
23 | 
24 |   val reader = Source.fromFile(new java.io.File(in), BUFFER_SIZE)
25 | 
26 |   var it = reader.getLines.zipWithIndex
27 |   var _next = fetchNext()
28 | 
29 |   override def getSchema:Schema =
30 |     Schema.createSchema("id", "text")
31 | 
32 |   override def hasNext: Boolean =
33 |     _next != null
34 | 
35 |   override def next(): Array[AnyRef] = {
36 |     val n = _next
37 |     _next = fetchNext()
38 |     n
39 |   }
40 | 
41 |   private def fetchNext(): Array[AnyRef] = {
42 |     var n:Array[AnyRef] = null
43 |     while (n == null && it.hasNext) {
44 |       val (line, num) = it.next
45 | 
46 |       val json = parse(line)
47 | 
48 |       implicit val formats = DefaultFormats
49 | 
50 |       try {
51 |         val documentId = (json \ idKey).extract[String]
52 |         val documentStr = (json \ documentKey).extract[String]
53 | 
54 |         n = Array(documentId, documentStr)
55 | 
56 |       } catch {
57 |         case e:Exception =>
58 |           System.err.println(s"Warning: skipped malformed line ${num}: ${line}")
59 |       }
60 |     }
61 |     n
62 |   }
63 | 
64 |   def close =
65 |     reader.close
66 | }
67 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/Reader.scala:
--------------------------------------------------------------------------------
1 | package com.clearcut.pipe.io
2 | 
3 | import com.clearcut.pipe.Schema
4 | 
5 | trait Reader extends Iterator[Array[AnyRef]] {
6 |   def getSchema:Schema
7 |   def close
8 | }
9 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/TsvReader.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import java.nio.charset.CodingErrorAction
 4 | 
 5 | import com.clearcut.pipe.Schema
 6 | import com.clearcut.pipe.model.{Text, Id}
 7 | 
 8 | import scala.io.{Source, BufferedSource}
 9 | 
10 | class TsvReader(in:String = null,
11 |                 idCol:Int = 0, documentCol:Int = 1,
12 |                 inSource:Source = null)
13 |   extends Reader with Iterator[Array[AnyRef]] {
14 | 
15 |   implicit val codec = new scala.io.Codec(
16 |     java.nio.charset.Charset.forName("utf-8"))
17 |   codec.onMalformedInput(CodingErrorAction.IGNORE)
18 |   codec.onUnmappableCharacter(CodingErrorAction.IGNORE)
19 | 
20 |   val reader = if (inSource != null) inSource else Source.fromFile(in)
21 | 
22 |   var it = reader.getLines.zipWithIndex
23 |   var _next = fetchNext()
24 | 
25 |   override def getSchema:Schema =
26 |     Schema.createSchema("id", "text")
27 | 
28 |   override def hasNext: Boolean =
29 |     _next != null
30 | 
31 |   override def next(): Array[AnyRef] = {
32 |     val n = _next
33 |     _next = fetchNext()
34 |     n
35 |   }
36 | 
37 |   // should unescape \, \r, \n, \t
38 |   private def fetchNext(): Array[AnyRef] = {
39 |     var n:Array[AnyRef] = null
40 |     while (n == null && it.hasNext) {
41 |       val (line, num) = it.next
42 |       val tsvArr = line.trim.split("\t")
43 |       if (tsvArr.length >= 2) {
44 |         val documentId = tsvArr(idCol)
45 |         val documentStr = unescape(tsvArr(documentCol))
46 |         n = Array(documentId, documentStr)
47 |       } else {
48 |         System.err.println(s"Warning: skipped malformed line ${num}: ${line}")
49 |       }
50 |     }
51 |     n
52 |   }
53 | 
54 |   private def unescape(s:String):String = {
55 |     val sb = new StringBuilder()
56 |     val NORMAL = 0
57 |     val ESCAPE = 1
58 | 
59 |     var state = NORMAL
60 | 
61 |     for (i <- 0 until s.length) {
62 |       val c = s.charAt(i)
63 |       //val l = if (i == s.length - 1) Character.UNASSIGNED else s.charAt(i+1)
64 |       state match {
65 |         case NORMAL =>
66 |           c match {
67 |             case '\\' => state = ESCAPE
68 |             case _ => sb.append(c)
69 |           }
70 |         case ESCAPE =>
71 |           c match {
72 |             case 'r' => sb.append('\r'); state = NORMAL
73 |             case 'n' => sb.append('\n'); state = NORMAL
74 |             case 't' => sb.append('\t'); state = NORMAL
75 |             case '\\' => sb.append('\\'); state = NORMAL
76 |             case _ =>
77 |               println("ERROR")
78 |           }
79 |       }
80 |     }
81 |     return sb.toString
82 |   }
83 | 
84 |   def close =
85 |     reader.close
86 | }


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/TsvWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import java.io.{FileOutputStream, OutputStreamWriter, BufferedWriter}
 4 | 
 5 | import com.clearcut.pipe.Schema
 6 | import com.clearcut.pipe.model._
 7 | 
 8 | /** Legacy writer for psql readable TSV table.
 9 |  *
10 |  * Example output:
11 |  * 12	1	This is a simple example.	{"This","is","a","simple","example","."}
12 |  *   {"this","be","a","simple","example","."}	{"DT","VBZ","DT","JJ","NN","."}
13 |  *   {"O","O","O","O","O","O"}	{0,5,8,10,17,24}
14 |  *   {"nsubj","cop","det","amod","",""}	{5,5,5,5,0,0}
15 |  */
16 | class TsvWriter(out:String = null, outWriter:BufferedWriter = null) extends Writer {
17 | 
18 |   val writer = if (outWriter != null) outWriter else
19 |     new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "utf-8"))
20 | 
21 |   var indices:Seq[Int] = null
22 | 
23 |   def setSchema(schema:Schema) = {
24 |     indices = Schema.defaultAnnotationIndices(schema, Seq("Id", "Text", "SentenceOffsets",
25 |       "SentenceTokenOffsets", "Tokens", "TokenOffsets", "Lemmas", "Poss",
26 |       "NerTags", "SentenceDependencies"))
27 |   }
28 | 
29 |   def write(annotations:Seq[AnyRef]) = {
30 |     val is = indices.map(annotations(_))
31 |     val id = is(0).asInstanceOf[Id]
32 |     val ta = is(1).asInstanceOf[Text]
33 |     val soa = is(2).asInstanceOf[SentenceOffsets]
34 |     val stoa = is(3).asInstanceOf[SentenceTokenOffsets]
35 |     val toka = is(4).asInstanceOf[Tokens]
36 |     val toa = is(5).asInstanceOf[TokenOffsets]
37 |     val la = is(6).asInstanceOf[Lemmas]
38 |     val posa = is(7).asInstanceOf[Poss]
39 |     val nertaga = is(8).asInstanceOf[NerTags]
40 |     val sdepa = is(9).asInstanceOf[SentenceDependencies]
41 | 
42 |     for (sentNum <- 0 until soa.size) {
43 |       var columns = new Array[String](10)
44 | 
45 |       val s_stoa = stoa(sentNum)
46 | 
47 |       val outline = List(
48 |         id,
49 |         sentNum.toString,
50 |         ta.substring(soa(sentNum)(FROM), soa(sentNum)(TO)),
51 |         list2TSVArray(toka.slice(s_stoa(FROM), s_stoa(TO)).toList),
52 |         list2TSVArray(la.slice(s_stoa(FROM), s_stoa(TO)).toList),
53 |         list2TSVArray(posa.slice(s_stoa(FROM), s_stoa(TO)).toList),
54 |         list2TSVArray(nertaga.slice(s_stoa(FROM), s_stoa(TO)).toList),
55 |         intList2TSVArray(toa.slice(s_stoa(FROM), s_stoa(TO)).map {_(FROM) - soa(sentNum)(FROM) }.toList),
56 |         list2TSVArray(sdepa(sentNum).map(_.name).toList),
57 |         intList2TSVArray(sdepa(sentNum).map(_.from).toList)
58 |       )
59 |       writer.append(outline.mkString("\t"))
60 |       writer.newLine()
61 |     }
62 |   }
63 | 
64 |   /** Construct a Postgres-acceptable array in the TSV format, from a list */
65 |   def list2TSVArray(arr: List[String]) : String = {
66 |     return arr.map( x =>
67 |       // Replace '\' with '\\\\' to be accepted by COPY FROM
68 |       // Replace '"' with '\\"' to be accepted by COPY FROM
69 |       if (x.contains("\\"))
70 |         "\"" + x.replace("\\", "\\\\\\\\").replace("\"", "\\\\\"") + "\""
71 |       else
72 |         "\"" + x + "\""
73 |     ).mkString("{", ",", "}")
74 |   }
75 | 
76 |   def intList2TSVArray(arr: List[Int]) : String = {
77 |     return arr.map( x =>
78 |       "" + x
79 |     ).mkString("{", ",", "}")
80 |   }
81 | 
82 |   def string2TSVString(str: String) : String = {
83 |     if (str.contains("\\"))
84 |       str.replace("\\", "\\\\")
85 |     else
86 |       str
87 |   }
88 | 
89 |   def close =
90 |     writer.close
91 | }


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/io/Writer.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.io
 2 | 
 3 | import com.clearcut.pipe.Schema
 4 | 
 5 | trait Writer {
 6 |   def setSchema(s:Schema)
 7 |   def write(annotations:Seq[AnyRef])
 8 |   def close
 9 | }
10 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/model/Util.scala:
--------------------------------------------------------------------------------
 1 | package com.clearcut.pipe.model
 2 | 
 3 | object Util {
 4 | 
 5 |   val types:Array[Class[_ <: AnyRef]] = Array(
 6 |     classOf[Coreferences],
 7 |     classOf[Dependencies],
 8 |     classOf[Lemmas],
 9 |     classOf[Mentions],
10 |     classOf[Ners],
11 |     classOf[NerTags],
12 |     classOf[Offsets],
13 |     classOf[Parses],
14 |     classOf[Poss],
15 |     classOf[SentenceDependencies],
16 |     classOf[SentenceOffsets],
17 |     classOf[SentenceTokenOffsets],
18 |     classOf[Text],
19 |     classOf[TextFragments],
20 |     classOf[TextMappings],
21 |     classOf[TokenOffsets],
22 |     classOf[Tokens],
23 |     classOf[TrueCases]
24 |   )
25 | 
26 | //  val name2clazz =
27 | //    Map(types.map(t => lowerFirst(t.getSimpleName) -> t):_*)
28 | 
29 | //  val clazz2name:Map[Class[_ <: AnyRef], String] =
30 | //    name2clazz.map(_.swap)
31 | 
32 | 
33 |   val name2clazz = Map(
34 |     "coreferences" -> classOf[Coreferences],
35 |     "dependencies" -> classOf[Dependencies],
36 |     "lemmas" -> classOf[Lemmas],
37 |     "mentions" -> classOf[Mentions],
38 |     "ners" -> classOf[Ners],
39 |     "nerTags" -> classOf[NerTags],
40 |     "parses" -> classOf[Parses],
41 |     "poss" -> classOf[Poss],
42 |     "sentenceDependencies" -> classOf[SentenceDependencies],
43 |     "sentenceOffsets" -> classOf[SentenceOffsets],
44 |     "sentenceTokenOffsets" -> classOf[SentenceTokenOffsets],
45 |     "text" -> classOf[Text],
46 |     "textFragments" -> classOf[TextFragments],
47 |     "textMappings" -> classOf[TextMappings],
48 |     "tokenOffsets" -> classOf[TokenOffsets],
49 |     "tokens" -> classOf[Tokens],
50 |     "trueCases" -> classOf[TrueCases]
51 |   )
52 | 
53 | 
54 | 
55 | 
56 | 
57 |   def lowerFirst(s:String) =
58 |     if (s == null || s.length < 1) s
59 |     else s.charAt(0).toLower + s.substring(1)
60 | }
61 | 


--------------------------------------------------------------------------------
/pipe/src/main/scala/com/clearcut/pipe/model/package.scala:
--------------------------------------------------------------------------------
  1 | package com.clearcut.pipe
  2 | 
  3 | import com.clearcut.pipe.io.Json
  4 | 
  5 | /** Set of our cross-language, minimalist schema */
  6 | package object model {
  7 |   type Html = String
  8 |   type Coreferences = Array[CoreferenceChain]
  9 |   type Dependencies = Array[Dependency]
 10 |   type Id = String
 11 |   type Lemmas = Array[String]
 12 |   type Mentions = Array[Mention]
 13 |   type Ners = Array[NamedEntity]
 14 |   type NerTags = Array[String]
 15 |   type Offsets = Array[Int]
 16 |   type Parses = Array[String]
 17 |   type Poss = Array[String]
 18 |   type SentenceDependencies = Array[Array[Dependency]]
 19 |   type SentenceOffsets = Array[Offsets]
 20 |   type SentenceTokenOffsets = Array[Offsets]
 21 |   type Text = String
 22 |   type TextFragments = Array[TextFragment]
 23 |   type TextMappings = Array[TextMapping]
 24 |   type TokenOffsets = Array[Offsets]
 25 |   type Tokens = Array[String]
 26 |   type TrueCases = Array[String]
 27 | 
 28 |   /* Constants used for offsets */
 29 |   val FROM = 0
 30 |   val TO = 1
 31 | 
 32 |   def print(s: Schema, arr: AnyRef*) =
 33 |     for ((name, ann) <- s.annTyps.zip(arr))
 34 |       println(name + " : " + Json.write(arr))
 35 | 
 36 |   def lowerFirst(s:String) =
 37 |     if (s == null || s.length < 1) s
 38 |     else s.charAt(0).toLower + s.substring(1)
 39 | 
 40 |   def upperFirst(s:String) =
 41 |     if (s == null || s.length < 1) s
 42 |     else s.charAt(0).toUpper + s.substring(1)
 43 | 
 44 | 
 45 |   /* Auxiliary sub-types used above */
 46 | 
 47 |   case class CoreferenceChain
 48 |   (
 49 |     chainNum: Int = -1,
 50 |     representativeMentionNum: Int = -1,
 51 |     mentionNums: Array[Int] = Array()
 52 |     )
 53 | 
 54 |   case class Dependency
 55 |   (
 56 |     name: String,
 57 |     from: Int,
 58 |     to: Int
 59 |     )
 60 | 
 61 |   case class NamedEntity
 62 |   (
 63 |     typ:String,
 64 |     offsets:Offsets,
 65 |     head:Int = -1
 66 |     )
 67 | 
 68 | 
 69 |   case class Mention
 70 |   (
 71 |     mentionNum:Int = -1,
 72 |     head:Int = -1,  // token offset from begin of document
 73 |     tokenOffsets:Offsets,
 74 |     mentionTyp:Byte = -1, //PRONOMINAL, NOMINAL, PROPER, UNKNOWN
 75 |     number:Byte = -1,     //SINGULAR, PLURAL, UNKNOWN
 76 |     gender:Byte = -1,     //MALE, FEMALE, NEUTRAL, UNKNOWN
 77 |     animacy:Byte = -1     //ANIMATE, INANIMATE, UNKNOWN
 78 |     )
 79 | 
 80 |   object Mention {
 81 |     val UNKNOWN = -1.toByte
 82 | 
 83 |     // mention types
 84 |     val PRONOMINAL = 0.toByte
 85 |     val NOMINAL = 1.toByte
 86 |     val PROPER = 2.toByte
 87 |     val LIST = 3.toByte
 88 | 
 89 |     // numbers
 90 |     val SINGULAR = 0.toByte
 91 |     val PLURAL = 1.toByte
 92 | 
 93 |     // genders
 94 |     val MALE = 0.toByte
 95 |     val FEMALE = 1.toByte
 96 |     val NEUTRAL = 2.toByte
 97 | 
 98 |     // animacy
 99 |     val ANIMATE = 0.toByte
100 |     val INANIMATE = 1.toByte
101 | 
102 |     // need bidirectional mappings for stanford conversions
103 | 
104 |     def typeToByte(s:String) = s match {
105 |       case "PRONOMINAL" => PRONOMINAL
106 |       case "NOMINAL" => NOMINAL
107 |       case "PROPER" => PROPER
108 |       case "LIST" => LIST
109 |       case "UNKNOWN" => UNKNOWN
110 |     }
111 | 
112 |     def typeFromByte(b:Byte) = b match {
113 |       case PRONOMINAL => "PRONOMINAL"
114 |       case NOMINAL => "NOMINAL"
115 |       case PROPER => "PROPER"
116 |       case LIST => "LIST"
117 |       case UNKNOWN => "UNKNOWN"
118 |     }
119 | 
120 |     def numberToByte(s:String) = s match {
121 |       case "SINGULAR" => SINGULAR
122 |       case "PLURAL" => PLURAL
123 |       case "UNKNOWN" => UNKNOWN
124 |     }
125 | 
126 |     def numberFromByte(b:Byte) = b match {
127 |       case SINGULAR => "SINGULAR"
128 |       case PLURAL => "PLURAL"
129 |       case UNKNOWN => "UNKNOWN"
130 |     }
131 | 
132 |     def genderToByte(s:String) = s match {
133 |       case "MALE" => MALE
134 |       case "FEMALE" => FEMALE
135 |       case "NEUTRAL" => NEUTRAL
136 |       case "UNKNOWN" => UNKNOWN
137 |     }
138 | 
139 |     def genderFromByte(b:Byte) = b match {
140 |       case MALE => "MALE"
141 |       case FEMALE => "FEMALE"
142 |       case NEUTRAL => "NEUTRAL"
143 |       case UNKNOWN => "UNKNOWN"
144 |     }
145 | 
146 |     def animacyToByte(s:String) = s match {
147 |       case "ANIMATE" => ANIMATE
148 |       case "INANIMATE" => INANIMATE
149 |       case "UNKNOWN" => UNKNOWN
150 |     }
151 | 
152 |     def animacyFromByte(b:Byte) = b match {
153 |       case ANIMATE => "ANIMATE"
154 |       case INANIMATE => "INANIMATE"
155 |       case UNKNOWN => "UNKNOWN"
156 |     }
157 |   }
158 | 
159 |   case class TextFragment
160 |   (
161 |     typ:String,
162 |     offsets:Offsets,
163 |     extract:Boolean
164 |     )
165 | 
166 |   case class TextMapping
167 |   (
168 |     documentID:Int,
169 |     beginText:Int,
170 |     beginSource:Int,
171 |     length:Int
172 |     )
173 | }
174 | 


--------------------------------------------------------------------------------
/pipe/src/test/resources/testdoc.html:
--------------------------------------------------------------------------------
 1 | <div class="article-entry text">
 2 | 
 3 | <!-- Begin: Wordpress Article Content -->
 4 | <img src="http://tctechcrunch2011.files.wordpress.com/2013/12/5396691102_dd3d157676_b.jpg?w=698" class="article-img-feature" originalw="400" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/5396691102_dd3d157676_b.jpg?w=400" scale="2"><p><a target="_blank" href="http://wrike.com">Wrike</a> has launched a new version of its project management platform with an emphasis on real-time analysis and new features such as syncing calendars to work projects. The new platform, Wrike Enterprise, gives the company a deeper focus on the corporate market for its collaboration-centered tools. It gives customers a way to crunch project management data in the order of a million updates per day. This is data around work items such as tasks completed, the original time planned for the&nbsp;project&nbsp;and the historical data that is associated with the project. The data is presented in “instant infographics,” that help people see the latest updates to projects, said Wrike CEO&nbsp;Andrew Filev in an email interview. <img class="aligncenter size-large wp-image-931506" alt="Wrike-Enterprise-visual-reports" src="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-visual-reports.png?w=1152&amp;h=640" originalw="576" width="576" height="638" originalh="640" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-visual-reports.png?w=576&amp;h=640" scale="2"> Historically, project managers have done detailed plans that they then track. The manager periodically updates the projects and then compares the current state to the baseline established at the start of the project. With the Wrike platform, the data from every interaction is stored and then compared to historical data and then presented in a chart. A customer can see the state of the project from different dimensions such as the realistic amount of time a project will take to get done,&nbsp;&nbsp;what&nbsp;requires immediate action and how performance of an employee has evolved over time.</p>
 5 | <p><span style="line-height:1.625;">A new &nbsp;user group feature in Wrike Enterprise allows the project manager to i</span><span style="line-height:1.625;">nclude employees in multiple work groups by project, department, or any other ad hoc query. It can share the needed data with the whole group and keep permissions organized. This allows the manager to keep track of the overall project without hundreds of people making their own changes.&nbsp;</span> <img class="aligncenter size-large wp-image-931882" alt="Wrike-Enterprise-user-groups" src="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-user-groups1.png?w=1280&amp;h=868" width="640" height="434" originalw="640" originalh="434" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-user-groups1.png?w=640&amp;h=434" scale="2"></p>
 6 | <div>Wrike’s new “Custom Calendars,” syncs projects with the calendars of other members on the team. It allows the manager to track a&nbsp;<span style="line-height:1.625;">colleague’s vacations, PTO and extra working days. It is designed to avoid schedule overlaps and build more accurate plans.</span> <img class="aligncenter size-large wp-image-931884" alt="Wrike-Enterprise-custom-calendars" src="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-custom-calendars1.png?w=1280&amp;h=670" width="640" height="335" originalw="640" originalh="335" src-orig="http://tctechcrunch2011.files.wordpress.com/2013/12/wrike-enterprise-custom-calendars1.png?w=640&amp;h=335" scale="2"> There are also new ways to integrate a company’s &nbsp;identity into the service. Wrike has also added new security controls for larger customers. In October, <a href="http://techcrunch.com/2013/10/02/wrike-raises-10m-in-funding-for-service-that-helps-the-work-get-done/">Wrike raised $10 million in funding</a>. &nbsp;It was the first round since the company was originally founded seven years ago. The company has traditionally served the small business community but this release points to its additonal focus on the larger enterprise companies of the world. Wrike competes with the likes of<a target="_blank" href="http://atlassian.com"> Atlassian</a> and<a target="_blank" href="http://asana.com"> Asana</a>. But its advantage is in its crisp user interface which it can now leverage even more as it embraces data as a way for project managers to better keep track of their projects. <span style="line-height:1.625;">&nbsp;</span></div>
 7 | <div></div>
 8 | <div><em>Feature image courtesy of<a target="_blank" href="http://www.flickr.com/photos/vfsdigitaldesign/"> VFS Digital Design</a> on Flickr via Creative Commons)</em></div>
 9 | <div></div>
10 | <div id="jp-post-flair" class="sharedaddy sd-like-enabled"></div><!-- End: Wordpress Article Content -->
11 | 
12 |                 
13 |               </div>


--------------------------------------------------------------------------------
/pipe/src/test/resources/testdoc.json:
--------------------------------------------------------------------------------
1 | {"documents.id" : 5, "documents.text" : "I am document one. I am sentence twp, really. I am another sentence, called sentence three."}
2 | {"documents.id" : 7, "documents.text" : "John drove to Judy’s house and he made her dinner. This sentence should have some corefs."}


--------------------------------------------------------------------------------
/pipe/src/test/resources/testdoc.txt:
--------------------------------------------------------------------------------
1 | In a decision that could have far-reaching consequences, the D.C. Circuit Court of Appeals today struck down the FCC’s Open Internet Order. That Order, put into force in 2010 by then-chairman Julius Genachowski, was designed to make it so that broadband service providers couldn’t meddle with traffic on the web based on its type – in other words, they couldn’t block certain kinds of online data transmission just because it didn’t align with their own goals and financial strategy.
2 | 
3 | Media watchdog and advocacy agency Free Press released the following statement about the decision via President and CEO Craig Aaron, condemning it while also acknowledging that the Open Internet Order probably wasn’t the best possible solution for enforcing net neutrality:


--------------------------------------------------------------------------------
/pipe/src/test/scala/BasicSpec.scala:
--------------------------------------------------------------------------------
 1 | import java.io.{BufferedWriter, OutputStreamWriter, FileOutputStream}
 2 | import java.util.Properties
 3 | import javax.swing.text.html.parser.DocumentParser
 4 | 
 5 | import com.clearcut.pipe.annotator._
 6 | import com.clearcut.pipe.{Schema, Main}
 7 | import com.clearcut.pipe.io.{ColumnWriter, ColumnReader, Json}
 8 | import com.clearcut.pipe.model.Text
 9 | import org.scalatest.{Matchers, FlatSpec}
10 | 
11 | /**
12 |  *
13 |  * Note: SRParser needs a lot of memory. You have to run the test like this:
14 |  * sbt -mem 4096 test
15 |  *
16 |  */
17 | class BasicSpec extends FlatSpec with Matchers {
18 | 
19 |   def createTextFile(dir:String) = {
20 |     val w = new BufferedWriter(new OutputStreamWriter
21 |       (new FileOutputStream(dir + "/ann.text")))
22 |     w.write(Json.write("This is a very simple text file.\nIt contains two sentences."))
23 |     w.close
24 |   }
25 | 
26 |   "ColumnReader and ColumnWriter" should "work" in {
27 |     import java.nio.file.{Path, Paths, Files}
28 |     val folderPath: Path = Paths.get(System.getProperty("java.io.tmpdir"))
29 |     var dir: Path = Files.createTempDirectory(folderPath, "pipe")
30 | 
31 |     println(dir.toString)
32 | 
33 |     createTextFile(dir.toString)
34 | 
35 |     val annotators:Array[Annotator[_,_]] = Array(
36 |       new StanfordTokenizer,
37 |       new StanfordSentenceSplitter,
38 |       new StanfordPOSTagger
39 |       //new StanfordLemmatizer
40 |     )
41 | 
42 |     val r = new ColumnReader(dir.toString)
43 |     val w = new ColumnWriter(dir.toString)
44 |     val e = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dir + "/.errors")))
45 | 
46 |     Main.run(annotators, r, w, e)
47 | 
48 |     r.close
49 |     w.close
50 |     e.close
51 |   }
52 | 
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/view/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | 
 3 | /env
 4 | /node_modules
 5 | /util/elasticsearch-*
 6 | /public/closure-library
 7 | /public/js/help
 8 | /public/js/vis
 9 | /public/js/.module-cache
10 | 


--------------------------------------------------------------------------------
/view/README.md:
--------------------------------------------------------------------------------
 1 | View
 2 | ====
 3 | 
 4 | View visualizations of extractions and NLP annotations. Search by keywords.
 5 | 
 6 | 
 7 | ## Installation
 8 | 
 9 | Run `./setup.sh` to install dependencies.
10 | 
11 | Make sure you run `source env.sh` each time you run view.
12 | 
13 | You can use `./run.sh` to run the two servers (elasticsearch and nodejs). 
14 | 
15 | ## How to index your data
16 | 
17 | * To update view's index, adjust `view.conf` and run tools in `./util`.
18 | 
19 | * The documents should be in [Pipe](../pipe)'s column format. We have included the tool `./fetch-sentences-table.py` which dumps the sentences table from DeepDive and converts it into column format. This tool has been tested with DeepDive's spouse example, so it assumes that the sentences table has a that schema. 
20 | 
21 | * Then fetch extractor output by running `./fetch-anntations.py`. This tool dumps a candidate or inference table from DeepDive and converts it into the right format.
22 | 
23 | * Create the elasticsearch indexes by running:
24 | 
25 |   ```
26 |   ./create_index.sh
27 |   ./refresh-documents.py
28 |   ./refresh-annotations.py
29 |   ``` 
30 | 
31 | * Visit `http://localhost:3000`.
32 | 
33 | View actually uses two elasticsearch indexes: one containing all documents and their NLP annotations, the other containing all extractions. Typically, the documents index is very large and the extractions index relatively small. By separating the two it is now possible to update the extractions index extremely quickly. This is great for extractor development, since an update to an extractor doesn't require rebuilding the documents index. On the spouse example, updating the extractions index now takes only about 5 seconds.
34 | 
35 | To make sure that retrieval of documents and their extractions remains very fast, the two indexes are linked through elasticsearch'es Parent-Child mapping. Each document (parent) has a mapping to a set of extractions (children). This mapping is represented as a hashmap over IDs and is cached in memory while elasticsearch is running.  
36 | 
37 | 


--------------------------------------------------------------------------------
/view/app.js:
--------------------------------------------------------------------------------
 1 | var express = require('express');
 2 | var path = require('path');
 3 | var favicon = require('serve-favicon');
 4 | var logger = require('morgan');
 5 | var cookieParser = require('cookie-parser');
 6 | var bodyParser = require('body-parser');
 7 | 
 8 | var routes = require('./routes/index');
 9 | var users = require('./routes/users');
10 | 
11 | var app = express();
12 | 
13 | // view engine setup
14 | app.set('views', path.join(__dirname, 'views'));
15 | app.set('view engine', 'jade');
16 | 
17 | // uncomment after placing your favicon in /public
18 | //app.use(favicon(__dirname + '/public/favicon.ico'));
19 | app.use(logger('dev'));
20 | app.use(bodyParser.json());
21 | app.use(bodyParser.urlencoded({ extended: false }));
22 | app.use(cookieParser());
23 | app.use(express.static(path.join(__dirname, 'public')));
24 | 
25 | app.use('/', routes);
26 | app.use('/users', users);
27 | 
28 | // catch 404 and forward to error handler
29 | app.use(function(req, res, next) {
30 |   var err = new Error('Not Found');
31 |   err.status = 404;
32 |   next(err);
33 | });
34 | 
35 | // error handlers
36 | 
37 | // development error handler
38 | // will print stacktrace
39 | if (app.get('env') === 'development') {
40 |   app.use(function(err, req, res, next) {
41 |     res.status(err.status || 500);
42 |     res.render('error', {
43 |       message: err.message,
44 |       error: err
45 |     });
46 |   });
47 | }
48 | 
49 | // production error handler
50 | // no stacktraces leaked to user
51 | app.use(function(err, req, res, next) {
52 |   res.status(err.status || 500);
53 |   res.render('error', {
54 |     message: err.message,
55 |     error: {}
56 |   });
57 | });
58 | 
59 | 
60 | module.exports = app;
61 | 


--------------------------------------------------------------------------------
/view/bin/www:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | /**
 4 |  * Module dependencies.
 5 |  */
 6 | 
 7 | var app = require('../app');
 8 | var debug = require('debug')('view:server');
 9 | var http = require('http');
10 | 
11 | /**
12 |  * Get port from environment and store in Express.
13 |  */
14 | 
15 | var port = normalizePort(process.env.PORT || '3000');
16 | app.set('port', port);
17 | 
18 | /**
19 |  * Create HTTP server.
20 |  */
21 | 
22 | var server = http.createServer(app);
23 | 
24 | /**
25 |  * Listen on provided port, on all network interfaces.
26 |  */
27 | 
28 | server.listen(port);
29 | server.on('error', onError);
30 | server.on('listening', onListening);
31 | 
32 | /**
33 |  * Normalize a port into a number, string, or false.
34 |  */
35 | 
36 | function normalizePort(val) {
37 |   var port = parseInt(val, 10);
38 | 
39 |   if (isNaN(port)) {
40 |     // named pipe
41 |     return val;
42 |   }
43 | 
44 |   if (port >= 0) {
45 |     // port number
46 |     return port;
47 |   }
48 | 
49 |   return false;
50 | }
51 | 
52 | /**
53 |  * Event listener for HTTP server "error" event.
54 |  */
55 | 
56 | function onError(error) {
57 |   if (error.syscall !== 'listen') {
58 |     throw error;
59 |   }
60 | 
61 |   var bind = typeof port === 'string'
62 |     ? 'Pipe ' + port
63 |     : 'Port ' + port;
64 | 
65 |   // handle specific listen errors with friendly messages
66 |   switch (error.code) {
67 |     case 'EACCES':
68 |       console.error(bind + ' requires elevated privileges');
69 |       process.exit(1);
70 |       break;
71 |     case 'EADDRINUSE':
72 |       console.error(bind + ' is already in use');
73 |       process.exit(1);
74 |       break;
75 |     default:
76 |       throw error;
77 |   }
78 | }
79 | 
80 | /**
81 |  * Event listener for HTTP server "listening" event.
82 |  */
83 | 
84 | function onListening() {
85 |   var addr = server.address();
86 |   var bind = typeof addr === 'string'
87 |     ? 'pipe ' + addr
88 |     : 'port ' + addr.port;
89 |   debug('Listening on ' + bind);
90 | }
91 | 


--------------------------------------------------------------------------------
/view/build.sh:
--------------------------------------------------------------------------------
1 | #jsx view/ public/js
2 | #browserify -t reactify public/js/main.js -o public/bundle.js
3 | #browserify public/js/main.js -o public/bundle.js
4 | 
5 | browserify -t [ reactify --es6 ] view/main.jsx -o public/bundle.js
6 | 


--------------------------------------------------------------------------------
/view/env.sh:
--------------------------------------------------------------------------------
 1 | # elasticsearch
 2 | export INDEX_NAME=view
 3 | 
 4 | # database
 5 | export PGPORT=5432
 6 | export PGHOST=localhost
 7 | export DBNAME=deepdive_spouse_tsv
 8 | export PGUSER=raphael
 9 | export PGPASSWORD=
10 | 
11 | source env/bin/activate
12 | 
13 | PATH="$PWD/node_modules/.bin:$PATH"
14 | PATH="$PWD/util/elasticsearch-1.6.0/bin:$PATH"
15 | 


--------------------------------------------------------------------------------
/view/gulpfile.js:
--------------------------------------------------------------------------------
 1 | var browserify = require('browserify');
 2 | var gulp = require('gulp');
 3 | var gutil = require('gulp-util');
 4 | var source = require("vinyl-source-stream");
 5 | var reactify = require('reactify');
 6 | var es6ify = require('es6ify');
 7 | var watchify = require('watchify');
 8 | 
 9 | //var requireFiles = ['./node_modules/react/react.js']
10 | var requireFiles = 'react-router'
11 | var rename = require('gulp-rename');
12 | 
13 | function compileScripts(watch) {
14 |     gutil.log('Starting browserify');
15 | 
16 |     var entryFile = './view/main.js';
17 |     es6ify.traceurOverrides = {experimental: true};
18 | 
19 |     var bundler = browserify({entries: entryFile, debug: true});
20 | 
21 |     bundler.require(requireFiles);
22 |     bundler.transform(reactify, {es6: true});
23 |     bundler.transform(es6ify.configure(/.jsx/));
24 | 
25 |     var rebundle = function () {
26 |         var stream = bundler.bundle();
27 | 
28 |         stream.on('error', function (err) { console.error(err) });
29 |         stream = stream.pipe(source(entryFile));
30 | 
31 |         stream.pipe(rename('bundle.js'));
32 |         stream.pipe(gulp.dest('public'));
33 |     }
34 |     bundler.on('update', rebundle);
35 |     return rebundle();
36 | }
37 | 
38 | gulp.task('default', [], function () {
39 |     compileScripts(true);
40 | });


--------------------------------------------------------------------------------
/view/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "view",
 3 |   "version": "0.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "start": "node ./bin/www"
 7 |   },
 8 |   "dependencies": {
 9 |     "body-parser": "~1.12.4",
10 |     "cookie-parser": "~1.3.5",
11 |     "react-router": "v1.0.0-beta1",
12 |     "debug": "~2.2.0",
13 |     "elasticsearch": "^5.0.0",
14 |     "express": "~4.12.4",
15 |     "jade": "~1.9.2",
16 |     "morgan": "~1.5.3",
17 |     "nodemon": "^1.3.7",
18 |     "serve-favicon": "~2.2.1"
19 |   },
20 |   "devDependencies": {
21 |     "gulp": "^3.9.0",
22 |     "gulp-util": "^3.0.6",
23 |     "vinyl-source-stream": "^1.1.0",
24 |     "reactify": "^1.1.1",
25 |     "es6ify": "^1.6.0",
26 |     "watchify": "^3.3.0",
27 |     "gulp-rename": "^1.2.2"
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/view/public/css/main.css:
--------------------------------------------------------------------------------
  1 | /** normalize **/
  2 | *, body, button, input, textarea, select {
  3 |   text-rendering: optimizeLegibility;
  4 |   font-size:100%;
  5 | }
  6 | 
  7 | body,div,dl,dt,dd,ul,ol,li,h1,h2,h3,h4,h5,h6,pre,form,fieldset,input,textarea,p,blockquote,th,td { 
  8 |   margin:0;
  9 |   padding:0;
 10 | }
 11 | table {
 12 |   border-collapse:collapse;
 13 |   border-spacing:0;
 14 | }
 15 | fieldset,img { 
 16 |   border:0;
 17 | }
 18 | address,caption,cite,code,dfn,em,strong,th,var {
 19 |   font-style:normal;
 20 |   font-weight:normal;
 21 | }
 22 | ol,ul {
 23 |   list-style:none;
 24 | }
 25 | caption,th {
 26 |   text-align:left;
 27 | }
 28 | h1,h2,h3,h4,h5,h6 {
 29 |   font-size:100%;
 30 |   font-weight:normal;
 31 | }
 32 | q:before,q:after {
 33 |   content:'';
 34 | }
 35 | abbr,acronym { border:0;}
 36 | /* end normalize.css */
 37 | 
 38 | body {
 39 |   font-family: "Helvetica", "Arial", "FreeSans", "Verdana", "Tahoma", "Lucida Sans", "Lucida Sans Unicode", "Luxi Sans", sans-serif;
 40 |   font-size: 14px;
 41 | }
 42 | 
 43 | *, html, body {
 44 |   -webkit-font-smoothing: antialiased;
 45 | }
 46 | 
 47 | input:focus {outline:none;}
 48 | 
 49 | .header input:focus { border:1px solid #AAF !important; }
 50 | 
 51 | 
 52 | 
 53 | body {
 54 |   overflow-x:hidden;
 55 |   overflow-y:visible;
 56 |   height:100%;
 57 |   font: normal 13px arial,sans-serif;
 58 | }
 59 | 
 60 | html, body {
 61 |     height: 100%;
 62 | }
 63 | 
 64 | .unselectable {
 65 |   -webkit-user-select: none;
 66 |   -khtml-user-select: none;
 67 |   -moz-user-select: none;
 68 |   -o-user-select: none;
 69 |   user-select: none;
 70 | }
 71 | 
 72 | /** header **/
 73 | 
 74 | .header {
 75 |   z-index:1001; /* now beats that of bootstrap dropdown-menu */
 76 |   position: absolute;
 77 |   white-space: nowrap;
 78 |   color: black;
 79 |   top:0;
 80 |   left:0;
 81 |   width:100%;
 82 |   margin:0px;
 83 |   padding:0px;     
 84 |   height:54px;
 85 |   background-image:none;
 86 |   background-color:white;
 87 |   border-bottom:1px solid rgba(0,0,0,0.175);
 88 |   box-shadow:0 3px 12px rgba(0,0,0,0.175);     
 89 | }
 90 | 
 91 | .header input {
 92 |   font: 16px arial,sans-serif;
 93 |   line-height: 1.2em !important;
 94 |   height: 1.2em !important;  
 95 |   width:400px;
 96 |   padding:5px;
 97 |   margin-top:10px;
 98 |   border-radius:3px;
 99 |   border:1px solid #CCC;
100 | }
101 | 
102 | .content {
103 |   position:absolute;
104 |   width:100%;
105 |   margin:0;
106 |   top:54px;
107 |   bottom:0px;
108 |   overflow-x:hidden;
109 |   background-color:rgb(247, 247, 247);
110 | }
111 | 
112 | .help {
113 |   position:absolute;
114 |   top:10px;
115 |   right:0;
116 |   min-height:100%;
117 |   overflow-x:hidden;
118 |   transition:width .25s;
119 |   -webkit-transition:width .25s;
120 | }
121 | 
122 | .leftmenu {
123 |   position:fixed;
124 |   width:200px;
125 |   top:54px;
126 |   bottom:0px;
127 | }
128 | 
129 | .result {
130 |   background-color:white;
131 |   margin-top:0px;
132 |   margin-bottom:10px;
133 |   margin-left:0px;
134 |   padding:4px;
135 |   border:1px solid #DDD;
136 |   position:relative;
137 | 
138 |   color: rgb(84, 84, 84);
139 |   font-family: arial, sans-serif;
140 |   font-size: 13px;
141 |   font-weight: normal;
142 |   line-height: 18.2px;
143 | }
144 | 
145 | .result em {
146 |   background-color:yellow;
147 | }
148 | 
149 | .result * {
150 |   background-color:transparent;
151 | }
152 | 
153 | .facet {
154 |   cursor:pointer;
155 |   padding-left:10px;
156 |   font-size:18px;
157 | }
158 | 
159 | .facet:hover {
160 |   color:#555;
161 | }
162 | 
163 | 
164 | .facet-inactive div {
165 |   visibility:hidden
166 | }
167 | 
168 | .extraction {
169 |   color:red
170 | }
171 | 
172 | .extractionBlue {
173 |   color:blue
174 | }
175 | 
176 | /* Highlights */
177 | .highlight_left {
178 |     border-top-left-radius:3px;
179 |     border-bottom-left-radius:3px;
180 |   border-bottom:1px solid;
181 |   border-left:1px solid;
182 |   border-top:1px solid;
183 |   box-sizing:border-box;
184 |     margin-left:-1px;
185 |     margin-right:0px;
186 |     /*
187 |     box-shadow: inset 8px 0px 8px -8px red, 
188 |                 inset 0px 8px 8px -8px red,
189 |                 inset 0px -8px 8px -8px red;
190 |                 */
191 | }
192 | 
193 | .highlight_right {
194 |     border-top-right-radius:3px;
195 |     border-bottom-right-radius:3px;
196 |   border-bottom:1px solid;
197 |   border-right:1px solid;
198 |   border-top:1px solid;
199 |   box-sizing:border-box;
200 |     margin-right:-1px;
201 |     margin-left:0px;
202 |     /*
203 |     box-shadow: inset -8px 0px 8px -8px red, 
204 |                 inset 0px 8px 8px -8px red,
205 |                 inset 0px -8px 8px -8px red;
206 |                 */
207 | }
208 | 
209 | .highlight_leftright {
210 |     border-radius:3px;
211 |   border:1px solid transparent;
212 |   box-sizing:border-box;
213 |   -webkit-box-sizing:border-box;
214 |   margin-left:-1px;
215 |     margin-right:-1px;
216 |     /*
217 |     box-shadow: inset 0px 0px 8px 0px red;
218 |     */
219 | }
220 | 
221 | .highlight_inner {
222 |   border-top:1px solid;
223 |   border-bottom:1px solid;
224 |   box-sizing:border-box;
225 |     padding:0px;
226 |     margin:0px;
227 |     /*
228 |     box-shadow: inset 0px 8px 8px -8px red,
229 |                 inset 0px -8px 8px -8px red;
230 |                 */
231 | }
232 | 
233 | .highlight_red {
234 |   /*background-color:rgba(255,0,0,0.3);*/
235 |   background-color:rgba(255,0,0,0.6);
236 |   border-color:rgba(255,0,0,0.4); 
237 |   color:white;
238 | }
239 | 
240 | .highlight_strongred {
241 | /*  background-color:rgba(255,0,0,0.6) !important;
242 |   border-color:rgba(255,0,0,0.4) !important;  */
243 |   background-color:rgba(255,0,0,1) !important;
244 |   border-color:rgba(255,0,0,1) !important;  
245 |   color:white !important;
246 | }
247 | 
248 | .highlight_grey {
249 |   /*background-color:rgba(255,0,0,0.3);*/
250 |   background-color:rgba(200,200,200,0.6);
251 |   border-color:rgba(200,200,200,0.4); 
252 |   color:black;
253 | }
254 | 
255 | .highlight_yellow {
256 |   background-color:rgba(255,255,0,0.6);
257 |   border-color:rgba(255,255,0,0.4); 
258 |   color:black;
259 | }
260 | 
261 | .annotationsSelector {
262 |   border-top:1px solid #EEE;
263 |   visibility: hidden;
264 |   background-color:transparent;
265 |   position:absolute;
266 |   padding:10px;
267 |   top:0px;
268 |   right:-170px;
269 |   width:150px;
270 | }
271 | 
272 | .result:hover .annotationsSelector {
273 |   visibility: visible
274 | }
275 | 
276 | .help h1 {
277 |   font-size:16px;
278 |   margin:0px;
279 | }
280 | 
281 | .help h3 {
282 |   margin:0px;
283 |   margin-top:20px;
284 |   margin-bottom:5px;
285 |   font-weight:bold;
286 | }
287 | 
288 | 
289 | 


--------------------------------------------------------------------------------
/view/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>DeepDive</title>
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 | 
 7 |     <!--
 8 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/react/0.13.3/react.js"></script>
 9 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/react-router/0.13.3/ReactRouter.js"></script>
10 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/react/0.13.3/JSXTransformer.js"></script>-->
11 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
12 |     <script src="/closure-library/closure/goog/base.js"></script>
13 |     <script>
14 | goog.require('goog.dom');
15 | goog.require('goog.dom.classes');
16 | goog.require('goog.array');
17 | goog.require('goog.dom.Range');
18 | goog.require('goog.editor.range.Point');
19 | 
20 |     </script>
21 |     <link rel="stylesheet" type="text/css" href="/css/main.css" />
22 |     <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css">
23 |   </head>
24 |   <body class="unselectable"  onload="">
25 |     <div id="page"></div>
26 |     <script src="/bundle.js"></script>
27 |     <!--
28 |     <script type="text/jsx" src="js/main.js"></script>
29 |     <script type="text/jsx" src="js/AnnotationsSelector.js"></script>
30 |     <script type="text/jsx" src="js/EdgesVisualization.js"></script>-->
31 |   </body>
32 | </html>
33 | 


--------------------------------------------------------------------------------
/view/public/js/help/Help.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Help = React.createClass({displayName: "Help",
 3 | 
 4 | 
 5 |   render: function() {
 6 |     var show = this.props.isHelp
 7 | 
 8 |     var wrapperStyle = {position:'fixed', top: '0px', right:0, minHeight:'100%', overflowX:'hidden', transition:'width .25s',
 9 |                                WebkitTransition:'width .25s', backgroundColor: 'rgb(71, 71, 71)'}
10 |     var columnStyle = {position:'absolute', top:'50px', paddingTop:'10px', paddingBottom:'10px', paddingLeft:'10px', paddingRight:'10px',
11 |                                       minHeight:'100%', width:'280px', color:'white', zIndex:3}
12 | 
13 |     var columnStyleBackground = {} //{position:'fixed', boxSizing:'borderBox', MozBoxSizing:'border-box', WebkitBoxSizing:'border-box',
14 |                     //top:0, right:0, minHeight:'100%', backgroundColor:'rgba(71,71,71,1)', transition:'width .25s', WebkitTransition:'width .25s',
15 |                     //zIndex:1}
16 |     if (show) {
17 |         columnStyleBackground.width = '300px'
18 |         wrapperStyle.width = '300px'
19 |     } else {
20 |         wrapperStyle.width = '0px'
21 |         columnStyleBackground.width = '0px';
22 |     }
23 | 
24 |     return (React.createElement("div", {style: wrapperStyle}, 
25 |                React.createElement("div", {className: "help", style: columnStyle}, 
26 |                 React.createElement("h1", null, "Query Examples"), 
27 | 
28 |                 React.createElement("h3", null, "Words and Phrases"), 
29 |                 React.createElement("code", null, "quick"), " and ", React.createElement("code", null, "\"quick brown\""), 
30 | 
31 |                 React.createElement("h3", null, "Field names"), 
32 |                 React.createElement("code", null, "_id:4325235"), React.createElement("br", null), 
33 |                 React.createElement("code", null, "title:(quick OR brown)"), React.createElement("br", null), 
34 |                 React.createElement("code", null, "book.\\*:(quick brown)"), React.createElement("br", null), 
35 |                 React.createElement("code", null, "_missing_:title"), React.createElement("br", null), 
36 |                 React.createElement("code", null, "_exists_:title"), 
37 | 
38 |                 React.createElement("h3", null, "Wildcards"), 
39 |                 React.createElement("code", null, "qu?ck bro*"), 
40 | 
41 |                 React.createElement("h3", null, "Regular Expressions"), 
42 |                 React.createElement("code", null, "name:/joh?n(ath[oa]n)/"), 
43 | 
44 |                 React.createElement("h3", null, "Fuzziness"), 
45 |                 React.createElement("code", null, "quikc~ brwn~ foks~"), React.createElement("br", null), 
46 |                 React.createElement("code", null, "quikc~1"), 
47 | 
48 |                 React.createElement("h3", null, "Proximity Searches"), 
49 |                 React.createElement("code", null, "\"fox quick\"~5"), 
50 | 
51 |                 React.createElement("h3", null, "Ranges"), 
52 |                 React.createElement("code", null, "date:[2012-01-01 TO 2012-12-31]"), React.createElement("br", null), 
53 |                 React.createElement("code", null, "count:[1 TO 5]"), React.createElement("br", null), 
54 |                 React.createElement("code", null, "tag: ", "{", "alpha TO omega", "}"), React.createElement("br", null), 
55 |                 React.createElement("code", null, "count:[10 TO *]"), React.createElement("br", null), 
56 |                 React.createElement("code", null, "date:", "{", "* TO 2012-01-01", "}"), React.createElement("br", null), 
57 |                 React.createElement("code", null, "count:[1 TO 5", "}"), React.createElement("br", null), 
58 |                 React.createElement("code", null, "age:>=10"), React.createElement("br", null), 
59 |                 React.createElement("code", null, "age:(>=10 AND <20)"), 
60 | 
61 |                 React.createElement("h3", null, "Boosting"), 
62 |                 React.createElement("code", null, "quick^2 fox"), React.createElement("br", null), 
63 |                 React.createElement("code", null, "\"john smith\"^2"), React.createElement("br", null), 
64 |                 React.createElement("code", null, "(foo bar)^4"), 
65 | 
66 |                 React.createElement("h3", null, "Boolean Operators"), 
67 |                 React.createElement("code", null, "quick brown +fox -news"), React.createElement("br", null), 
68 |                 React.createElement("code", null, "((quick AND fox) OR (brown AND fox) OR fox) AND NOT news"), 
69 | 
70 |                 React.createElement("h3", null, "Grouping"), 
71 |                 React.createElement("code", null, "(quick OR brown) AND fox"), React.createElement("br", null), 
72 |                 React.createElement("code", null, "status:(active OR pending) title:(full text search)^2"), 
73 | 
74 |                 React.createElement("h3", null, "Reserved Characters"), 
75 |                 "Escape with backslash", React.createElement("br", null), 
76 |                 "Example: ", React.createElement("code", null, "\\(1\\+1\\)\\=2"), " , finds (1+1)=2 ", React.createElement("br", null), 
77 |                 "Characters: ", React.createElement("code", null, "+ - = && || > < ! ( ) ", "{", " ", "}", " [ ] ^ \" ~ * ? : \\ /"), 
78 | 
79 |                 React.createElement("h3", null, "Empty Query"), 
80 |                 "Shows all results.", 
81 | 
82 |                 React.createElement("p", null, 
83 |                   "For more details, see ", React.createElement("a", {href: "https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax", target: "_blank"}, "here"), "."
84 |                 )
85 |                   ), 
86 |                   React.createElement("div", {style: columnStyleBackground}, 
87 |                     React.createElement("div", {style: {position:'absolute', borderLeft:'1px solid white', minHeight:'100%', width:'1px'}})
88 |                   )
89 |             	))
90 |   }
91 | })
92 | 
93 | module.exports = Help
94 | 


--------------------------------------------------------------------------------
/view/public/js/vis/AnnotationsSelector.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var AnnotationsSelector = React.createClass({displayName: "AnnotationsSelector",
 3 | 
 4 | 	render: function() {
 5 | 		var onLayerChange = this.props.onLayerChange
 6 | 
 7 | 		var buttons = this.props.layers.map(function(result) {
 8 |        		return (
 9 |          		React.createElement(AnnotationsSelectorButton, {data: result, 
10 |          			onLayerChange: onLayerChange})
11 |          	);
12 |      	});
13 |      	return (React.createElement("div", {className: "annotationsSelector"}, buttons));
14 | 	}
15 | });
16 | 
17 | var AnnotationsSelectorButton = React.createClass({displayName: "AnnotationsSelectorButton",
18 |   handleClick: function() {
19 |     var active = !this.props.data.active;
20 |     this.props.onLayerChange(this.props.data.name, active);
21 |   },
22 |   render: function() {
23 |     var classes = 'facet';
24 |     if (!this.props.data.active)
25 |       classes += ' facet-inactive';
26 |     return (React.createElement("div", {style: {fontSize:'10pt'}, className: classes, onClick: this.handleClick}, 
27 |        React.createElement("div", {style: {display:'inline-block',width:'30px'}}, 
28 |          React.createElement("i", {className: "fa fa-check"})
29 |        ), this.props.data.name
30 |      ))
31 |   }
32 | })
33 | 
34 | module.exports = AnnotationsSelector


--------------------------------------------------------------------------------
/view/public/js/vis/TextWithAnnotations.js:
--------------------------------------------------------------------------------
  1 | var React       = require('react');
  2 | 
  3 | var SpansVisualization = require('./core/SpansVisualization.js')
  4 | var TokenTagsVisualization = require('./core/TokenTagsVisualization.js')
  5 | var EdgesVisualization = require('./core/EdgesVisualization.js')
  6 | 
  7 | var TokensVisualization = function(element, source) {
  8 | 	return SpansVisualization(element, source.tokenOffsets)
  9 | }
 10 | 
 11 | var SentencesVisualization = function(element, source) {
 12 | 	return SpansVisualization(element, source.sentenceOffsets)
 13 | }
 14 | 
 15 | var PartOfSpeechVisualization = function(element, source) {
 16 | 	return TokenTagsVisualization(element, source.tokenOffsets, source.poss)
 17 | }
 18 | 
 19 | var LemmasVisualization = function(element, source) {
 20 | 	return TokenTagsVisualization(element, source.tokenOffsets, source.lemmas)
 21 | }
 22 | 
 23 | var DependenciesVisualization = function(element, source) {
 24 | 	return EdgesVisualization(element, source.tokenOffsets, source.sentenceOffsets, source.sentenceTokenOffsets, source.sentenceDependencies)
 25 | }
 26 | 
 27 | var ExtractorsVisualization = function(element, source, annotations) {
 28 |    var sentenceTokenOffsets = source['sentenceTokenOffsets']
 29 |    var tokenOffsets = source['tokenOffsets']
 30 |    var extractorOffsets = []
 31 | 
 32 |    $.each(annotations, function(i, a) {
 33 |    	  var sentNum = a.range.sentNum
 34 |    	  var sentenceBeginToken = sentenceTokenOffsets[sentNum][0]
 35 |    	  var tokenFrom = sentenceBeginToken + a.range.f
 36 |    	  var tokenTo = sentenceBeginToken + a.range.t
 37 |       var charFrom = tokenOffsets[tokenFrom][0]
 38 |       var charTo = tokenOffsets[tokenTo - 1][1]
 39 |       extractorOffsets.push([charFrom,charTo])
 40 |    })
 41 |    return SpansVisualization(element, extractorOffsets)
 42 | }
 43 | 
 44 | var TextWithAnnotations = React.createClass({displayName: "TextWithAnnotations",
 45 | 
 46 |   componentDidMount: function() {
 47 |   	this.vis = {}
 48 |   	this.buildCustomDom()
 49 |   },
 50 |   componentDidUpdate: function() {
 51 |   	this.buildCustomDom()
 52 |   },
 53 |   buildCustomDom: function() {
 54 |     var div = React.findDOMNode(this)
 55 |     //cleanup existing visualizations
 56 |     $.each(this.vis, function(k,v) { v.destroy() })
 57 | 
 58 |   	this.vis = {}
 59 | 
 60 |     var annotations = this.props.data.annotations
 61 |     var sourceData = this.props.data._source
 62 |     var vis = this.vis
 63 | 
 64 |     $.each(this.props.layers, function(i, l) {
 65 |         if (vis && vis[l.name] && !l.active) {
 66 |         	vis[l.name].destroy()
 67 |         	delete vis[l.name]
 68 |         }
 69 |         if (vis && !vis[l.name] && l.active) {
 70 |         	if (l.name == 'Tokens')
 71 |         		vis[l.name] = new TokensVisualization(div, sourceData)
 72 |         	if (l.name == 'Sentences')
 73 |         		vis[l.name] = new SentencesVisualization(div, sourceData)
 74 |         	if (l.name == 'Extractors')
 75 |         		vis[l.name] = new ExtractorsVisualization(div, sourceData, annotations)
 76 |         	if (l.name == 'Dependencies')
 77 |         		vis[l.name] = new DependenciesVisualization(div, sourceData)
 78 |         	if (l.name == 'Lemmas')
 79 |         		vis[l.name] = new LemmasVisualization(div, sourceData)        		
 80 |         	if (l.name == 'PartOfSpeech')
 81 |         		vis[l.name] = new PartOfSpeechVisualization(div, sourceData)        		
 82 |         }
 83 |     })
 84 |   },
 85 |   isActive: function(name) {
 86 |   	var isActive = false
 87 |     $.each(this.props.layers, function(i, l) {
 88 |        if (l.name == name) { isActive = l.active; return false }
 89 |     })
 90 |     return isActive
 91 |   },
 92 | 
 93 |   render: function() {
 94 |     content = this.props.data._source.content;
 95 |     // if we have field with keyword highlighting, take that
 96 |     if (this.props.data.highlight != null &&
 97 |         this.props.data.highlight.content != null) {
 98 |       content = this.props.data.highlight.content[0];
 99 |     }
100 |     var details = []
101 |     if (this.isActive('Details')) {
102 | 	    $.each(this.props.data.annotations, function(i, value) {
103 | 	    	details.push(React.createElement("div", {className: "extractionBlue"}, JSON.stringify(value), " "));
104 | 	    })
105 | 	    $.each(this.props.data._source, function(name, value) {
106 | 	      if (name != 'content' && name != 'id')
107 | 	        details.push (React.createElement("div", {className: "extraction"}, name, " : ", JSON.stringify(value), " "));
108 | 	    })
109 | 	}
110 | 
111 |     var div = (React.createElement("div", null, React.createElement("span", {dangerouslySetInnerHTML: {__html: content}}), 
112 |         React.createElement("br", null), React.createElement("div", {style: {'color':'green'}}, this.props.data._id), 
113 |         details
114 |     	))
115 | 
116 |     return div;
117 |   }
118 | });
119 | 
120 | module.exports = TextWithAnnotations
121 | 
122 | 


--------------------------------------------------------------------------------
/view/public/js/vis/core/CharOffsets.js:
--------------------------------------------------------------------------------
  1 | var CharOffsets = (function() {
  2 | 	var ELEMENT = 1;
  3 | 	var TEXT = 3;
  4 | 	
  5 | 	var offsetComparator = function(e1, e2) {
  6 | 		return e1.readrOffset - e2.readrOffset;					
  7 | 	};
  8 | 		
  9 | 	var indexOffsets = function(node, offset) {
 10 | 		node.readrOffset = offset;
 11 | 		if (node.nodeType == TEXT) {
 12 | 			node.readrLength = node.nodeValue.length;
 13 | 		} else if (node.nodeType == ELEMENT) {
 14 | 			// ignore if has class ignoreReadrLength
 15 | 			if (goog.dom.classes.has(node, 'ignoreReadrLength')) {
 16 | 				node.readrLength = 0;
 17 | 			} else {
 18 | 				// sum up lengths of children
 19 | 				var l = 0;
 20 | 				for (var i=0, ii = node.childNodes.length; i < ii; i++) {
 21 | 					var child = node.childNodes[i];
 22 | 					indexOffsets(child, offset + l);
 23 | 					l += child.readrLength;
 24 | 				}
 25 | 				node.readrLength = l;
 26 | 			}
 27 | 		}
 28 | 	};
 29 | 	
 30 | 	var getTextRangesToHighlightFromIndex = function(node, start, end) {
 31 | 		var results = new Array();
 32 | 		recur(node, start, end, results);
 33 | 		return results;
 34 | 	};
 35 | 	
 36 | 	var recur = function(node, start, end, results) {
 37 | 		if (end - start <= 0) return;
 38 | 	
 39 | 		// we assume that start >= node.readrOffset and end <= node.readrOffset + node.readrLength
 40 | 		if (node.nodeType == TEXT) {
 41 | 			results.push([node, start - node.readrOffset, end - node.readrOffset, start, end]);
 42 | 			return;
 43 | 		}
 44 | 		// binary search for start and end
 45 | 		var ns = goog.array.binarySearch(node.childNodes, { readrOffset : start }, offsetComparator);
 46 | 		var ne = goog.array.binarySearch(node.childNodes, { readrOffset : end }, offsetComparator);
 47 | 		
 48 | 		if (ns < 0) { ns = -ns-2; }
 49 | 		if (ne < 0) { ne = -ne-1; }
 50 | 		
 51 | 		for (var i=ns; i < ne; i++) {
 52 | 			var child = node.childNodes[i];
 53 | 			var s = (i==ns)? start : child.readrOffset;
 54 | 			var e = (i==ne-1)? end : child.readrOffset + child.readrLength;
 55 | 			
 56 | 			recur(child, s, e, results);
 57 | 		}
 58 | 	};
 59 | 	
 60 | 	var createMultiRangeSpans = function(element, tokenOffsets, renderedSpans, documentOffset) {
 61 | 		if (!renderedSpans)
 62 | 			renderedSpans = new Array();
 63 | 		if (!documentOffset)
 64 | 			documentOffset = 0
 65 | 		indexOffsets(element[0], documentOffset)
 66 | 		for (var j=0, jj = tokenOffsets.length; j < jj; j++) {
 67 | 			// token has offsets t.f, t.t
 68 | 			var rs = createSingleRangeSpans(element, tokenOffsets[j]);
 69 | 			renderedSpans.push(rs);
 70 | 		}
 71 | 		return renderedSpans;
 72 | 	};
 73 | 
 74 |     var FROM = 0
 75 |     var TO = 1
 76 | 	
 77 | 	// example tokenOffset: { f:12, t:23 }
 78 | 	var createSingleRangeSpans = function(element, tokenOffset) {
 79 | 		//if (!documentOffset) 
 80 | 			//documentOffset = 0
 81 | 		var sels = new Array();
 82 | 		var todo = getTextRangesToHighlightFromIndex
 83 | 			(element[0], tokenOffset[FROM], tokenOffset[TO]);
 84 | 		for (var i=0, ii = todo.length; i < ii; i++) {
 85 | 			var t = todo[i];
 86 | 			var range = goog.dom.Range.createFromNodes(t[0], t[1], t[0], t[2]);
 87 | 			var el = goog.dom.createDom('span'); //, { 'style':'background-color:green'}); 
 88 | 			range.surroundContents(el);
 89 | 			indexOffsets(t[0].parentNode, t[0].parentNode.readrOffset);
 90 | 			sels.push(el);
 91 | 		}
 92 | 		return { sels:sels };
 93 | 	};
 94 | 	
 95 | 	//note, the output of this function is a singleton
 96 | 	return {
 97 | 		indexOffsets: indexOffsets,
 98 | 		getTextRangesToHighlightFromIndex: getTextRangesToHighlightFromIndex,
 99 | 		createMultiRangeSpans: createMultiRangeSpans,
100 | 		createSingleRangeSpans: createSingleRangeSpans
101 | 	};
102 | })()
103 | 
104 | module.exports = CharOffsets


--------------------------------------------------------------------------------
/view/public/js/vis/core/FramesVisualization.js:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | //
 5 | //
 6 | //var FramesVisualization = function(element, source) {
 7 | //	var state ={}
 8 | //
 9 | //	var documentOffset = 0
10 | //
11 | //	var msHeadSpans = new Array();
12 | //	CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset)
13 | //
14 | //
15 | //
16 | //}


--------------------------------------------------------------------------------
/view/public/js/vis/core/SpansVisualization.js:
--------------------------------------------------------------------------------
 1 | /* TokensVisualization */
 2 | var CharOffsets = require('./CharOffsets.js')
 3 | 
 4 | var Span = function(sels) {
 5 |     var state = {}
 6 | 
 7 | 	var fragment = function(i, length) {
 8 | 		var fragment = '';
 9 | 		if (i==0 && i < length-1) fragment = 'left';
10 | 		else if (i==0 && i==length-1) fragment = 'leftright';
11 | 		else if (i==length-1 && i > 0) fragment = 'right';
12 | 		else if (i > 0 && i < length-1) fragment = 'inner';
13 | 		return fragment;
14 | 	};
15 | 
16 |     // initialize
17 |     state.sels = sels
18 |     state.color = 'red'
19 | 	if (!sels) return;
20 | 	var ii = sels.length;
21 | 	$.each(sels, function(i, sel) {
22 | 		$(sel).addClass('highlight_' + state.color);
23 | 		$(sel).addClass('highlight_' + fragment(i, ii));
24 | 		//$(sel).on('click', function() {
25 | 		//	console.log('clicked');
26 | 		//});
27 |     })
28 | 
29 |     state.destroy = function() {
30 |     	// unbind all handlers
31 | 		if (!state.sels) return;
32 | 				
33 | 		$.each(state.sels, function(sel) {
34 | 			//$(sel).unbind('click');
35 | 		});
36 |     }
37 |     return state
38 | }
39 | 
40 | var SpansVisualization = function(element, spans) {
41 | 	var state = {
42 | 	 	renderedSpans: new Array(),
43 | 	 	destroyed: false
44 | 	};
45 | 
46 | 	//var documentOffset = scope.document.offset
47 | 	var documentOffset = 0
48 | 		
49 | 	CharOffsets.createMultiRangeSpans([element,this], spans, state.renderedSpans, documentOffset)
50 | 
51 | 	$.each(state.renderedSpans, function(i, rs) {
52 | 		var span = new Span(rs.sels)
53 | 	});
54 | 
55 | 	state.destroy = function() {
56 | 		state.destroyed = true;
57 | 		$.each(state.renderedSpans, function(i, value) {
58 | 			// do bound listeners automatically get destroyed??
59 |             //value.element.remove();
60 |             //value.scope.$destroy();
61 | 
62 | 			//$.each(value.aux, function(j,n) {
63 | 			//	goog.dom.removeNode(n);
64 | 			//});
65 | 			$.each(value.sels, function(j,n) {
66 | 				goog.dom.flattenElement(n);
67 | 			});
68 | 			value.sels = [];			
69 | 		});
70 | 		//element.remove();
71 | 		//goog.editor.range.normalizeNode(element[0]);
72 | 		state.renderedSpans.length = 0;     
73 | 	}
74 | 	return state
75 | }
76 | 
77 | module.exports = SpansVisualization


--------------------------------------------------------------------------------
/view/public/js/vis/core/TokenTagsVisualization.js:
--------------------------------------------------------------------------------
 1 | /* TokenTagsVisualization */
 2 | 
 3 | var CharOffsets = require('./CharOffsets.js')
 4 | 
 5 | var TokenTagsVisualization = function(element, tokenOffsets, tags) {
 6 | 	var state = {
 7 | 	 	renderedSpans: new Array(),
 8 | 	 	destroyed: false
 9 | 	};
10 | 
11 | 	//var documentOffset = scope.document.offset
12 | 	var documentOffset = 0
13 | 
14 | 	// insert spans
15 | 	CharOffsets.createMultiRangeSpans([element,this], tokenOffsets, state.renderedSpans, documentOffset)
16 | 
17 | 	$.each(state.renderedSpans, function(i, rs) {
18 | 		var firstSpan = rs.sels[0]
19 | 		var el = goog.dom.createDom('div', { 'style' :
20 | 			'position:absolute;' +
21 | 			'top:-15px;' +
22 | 			'left:0px;right:0px;' +
23 | 			'z-index:0;' +
24 | 			'width:100px;' + //' + tokenWidth + 'px;' +
25 | 			'height:20px;' +
26 | 			'color:red;' +
27 | 			'font-size:10px;' +
28 | 			'font-family:helvetica,arial;' +
29 | 			'font-stretch:semi-condensed;' +
30 | 			'font-weight:500;'/* +
31 | 			'background-color:white'*/
32 | 		})
33 | 		el.appendChild(goog.dom.createTextNode(tags[i]))
34 | 		// if you want all lines to be equal height, set marginTop as follows
35 | 		//var marginTop = (drawing.highestLevels[i]+1) * 15;
36 | 			// if you want to use inline rather than inline-block spans, use following line
37 | 			//$(firstSpan).attr('style', 'display:inline;line-height:' + (marginTop + 20) +
38 | 			//   'px;margin-top:' + marginTop + 'px;position:relative');
39 | 		var marginTop = 10
40 | 		$(firstSpan).attr('style', 'display:inline-block;margin-top:' + marginTop + 'px;position:relative')
41 | 		firstSpan.appendChild(el)
42 | 		rs.aux = new Array()
43 | 		rs.aux.push(el)
44 | 		})
45 | 
46 | 	state.destroy = function() {
47 | 		state.destroyed = true;
48 | 		$.each(state.renderedSpans, function(i, value) {
49 | 			$.each(value.aux, function(j, n) {
50 | 				goog.dom.removeNode(n);
51 | 			})
52 | 			$.each(value.sels, function(j, n) {
53 | 				goog.dom.flattenElement(n);
54 | 			})
55 | 			value.sels = [];
56 | 		});
57 | 		//element.remove();
58 | 		//goog.editor.range.normalizeNode(element[0]);
59 | 		state.renderedSpans.length = 0;
60 | 	}
61 | 	return state
62 | }
63 | 
64 | module.exports = TokenTagsVisualization
65 | 


--------------------------------------------------------------------------------
/view/public/js/vis/visframe.js:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 | var FramesVisualization = function(element, source) {
 7 | 	var state ={}
 8 | 
 9 | 	var documentOffset = 0
10 | 
11 | 	var msHeadSpans = new Array();
12 | 	CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset)
13 | 
14 | 
15 | 
16 | }


--------------------------------------------------------------------------------
/view/public/js/vis/visspan.js:
--------------------------------------------------------------------------------
 1 | /* TokensVisualization */
 2 | 
 3 | var Span = function(sels) {
 4 |     var state = {}
 5 | 
 6 | 	var fragment = function(i, length) {
 7 | 		var fragment = '';
 8 | 		if (i==0 && i < length-1) fragment = 'left';
 9 | 		else if (i==0 && i==length-1) fragment = 'leftright';
10 | 		else if (i==length-1 && i > 0) fragment = 'right';
11 | 		else if (i > 0 && i < length-1) fragment = 'inner';
12 | 		return fragment;
13 | 	};
14 | 
15 |     // initialize
16 |     state.sels = sels
17 |     state.color = 'red'
18 | 	if (!sels) return;
19 | 	var ii = sels.length;
20 | 	$.each(sels, function(i, sel) {
21 | 		$(sel).addClass('highlight_' + state.color);
22 | 		$(sel).addClass('highlight_' + fragment(i, ii));
23 | 		//$(sel).on('click', function() {
24 | 		//	console.log('clicked');
25 | 		//});
26 |     })
27 | 
28 |     state.destroy = function() {
29 |     	// unbind all handlers
30 | 		if (!state.sels) return;
31 | 				
32 | 		$.each(state.sels, function(sel) {
33 | 			//$(sel).unbind('click');
34 | 		});
35 |     }
36 |     return state
37 | }
38 | 
39 | 
40 | var SpansVisualization = function(element, spans) {
41 | 	var state = {
42 | 	 	renderedSpans: new Array(),
43 | 	 	destroyed: false
44 | 	};
45 | 
46 | 	//var documentOffset = scope.document.offset
47 | 	var documentOffset = 0
48 | 		
49 | 	CharOffsets.createMultiRangeSpans([element,this], spans, state.renderedSpans, documentOffset)
50 | 
51 | 	$.each(state.renderedSpans, function(i, rs) {
52 | 		var span = new Span(rs.sels)
53 | 	});
54 | 
55 | 	state.destroy = function() {
56 | 		state.destroyed = true;
57 | 		$.each(state.renderedSpans, function(i, value) {
58 | 			// do bound listeners automatically get destroyed??
59 |             //value.element.remove();
60 |             //value.scope.$destroy();
61 | 
62 | 			//$.each(value.aux, function(j,n) {
63 | 			//	goog.dom.removeNode(n);
64 | 			//});
65 | 			$.each(value.sels, function(j,n) {
66 | 				goog.dom.flattenElement(n);
67 | 			});
68 | 			value.sels = [];			
69 | 		});
70 | 		//element.remove();
71 | 		//goog.editor.range.normalizeNode(element[0]);
72 | 		state.renderedSpans.length = 0;     
73 | 	}
74 | 	return state
75 | }
76 | 
77 | 
78 | 
79 | 
80 | var TokensVisualization = function(element, source) {
81 | 	return SpansVisualization(element, source.tokenOffsets)
82 | }
83 | 
84 | var SentencesVisualization = function(element, source) {
85 | 	return SpansVisualization(element, source.sentenceOffsets)
86 | }
87 | 


--------------------------------------------------------------------------------
/view/public/js/vis/vistokentag.js:
--------------------------------------------------------------------------------
 1 | /* TokenTagsVisualization */
 2 | 
 3 | var TokenTagsVisualization = function(element, tokenOffsets, tags) {
 4 | 	var state = {
 5 | 	 	renderedSpans: new Array(),
 6 | 	 	destroyed: false
 7 | 	};
 8 | 
 9 | 	//var documentOffset = scope.document.offset
10 | 	var documentOffset = 0
11 | 
12 | 	// insert spans
13 | 	CharOffsets.createMultiRangeSpans([element,this], tokenOffsets, state.renderedSpans, documentOffset)
14 | 
15 | 	$.each(state.renderedSpans, function(i, rs) {
16 | 		var firstSpan = rs.sels[0]
17 | 		var el = goog.dom.createDom('div', { 'style' :
18 | 			'position:absolute;' +
19 | 			'top:-15px;' +
20 | 			'left:0px;right:0px;' +
21 | 			'z-index:0;' +
22 | 			'width:100px;' + //' + tokenWidth + 'px;' +
23 | 			'height:20px;' +
24 | 			'color:red;' +
25 | 			'font-size:10px;' +
26 | 			'font-family:helvetica,arial;' +
27 | 			'font-stretch:semi-condensed;' +
28 | 			'font-weight:500;'/* +
29 | 			'background-color:white'*/
30 | 		})
31 | 		el.appendChild(goog.dom.createTextNode(tags[i]))
32 | 		// if you want all lines to be equal height, set marginTop as follows
33 | 		//var marginTop = (drawing.highestLevels[i]+1) * 15;
34 | 			// if you want to use inline rather than inline-block spans, use following line
35 | 			//$(firstSpan).attr('style', 'display:inline;line-height:' + (marginTop + 20) +
36 | 			//   'px;margin-top:' + marginTop + 'px;position:relative');
37 | 		var marginTop = 10
38 | 		$(firstSpan).attr('style', 'display:inline-block;margin-top:' + marginTop + 'px;position:relative')
39 | 		firstSpan.appendChild(el)
40 | 		rs.aux = new Array()
41 | 		rs.aux.push(el)
42 | 		})
43 | 
44 | 	state.destroy = function() {
45 | 		state.destroyed = true;
46 | 		$.each(state.renderedSpans, function(i, value) {
47 | 			$.each(value.aux, function(j, n) {
48 | 				goog.dom.removeNode(n);
49 | 			})
50 | 			$.each(value.sels, function(j, n) {
51 | 				goog.dom.flattenElement(n);
52 | 			})
53 | 			value.sels = [];
54 | 		});
55 | 		//element.remove();
56 | 		//goog.editor.range.normalizeNode(element[0]);
57 | 		state.renderedSpans.length = 0;
58 | 	}
59 | 	return state
60 | }
61 | 
62 | 
63 | var PartOfSpeechVisualization = function(element, source) {
64 | 	return TokenTagsVisualization(element, source.tokenOffsets, source.poss)
65 | }
66 | 
67 | var LemmasVisualization = function(element, source) {
68 | 	return TokenTagsVisualization(element, source.tokenOffsets, source.lemmas)
69 | }
70 | 


--------------------------------------------------------------------------------
/view/public/js/visframe.js:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 | var FramesVisualization = function(element, source) {
 7 | 	var state ={}
 8 | 
 9 | 	var documentOffset = 0
10 | 
11 | 	var msHeadSpans = new Array();
12 | 	CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset)
13 | 
14 | 
15 | 
16 | }


--------------------------------------------------------------------------------
/view/routes/index.js:
--------------------------------------------------------------------------------
  1 | var express = require('express');
  2 | var router = express.Router();
  3 | var path = require('path');
  4 | 
  5 | var elasticsearch = require('elasticsearch');
  6 | var client = new elasticsearch.Client({
  7 |   host: 'http://localhost:9200'
  8 | });
  9 | 
 10 | router.get('/', function(req, res, next) {
 11 |   //res.render('index', { title: 'Express3' });
 12 |   res.sendFile(path.join(__dirname + '/../public/index.html'));
 13 | });
 14 | 
 15 | router.get('/search*', function(req, res, ext) {
 16 |   //res.render('index', { title: 'Express3' });
 17 |   res.sendFile(path.join(__dirname + '/../public/index.html'));
 18 | });
 19 | 
 20 | router.get('/annotators', function(req, res, next) {
 21 |   var index = req.query.index || 'view'
 22 |   client.search({
 23 |     index: index, //process.env.INDEX_NAME,
 24 |     type: 'annotators',
 25 |     body: {
 26 |       query: {
 27 |         'match_all': {}
 28 |       }
 29 |     }
 30 |   }).then(function(body) {
 31 |     var hits = body.hits.hits;
 32 |     res.send(hits);
 33 |   }, function (err) {
 34 |     console.trace(err.message);
 35 |     next(err);
 36 |   });
 37 | });
 38 | 
 39 | router.get('/annotations', function(req, res, next) {
 40 |   var doc_ids = []
 41 |   var doc_ids_str = req.param('doc_ids')
 42 |   if (doc_ids_str) doc_ids = doc_ids_str.split(',') 
 43 |   var index = req.query.index || 'view'
 44 | 
 45 |   var obj = {
 46 |     index: index, //process.env.INDEX_NAME,
 47 |     type: 'annotations',
 48 |     from: 0,
 49 |     size: 100000,
 50 |     body: {
 51 |       "query" : {
 52 |         "has_parent": {
 53 |           "type": "docs", 
 54 |           "query": {
 55 |             "ids" : {
 56 |               "values" : doc_ids
 57 |             }
 58 |           }
 59 |         }
 60 |       }
 61 |     }
 62 |   }
 63 | 
 64 |   client.search(
 65 |    obj
 66 |   ).then(function (body) {
 67 |     var hits = body.hits.hits;
 68 |     res.send(hits)
 69 |   }, function (err) {
 70 |     console.trace(err.message);
 71 |     next(err)
 72 |   });
 73 | });
 74 | 
 75 | 
 76 | 
 77 | router.get('/docs', function(req, res, next) {
 78 |   var from = req.param('from', 0)
 79 |   var limit = req.param('limit', 100)
 80 |   var keywords = req.query.keywords || ''
 81 |   var facets = req.query.facets || ''
 82 |   var index = req.query.index || 'view'
 83 | 
 84 |   var obj = {
 85 |     index: index, //process.env.INDEX_NAME,
 86 |     type: 'docs',
 87 |     from: from,
 88 |     size: limit,
 89 |     body: {
 90 |       query: {
 91 |         "match_all" : {}
 92 |       },
 93 |       highlight : {
 94 |         fields : {
 95 |           content : { "number_of_fragments" : 0 }
 96 |         }
 97 |       }
 98 |     }
 99 |   }
100 | 
101 |   if (keywords.length > 0) {
102 |     obj.body.query = {
103 |         query_string: {
104 |           "default_field" : "content",
105 |           "fields" : ["content", "_id", "id"],
106 |           "query" : keywords
107 |         }
108 |       }
109 |   }
110 | 
111 |   if (facets.length > 0) {
112 |     var l = facets.split(',')
113 | 
114 |     var filters = []
115 |     for (var i=0; i < l.length; i++)
116 |       filters.push({
117 |         //"exists" : { "field" : l[i] }
118 |         "has_child" : {
119 |            "type" : "annotations",
120 |            "query" : {
121 |               "term" : {
122 |                 "attribute" : l[i]
123 |               }
124 |            }
125 |         }
126 |       });
127 | 
128 |     if (filters.length > 1)
129 |       obj.body.filter = {
130 |         "and" : filters
131 |       }
132 |     else
133 |       obj.body.filter = filters[0]
134 |   }
135 | 
136 |   client.search(obj).then(function (body) {
137 |     var docs_context = body.hits
138 |     var docs = body.hits.hits;
139 |    
140 |     // we now have the documents, run another query to get all annotations on
141 |     // these documents 
142 |     var doc_ids = new Array(docs.length)
143 |     for (var i=0, ii = docs.length; i < ii; i++)
144 |       doc_ids[i] = docs[i]._id
145 | 
146 |     var obj = {
147 |       index: index, //process.env.INDEX_NAME,
148 |       type: 'annotations',
149 |       from:0,
150 |       size:100000,
151 |       body: {
152 |         "query" : {
153 |           "has_parent": {
154 |             "type": "docs",
155 |             "query": {
156 |               "ids" : {
157 |                 "values" : doc_ids
158 |               }
159 |             }
160 |           }
161 |         }
162 |       }
163 |     }
164 |     client.search(obj).then(function(body) {
165 |       var hits = body.hits.hits
166 |       // build a little index of the annotations
167 |       var id2ann = {}
168 |       for (var i = 0, ii = hits.length; i < ii; i++) {
169 |         var id = hits[i]._source.range.doc_id
170 |         if (id in id2ann)
171 |           id2ann[id].push(hits[i]._source)
172 |         else
173 |           id2ann[id] = [hits[i]._source]
174 |       }
175 |       // add to docs
176 |       for (var i=0, ii = docs.length; i < ii; i++) {
177 |         docs[i].annotations = id2ann[docs[i]._id] || []
178 |       } 
179 |       res.send(docs_context) 
180 |     }, function(err) {
181 |       console.trace(err.message);
182 |       next(err)
183 |     });
184 |     
185 |     //res.send(hits)
186 | 
187 |   }, function (err) {
188 |     console.trace(err.message);
189 |     next(err)
190 |   });
191 | });
192 | 
193 | 
194 | module.exports = router;
195 | 


--------------------------------------------------------------------------------
/view/routes/users.js:
--------------------------------------------------------------------------------
 1 | var express = require('express');
 2 | var router = express.Router();
 3 | 
 4 | /* GET users listing. */
 5 | router.get('/', function(req, res, next) {
 6 |   res.send('respond with a resource');
 7 | });
 8 | 
 9 | module.exports = router;
10 | 


--------------------------------------------------------------------------------
/view/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ $(uname) = Darwin ]] &&
 4 |     osascript -e 'get path to application "iTerm"' &>/dev/null; then
 5 |     # On a Mac with iTerm.app, do something nice
 6 |     start() { ( source ./util/tab && tab "$@" ); }
 7 | else
 8 |     # otherwise, just run the process
 9 |     start() { local title=$1; shift; "$@" & }
10 |     trap wait EXIT
11 | fi
12 | 
13 | # launch elasticsearch
14 | 
15 | start "ElasticSearch" elasticsearch
16 | 
17 | # launch nodejs
18 | start "Nodejs" npm start
19 | 
20 | # launch react jsx watch
21 | start "Reactjs" jsx --watch view/ public/js
22 | 
23 | 


--------------------------------------------------------------------------------
/view/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # install virtualenv
 4 | command -v virtualenv >/dev/null 2>&1 || {
 5 |   echo >&2 "virtualenv required but not installed. Aborting.";
 6 |   echo >&2 "You can install virtualenv with:"
 7 |   echo >&2 "    sudo pip install virtualenv"
 8 | }
 9 | 
10 | virtualenv env
11 | source env/bin/activate
12 | 
13 | # install python dependencies
14 | pip install elasticsearch
15 | pip install pyhocon
16 | pip install psycopg2
17 | 
18 | # install node packages
19 | npm install
20 | 
21 | # elasticsearch
22 | cd util
23 | ES_VER=elasticsearch-1.6.0
24 | if [ ! -f ${ES_VER}.tar.gz ]; then
25 |     curl -L -O https://download.elastic.co/elasticsearch/elasticsearch/${ES_VER}.tar.gz
26 |     tar xvzf ${ES_VER}.tar.gz
27 | fi
28 | # must add
29 | echo "script.disable_dynamic: false" >> ${ES_VER}/config/elasticsearch.yml
30 | cd ..
31 | 
32 | # for development, we would like to enable auto-reload
33 | npm install react-tools nodemon
34 | 
35 | cd public
36 | git clone https://github.com/google/closure-library
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/view/util/cat.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | curl localhost:9200/_cat/indices/?v
4 | 


--------------------------------------------------------------------------------
/view/util/create_index.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # exists
 4 | HEAD=$(curl -s -XHEAD -i 'http://localhost:9200/view')
 5 | [ "${HEAD:0:15}" == "HTTP/1.1 200 OK" ] && EXISTS=1
 6 | if [ $EXISTS ]; then
 7 |   curl -XDELETE 'http://localhost:9200/view/'
 8 | fi
 9 | 
10 | INDEX_NAME=view
11 | TYPE_DOCS_NAME=docs
12 | 
13 | curl -XPOST localhost:9200/$INDEX_NAME -d '{
14 |   "settings" : {
15 |     "index" : {
16 |       "number_of_shards" : 1
17 |     },
18 |     "analysis" : {
19 |       "analyzer" : {
20 |         "fulltext_analyzer" : {
21 |           "type" : "custom",
22 |           "tokenizer" : "whitespace",
23 |           "filter" : [
24 |             "lowercase"
25 |           ]
26 |         }
27 |       }
28 |     }
29 |   },
30 |   "mappings" : {
31 |     "annotations" : {
32 |       "_source" : { "enabled" : true },
33 |       "_parent" : {
34 |           "type" : "docs"
35 |        },
36 |       "properties" : {}
37 |     },
38 |     "docs" : {
39 |       "_source" : { "enabled" : true },
40 |       "properties" : {
41 |           "id" : {
42 |             "type" : "string"
43 |           },
44 |           "content" : { 
45 |             "type" : "string", 
46 |             "term_vector" : "with_positions_offsets",
47 |             "store" : false,
48 |             "index_analyzer" : "fulltext_analyzer",
49 |             "norms" : {
50 |                "enabled" : false
51 |             }
52 |           },
53 |           "text" : {
54 |             "type" : "string",
55 |             "term_vector" : "with_positions_offsets",
56 |             "index_analyzer" : "fulltext_analyzer"
57 |           },
58 |           "extr1" : {
59 |             "type" : "string",
60 |             "index" : "not_analyzed"
61 |           },
62 |           "extr1_meta" : {
63 |             "type" : "string",
64 |             "index" : "not_analyzed"
65 |           }
66 |         }
67 |       }
68 |     }
69 |   }'
70 | 


--------------------------------------------------------------------------------
/view/util/fetch-annotations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pyhocon import ConfigFactory
 4 | import json
 5 | import psycopg2
 6 | import psycopg2.extras
 7 | import sys
 8 | 
 9 | conf = ConfigFactory.parse_file('../view.conf')
10 | 
11 | conf_annotations = conf.get_list('view.annotations')
12 | 
13 | def write_annotations():
14 |     # write extractions to json file
15 |     dbconf = conf.get('view.db.default')
16 |     conn_string = "host='%s' dbname='%s' user='%s' password='%s'" % (
17 |         dbconf.get('host'),
18 |         dbconf.get('dbname'),
19 |         dbconf.get('user'),
20 |         dbconf.get('password'))
21 |     conn = psycopg2.connect(conn_string)
22 |     for ann in conf_annotations:
23 |       with open('../' + ann.get('input'), 'w') as w:
24 |         cursor = conn.cursor('ann_cursor', cursor_factory=psycopg2.extras.DictCursor)
25 |         cursor.execute(ann.get('sql.query'))
26 |         for row in cursor:
27 |             #print(row)
28 |             # TODO: must write into the following format
29 |             # each row:
30 |             # {"range":{"type":"sentenceTokenSpan","doc_id":"doc123","sentNum":0,"f":3,"t":4},"target":{"entity":"something"}}
31 |             # save in file using w.write
32 |             obj = {"id":row[0], "range":{"type":"sentenceTokenSpan","doc_id":row[1],"sentNum":0,"f":row[2],"t":int(row[3])},"target":{"entity":row[4]}}
33 |             w.write(json.dumps(obj))
34 |             w.write('\n')
35 | 
36 | write_annotations()
37 | 


--------------------------------------------------------------------------------
/view/util/fetch-sentences-table.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Legacy support for sentences table in DeepDive.
 4 | # The script reads the table from the database and stores it in the new column format.
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | import json
 8 | import psycopg2
 9 | import psycopg2.extras
10 | import sys
11 | import pipe
12 | 
13 | conf = ConfigFactory.parse_file('../view.conf')
14 | 
15 | docs = conf.get('view.docs')
16 | 
17 | def find_token_offsets(s):
18 |     # split on whitespace
19 |     pos = [ -1 ] + [ i for i, ltr in enumerate(s) if ltr == ' ' ] + [ len(s) ]
20 |     offsets = [ [ pos[i] + 1, pos[i + 1] ] for i in range(0, len(pos) - 1) ]
21 |     return offsets
22 | 
23 | def write_docs():
24 |     # write extractions to json file
25 |     dbconf = conf.get('view.db.default')
26 |     conn_string = "host='%s' dbname='%s' user='%s' password='%s'" % (
27 |         dbconf.get('host'),
28 |         dbconf.get('dbname'),
29 |         dbconf.get('user'),
30 |         dbconf.get('password'))
31 |     conn = psycopg2.connect(conn_string)
32 |     cursor = conn.cursor('ann_cursor', cursor_factory=psycopg2.extras.DictCursor)
33 |     cursor.execute(docs.get('sql.query'))
34 | 
35 |     with pipe.col_open_w('../data/sentences', [ 'id', 'text', 'tokenOffsets', 'sentenceTokenOffsets', 'sentenceOffsets', 'lemmas', 'poss' ]) as w:
36 |       sent_num = 0
37 |       prev_document_id = None
38 |       for row in cursor:
39 |           # id
40 |           #document_id = str(row[0])
41 |           #if document_id != prev_document_id:
42 |           #    sent_num = 0
43 |           #id = document_id + '@' + str(sent_num)
44 |           id = row[0]
45 |  
46 |           text = row[1]
47 |           token_offsets = find_token_offsets(text)
48 |           sentence_token_offsets = [[0,len(token_offsets)]]
49 |           sentence_offsets = [[0, len(text)]]
50 |           lemmas = row[2]
51 |           pos_tags = row[3]
52 | 
53 |           w.write([id, text, token_offsets, sentence_token_offsets, sentence_offsets, lemmas, pos_tags])
54 | 
55 |           #prev_document_id = document_id
56 |           sent_num = sent_num + 1
57 | 
58 | write_docs()
59 | 
60 | 


--------------------------------------------------------------------------------
/view/util/generate_sentence_table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Author: Zifei Shan (zifeishan@gmail.com)
  4 | 
  5 | ''' This file construct a sentence table from ann.* files generated from Pipe project.
  6 | 
  7 | Example usage:
  8 | 
  9 |     python generate_sentence_table.py  DIRECTORY/OF/ANN/  > output_sentences.tsv
 10 | 
 11 | The generated sentence table follow the format below:
 12 | 
 13 |     CREATE TABLE sentences (
 14 |      doc_id       text,
 15 |      sent_id      integer,
 16 |      wordidxs     integer[],
 17 |      words        text[],
 18 |      poses        text[],
 19 |      ners         text[],
 20 |      lemmas       text[],
 21 |      dep_tuples   text[],  -- Triplet format. e.g.: "1 dep 0"
 22 |      sentence_id  text
 23 |     );
 24 | 
 25 | '''
 26 | 
 27 | import sys, json
 28 | 
 29 | # This file can accept an argument: the folder that contains ann.*
 30 | # If not specified, use the current directory.
 31 | if len(sys.argv) == 1:
 32 |   basedir = ''  
 33 | else:
 34 |   basedir = sys.argv[1] + '/'
 35 | 
 36 | # Helper functions
 37 | 
 38 | def list2TSVarray(a_list, quote=True):
 39 |   '''Convert a list to a string that can be used in a TSV column and intepreted as
 40 |   an array by the PostreSQL COPY FROM command.
 41 |   If 'quote' is True, then double quote the string representation of the
 42 |   elements of the list, and escape double quotes and backslashes.
 43 |   '''
 44 |   if a_list is None:
 45 |     return '\\N'
 46 | 
 47 |   if quote:
 48 |     for index in range(len(a_list)):
 49 |       if "\\" in unicode(a_list[index]):
 50 |         # Replace '\' with '\\\\"' to be accepted by COPY FROM
 51 |         a_list[index] = unicode(a_list[index]).replace("\\", "\\\\\\\\")
 52 |       # This must happen the previous substitution
 53 |       if "\"" in unicode(a_list[index]):
 54 |         # Replace '"' with '\\"' to be accepted by COPY FROM
 55 |         a_list[index] = unicode(a_list[index]).replace("\"", "\\\\\"")
 56 |     string = ",".join(list(map(lambda x: "\"" + unicode(x) + "\"", a_list)))
 57 |   else:
 58 |     string = ",".join(list(map(lambda x: unicode(x), a_list)))
 59 |   return "{" + string + "}"
 60 | 
 61 | def open_file(fname):
 62 |   '''
 63 |   Opens a file, if not found, return None.
 64 |   '''
 65 |   try:
 66 |     return open(fname)
 67 |   except:
 68 |     return None
 69 | 
 70 | def read_js_line(fp):
 71 |   '''
 72 |   Return None if file is not open. Otherwise read a line from file.
 73 |   If '' returned, EOF is found.
 74 |   '''
 75 |   if fp == None:
 76 |     return None
 77 |   s = fp.readline()
 78 |   if s == '':
 79 |     return ''
 80 |   else:
 81 |     return json.loads(s)
 82 | 
 83 | def escape_none(s):
 84 |   '''
 85 |   Just escaping a None into psql-friendly format
 86 |   '''
 87 |   if s is None:
 88 |     return '\\N'
 89 |   return unicode(s).encode('utf-8')
 90 | 
 91 | def findTokenOffset(token_offsets, sent_offset):
 92 |   '''
 93 |   Construct sent_token_offsets
 94 |   '''
 95 |   start = min(i for i in range(len(token_offsets)) if token_offsets[i][0] == sent_offset[0]) 
 96 |   end = max(i for i in range(len(token_offsets)) if token_offsets[i][1] == sent_offset[1]) + 1
 97 |   return start, end
 98 | 
 99 | # ----------- Main function -------------
100 | 
101 | # Assume fixed filenames
102 | fdoc_id = open_file(basedir + 'ann.id')
103 | flemma = open_file(basedir + 'ann.lemmas')
104 | fpos = open_file(basedir + 'ann.poss')
105 | fner = open_file(basedir + 'ann.nerTags')
106 | fsent_offset = open_file(basedir + 'ann.sentenceOffsets')
107 | fsent_token_offset = open_file(basedir + 'ann.sentenceTokenOffsets')
108 | ftext = open_file(basedir + 'ann.text')
109 | ftoken_offset = open_file(basedir + 'ann.tokenOffsets')
110 | fsent_deps = open_file(basedir + 'ann.sentenceDependencies')
111 | 
112 | while True:
113 |   doc_id = read_js_line(fdoc_id)
114 |   lemmas = read_js_line(flemma)
115 |   poss = read_js_line(fpos)
116 |   ners = read_js_line(fner)
117 |   sent_offsets = read_js_line(fsent_offset)
118 |   # sent_token_offsets = read_js_line(fsent_token_offset)
119 |   text = read_js_line(ftext)
120 |   token_offsets = read_js_line(ftoken_offset)
121 |   sent_deps = read_js_line(fsent_deps)
122 | 
123 |   if any(x == '' for x in [doc_id, lemmas, poss, sent_offsets, \
124 |     text, token_offsets]):
125 |     break
126 | 
127 |   sent_token_offsets = [ findTokenOffset(token_offsets, x) for x in sent_offsets]
128 | 
129 |   # loop through each sentence
130 |   sent_words = [text[o[0] : o[1]] for o in token_offsets]
131 |   # print 'WORDS:', sent_words
132 | 
133 | 
134 |   for sent_id in range(len(sent_token_offsets)):
135 |     sent_from, sent_to = sent_token_offsets[sent_id]
136 |     sentence_id = unicode(doc_id) + '_' + unicode(sent_id)
137 |     if sent_deps is not None:
138 |       # e.g.: [[{"name":"det","from":1,"to":0}],[{"name":"advmod","from":1,"to":0},{"name":"advmod","from":1,"to":2}]]
139 |       this_sent_deps = ['%d %s %d' % (d['from'], d['name'], d['to']) for d in sent_deps[sent_id]]
140 |     print '\t'.join([escape_none(x) for x in [ \
141 |       doc_id, \
142 |       sent_id, \
143 |       list2TSVarray([x for x in range(sent_to - sent_from)]), \
144 |       list2TSVarray( sent_words[ sent_from : sent_to] ) if sent_words is not None else None, \
145 |       list2TSVarray( poss[ sent_from : sent_to]) if poss is not None else None, \
146 |       list2TSVarray( ners[ sent_from : sent_to]) if ners is not None else None, \
147 |       list2TSVarray( lemmas[ sent_from : sent_to]) if lemmas is not None else None, \
148 |       list2TSVarray( this_sent_deps ) if sent_deps is not None else None, \
149 |       sentence_id \
150 |     ]])
151 | 


--------------------------------------------------------------------------------
/view/util/get.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | #curl -XGET 'http://localhost:9200/dd/docs/10.1371.journal.pone.0042439.Body__50'
4 | #curl -XGET 'http://localhost:9200/view/docs/doc123'
5 | curl -XGET 'http://localhost:9200/view/docs/132553@2'
6 | 


--------------------------------------------------------------------------------
/view/util/index_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from elasticsearch import Elasticsearch
 4 | import json
 5 | 
 6 | INPUT = "../data/sentences.json"
 7 | ES_HOST = {"host" : "localhost", "port" : 9200}
 8 | INDEX_NAME = 'dd'
 9 | TYPE_NAME = 'docs'
10 | N = 1000
11 | 
12 | es = Elasticsearch(hosts = [ES_HOST])
13 | 
14 | es.delete_by_query(index = INDEX_NAME, body = {
15 |       "query": {
16 |         "match_all": {}
17 |       }
18 |   })
19 | 
20 | with open(INPUT, 'r') as f:
21 |     bulk_data = []
22 | 
23 |     for line in f:
24 |         src = json.loads(line)
25 |         id = src['doc_id'] + '__' + src['sent_id']
26 |         content = ' '.join(src['words'])
27 |         op_dict = {
28 |             "index": {
29 |                 "_index": INDEX_NAME,
30 |                 "_type": TYPE_NAME,
31 |         	"_id": id 
32 |             }
33 |         }
34 |         data_dict = {
35 |             "id": id,
36 |             "content": content
37 |         }
38 |         bulk_data.append(op_dict)
39 |         bulk_data.append(data_dict)
40 |         if len(bulk_data) > N:
41 |             res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
42 |             bulk_data = [] 
43 | 
44 | if len(bulk_data) > 0:
45 |    res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
46 | 
47 | es.indices.refresh(index = INDEX_NAME)
48 | 
49 | #if es.indices.exists(INDEX_NAME):
50 | #    res = es.indices.delete(index = INDEX_NAME)
51 | #
52 | #request_body = {
53 | #    "settings" : {
54 | #        "number_of_shards": 1,
55 | #        "number_of_replicas": 0
56 | #    }
57 | #}
58 | #
59 | #print("creating '%s' index..." % (INDEX_NAME))
60 | #res = es.indices.create(index = INDEX_NAME, body = request_body, ignore=400)
61 | 
62 | #print("bulk indexing...")
63 | #res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)
64 | 
65 | # sanity check
66 | #res = es.search(index = INDEX_NAME, size=2, body={"query": {"match_all": {}}})
67 | #print(" response: '%s'" % (res))
68 | 
69 | #print("results:")
70 | #for hit in res['hits']['hits']:
71 | #    print(hit["_source"])
72 | 
73 | 


--------------------------------------------------------------------------------
/view/util/index_extr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from elasticsearch import Elasticsearch
 4 | import json
 5 | 
 6 | EXTRACTOR='genepheno'
 7 | INPUT='../data/genepheno_rel.json'
 8 | ES_HOST = {"host" : "localhost", "port" : 9200}
 9 | INDEX_NAME = 'dd'
10 | TYPE_NAME = 'docs'
11 | N = 1000
12 | 
13 | es = Elasticsearch(hosts = [ES_HOST])
14 | 
15 | with open(INPUT, 'r') as f:
16 |     bulk_data = []
17 | 
18 |     for line in f:
19 |         src = json.loads(line)
20 |         id = src['doc_id'] + '__' + str(src['sent_id'])
21 |         op_dict = {
22 |             "update": {
23 |                 "_index": INDEX_NAME,
24 |                 "_type": TYPE_NAME,
25 |         	"_id": str(id) 
26 |             }
27 |         }
28 |         extr = ','.join(map(str, src['gene_wordidxs'])) + '-' + ','.join(map(str, src['pheno_wordidxs']))
29 |         script_dict = {
30 |             "script" : "if (ctx._source.containsKey(\"" + EXTRACTOR + "\")) {ctx._source[\"" + EXTRACTOR + "\"] += ex;} else {ctx._source[\"" + EXTRACTOR + "\"] = [ex]}",
31 |             "params" : {
32 |                 "ex" : extr
33 |             }
34 |         }
35 |         bulk_data.append(op_dict)
36 |         bulk_data.append(script_dict)
37 |         if len(bulk_data) > N:
38 |             res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
39 |             bulk_data = [] 
40 | 
41 | if len(bulk_data) > 0:
42 |    print('doing update')
43 |    res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
44 | 
45 | es.indices.refresh(index = INDEX_NAME)
46 | 
47 | 


--------------------------------------------------------------------------------
/view/util/index_extrlist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from elasticsearch import Elasticsearch
 4 | import json
 5 | 
 6 | ES_HOST = {"host" : "localhost", "port" : 9200}
 7 | INDEX_NAME = 'dd'
 8 | TYPE_NAME = 'extractors'
 9 | 
10 | es = Elasticsearch(hosts = [ES_HOST])
11 | 
12 | es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_NAME, body = {
13 |       "query": {
14 |         "match_all": {}
15 |       }
16 | })
17 | 
18 | 
19 | es.index(index = INDEX_NAME, doc_type = TYPE_NAME, body = {
20 |   "name" : "genepheno"
21 | }, refresh = False)
22 | 
23 | es.indices.refresh(index = INDEX_NAME)
24 | 
25 | 


--------------------------------------------------------------------------------
/view/util/pipe.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | from os import listdir
 4 | from os.path import isfile, join
 5 | import json
 6 | 
 7 | # column format reader
 8 | 
 9 | def col_open(dir):
10 |   return ColumnReaderAsSingleObj(dir)
11 | 
12 | def col_open_arr(dir):
13 |   return ColumnReader(dir)
14 | 
15 | class ColumnReader(object):
16 |   '''Reads Pipe's column format'''
17 |   
18 |   def __init__(self, dir):
19 |     files = [ f for f in listdir(dir) if isfile(join(dir, f)) and not f == '.errors' ]
20 |     self.types = [ f[f.rfind('.') + 1:] for f in files ]
21 |     self.u_types = [ unicode(s, 'utf-8') for s in self.types ]
22 |     self.handles = [ open(join(dir, f)) for f in files ]
23 | 
24 |   def __iter__(self):
25 |     return self
26 | 
27 |   def next(self):
28 |     row = [ h.readline() for h in self.handles ]
29 |     for c in row:
30 |       if c == '':
31 |         self.close()
32 |         raise StopIteration
33 |     return [ json.loads(c.rstrip()) for c in row ]
34 |    
35 |   def close(self):
36 |     for h in self.handles:
37 |       if not h.closed:
38 |         h.close()
39 | 
40 | class ColumnReaderAsSingleObj(ColumnReader):
41 | 
42 |   def next(self):
43 |     row = super(self.__class__, self).next()
44 |     obj = {}
45 |     for i in range(0, len(row)):
46 |       obj[self.u_types[i]] = row[i]
47 |     return obj
48 | 
49 | # column format writer
50 | 
51 | def col_open_w(dir, types):
52 |   return ColumnWriter(dir, types)
53 | 
54 | class ColumnWriter(object):
55 |   '''Writes Pipe's column format'''
56 | 
57 |   def __init__(self, dir, types):
58 |     self.types = types
59 |     files = [ 'ann.' + t for t in types ]
60 |     self.handles = [ open(join(dir, 'ann.' + t), 'w') for t in types ]
61 | 
62 |   def __enter__(self):
63 |     return self
64 | 
65 |   def __exit__(self, type, value, traceback):
66 |     self.close()
67 | 
68 |   def write(self, arr):
69 |     for i, a in enumerate(arr):
70 |       self.handles[i].write(json.dumps(a) + '\n')
71 | 
72 |   def close(self):
73 |     for h in self.handles:
74 |       if not h.closed:
75 |         h.close()
76 | 


--------------------------------------------------------------------------------
/view/util/refresh-annotations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ES_HOST = {"host" : "localhost", "port" : 9200}
 4 | INDEX_NAME = 'view'
 5 | TYPE_ANNOTATORS_NAME = 'annotators'
 6 | TYPE_ANNOTATIONS_NAME = 'annotations'
 7 | N = 1000
 8 | 
 9 | from pyhocon import ConfigFactory
10 | from elasticsearch import Elasticsearch
11 | import json
12 | import sys
13 | 
14 | conf = ConfigFactory.parse_file('../view.conf')
15 | 
16 | conf_annotations = conf.get_list('view.annotations')
17 | 
18 | es = Elasticsearch(hosts = [ES_HOST])
19 | 
20 | # create a small table that only contains the names of all available extractors
21 | def index_annotators():
22 |   es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_ANNOTATORS_NAME, body = {
23 |       "query": {
24 |         "match_all": {}
25 |       }
26 |   })
27 |   for ann in conf_annotations:
28 |     es.index(index = INDEX_NAME, doc_type = TYPE_ANNOTATORS_NAME, body = {
29 |       "name" : ann.get('name')
30 |     }, refresh = False)
31 |   es.indices.refresh(index = INDEX_NAME)
32 | 
33 | # create a large table that contains all extractions
34 | def index_annotations():
35 |   es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_ANNOTATIONS_NAME, body = {
36 |       "query": {
37 |         "match_all": {}
38 |       }
39 |   })
40 |   for ann in conf_annotations:
41 |     # read from file
42 | 
43 |     # bulk index docs
44 |     bulk_data = []
45 |     for l in open('../' + ann.get('input')):
46 |         o = json.loads(l)
47 |         # {"id": "12", "range":{"type":"sentenceTokenSpan","doc_id":"doc123","sentNum":0,"f":3,"t":4},"target":{"entity":"something"}}
48 |         o['attribute'] = ann.get('name')
49 |         op_dict = {
50 |             "index": {
51 |                 "_index": INDEX_NAME,
52 |                 "_type": TYPE_ANNOTATIONS_NAME,
53 |                 "_id": o['id'],
54 |                 "_parent": o['range']['doc_id']
55 |             }
56 |         }
57 |         #data_dict = {
58 |         #    "id": id,
59 |         #    "content": content,
60 |         #    "tokenOffsets": tokenOffsets
61 |         #}
62 |         #o['content'] = o[u'text']
63 |         data_dict = o
64 |         #print(op_dict)
65 |         #print(data_dict)
66 |         bulk_data.append(op_dict)
67 |         bulk_data.append(data_dict)
68 |         if len(bulk_data) > N:
69 |             res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
70 |             bulk_data = []
71 | 
72 |     if len(bulk_data) > 0:
73 |         res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
74 | 
75 |     es.indices.refresh(index = INDEX_NAME)
76 | 
77 | index_annotators()
78 | index_annotations()
79 | 
80 | 


--------------------------------------------------------------------------------
/view/util/refresh-documents.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pipe
 4 | 
 5 | ES_HOST = {"host" : "localhost", "port" : 9200}
 6 | INDEX_NAME = 'view'
 7 | TYPE_NAME = 'docs'
 8 | N = 1000
 9 | 
10 | from pyhocon import ConfigFactory
11 | from elasticsearch import Elasticsearch
12 | import json
13 | import sys
14 | 
15 | conf = ConfigFactory.parse_file('../view.conf')
16 | 
17 | docs_conf = conf.get('view.docs')
18 | 
19 | es = Elasticsearch(hosts = [ES_HOST])
20 | 
21 | def index_docs():
22 | 
23 |     # clear index
24 |     es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_NAME, body = {
25 |         "query": {
26 |           "match_all": {}
27 |         }
28 |     })
29 | 
30 |     # bulk index docs
31 |     bulk_data = []
32 |     for o in pipe.col_open('../' + docs_conf.get('input')):
33 |         id = o[u'id']
34 |         content = o[u'text']
35 |         tokenOffsets = o[u'tokenOffsets']
36 | 
37 |         op_dict = {
38 |             "index": {
39 |                 "_index": INDEX_NAME,
40 |                 "_type": TYPE_NAME,
41 |                 "_id": id
42 |             }
43 |         }
44 |         #data_dict = {
45 |         #    "id": id,
46 |         #    "content": content,
47 |         #    "tokenOffsets": tokenOffsets
48 |         #}
49 |         o['content'] = o[u'text']
50 |         data_dict = o
51 |         bulk_data.append(op_dict)
52 |         bulk_data.append(data_dict)
53 |         if len(bulk_data) > N:
54 |             res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
55 |             bulk_data = []
56 | 
57 |     if len(bulk_data) > 0:
58 |         res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False)
59 | 
60 |     es.indices.refresh(index = INDEX_NAME)
61 | 
62 | index_docs()
63 | 


--------------------------------------------------------------------------------
/view/util/search.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | #curl -XGET 'http://localhost:9200/view/docs/_search?q=_id:10.1371.journal.pone.0042439.Body__50'
4 | curl -XGET 'http://localhost:9200/view/docs/_search?q=_id:doc123'
5 | curl -XGET 'http://localhost:9200/view/docs/_search?q=simple'
6 | 


--------------------------------------------------------------------------------
/view/util/tab:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  
 3 | [ `uname -s` != "Darwin" ] && return
 4 |  
 5 | function tab () {
 6 |     local name="$1"
 7 |     local cmd=""
 8 |     local cdto="$PWD"
 9 |     local args="${@:2}"
10 | 
11 |     echo "launching $args ..."
12 |  
13 |     if [ -n "$args" ]; then
14 |         cmd="; $args"
15 |     fi
16 | 
17 |     osascript &>/dev/null <<EOF
18 |         # automate command+t to split tab vertically
19 |         tell application "System Events" to tell process "Terminal" to keystroke "d" using command down
20 |         tell application "iTerm" to tell session -1 of current terminal 
21 |             set name to "$1"
22 |             write text "cd \"$cdto\"$cmd"            
23 |         end tell
24 | EOF
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/view/view.conf:
--------------------------------------------------------------------------------
 1 | view {
 2 | 
 3 |   docs : {
 4 |     #input: data/pipe
 5 |     input: data/sentences
 6 |     inputFormat: column
 7 |     sql: {
 8 |       query: "SELECT sentence_id, sentence, lemma, pos_tags FROM sentences ORDER BY document_id, sentence_id"
 9 |     }
10 |   }
11 | 
12 |   annotations: [
13 |    {     
14 |      name: people
15 |      #input: data/annotations/people.span
16 |      input: data/people.span
17 |      sql: {
18 |        query: "SELECT mention_id, sentence_id, start_position, start_position + length, text FROM people_mentions",
19 |        #range: {
20 |        #  typ: span    # span, doc
21 |        #  columns: [ doc_id, from, to]
22 |        #},
23 |        #target: {
24 |        #  columns: [ entity ]
25 |        #}
26 |      }
27 |    } 
28 |   ]
29 | 
30 |   db.default {
31 |     driver   : "org.postgresql.Driver"
32 |     url      : "jdbc:postgresql://"${PGHOST}":"${PGPORT}"/"${DBNAME}
33 |     user     : ${PGUSER}
34 |     password : ${PGPASSWORD}
35 |     dbname   : ${DBNAME}
36 |     host     : ${PGHOST}
37 |     port     : ${PGPORT}
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/view/view/help/Help.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Help = React.createClass({
 3 | 
 4 | 
 5 |   render: function() {
 6 |     var show = this.props.isHelp
 7 | 
 8 |     var wrapperStyle = {position:'fixed', top: '0px', right:0, minHeight:'100%', overflowX:'hidden', transition:'width .25s',
 9 |                                WebkitTransition:'width .25s', backgroundColor: 'rgb(71, 71, 71)'}
10 |     var columnStyle = {position:'absolute', top:'50px', paddingTop:'10px', paddingBottom:'10px', paddingLeft:'10px', paddingRight:'10px',
11 |                                       minHeight:'100%', width:'280px', color:'white', zIndex:3}
12 | 
13 |     var columnStyleBackground = {} //{position:'fixed', boxSizing:'borderBox', MozBoxSizing:'border-box', WebkitBoxSizing:'border-box',
14 |                     //top:0, right:0, minHeight:'100%', backgroundColor:'rgba(71,71,71,1)', transition:'width .25s', WebkitTransition:'width .25s',
15 |                     //zIndex:1}
16 |     if (show) {
17 |         columnStyleBackground.width = '300px'
18 |         //wrapperStyle.width = '300px'
19 |     } else {
20 |         //wrapperStyle.width = '0px'
21 |         columnStyleBackground.width = '0px';
22 |     }
23 | 
24 |     return (<div style={wrapperStyle}>
25 |                <div className="help" style={columnStyle}>
26 |                 <h1>Query Examples</h1>
27 | 
28 |                 <h3>Words and Phrases</h3>
29 |                 <code>quick</code> and <code>"quick brown"</code>
30 | 
31 |                 <h3>Field names</h3>
32 |                 <code>_id:4325235</code><br />
33 |                 <code>title:(quick OR brown)</code><br />
34 |                 <code>book.\*:(quick brown)</code><br />
35 |                 <code>_missing_:title</code><br />
36 |                 <code>_exists_:title</code>
37 | 
38 |                 <h3>Wildcards</h3>
39 |                 <code>qu?ck bro*</code>
40 | 
41 |                 <h3>Regular Expressions</h3>
42 |                 <code>name:/joh?n(ath[oa]n)/</code>
43 | 
44 |                 <h3>Fuzziness</h3>
45 |                 <code>quikc~ brwn~ foks~</code><br />
46 |                 <code>quikc~1</code>
47 | 
48 |                 <h3>Proximity Searches</h3>
49 |                 <code>"fox quick"~5</code>
50 | 
51 |                 <h3>Ranges</h3>
52 |                 <code>date:[2012-01-01 TO 2012-12-31]</code><br />
53 |                 <code>count:[1 TO 5]</code><br />
54 |                 <code>tag: {"{"}alpha TO omega{"}"}</code><br />
55 |                 <code>count:[10 TO *]</code><br />
56 |                 <code>date:{"{"}* TO 2012-01-01{"}"}</code><br />
57 |                 <code>count:[1 TO 5{"}"}</code><br />
58 |                 <code>age:&gt;=10</code><br />
59 |                 <code>age:(&gt;=10 AND &lt;20)</code>
60 | 
61 |                 <h3>Boosting</h3>
62 |                 <code>quick^2 fox</code><br />
63 |                 <code>"john smith"^2</code><br />
64 |                 <code>(foo bar)^4</code>
65 | 
66 |                 <h3>Boolean Operators</h3>
67 |                 <code>quick brown +fox -news</code><br />
68 |                 <code>((quick AND fox) OR (brown AND fox) OR fox) AND NOT news</code>
69 | 
70 |                 <h3>Grouping</h3>
71 |                 <code>(quick OR brown) AND fox</code><br />
72 |                 <code>status:(active OR pending) title:(full text search)^2</code>
73 | 
74 |                 <h3>Reserved Characters</h3>
75 |                 Escape with backslash<br />
76 |                 Example: <code>\(1\+1\)\=2</code> , finds (1+1)=2 <br />
77 |                 Characters: <code>+ - = &amp;&amp; || &gt; &lt; ! ( ) {"{"} {"}"} [ ] ^ &quot; ~ * ? : \ /</code>
78 | 
79 |                 <h3>Empty Query</h3>
80 |                 Shows all results.
81 | 
82 |                 <p>
83 |                   For more details, see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax" target="_blank">here</a>.
84 |                 </p>
85 |                   </div>
86 |                   <div style={columnStyleBackground}>
87 |                     <div style={{position:'absolute', borderLeft:'1px solid white', minHeight:'100%', width:'1px'}}></div>
88 |                   </div>
89 |             	</div>)
90 |   }
91 | })
92 | 
93 | module.exports = Help
94 | 


--------------------------------------------------------------------------------
/view/view/vis/AnnotationsSelector.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var AnnotationsSelector = React.createClass({
 3 | 
 4 | 	render: function() {
 5 | 		var onLayerChange = this.props.onLayerChange
 6 | 
 7 | 		var buttons = this.props.layers.map(function(result) {
 8 |        		return (
 9 |          		<AnnotationsSelectorButton data={result} 
10 |          			onLayerChange={onLayerChange} />
11 |          	);
12 |      	});
13 |      	return (<div className="annotationsSelector">{buttons}</div>);
14 | 	}
15 | });
16 | 
17 | var AnnotationsSelectorButton = React.createClass({
18 |   handleClick: function() {
19 |     var active = !this.props.data.active;
20 |     this.props.onLayerChange(this.props.data.name, active);
21 |   },
22 |   render: function() {
23 |     var classes = 'facet';
24 |     if (!this.props.data.active)
25 |       classes += ' facet-inactive';
26 |     return (<div style={{fontSize:'10pt'}} className={classes} onClick={this.handleClick}>
27 |        <div style={{display:'inline-block',width:'30px'}}>
28 |          <i className="fa fa-check" ></i>
29 |        </div>{this.props.data.name} 
30 |      </div>)
31 |   }
32 | })
33 | 
34 | module.exports = AnnotationsSelector


--------------------------------------------------------------------------------
/view/view/vis/TextWithAnnotations.js:
--------------------------------------------------------------------------------
  1 | var React = window.React = require('react');
  2 | 
  3 | var SpansVisualization = require('./core/SpansVisualization.js')
  4 | var TokenTagsVisualization = require('./core/TokenTagsVisualization.js')
  5 | var EdgesVisualization = require('./core/EdgesVisualization.js')
  6 | var SentenceUtils = require('./core/SentenceUtils.js')
  7 | 
  8 | var TokensVisualization = function(element, source) {
  9 | 	return SpansVisualization(element, source.tokenOffsets)
 10 | }
 11 | 
 12 | var SentencesVisualization = function(element, source) {
 13 | 	return SpansVisualization(element, source.sentenceOffsets)
 14 | }
 15 | 
 16 | var PartOfSpeechVisualization = function(element, source) {
 17 | 	return TokenTagsVisualization(element, source.tokenOffsets, source.poss)
 18 | }
 19 | 
 20 | var LemmasVisualization = function(element, source) {
 21 | 	return TokenTagsVisualization(element, source.tokenOffsets, source.lemmas)
 22 | }
 23 | 
 24 | var DependenciesVisualization = function(element, source) {
 25 |     // compute sentenceTokenOffsets
 26 |     var sentenceTokenOffsets = SentenceUtils.getSentenceTokenOffsets(source.tokenOffsets, source.sentenceOffsets)
 27 | 	return EdgesVisualization(element, source.tokenOffsets, source.sentenceOffsets, sentenceTokenOffsets, source.sentenceDependencies)
 28 | }
 29 | 
 30 | var ExtractorsVisualization = function(element, source, annotations) {
 31 |    var sentenceTokenOffsets = source['sentenceTokenOffsets']
 32 |    var tokenOffsets = source['tokenOffsets']
 33 |    var extractorOffsets = []
 34 | 
 35 |    $.each(annotations, function(i, a) {
 36 |    	  var sentNum = a.range.sentNum
 37 |    	  var sentenceBeginToken = sentenceTokenOffsets[sentNum][0]
 38 |    	  var tokenFrom = sentenceBeginToken + a.range.f
 39 |    	  var tokenTo = sentenceBeginToken + a.range.t
 40 |       var charFrom = tokenOffsets[tokenFrom][0]
 41 |       var charTo = tokenOffsets[tokenTo - 1][1]
 42 |       extractorOffsets.push([charFrom,charTo])
 43 |    })
 44 |    return SpansVisualization(element, extractorOffsets)
 45 | }
 46 | 
 47 | var TextWithAnnotations = React.createClass({
 48 | 
 49 |   componentDidMount: function() {
 50 |   	this.vis = {}
 51 |   	this.buildCustomDom()
 52 |   },
 53 |   componentDidUpdate: function() {
 54 |   	this.buildCustomDom()
 55 |   },
 56 |   buildCustomDom: function() {
 57 |     var div = React.findDOMNode(this)
 58 |     //cleanup existing visualizations
 59 |     $.each(this.vis, function(k,v) { v.destroy() })
 60 | 
 61 |   	this.vis = {}
 62 | 
 63 |     var annotations = this.props.data.annotations
 64 |     var sourceData = this.props.data._source
 65 |     var vis = this.vis
 66 | 
 67 |     $.each(this.props.layers, function(i, l) {
 68 |         if (vis && vis[l.name] && !l.active) {
 69 |         	vis[l.name].destroy()
 70 |         	delete vis[l.name]
 71 |         }
 72 |         if (vis && !vis[l.name] && l.active) {
 73 |         	if (l.name == 'Tokens')
 74 |         		vis[l.name] = new TokensVisualization(div, sourceData)
 75 |         	if (l.name == 'Sentences')
 76 |         		vis[l.name] = new SentencesVisualization(div, sourceData)
 77 |         	if (l.name == 'Extractors')
 78 |         		vis[l.name] = new ExtractorsVisualization(div, sourceData, annotations)
 79 |         	if (l.name == 'Dependencies')
 80 |         		vis[l.name] = new DependenciesVisualization(div, sourceData)
 81 |         	if (l.name == 'Lemmas')
 82 |         		vis[l.name] = new LemmasVisualization(div, sourceData)        		
 83 |         	if (l.name == 'PartOfSpeech')
 84 |         		vis[l.name] = new PartOfSpeechVisualization(div, sourceData)        		
 85 |         }
 86 |     })
 87 |   },
 88 |   isActive: function(name) {
 89 |   	var isActive = false
 90 |     $.each(this.props.layers, function(i, l) {
 91 |        if (l.name == name) { isActive = l.active; return false }
 92 |     })
 93 |     return isActive
 94 |   },
 95 | 
 96 |   render: function() {
 97 |     content = this.props.data._source.content;
 98 |     // if we have field with keyword highlighting, take that
 99 |     if (this.props.data.highlight != null &&
100 |         this.props.data.highlight.content != null) {
101 |       content = this.props.data.highlight.content[0];
102 |     }
103 |     var details = []
104 |     if (this.isActive('Details')) {
105 | 	    $.each(this.props.data.annotations, function(i, value) {
106 | 	    	details.push(<div className='extractionBlue'>{JSON.stringify(value)} </div>);
107 | 	    })
108 | 	    $.each(this.props.data._source, function(name, value) {
109 | 	      if (name != 'content' && name != 'id')
110 | 	        details.push (<div className='extraction'>{name} : {JSON.stringify(value)} </div>);
111 | 	    })
112 | 	}
113 |     //style={{'white-space':'pre-wrap'}}  
114 |     var div = (<div><span style={{'white-space':'pre-wrap'}} dangerouslySetInnerHTML={{__html: content}} />
115 |         <br/><div style={{'color':'green'}}>{this.props.data._id}</div>
116 |         {details}
117 |     	</div>)
118 | 
119 |     return div;
120 |   }
121 | });
122 | 
123 | module.exports = TextWithAnnotations
124 | 
125 | 


--------------------------------------------------------------------------------
/view/view/vis/core/CharOffsets.js:
--------------------------------------------------------------------------------
  1 | var CharOffsets = (function() {
  2 | 	var ELEMENT = 1;
  3 | 	var TEXT = 3;
  4 | 	
  5 | 	var offsetComparator = function(e1, e2) {
  6 | 		return e1.readrOffset - e2.readrOffset;					
  7 | 	};
  8 | 		
  9 | 	var indexOffsets = function(node, offset) {
 10 | 		node.readrOffset = offset;
 11 | 		if (node.nodeType == TEXT) {
 12 | 			node.readrLength = node.nodeValue.length;
 13 | 		} else if (node.nodeType == ELEMENT) {
 14 | 			// ignore if has class ignoreReadrLength
 15 | 			if (goog.dom.classes.has(node, 'ignoreReadrLength')) {
 16 | 				node.readrLength = 0;
 17 | 			} else {
 18 | 				// sum up lengths of children
 19 | 				var l = 0;
 20 | 				for (var i=0, ii = node.childNodes.length; i < ii; i++) {
 21 | 					var child = node.childNodes[i];
 22 | 					indexOffsets(child, offset + l);
 23 | 					l += child.readrLength;
 24 | 				}
 25 | 				node.readrLength = l;
 26 | 			}
 27 | 		}
 28 | 	};
 29 | 	
 30 | 	var getTextRangesToHighlightFromIndex = function(node, start, end) {
 31 | 		var results = new Array();
 32 | 		recur(node, start, end, results);
 33 | 		return results;
 34 | 	};
 35 | 	
 36 | 	var recur = function(node, start, end, results) {
 37 | 		if (end - start <= 0) return;
 38 | 	
 39 | 		// we assume that start >= node.readrOffset and end <= node.readrOffset + node.readrLength
 40 | 		if (node.nodeType == TEXT) {
 41 | 			results.push([node, start - node.readrOffset, end - node.readrOffset, start, end]);
 42 | 			return;
 43 | 		}
 44 | 		// binary search for start and end
 45 | 		var ns = goog.array.binarySearch(node.childNodes, { readrOffset : start }, offsetComparator);
 46 | 		var ne = goog.array.binarySearch(node.childNodes, { readrOffset : end }, offsetComparator);
 47 | 
 48 | 		if (ns < 0) { ns = -ns-2; }
 49 | 		if (ne < 0) { ne = -ne-1; }
 50 | 		
 51 | 		for (var i=ns; i < ne; i++) {
 52 | 			var child = node.childNodes[i];
 53 | 			var s = (i==ns)? start : child.readrOffset;
 54 | 			var e = (i==ne-1)? end : child.readrOffset + child.readrLength;
 55 | 			
 56 | 			recur(child, s, e, results);
 57 | 		}
 58 | 	};
 59 | 	
 60 | 	var createMultiRangeSpans = function(element, tokenOffsets, renderedSpans, documentOffset) {
 61 | 		if (!renderedSpans)
 62 | 			renderedSpans = new Array();
 63 | 		if (!documentOffset)
 64 | 			documentOffset = 0
 65 | 		indexOffsets(element[0], documentOffset)
 66 | 		for (var j=0, jj = tokenOffsets.length; j < jj; j++) {
 67 | 			// token has offsets t.f, t.t
 68 | 			var rs = createSingleRangeSpans(element, tokenOffsets[j]);
 69 | 			renderedSpans.push(rs);
 70 | 		}
 71 | 		return renderedSpans;
 72 | 	};
 73 | 
 74 |     var FROM = 0
 75 |     var TO = 1
 76 | 	
 77 | 	// example tokenOffset: { f:12, t:23 }
 78 | 	var createSingleRangeSpans = function(element, tokenOffset) {
 79 | 		//if (!documentOffset) 
 80 | 			//documentOffset = 0
 81 | 		var sels = new Array();
 82 | 		var todo = getTextRangesToHighlightFromIndex
 83 | 			(element[0], tokenOffset[FROM], tokenOffset[TO]);
 84 | 		for (var i=0, ii = todo.length; i < ii; i++) {
 85 | 			var t = todo[i];
 86 | 			var range = goog.dom.Range.createFromNodes(t[0], t[1], t[0], t[2]);
 87 | 			var parentNode = t[0].parentNode
 88 |                         var parentNodeOffset = parentNode.readrOffset
 89 |               
 90 | 			var el = goog.dom.createDom('span'); //, { 'style':'background-color:green'}); 
 91 | 			range.surroundContents(el);
 92 | 			//indexOffsets(t[0].parentNode, t[0].parentNode.readrOffset);
 93 | 			indexOffsets(parentNode, parentNodeOffset);
 94 | 			sels.push(el);
 95 | 		}
 96 | 		return { sels:sels };
 97 | 	};
 98 | 	
 99 | 	//note, the output of this function is a singleton
100 | 	return {
101 | 		indexOffsets: indexOffsets,
102 | 		getTextRangesToHighlightFromIndex: getTextRangesToHighlightFromIndex,
103 | 		createMultiRangeSpans: createMultiRangeSpans,
104 | 		createSingleRangeSpans: createSingleRangeSpans
105 | 	};
106 | })()
107 | 
108 | module.exports = CharOffsets
109 | 


--------------------------------------------------------------------------------
/view/view/vis/core/FramesVisualization.js:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | //
 5 | //
 6 | //var FramesVisualization = function(element, source) {
 7 | //	var state ={}
 8 | //
 9 | //	var documentOffset = 0
10 | //
11 | //	var msHeadSpans = new Array();
12 | //	CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset)
13 | //
14 | //
15 | //
16 | //}


--------------------------------------------------------------------------------
/view/view/vis/core/SentenceUtils.js:
--------------------------------------------------------------------------------
 1 | var SentenceUtils = (function() {
 2 | 
 3 |     var FROM = 0
 4 |     var TO = 1
 5 | 
 6 | 	var getSentenceTokenOffsets = function(tokenOffsets, sentenceOffsets) {
 7 | 		var sentenceTokenOffsets = []
 8 | 		var tokBegin = 0
 9 | 		var tokEnd = 0
10 | 		for (var si = 0; si < sentenceOffsets.length; si++) {
11 | 			// move start
12 | 			while (tokenOffsets[tokBegin][FROM] < sentenceOffsets[si][FROM]) tokBegin++
13 | 			tokEnd = tokBegin
14 | 			while (tokEnd < tokenOffsets.length && tokenOffsets[tokEnd][FROM] <= sentenceOffsets[si][TO]) tokEnd++
15 | 
16 | 			// now we have (tokBegin,tokEnd) for sentence
17 | 			sentenceTokenOffsets.push([tokBegin, tokEnd])
18 | 			tokBegin = tokEnd
19 | 		}
20 | 		return sentenceTokenOffsets
21 | 	}
22 | 
23 | 	var findSentNumByTokenPos = function(pos, sentenceTokenOffsets) {
24 | 		var minIndex = 0;
25 | 		var maxIndex = sentenceTokenOffsets.length - 1;
26 | 		var currentIndex;
27 | 		var currentElement;
28 | 
29 | 		while (minIndex <= maxIndex) {
30 | 		    currentIndex = (minIndex + maxIndex) / 2 | 0;
31 | 		    currentElement = sentenceTokenOffsets[currentIndex];
32 | 
33 | 		    if (currentElement[TO] <= pos) {
34 | 		      //if (currentElement < searchElement) {
35 | 		       minIndex = currentIndex + 1;
36 | 		    }
37 | 		    else if (currentElement[FROM] > pos) {
38 | 		    	//if (currentElement > searchElement) {
39 | 		       maxIndex = currentIndex - 1;
40 | 		    }
41 | 		    else {
42 | 		       return currentIndex;
43 | 		    }
44 | 	    }
45 | 	}
46 | 	return {
47 | 		getSentenceTokenOffsets:getSentenceTokenOffsets,
48 | 		findSentNumByTokenPos:findSentNumByTokenPos
49 | 	};
50 | })()
51 | 
52 | module.exports = SentenceUtils


--------------------------------------------------------------------------------
/view/view/vis/core/SpansVisualization.js:
--------------------------------------------------------------------------------
 1 | /* TokensVisualization */
 2 | var CharOffsets = require('./CharOffsets.js')
 3 | 
 4 | var Span = function(sels) {
 5 |     var state = {}
 6 | 
 7 | 	var fragment = function(i, length) {
 8 | 		var fragment = '';
 9 | 		if (i==0 && i < length-1) fragment = 'left';
10 | 		else if (i==0 && i==length-1) fragment = 'leftright';
11 | 		else if (i==length-1 && i > 0) fragment = 'right';
12 | 		else if (i > 0 && i < length-1) fragment = 'inner';
13 | 		return fragment;
14 | 	};
15 | 
16 |     // initialize
17 |     state.sels = sels
18 |     state.color = 'red'
19 | 	if (!sels) return;
20 | 	var ii = sels.length;
21 | 	$.each(sels, function(i, sel) {
22 | 		$(sel).addClass('highlight_' + state.color);
23 | 		$(sel).addClass('highlight_' + fragment(i, ii));
24 | 		//$(sel).on('click', function() {
25 | 		//	console.log('clicked');
26 | 		//});
27 |     })
28 | 
29 |     state.destroy = function() {
30 |     	// unbind all handlers
31 | 		if (!state.sels) return;
32 | 				
33 | 		$.each(state.sels, function(sel) {
34 | 			//$(sel).unbind('click');
35 | 		});
36 |     }
37 |     return state
38 | }
39 | 
40 | var SpansVisualization = function(element, spans) {
41 | 	var state = {
42 | 	 	renderedSpans: new Array(),
43 | 	 	destroyed: false
44 | 	};
45 | 
46 | 	//var documentOffset = scope.document.offset
47 | 	var documentOffset = 0
48 | 		
49 | 	CharOffsets.createMultiRangeSpans([element,this], spans, state.renderedSpans, documentOffset)
50 | 
51 | 	$.each(state.renderedSpans, function(i, rs) {
52 | 		var span = new Span(rs.sels)
53 | 	});
54 | 
55 | 	state.destroy = function() {
56 | 		state.destroyed = true;
57 | 		$.each(state.renderedSpans, function(i, value) {
58 | 			// do bound listeners automatically get destroyed??
59 |             //value.element.remove();
60 |             //value.scope.$destroy();
61 | 
62 | 			//$.each(value.aux, function(j,n) {
63 | 			//	goog.dom.removeNode(n);
64 | 			//});
65 | 			$.each(value.sels, function(j,n) {
66 | 				goog.dom.flattenElement(n);
67 | 			});
68 | 			value.sels = [];			
69 | 		});
70 | 		//element.remove();
71 | 		//goog.editor.range.normalizeNode(element[0]);
72 | 		state.renderedSpans.length = 0;     
73 | 	}
74 | 	return state
75 | }
76 | 
77 | module.exports = SpansVisualization


--------------------------------------------------------------------------------
/view/view/vis/core/TokenTagsVisualization.js:
--------------------------------------------------------------------------------
 1 | /* TokenTagsVisualization */
 2 | 
 3 | var CharOffsets = require('./CharOffsets.js')
 4 | 
 5 | var TokenTagsVisualization = function(element, tokenOffsets, tags) {
 6 | 	var state = {
 7 | 	 	renderedSpans: new Array(),
 8 | 	 	destroyed: false
 9 | 	};
10 | 
11 | 	//var documentOffset = scope.document.offset
12 | 	var documentOffset = 0
13 | 
14 | 	// insert spans
15 | 	CharOffsets.createMultiRangeSpans([element,this], tokenOffsets, state.renderedSpans, documentOffset)
16 | 
17 | 	$.each(state.renderedSpans, function(i, rs) {
18 | 		var firstSpan = rs.sels[0]
19 | 		var el = goog.dom.createDom('div', { 'style' :
20 | 			'position:absolute;' +
21 | 			'top:-15px;' +
22 | 			'left:0px;right:0px;' +
23 | 			'z-index:0;' +
24 | 			'width:100px;' + //' + tokenWidth + 'px;' +
25 | 			'height:20px;' +
26 | 			'color:red;' +
27 | 			'font-size:10px;' +
28 | 			'font-family:helvetica,arial;' +
29 | 			'font-stretch:semi-condensed;' +
30 | 			'font-weight:500;'/* +
31 | 			'background-color:white'*/
32 | 		})
33 | 		el.appendChild(goog.dom.createTextNode(tags[i]))
34 | 		// if you want all lines to be equal height, set marginTop as follows
35 | 		//var marginTop = (drawing.highestLevels[i]+1) * 15;
36 | 			// if you want to use inline rather than inline-block spans, use following line
37 | 			//$(firstSpan).attr('style', 'display:inline;line-height:' + (marginTop + 20) +
38 | 			//   'px;margin-top:' + marginTop + 'px;position:relative');
39 | 		var marginTop = 10
40 | 		$(firstSpan).attr('style', 'display:inline-block;margin-top:' + marginTop + 'px;position:relative')
41 | 		firstSpan.appendChild(el)
42 | 		rs.aux = new Array()
43 | 		rs.aux.push(el)
44 | 		})
45 | 
46 | 	state.destroy = function() {
47 | 		state.destroyed = true;
48 | 		$.each(state.renderedSpans, function(i, value) {
49 | 			$.each(value.aux, function(j, n) {
50 | 				goog.dom.removeNode(n);
51 | 			})
52 | 			$.each(value.sels, function(j, n) {
53 | 				goog.dom.flattenElement(n);
54 | 			})
55 | 			value.sels = [];
56 | 		});
57 | 		//element.remove();
58 | 		//goog.editor.range.normalizeNode(element[0]);
59 | 		state.renderedSpans.length = 0;
60 | 	}
61 | 	return state
62 | }
63 | 
64 | module.exports = TokenTagsVisualization
65 | 


--------------------------------------------------------------------------------
/view/views/error.jade:
--------------------------------------------------------------------------------
1 | extends layout
2 | 
3 | block content
4 |   h1= message
5 |   h2= error.status
6 |   pre #{error.stack}
7 | 


--------------------------------------------------------------------------------
/view/views/layout.jade:
--------------------------------------------------------------------------------
1 | doctype html
2 | html
3 |   head
4 |     title= title
5 |     link(rel='stylesheet', href='/stylesheets/style.css')
6 |   body
7 |     block content


--------------------------------------------------------------------------------