├── .gitignore ├── README.md ├── condor ├── README.md └── shared │ ├── URLS │ ├── do.sh │ └── html2text.py ├── distribute ├── .gitignore ├── README.md ├── azure-client.py ├── ec2-client.py ├── env_local.sh ├── fabfile.py ├── generate-keys.sh ├── installer │ ├── .gitignore │ ├── build │ ├── decompress │ └── payload │ │ └── installer ├── setup.sh └── test │ └── input.json ├── parser ├── .gitignore ├── README.md ├── build.sbt ├── project │ └── plugins.sbt ├── run.sh ├── run_parallel.sh ├── sbt │ ├── sbt │ └── sbt-launch.jar ├── setup.sh ├── src │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── clearcut │ │ │ └── nlp │ │ │ ├── DocumentParseResult.scala │ │ │ ├── DocumentParser.scala │ │ │ ├── JSONReader.scala │ │ │ ├── Main.scala │ │ │ ├── Server.scala │ │ │ └── TSVReader.scala │ └── test │ │ ├── resources │ │ ├── input.json.txt │ │ ├── testdoc.html │ │ └── testdoc.txt │ │ └── scala │ │ └── DocumentParserSpec.scala └── test │ ├── input.json │ ├── input.tsv │ └── output.tsv ├── pipe ├── .gitignore ├── PLANS.md ├── README.md ├── build.sbt ├── config.properties.template ├── example │ ├── input.json │ └── parse.sh ├── project │ └── plugins.sbt ├── run.sh ├── run_parallel.sh ├── run_test.sh ├── sbt │ ├── sbt │ └── sbt-launch.jar ├── setup.sh └── src │ ├── main │ └── scala │ │ └── com │ │ └── clearcut │ │ └── pipe │ │ ├── Main.scala │ │ ├── Schema.scala │ │ ├── Server.scala │ │ ├── annotator │ │ ├── Annotator.scala │ │ ├── ExtendedCleanHtmlStanfordPipeline.scala │ │ ├── ExtendedHtmlStanfordPipeline.scala │ │ ├── ExtendedStanfordPipeline.scala │ │ ├── SimpleStanfordPipeline.scala │ │ ├── StanfordCoreferenceResolver.scala │ │ ├── StanfordDependencyExtractor.scala │ │ ├── StanfordLemmatizer.scala │ │ ├── StanfordNERTagger.scala │ │ ├── StanfordPOSTagger.scala │ │ ├── StanfordSRParser.scala │ │ ├── StanfordSentenceSplitter.scala │ │ ├── StanfordTokenizer.scala │ │ ├── StanfordTrueCaseAnnotator.scala │ │ └── StanfordUtil.scala │ │ ├── io │ │ ├── ColumnReader.scala │ │ ├── ColumnWriter.scala │ │ ├── JSONWriter.scala │ │ ├── Json.scala │ │ ├── JsonReader.scala │ │ ├── Reader.scala │ │ ├── TsvReader.scala │ │ ├── TsvWriter.scala │ │ └── Writer.scala │ │ └── model │ │ ├── Util.scala │ │ └── package.scala │ └── test │ ├── resources │ ├── testdoc.html │ ├── testdoc.json │ └── testdoc.txt │ └── scala │ └── BasicSpec.scala └── view ├── .gitignore ├── README.md ├── app.js ├── bin └── www ├── build.sh ├── env.sh ├── gulpfile.js ├── package.json ├── public ├── bundle.js ├── css │ └── main.css ├── index.html └── js │ ├── help │ └── Help.js │ ├── main.js │ ├── vis.js │ ├── vis │ ├── AnnotationsSelector.js │ ├── TextWithAnnotations.js │ ├── core │ │ ├── CharOffsets.js │ │ ├── EdgesVisualization.js │ │ ├── FramesVisualization.js │ │ ├── SpansVisualization.js │ │ └── TokenTagsVisualization.js │ ├── vis.js │ ├── visedge.js │ ├── visframe.js │ ├── visspan.js │ └── vistokentag.js │ ├── visedge.js │ └── visframe.js ├── routes ├── index.js └── users.js ├── run.sh ├── setup.sh ├── util ├── cat.sh ├── create_index.sh ├── fetch-annotations.py ├── fetch-sentences-table.py ├── generate_sentence_table.py ├── get.sh ├── index_docs.py ├── index_extr.py ├── index_extrlist.py ├── pipe.py ├── refresh-annotations.py ├── refresh-documents.py ├── search.sh └── tab ├── view.conf ├── view ├── help │ └── Help.js ├── main.js └── vis │ ├── AnnotationsSelector.js │ ├── TextWithAnnotations.js │ └── core │ ├── CharOffsets.js │ ├── EdgesVisualization.js │ ├── FramesVisualization.js │ ├── SentenceUtils.js │ ├── SpansVisualization.js │ └── TokenTagsVisualization.js └── views ├── error.jade └── layout.jade /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | parser/lib/ 3 | 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bazaar 2 | ====== 3 | 4 | A collection of tools to generate input for DeepDive. 5 | 6 | ## [Parser](parser) 7 | 8 | Parser is a wrapper of Stanford CoreNLP which takes a simple JSON format as 9 | input and generates a TSV file that can be directly loaded into a database. 10 | 11 | There are five different ways in which the parser package is used. 12 | 13 | 1. `parser/run.sh` runs the parser as a single process. 14 | 2. `parser/run_parallel.sh` runs multiple instances of the parser on a single machine. 15 | 3. [Distribute](distribute) runs multiple instances of the parser on multiple machines. 16 | 4. [Condor](condor) contains instructions on how to run the parser on the Condor cluster. 17 | 5. `parser/run.sh -p 8080` runs the parser as a REST service. 18 | 19 | ## [XML](http://github.com/hazyresearch/dd-genomics) 20 | 21 | Many external datasets are in an XML format. To consume these datasets with DeepDive, 22 | the XML has to be parsed into the simple JSON representation that the Parser package 23 | uses as input. 24 | 25 | An example of using an XML parser is contained in the dd-genomics project. 26 | 27 | ## [Distribute](distribute) 28 | 29 | It is often desirable to run the parser on multiple machines on ec-2 or azure. Distribute contains tools to automatically provision machines, distribute data, perform parsing, and collect results. 30 | 31 | -------------------------------------------------------------------------------- /condor/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Condor 3 | 4 | Put this folder when you use http://chtc.cs.wisc.edu/chtcjobwrapper.shtml 5 | 6 | Follow the instruction of http://chtc.cs.wisc.edu/chtcjobwrapper.shtml to start the job 7 | 8 | ### Notes 9 | 10 | It is VERY VERY important to add the line 11 | 12 | -Xmx4g -XX:CICompilerCount=1 -XX:ConcGCThreads=1 -XX:ParallelGCThreads=1 13 | 14 | whenever you start JVM on Condor. Otherwise, Stanford CoreNLP would use more 15 | than 1 core. 16 | 17 | In the file URLS, there are two things: 18 | - jre-8u31-linux-x64.gz: Just pack the Oracle Java's binary into this. When you 19 | need JVM, start it like ./jre1.8.0_31/bin/java 20 | - nlp_2015_2.jar: Put your jar here to use it. Make sure it is compiled with JRE-8u31. -------------------------------------------------------------------------------- /condor/shared/URLS: -------------------------------------------------------------------------------- 1 | /czhang/cde-package/jre-8u31-linux-x64.gz 2 | /czhang/cde-package/nlp_2015_2.jar 3 | -------------------------------------------------------------------------------- /condor/shared/do.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #### 4 | # We always log which machine does this job run first 5 | #### 6 | uname -a 7 | 8 | tar xf jre-8u31-linux-x64.gz 9 | 10 | for l in `ls *.nxml` 11 | do 12 | sed -i 's/title>/p>/g' $l 13 | python html2text.py $l > $l.txt 14 | 15 | time ./jre1.8.0_31/bin/java -Xmx4g -XX:CICompilerCount=1 -XX:ConcGCThreads=1 -XX:ParallelGCThreads=1 -jar nlp_2015_2.jar $l.txt 16 | if [ -f $l.txt.nlp ] 17 | then 18 | echo "SUCCEED!" > SUCCEED.txt 19 | fi 20 | done 21 | 22 | rm nlp_2015_2.jar 23 | 24 | 25 | -------------------------------------------------------------------------------- /distribute/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | ssh 3 | conf/credentials.publishsettings 4 | .state 5 | env 6 | result 7 | segments 8 | -------------------------------------------------------------------------------- /distribute/README.md: -------------------------------------------------------------------------------- 1 | Distribute 2 | ========== 3 | 4 | Runs the [Parser](/parser) on multiple machines in parallel. Distribute provisions machines 5 | on ec-2 or azure, then processes chunks of your data on each machine, and 6 | finally terminates the machines. 7 | 8 | Before you begin, follow the instructions in [Setup](#setup) to install Distribute. 9 | 10 | 1. Launch instances on ec-2 or azure. **Note: For ec-2, the General Purpose instance type is recommended (e.g. `m3.2xlarge`); instace types with lower memory/core may cause parser to abort.** 11 | 12 | ```bash 13 | fab launch:cloud=ec2,num=1 14 | ``` 15 | This will launch 1 instance on ec-2. It will also put status information 16 | about the launched instance into `.state`. 17 | 18 | 2. Install dependencies on remote machines 19 | ```bash 20 | fab install > install.log 21 | ``` 22 | 23 | 3. Copy chunks to remote machines, run parser on remote machines and collect results: 24 | ```bash 25 | time fab copy_parse_collect > parse.log 26 | ``` 27 | Tip: You can schedule the remote machines to be terminated on task completion automatically; note though that if the `parse` operation fails, nodes may not terminate: 28 | ```bash 29 | time fab copy_parse_collect terminate > parse.log 30 | ``` 31 | Tip: You can provide additional parameters to override defaults: 32 | ```bash 33 | time fab copy_parse_collect:input=test/input.json,batch_size=1000,parallelism=8,key_id='item_id',content_id='content' > parse.log 34 | ``` 35 | If `batch_size` is left unspecified, it will be computed automatically. Note that very large batch sizes may cause memory errors. See [Parser](/parser) documentation for details on parser parameters. *Note also that commas need to be backslash-escaped when passed in as parameters.* 36 | 37 | 4. To check global status of distributed parse, run: 38 | ```bash 39 | fab get_status 40 | ``` 41 | 42 | 5. If not automatically terminated as above, or if error occured, terminate remote machines: 43 | ```bash 44 | fab terminate 45 | ``` 46 | If termination is successful, the status information in `.state` will be deleted. 47 | 48 | Your parsed information should now be available as a tsv file named `result` in your working directory. 49 | 50 | ## Setup 51 | 52 | ### Dependencies 53 | 54 | If you have sudo rights, run `./setup.sh`. 55 | 56 | If you don't have sudo rights, follow these steps instead. These have been tested on raiders3 (Stanford): 57 | ``` 58 | cd bazaar 59 | wget https://raw.githubusercontent.com/pypa/virtualenv/develop/virtualenv.py 60 | python virtualenv.py env --no-setuptools 61 | source env/bin/activate 62 | wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py 63 | python get-pip.py 64 | pip install fabric 65 | pip install urltools 66 | pip install azure 67 | pip install botocore 68 | ``` 69 | 70 | The fab command line tool should work now. 71 | 72 | Note that you will have to run `source env/bin/activate` after each login to initialize the environment. 73 | 74 | ### Generate SSH Keys 75 | 76 | Now, generate SSH keys. 77 | ``` 78 | ./generate-keys.sh 79 | ``` 80 | 81 | ### Build 82 | 83 | Finally, create a self-extracting installer that will be run on worker nodes. 84 | ``` 85 | cd installer 86 | ./build 87 | cd .. 88 | ``` 89 | 90 | ### Set EC2 or Azure credentials 91 | 92 | See variables in `env_local.sh` and override as needed. 93 | 94 | For ec2, we recommend storing credentials at `~/.aws/credentials` following this 95 | [documentation](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html). 96 | Make sure to `chmod 400 ~/.aws/credentials` and insert your access key and secret key: 97 | 98 | ``` 99 | [default] 100 | aws_access_key_id = 101 | aws_secret_access_key = 102 | ``` 103 | 104 | For azure, upload `ssh/mycert.cer` to the management portal via the "Upload" action of the "Settings" tab, and set the following variable in `env_local.sh`: 105 | ``` 106 | export AZURE_SUBSCRIPTION_ID= 107 | ``` 108 | 109 | ## Tips 110 | 111 | * You can log into any of the launched nodes on ec-2 or azure: 112 | ``` 113 | ssh -i ssh/bazaar.key -p PORT USER@HOST 114 | ``` 115 | where USER, HOST, PORT are contained in `.state/HOSTS`. 116 | 117 | * You can choose different instance types (see `env_local.sh`). 118 | 119 | * Test your distribution setup on smaller samples of your data, 120 | and more basic instance types (eg. Standard_D2 for azure). 121 | Then, when you are confident that everything works as expected, 122 | choose a more powerful instance type (eg. Standard_D14 on azure), 123 | and increase the parallelism in step 4 above (eg. 8 or 16). 124 | 125 | * By default, Azure only allows you to use a maximum of 20 cores 126 | in total. This means you can not launch more than one instance 127 | of type Standard_D14 (16 cores) at a time. You can submit a 128 | request to Microsoft to increase your quota of cores. 129 | 130 | * In case of errors, make sure you stop running VMs through the 131 | Azure management portal or AWS management console. You may have 132 | to `rm -r .state` to continue using Distribute. 133 | -------------------------------------------------------------------------------- /distribute/env_local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # AZURE SETTINGS 4 | 5 | # your azure subscription ID (look it up under 'Settings' in the management portal) 6 | # It has the following form '00000000-0000-0000-0000-000000000000' 7 | export AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID:-} 8 | 9 | # name for service (must be unique among all azure users), eg. 'ddbazaa' 10 | export AZURE_SERVICE_NAME=${AZURE_SERVICE_NAME:-ddbazaa} 11 | 12 | # name for storage account (must be unique among all azure users) 13 | export AZURE_STORAGE_ACCOUNT=${AZURE_STORAGE_ACCOUNT:-ddbazaastore} 14 | 15 | # eg. 'Standard_D2', or 'Standard_D14' 16 | export AZURE_ROLE_SIZE=${AZURE_ROLE_SIZE:-Standard_D2} 17 | 18 | # EC2 SETTINGS 19 | 20 | # For ec-2, we recommend that you keep your AWS_ACCESS_KEY_ID and your 21 | # AWS_SECRET_ACCESS_KEY in ~/.aws/credentials. 22 | 23 | # eg. 'm3.large' 24 | #export EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-r3.4xlarge} 25 | export EC2_INSTANCE_TYPE=m3.2xlarge 26 | -------------------------------------------------------------------------------- /distribute/generate-keys.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ -d "./ssh" ]; then 4 | echo 'Directory ./ssh exists already. Abort.' 5 | echo 'If you would like to re-generate the keys, please remove ./ssh and try again.' 6 | exit 1 7 | fi 8 | 9 | # install SSH keys 10 | echo "Creating SSH keys" 11 | rm -rf ./ssh 12 | mkdir ./ssh 13 | cd ./ssh 14 | 15 | # generate private/public key pair 16 | ssh-keygen -t rsa -b 2048 -f bazaar.key -N '' -C bazaar 17 | 18 | # generate azure pem file from openssh private key 19 | openssl req \ 20 | -x509 \ 21 | -days 365 \ 22 | -nodes \ 23 | -key bazaar.key \ 24 | -out bazaar.pem \ 25 | -newkey rsa:2048 \ 26 | -subj "/" 27 | 28 | # install (separate) management certificates for azure 29 | openssl req -x509 \ 30 | -nodes \ 31 | -days 365 \ 32 | -newkey rsa:1024 \ 33 | -keyout mycert.pem \ 34 | -out mycert.pem \ 35 | -subj "/" 36 | 37 | openssl x509 -inform pem -in mycert.pem -outform der -out mycert.cer 38 | 39 | echo 'All keys have been generated and placed into ./ssh.' 40 | echo ' ssh/bazaar.key is the private key used to log in to worker machines' 41 | echo ' ssh/bazaar.key.pub is the corresponding public key in OpenSSH format (ec2)' 42 | echo ' ssh/bazaar.pem is the corresponding public key in OpenSSL format (azure)' 43 | echo ' ssh/mycert.cer is a management certificate used for azure only' 44 | echo 'NOTE: If you would like to use Azure, you must upload ssh/mycert.cer via the "Upload" action of the "Settings" tab of the management portal.' 45 | 46 | -------------------------------------------------------------------------------- /distribute/installer/.gitignore: -------------------------------------------------------------------------------- 1 | install-parser 2 | payload.tar.gz 3 | -------------------------------------------------------------------------------- /distribute/installer/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Create parser package" 4 | 5 | INSTALLER_DIR="$(pwd)" 6 | PARSER_DIR=../../parser 7 | 8 | cd $PARSER_DIR 9 | tar cf $INSTALLER_DIR/payload/parser.tar \ 10 | build.sbt \ 11 | src \ 12 | run.sh \ 13 | run_parallel.sh \ 14 | project \ 15 | sbt \ 16 | setup.sh 17 | 18 | echo "Create self-extracting installer" 19 | 20 | cd $INSTALLER_DIR/payload 21 | tar cf ../payload.tar ./* 22 | cd .. 23 | 24 | if [ -e "payload.tar" ]; then 25 | gzip payload.tar 26 | 27 | if [ -e "payload.tar.gz" ]; then 28 | cat decompress payload.tar.gz > install-parser 29 | else 30 | echo "payload.tar.gz does not exist" 31 | exit 1 32 | fi 33 | else 34 | echo "payload.tar does not exist" 35 | exit 1 36 | fi 37 | 38 | chmod +x install-parser 39 | echo "install-parser created" 40 | exit 0 41 | -------------------------------------------------------------------------------- /distribute/installer/decompress: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "" 3 | echo "Self Extracting Installer" 4 | echo "" 5 | 6 | export TMPDIR=`mktemp -d /tmp/selfextract.XXXXXX` 7 | 8 | ARCHIVE=`awk '/^__ARCHIVE_BELOW__/ {print NR + 1; exit 0; }' $0` 9 | 10 | tail -n+$ARCHIVE $0 | tar xzv -C $TMPDIR 11 | 12 | CDIR=`pwd` 13 | cd $TMPDIR 14 | ./installer 15 | 16 | cd $CDIR 17 | rm -rf $TMPDIR 18 | 19 | exit 0 20 | 21 | __ARCHIVE_BELOW__ 22 | -------------------------------------------------------------------------------- /distribute/installer/payload/installer: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Running Installer" 3 | rm -rf $HOME/parser 4 | mkdir $HOME/parser 5 | tar xf ./parser.tar -C $HOME/parser 6 | 7 | # install java? 8 | if type -p java; then 9 | echo "Found java" 10 | else 11 | sys=`uname -s` 12 | if [ ! "$sys" == "Linux" ]; then 13 | echo "Only supporting Ubuntu" 14 | exit 1 15 | fi 16 | #echo "Getting openjdk" 17 | #mkdir /tmp/openjdk 18 | #cd /tmp/openjdk 19 | #apt-get -y --print-uris install openjdk-7-jdk | grep "http.*deb" -o | xargs -0 echo 20 | #apt-get -y --print-uris install openjdk-7-jdk | grep "http.*deb" -o | xargs -0 wget 21 | #for d in *.deb; do dpkg -x "$d" openjdk; done 22 | #rm *.deb 23 | #_java=/tmp/openjdk/TODO 24 | 25 | cd $HOME 26 | wget --no-check-certificate --no-cookies --header "Cookie: oraclelicense=accept-securebackup-cookie" http://download.oracle.com/otn-pub/java/jdk/8u45-b14/jdk-8u45-linux-x64.tar.gz 27 | tar xvzf jdk-8u45-linux-x64.tar.gz 28 | 29 | echo 'export PATH=~/jdk1.8.0_45/bin:$PATH' >> ~/.bashrc 30 | echo 'export JAVA_HOME=~/jdk1.8.0_45' >> ~/.bashrc 31 | export PATH=~/jdk1.8.0_45/bin:$PATH 32 | export JAVA_HOME=~/jdk1.8.0_45 33 | fi 34 | 35 | DIRNAME=$HOME/parser 36 | DESTDIR=$DIRNAME/lib 37 | FILENAME='stanford-srparser-2014-10-23-models.jar' 38 | if [ ! -e "$DESTDIR/$FILENAME" ]; then 39 | mkdir -p $DESTDIR 40 | wget -P $DESTDIR http://nlp.stanford.edu/software/stanford-srparser-2014-10-23-models.jar 41 | else 42 | echo "Skipping download: $DESTDIR/$FILENAME already exists" 43 | fi 44 | 45 | cd $HOME/parser 46 | which java 47 | sbt/sbt stage 48 | -------------------------------------------------------------------------------- /distribute/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # install virtualenv 4 | command -v virtualenv >/dev/null 2>&1 || { 5 | echo >&2 "virtualenv required but not installed. Aborting."; 6 | echo >&2 "You can install virtualenv with:" 7 | echo >&2 " sudo pip install virtualenv" 8 | } 9 | 10 | virtualenv env 11 | source env/bin/activate 12 | 13 | pip install azure 14 | pip install botocore 15 | pip install fabric 16 | pip install urltools 17 | 18 | -------------------------------------------------------------------------------- /parser/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | -------------------------------------------------------------------------------- /parser/README.md: -------------------------------------------------------------------------------- 1 | Parser 2 | ====== 3 | 4 | Run `setup.sh` to install dependencies and build the parser. 5 | 6 | We assume that your input has the following format. There's one line per document and each document is a JSON object with a key and content field. 7 | 8 | ```json 9 | { "item_id":"doc1", "content":"Here is the content of my document.\nAnd here's another line." } 10 | { "item_id":"doc2", "content":"Here's another document." } 11 | ``` 12 | 13 | You can run the NLP pipeline on 1 core as follows: 14 | 15 | ```bash 16 | cat input.json | ./run.sh -i json -k "item_id" -v "content" > output.tsv 17 | ``` 18 | 19 | You can run the NLP pipeline on 16 cores as follows: 20 | ```bash 21 | ./run_parallel.sh -in="input.json" --parallelism=16 -i json -k "item_id" -v "content" 22 | ``` 23 | 24 | You can run the NLP pipeline as a REST service as follows: 25 | 26 | ```bash 27 | ./run.sh -p 8080 28 | ``` 29 | 30 | The output will be files in tsv-format that you can directly load into the database. 31 | 32 | 33 | ## Setup 34 | 35 | This package requires Java 8. 36 | 37 | -------------------------------------------------------------------------------- /parser/build.sbt: -------------------------------------------------------------------------------- 1 | import com.typesafe.sbt.SbtStartScript 2 | 3 | name := "deepdive-nlp-parser" 4 | 5 | version := "0.1" 6 | 7 | scalaVersion := "2.10.3" 8 | 9 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 10 | 11 | resolvers += "Scalaz Bintray Repo" at "https://dl.bintray.com/scalaz/releases" 12 | 13 | libraryDependencies ++= List( 14 | "ch.qos.logback" % "logback-classic" % "1.0.7", 15 | "com.typesafe.play" %% "play-json" % "2.2.1", 16 | "com.github.scopt" %% "scopt" % "3.2.0", 17 | "edu.stanford.nlp" % "stanford-corenlp" % "3.5.1", 18 | "edu.stanford.nlp" % "stanford-corenlp" % "3.5.1" classifier "models", 19 | "org.scalatest" % "scalatest_2.10" % "2.0.RC2" % "test", 20 | "org.http4s" %% "http4s-dsl" % "0.7.0", 21 | "org.http4s" %% "http4s-jetty" % "0.7.0" 22 | ) 23 | 24 | unmanagedJars in Compile += file("lib/stanford-srparser-2014-10-23-models.jar") 25 | 26 | parallelExecution in Test := false 27 | 28 | test in assembly := {} 29 | 30 | seq(SbtStartScript.startScriptForClassesSettings: _*) 31 | 32 | -------------------------------------------------------------------------------- /parser/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0") 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 4 | -------------------------------------------------------------------------------- /parser/run.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | export JAVA_OPTS="-Xmx4g -Dfile.encoding=UTF-8" 4 | 5 | $(dirname $0)/target/start $@ 6 | -------------------------------------------------------------------------------- /parser/run_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Parse sentences in parallel 3 | 4 | set -eu 5 | 6 | # Usage: this_script input_file parallelism input_batch_size 7 | 8 | if [ "$#" -le 1 ]; then 9 | echo "Usage: $0 input_file parallelism [input_batch_size=1000] [sentence_words_limit=120]" 10 | exit 11 | fi 12 | 13 | for i in "$@" 14 | do 15 | case $i in 16 | -in=*|--input=*) 17 | INPUT_FILE="${i#*=}" 18 | shift 19 | ;; 20 | -p=*|--parallelism=*) 21 | PARALLELISM="${i#*=}" 22 | shift 23 | ;; 24 | -b=*|--batch-size=*) 25 | BATCH_SIZE="${i#*=}" 26 | shift 27 | ;; 28 | *) 29 | echo "NO MATCH" 30 | break 31 | ;; 32 | esac 33 | done 34 | 35 | if [ -z "$INPUT_FILE" ]; then 36 | echo "Usage: $0 -i=input_file [--parallelism=PARALLELISM] [--batch-size=BATCH_SIZE ] " 37 | exit 38 | fi 39 | 40 | PARALLELISM=${PARALLELISM:-2} 41 | BATCH_SIZE=${BATCH_SIZE:-1000} 42 | 43 | echo "parallelism = $PARALLELISM" 44 | echo "batch-size = $BATCH_SIZE" 45 | 46 | RUN_SCRIPT=`cd $(dirname $0)/; pwd`"/run.sh $@" 47 | echo $RUN_SCRIPT 48 | mkdir -p $INPUT_FILE.split 49 | rm -f $INPUT_FILE.split/* 50 | 51 | # Split the input file into subfiles 52 | split -a 10 -l $BATCH_SIZE $INPUT_FILE $INPUT_FILE.split/input- 53 | 54 | # Match all files in the split directory 55 | find $INPUT_FILE.split -name "input-*" 2>/dev/null -print0 | xargs -0 -P $PARALLELISM -L 1 bash -c "${RUN_SCRIPT}"' -f "$0"' 56 | 57 | echo "Output TSV files are in: $INPUT_FILE.split/*.parsed" 58 | echo "To load them into the databse, run: cat $INPUT_FILE.split/*.parsed | psql YOUR_DB_NAME -c "'"COPY sentences FROM STDIN"' 59 | -------------------------------------------------------------------------------- /parser/sbt/sbt: -------------------------------------------------------------------------------- 1 | java $SBT_OPTS -jar `dirname $0`/sbt-launch.jar "$@" -------------------------------------------------------------------------------- /parser/sbt/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/bazaar/c09dce20f16a90c359f804f9e83d6107547d442c/parser/sbt/sbt-launch.jar -------------------------------------------------------------------------------- /parser/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DIRNAME=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 6 | 7 | # fetch SR models 8 | DESTDIR="$DIRNAME"/lib 9 | FILENAME='stanford-srparser-2014-10-23-models.jar' 10 | if [ ! -e "$DESTDIR/$FILENAME" ]; then 11 | mkdir -p "$DESTDIR" 12 | url="http://nlp.stanford.edu/software/stanford-srparser-2014-10-23-models.jar" 13 | if type wget &>/dev/null; then 14 | wget -P "$DESTDIR" "$url" 15 | elif type curl &>/dev/null; then 16 | ( cd "$DESTDIR" && curl -LO "$url" ) 17 | else 18 | echo >&2 "Could not find curl or wget. Manually download $url to $DESTDIR/" 19 | false 20 | fi 21 | else 22 | echo "Skipping download: $DESTDIR/$FILENAME already exists" 23 | fi 24 | 25 | # java 26 | #sudo add-apt-repository -y ppa:openjdk-r/ppa 27 | #sudo apt-get update 28 | #sudo apt-get install -y openjdk-8-jdk 29 | 30 | # check if java -version >= 1.8 31 | javaVersion=$(java -version 2>&1 | sed -e '1!d; s/^java version "//; s/"$//') 32 | [[ ! $javaVersion < 1.8 ]] || { 33 | echo >&2 "java -version >= 1.8 required but found: $javaVersion" 34 | false 35 | } 36 | 37 | # build parser 38 | cd "$DIRNAME" 39 | sbt/sbt stage 40 | 41 | -------------------------------------------------------------------------------- /parser/src/main/scala/com/clearcut/nlp/DocumentParseResult.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.nlp 2 | 3 | case class SentenceParseResult( 4 | sentence: String, 5 | words: List[String], 6 | lemma: List[String], 7 | pos_tags: List[String], 8 | ner_tags: List[String], 9 | offsets: List[Int], 10 | dep_labels: List[String], 11 | dep_parents: List[Int], 12 | collapsed_deps: List[String] 13 | ) 14 | 15 | case class DocumentParseResult( 16 | sentences: List[SentenceParseResult] 17 | ) 18 | -------------------------------------------------------------------------------- /parser/src/main/scala/com/clearcut/nlp/DocumentParser.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.nlp 2 | 3 | import edu.stanford.nlp.ling.CoreAnnotations._ 4 | import edu.stanford.nlp.pipeline._ 5 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.{CollapsedCCProcessedDependenciesAnnotation, CollapsedDependenciesAnnotation} 6 | 7 | // import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation 8 | import java.util.Properties 9 | 10 | import scala.collection.JavaConversions._ 11 | 12 | 13 | class DocumentParser(props: Properties) { 14 | 15 | val pipeline = new StanfordCoreNLP(props) 16 | 17 | def parseDocumentString(doc: String) = { 18 | 19 | // Temporary fix for bug where brackets are being incorrectly treated as punct 20 | // and somehow this messes up the whole dep parse -> change them to round braces 21 | val doc2 = doc.replaceAll("""\[""", "(").replaceAll("""\]""", ")") 22 | 23 | val document = new Annotation(doc2) 24 | pipeline.annotate(document) 25 | // val dcoref = document.get(classOf[CorefChainAnnotation]) 26 | val sentences = document.get(classOf[SentencesAnnotation]) 27 | 28 | val sentenceResults = sentences.zipWithIndex.map { case(sentence, sentIdx) => 29 | val content = sentence.toString 30 | val tokens = sentence.get(classOf[TokensAnnotation]) 31 | val wordList = tokens.map(_.get(classOf[TextAnnotation])) 32 | val posList = tokens.map(_.get(classOf[PartOfSpeechAnnotation])) 33 | val nerList = tokens.map(_.get(classOf[NamedEntityTagAnnotation])) 34 | val lemmaList = tokens.map(_.get(classOf[LemmaAnnotation])) 35 | val offsetList = tokens.map(_.get(classOf[CharacterOffsetBeginAnnotation]).intValue) 36 | 37 | // This kind of dep paths seem to be a tree. Need CoreNLP guys to confirm. 38 | // Ce has been using this all along. 39 | val depCollapsedPaths = sentence.get(classOf[CollapsedDependenciesAnnotation]).edgeIterable 40 | val depLabels = Array.fill(tokens.size)("") 41 | val depParents = Array.fill(tokens.size)(0) 42 | for (path <- depCollapsedPaths) { 43 | depLabels(path.getTarget.index - 1) = path.getRelation.toString 44 | depParents(path.getTarget.index - 1) = path.getSource.index 45 | } 46 | 47 | // This kind of dep paths may have cycles. 48 | val depCCPPaths = sentence.get(classOf[CollapsedCCProcessedDependenciesAnnotation]).edgeIterable 49 | val ccpPathTriples = for(path <- depCCPPaths) yield 50 | List(path.getSource.index, path.getRelation, path.getTarget.index).mkString(",") 51 | 52 | SentenceParseResult( 53 | content, 54 | wordList.toList, 55 | lemmaList.toList, 56 | posList.toList, 57 | nerList.toList, 58 | offsetList.toList, 59 | depLabels.toList, 60 | depParents.toList, 61 | ccpPathTriples.toList 62 | ) 63 | } 64 | 65 | DocumentParseResult(sentenceResults.toList) 66 | } 67 | 68 | /** 69 | Construct a Postgres-acceptable array in the TSV format, from a list 70 | */ 71 | def list2TSVArray(arr: List[String]) : String = { 72 | return arr.map( x => 73 | // Replace '\' with '\\\\' to be accepted by COPY FROM 74 | // Replace '"' with '\\"' to be accepted by COPY FROM 75 | if (x.contains("\\")) 76 | "\"" + x.replace("\\", "\\\\\\\\").replace("\"", "\\\\\"") + "\"" 77 | else 78 | "\"" + x + "\"" 79 | ).mkString("{", ",", "}") 80 | } 81 | 82 | def intList2TSVArray(arr: List[Int]) : String = { 83 | return arr.map( x => 84 | "" + x 85 | ).mkString("{", ",", "}") 86 | } 87 | 88 | def string2TSVString(str: String) : String = { 89 | if (str.contains("\\")) 90 | str.replace("\\", "\\\\") 91 | else 92 | str 93 | } 94 | 95 | // NOTE: an alternative would be to quote the field correctly 96 | // http://stackoverflow.com/questions/3089077/new-lines-in-tab-delimited-or-comma-delimtted-output 97 | def replaceChars(str: String) : String = { 98 | str.replace("\n", " ").replace("\t", " ") 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /parser/src/main/scala/com/clearcut/nlp/JSONReader.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.nlp 2 | 3 | import play.api.libs.json.{JsObject, JsString, JsValue, Json} 4 | 5 | import scala.io.Source 6 | 7 | class JSONReader(input:Source, docIdKeys:Array[String], documentKey:String) 8 | extends Iterator[(Array[String], String)] { 9 | 10 | var it = input.getLines.zipWithIndex 11 | var _next = fetchNext() 12 | 13 | override def hasNext: Boolean = 14 | _next != null 15 | 16 | override def next(): (Array[String], String) = { 17 | val n = _next 18 | _next = fetchNext() 19 | n 20 | } 21 | 22 | private def fetchNext(): (Array[String], String) = { 23 | var n:(Array[String], String) = null 24 | while (n == null && it.hasNext) { 25 | val (line, num) = it.next 26 | 27 | val jsObj = Json.parse(line).asInstanceOf[JsObject] 28 | 29 | val maybeDocumentIds = new Array[String](docIdKeys.length) 30 | docIdKeys.zipWithIndex.foreach { case (idk, i) => 31 | val maybeDocumentId = jsObj.value.get(idk); 32 | (maybeDocumentId) match { 33 | case (Some(documentId:JsString)) => 34 | maybeDocumentIds(i) = documentId.value 35 | case (_) => 36 | maybeDocumentIds(i) = "\\N" 37 | } 38 | } 39 | 40 | val maybeDocumentStr = jsObj.value.get(documentKey).map(_.asInstanceOf[JsString].value) 41 | 42 | (maybeDocumentIds, maybeDocumentStr) match { 43 | case (documentIds:Array[String], Some(documentStr:String)) => 44 | n = (documentIds, documentStr) 45 | //case (Array[None],_) => 46 | // System.err.println(s"Warning: skipped malformed line ${num}: ${line}") 47 | case (_, None) => 48 | System.err.println(s"Warning: skipped malformed line ${num}: ${line}") 49 | } 50 | } 51 | n 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /parser/src/main/scala/com/clearcut/nlp/Server.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.nlp 2 | 3 | import org.http4s._ 4 | import org.http4s.dsl._ 5 | import org.http4s.server.HttpService 6 | import org.http4s.server.jetty.JettyBuilder 7 | import scala.collection.mutable.ListBuffer 8 | import scalaz.stream.Process._ 9 | 10 | 11 | class Server(dp: DocumentParser, port: Integer) { 12 | 13 | val route = HttpService { 14 | case req @ GET -> Root => 15 | Ok("Hello. I can parse stuff. Just POST the text to me.\n") 16 | 17 | case req @ POST -> Root => 18 | // WARNING: when request body is empty, http4s seems to hang here 19 | val content = new String(req.body.runLog.run.reduce(_ ++ _).toArray, Charset.`UTF-8`.nioCharset) 20 | 21 | val lines = ListBuffer[String]() 22 | dp.parseDocumentString(content).sentences.zipWithIndex 23 | .foreach { case (sentenceResult, sentence_idx) => 24 | 25 | val outline = List( 26 | sentence_idx + 1, 27 | sentenceResult.sentence, 28 | dp.list2TSVArray(sentenceResult.words), 29 | dp.list2TSVArray(sentenceResult.lemma), 30 | dp.list2TSVArray(sentenceResult.pos_tags), 31 | dp.list2TSVArray(sentenceResult.ner_tags), 32 | dp.intList2TSVArray(sentenceResult.offsets), 33 | dp.list2TSVArray(sentenceResult.dep_labels), 34 | dp.intList2TSVArray(sentenceResult.dep_parents) 35 | // dp.list2TSVArray(sentenceResult.collapsed_deps) 36 | ).mkString("\t") 37 | 38 | lines += outline 39 | } 40 | Ok(lines.toList.mkString("\n") + "\n") 41 | } 42 | 43 | def run() = { 44 | JettyBuilder 45 | .mountService(route, "") 46 | .bindHttp(port) 47 | .run 48 | .awaitShutdown() 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /parser/src/main/scala/com/clearcut/nlp/TSVReader.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.nlp 2 | 3 | import scala.io.BufferedSource 4 | 5 | class TSVReader(input:BufferedSource, 6 | idCols:Array[Int], documentCol:Int) 7 | extends Iterator[(Array[String], String)] { 8 | 9 | var it = input.getLines.zipWithIndex 10 | var _next = fetchNext() 11 | 12 | override def hasNext: Boolean = 13 | _next != null 14 | 15 | override def next(): (Array[String], String) = { 16 | val n = _next 17 | _next = fetchNext() 18 | n 19 | } 20 | 21 | private def fetchNext(): (Array[String], String) = { 22 | var n:(Array[String], String) = null 23 | while (n == null && it.hasNext) { 24 | val (line, num) = it.next 25 | val tsvArr = line.trim.split("\t") 26 | if (tsvArr.length >= idCols.length + 1) { 27 | val documentIds = idCols.map(idc => tsvArr(idc)) 28 | val documentStr = tsvArr(documentCol) 29 | n = (documentIds, documentStr) 30 | } else { 31 | System.err.println(s"Warning: skipped malformed line ${num}: ${line}") 32 | } 33 | } 34 | n 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /parser/src/test/resources/input.json.txt: -------------------------------------------------------------------------------- 1 | {"documents.id" : 5, "documents.text" : "I am document one. I am sentence twp, really. I am another sentence, called sentence three."} 2 | {"documents.id" : 7, "documents.text" : "John drove to Judy’s house and he made her dinner. This sentence should have some corefs."} -------------------------------------------------------------------------------- /parser/src/test/resources/testdoc.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |

Wrike has launched a new version of its project management platform with an emphasis on real-time analysis and new features such as syncing calendars to work projects. The new platform, Wrike Enterprise, gives the company a deeper focus on the corporate market for its collaboration-centered tools. It gives customers a way to crunch project management data in the order of a million updates per day. This is data around work items such as tasks completed, the original time planned for the project and the historical data that is associated with the project. The data is presented in “instant infographics,” that help people see the latest updates to projects, said Wrike CEO Andrew Filev in an email interview. Wrike-Enterprise-visual-reports Historically, project managers have done detailed plans that they then track. The manager periodically updates the projects and then compares the current state to the baseline established at the start of the project. With the Wrike platform, the data from every interaction is stored and then compared to historical data and then presented in a chart. A customer can see the state of the project from different dimensions such as the realistic amount of time a project will take to get done,  what requires immediate action and how performance of an employee has evolved over time.

5 |

A new  user group feature in Wrike Enterprise allows the project manager to include employees in multiple work groups by project, department, or any other ad hoc query. It can share the needed data with the whole group and keep permissions organized. This allows the manager to keep track of the overall project without hundreds of people making their own changes.  Wrike-Enterprise-user-groups

6 |
Wrike’s new “Custom Calendars,” syncs projects with the calendars of other members on the team. It allows the manager to track a colleague’s vacations, PTO and extra working days. It is designed to avoid schedule overlaps and build more accurate plans. Wrike-Enterprise-custom-calendars There are also new ways to integrate a company’s  identity into the service. Wrike has also added new security controls for larger customers. In October, Wrike raised $10 million in funding.  It was the first round since the company was originally founded seven years ago. The company has traditionally served the small business community but this release points to its additonal focus on the larger enterprise companies of the world. Wrike competes with the likes of Atlassian and Asana. But its advantage is in its crisp user interface which it can now leverage even more as it embraces data as a way for project managers to better keep track of their projects.  
7 |
8 |
Feature image courtesy of VFS Digital Design on Flickr via Creative Commons)
9 |
10 | 11 | 12 | 13 |
-------------------------------------------------------------------------------- /parser/src/test/resources/testdoc.txt: -------------------------------------------------------------------------------- 1 | In a decision that could have far-reaching consequences, the D.C. Circuit Court of Appeals today struck down the FCC’s Open Internet Order. That Order, put into force in 2010 by then-chairman Julius Genachowski, was designed to make it so that broadband service providers couldn’t meddle with traffic on the web based on its type – in other words, they couldn’t block certain kinds of online data transmission just because it didn’t align with their own goals and financial strategy. 2 | 3 | Media watchdog and advocacy agency Free Press released the following statement about the decision via President and CEO Craig Aaron, condemning it while also acknowledging that the Open Internet Order probably wasn’t the best possible solution for enforcing net neutrality: -------------------------------------------------------------------------------- /parser/src/test/scala/DocumentParserSpec.scala: -------------------------------------------------------------------------------- 1 | package org.deepdive.udf.nlp.test 2 | 3 | import java.io._ 4 | import com.clearcut.nlp.{Main, DocumentParser} 5 | import org.deepdive.udf.nlp._ 6 | import org.scalatest._ 7 | import play.api.libs.json._ 8 | import scala.io.Source 9 | import java.util.Properties 10 | 11 | class DocumentParserSpec extends FunSpec { 12 | 13 | describe("Parsing documents") { 14 | 15 | it("should work with plain text") { 16 | val inputFile = getClass.getResource("/testdoc.txt").getFile 17 | val documentStr = Source.fromFile(inputFile).mkString 18 | 19 | val props = new Properties() 20 | props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, dcoref") 21 | val dp = new DocumentParser(props) 22 | 23 | val result = dp.parseDocumentString(documentStr) 24 | assert(result.sentences.size == 3) 25 | } 26 | 27 | it("should work with HTML documents") { 28 | val inputFile = getClass.getResource("/testdoc.html").getFile 29 | val documentStr = Source.fromFile(inputFile).mkString 30 | 31 | val props = new Properties() 32 | props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, dcoref") 33 | val dp = new DocumentParser(props) 34 | 35 | val result = dp.parseDocumentString(documentStr) 36 | assert(result.sentences.size == 23) 37 | } 38 | 39 | } 40 | 41 | describe("Running the main method from the command line") { 42 | 43 | it("should work with valid JSON") { 44 | // Read stdin from file 45 | val inputFile = getClass.getResource("/input.json.txt").getFile 46 | val is = new FileInputStream(inputFile) 47 | System.setIn(is) 48 | 49 | // Execute the main method 50 | Main.main(Array("--valueKey", "documents.text", "--idKey", "documents.id")) 51 | } 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /parser/test/input.tsv: -------------------------------------------------------------------------------- 1 | 1 This is a test document. 2 | 2 This is another test document. It contains two sentences. 3 | 3 This is yet another one. It contains three sentences. The last sentence is, however, the longest. 4 | 4 Barack Obama, the current U.S. president, married to his wife Michelle several years ago. 5 | 5 In a decision that could have far-reaching consequences, the D.C. Circuit Court of Appeals today struck down the FCC’s Open Internet Order. That Order, put into force in 2010 by then-chairman Julius Genachowski, was designed to make it so that broadband service providers couldn’t meddle with traffic on the web based on its type – in other words, they couldn’t block certain kinds of online data transmission just because it didn’t align with their own goals and financial strategy. 6 | 7 | 6 Media watchdog and advocacy agency Free Press released the following statement about the decision via President and CEO Craig Aaron, condemning it while also acknowledging that the Open Internet Order probably wasn’t the best possible solution for enforcing net neutrality: 8 | 9 | 7 This is a "test document", with "quotation marks and slashes: \ \\ \\\ \\\\" 10 | 8 This sentence contains a mean token\. 11 | -------------------------------------------------------------------------------- /pipe/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | lib/ -------------------------------------------------------------------------------- /pipe/PLANS.md: -------------------------------------------------------------------------------- 1 | 2 | Plans 3 | ===== 4 | 5 | Change annotator schema to something like 6 | 7 | ``` 8 | class Annotator[A,B] {} 9 | 10 | class StanfordTokenizer[C <: HasText, D <: HasTokens with HasTokenOffsets] extends Annotator[C,D] {} 11 | ``` 12 | 13 | Then the readers can be type-safe, too: 14 | 15 | ``` 16 | val r = new ColumnReader[HasText with HasID] 17 | ``` 18 | 19 | Internally, the reader can look for the right files. 20 | 21 | And to build a concrete object: 22 | 23 | ``` 24 | trait HasInt { def getInt:Int } 25 | trait HasString { def getString:String } 26 | 27 | val obj:HasInt with HasString = 28 | new HasInt with HasString { 29 | def getInt = 12 30 | def getString = "hello" 31 | } 32 | ``` 33 | 34 | Open question: how to merge two objects? 35 | 36 | If we do multiple extractions, and then want to merge the results: 37 | 38 | val obj1:HasX with HasY 39 | val obj2:HasZ 40 | 41 | val obj = ???? 42 | 43 | -------------------------------------------------------------------------------- /pipe/README.md: -------------------------------------------------------------------------------- 1 | Pipe 2 | ==== 3 | 4 | Lightweight schemas and processing framework for NLP. 5 | 6 | Pipe addresses the following problems: 7 | 8 | * In many DeepDive applications, errors in pre-processing become more relevant as one tries to push up precision and recall. Often no further quality improvement is possible without targeting these errors. 9 | 10 | An example: 11 | ``` 12 | $300. 00 per hour 13 | ``` 14 | Our sentence splitter would break on the period and create two sentences. 15 | 16 | For some extractors we have tried work-arounds by adding complex rules to our extractors which target these errors. In fact, a significant portion of code in our 'rates' extractor is code to workaround this problem, but this code is complex and difficult to maintain. 17 | 18 | The right approach, of course, should be to fix the pre-processing components directly. Unfortunately, this is tricky because we treat all pre-processing as a black box, making changes nearly impossible. 19 | 20 | Pipe solves this problem by breaking up the preprocessing components. It is now easy to add your custom tokenization or sentence splitting rules. For almost any domain, we want to add a few such domain-specific rules to improve pre-processing. 21 | 22 | * We have a few problems with our current schemas for NLP. 23 | 1. Our NLP parser outputs a file in psql-specific format that no other application can read. 24 | 2. When running extractors, we manually serialize and deserialize using custom logic consisting of UDFs (array_to_string) and language-specific code (String.split). 25 | 3. Our sentences table is very wide, but most extractors only need 2 or 3 columns. This creates unnecessary I/O. 26 | 4. Our output is lossy, because we don't store the original text (only a tokenized version), and we have lost the mapping to the original characters. 27 | 5. It would be difficult to add coreference information to the sentences table, because it is not document-based. 28 | 29 | ## Schemas 30 | 31 | With Pipe, we create a set of *minimal* schemas for the different NLP annotations. There's one schema for each type of annotation, and we currently have 18 schemas in total. The schemas are in JSON, which makes it trivial to read from and write in any programming language. 32 | 33 | Examples: 34 | 35 | ann.id 36 | ``` 37 | "doc123" 38 | ``` 39 | 40 | ann.text 41 | ``` 42 | "This is a very simple text file.\nIt contains two sentences." 43 | ``` 44 | 45 | ann.poss 46 | ``` 47 | ["DT","VBZ","DT","RB","JJ","NN","NN",".","PRP","VBZ","CD","NNS","."] 48 | ``` 49 | 50 | ann.tokens 51 | ``` 52 | ["This","is","a","very","simple","text","file",".","It","contains","two","sentences","."] 53 | ``` 54 | ann.tokenOffsets 55 | ``` 56 | [[0,4],[5,7],[8,9],[10,14],[15,21],[22,26],[27,31],[31,32],[33,35],[36,44],[45,48],[49,58],[58,59]] 57 | ``` 58 | 59 | ann.sentenceOffsets 60 | ``` 61 | [[0,32],[33,59]] 62 | ``` 63 | 64 | ann.sentenceTokenOffsets 65 | ``` 66 | [[0,8],[8,13]] 67 | ``` 68 | 69 | ## Storage 70 | 71 | We propose to store these in column format, where there exists one file for each type of schema. 72 | Pipe contains readers and writers for column format in both [scala](src/main/scala/com/clearcut/pipe/io) and [python](../view/util/pipe.py). 73 | 74 | For compatibility reasons, Pipe also allows you to read and write as single JSON: 75 | ``` 76 | { 77 | "id": "doc123", 78 | "text": "This is a very simple text file.\nIt contains two sentences.", 79 | "poss": ["DT","VBZ","DT","RB","JJ","NN","NN",".","PRP","VBZ","CD","NNS","."], 80 | "tokens": ["This","is","a","very","simple","text","file",".","It","contains","two","sentences","."], 81 | ... 82 | } 83 | ``` 84 | And for backwards compatibility, Pipe also allows you to write in our psql-specific TSV. 85 | 86 | ## Framework 87 | 88 | The framework allows you to plug together different preprocessing components. Currently, Pipe contains wrappers for most components of Stanford CoreNLP, as well as a components that can run an entire Stanford pipeline. 89 | 90 | Since the components read and write our language-agnostic schemas, we can now plug together components in arbitrary programming languages including python, scala, julia. 91 | 92 | When working with Scala, you can choose to use static typing or not. If you use static typing, [our typedefs](src/main/scala/com/clearcut/pipe/model/package.scala) make code compact and clean: 93 | ``` 94 | type ID = String 95 | type Poss = Array[String] 96 | type Offsets = Array[Int] 97 | type SentenceDependencies = Array[Array[Dependency]] 98 | type SentenceOffsets = Array[Offsets] 99 | type SentenceTokenOffsets = Array[Offsets] 100 | type Text = String 101 | ... 102 | ``` 103 | An example is [here](src/test/scala/BasicSpec.scala). 104 | 105 | To build a custom tokenizer that solves the `$300. 00` problem, you can write something like 106 | ``` 107 | import com.clearcut.pipe.annotator.Annotator 108 | import com.clearcut.pipe.model._ 109 | 110 | class MyTokenizer extends Annotator[Text,(TokenOffsets,Tokens)] { 111 | override def annotate(t:(Text)):(TokenOffsets, Tokens) = { 112 | // add custom logic here 113 | } 114 | } 115 | ``` 116 | 117 | ## Tip 118 | 119 | You can run Pipe in a regular scala REPL and manipulate your data or processing components interactively. 120 | 121 | You can also run our python readers and writers in a python REPL and create your own components there. 122 | 123 | ## Setup 124 | 125 | Run `setup.sh` to install dependencies and build the parser. Pipe requires Java 8. 126 | 127 | ## Usage 128 | 129 | Here are a few examples showing how to call Pipe with the provided launcher scripts. 130 | 131 | ``` 132 | ./run.sh -i INPUT.json --formatIn json --formatOut json -v content -k doc_id -a SimpleStanfordPipeline -o OUTPUT 133 | ``` 134 | Reads INPUT.json which contains json objects with fields "doc_id" and "content". Writes results as json objects to file OUTPUT. 135 | 136 | ``` 137 | ./run.sh -i INPUT.json --formatIn json --formatOut column -v content -k doc_id -a StanfordTokenizer,StanfordSentenceSplitter,StanfordPOSTagger,StanfordLemmatizer,StanfordNERTagger,StanfordSRParser -o test 138 | ``` 139 | Runs a custom set of annotators and stores results in column format. 140 | 141 | ``` 142 | /run_parallel.sh --input=INPUT.json --parallelism=10 '--formatIn json --formatOut column -v content -k doc_id -a ExtendedStanfordPipeline' 143 | ``` 144 | Splits the input file into segments and runs 10 parallel processes at a time. The ExtendedStanfordPipeline adds parse trees and true case annotations. 145 | 146 | -------------------------------------------------------------------------------- /pipe/build.sbt: -------------------------------------------------------------------------------- 1 | import com.typesafe.sbt.SbtStartScript 2 | 3 | organization := "com.clearcut" 4 | 5 | name := "pipe" 6 | 7 | version := "0.1-SNAPSHOT" 8 | 9 | scalaVersion := "2.11.7" 10 | 11 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 12 | 13 | resolvers += "Scalaz Bintray Repo" at "https://dl.bintray.com/scalaz/releases" 14 | 15 | libraryDependencies ++= List( 16 | "ch.qos.logback" % "logback-classic" % "1.0.7", 17 | "com.typesafe.play" %% "play-json" % "2.3.4", 18 | "com.github.scopt" %% "scopt" % "3.2.0", 19 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0", 20 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" classifier "models", 21 | "org.scalatest" % "scalatest_2.11" % "2.2.5" % "test", 22 | "org.http4s" %% "http4s-dsl" % "0.7.0", 23 | "org.http4s" %% "http4s-jetty" % "0.7.0", 24 | "org.json4s" %% "json4s-jackson" % "3.2.11", 25 | "org.jsoup" % "jsoup" % "1.8.3" 26 | ) 27 | 28 | unmanagedJars in Compile += file("lib/stanford-srparser-2014-10-23-models.jar") 29 | 30 | parallelExecution in Test := false 31 | 32 | test in assembly := {} 33 | 34 | seq(SbtStartScript.startScriptForClassesSettings: _*) 35 | 36 | -------------------------------------------------------------------------------- /pipe/config.properties.template: -------------------------------------------------------------------------------- 1 | tokenize.whitespace = true 2 | ssplit.eolonly = true 3 | -------------------------------------------------------------------------------- /pipe/example/input.json: -------------------------------------------------------------------------------- 1 | { "doc_id":"1", "content":"I was robbed by this girl. 
:(" } 2 | -------------------------------------------------------------------------------- /pipe/example/parse.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ../run.sh --formatIn json --formatOut json -v content -k doc_id -a ExtendedCleanHtmlStanfordPipeline -i input.json -o output.json 4 | -------------------------------------------------------------------------------- /pipe/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0") 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 4 | -------------------------------------------------------------------------------- /pipe/run.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | export JAVA_OPTS="-Xmx4g -Dfile.encoding=UTF-8" 4 | 5 | $(dirname $0)/target/start $@ 6 | -------------------------------------------------------------------------------- /pipe/run_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Parses documents in parallel. 4 | # 5 | # Input is a single file that contains one JSON record per line. 6 | # Output is a single file that contains one JSON record per line. 7 | # 8 | # The number of records and their order is the same in input and output. 9 | # 10 | # Example: 11 | # ./run_parallel.sh --input=INPUT.json --output=OUTPUT.json \ 12 | # --params='-v content -k doc_id -a ExtendedCleanHtmlStanfordPipeline' 13 | # 14 | # The following environment variables are used when available. 15 | # PARALLELISM (default 2) 16 | # BATCH_SIZE (default 1000) 17 | 18 | set -eu 19 | 20 | for i in "$@" 21 | do 22 | case $i in 23 | -in=*|--input=*) 24 | INPUT_FILE="${i#*=}" 25 | shift 26 | ;; 27 | -out=*|--output=*) 28 | OUTPUT_FILE="${i#*=}" 29 | shift 30 | ;; 31 | -pa=*|--params=*) 32 | PARAMS="${i#*=}" 33 | shift 34 | ;; 35 | -p=*|--parallelism=*) 36 | PARALLELISM="${i#*=}" 37 | shift 38 | ;; 39 | -b=*|--batch-size=*) 40 | BATCH_SIZE="${i#*=}" 41 | shift 42 | ;; 43 | --keepsplit) 44 | KEEP_SPLIT=true 45 | shift 46 | ;; 47 | --compress) 48 | COMPRESS_OUTPUT=true 49 | shift 50 | ;; 51 | *) 52 | echo "Ignoring parameter: $i" 53 | break 54 | ;; 55 | esac 56 | done 57 | 58 | if [ -z "$INPUT_FILE" ]; then 59 | echo "Usage: $0 -in=INPUT.json [-out=OUTPUT.json] [--parallelism=PARALLELISM] \\" 60 | echo " [--batch-size=BATCH_SIZE ] --params=''" 61 | exit 62 | fi 63 | 64 | # Setting defaults 65 | PARALLELISM=${PARALLELISM:-2} 66 | BATCH_SIZE=${BATCH_SIZE:-1000} 67 | PARAMS=${PARAMS:-} 68 | KEEP_SPLIT=${KEEP_SPLIT:-false} 69 | COMPRESS_OUTPUT=${COMPRESS_OUTPUT:-false} 70 | if [ "$COMPRESS_OUTPUT" = false ]; then 71 | OUTPUT_FILE=${OUTPUT_FILE:-$INPUT_FILE.out} 72 | else 73 | OUTPUT_FILE=${OUTPUT_FILE:-$INPUT_FILE.out.gz} 74 | fi 75 | 76 | echo "parallelism = $PARALLELISM" 77 | echo "batch-size = $BATCH_SIZE" 78 | echo "compress = $COMPRESS_OUTPUT" 79 | 80 | # Fixed a bug when "config.properties" does not exists 81 | touch config.properties 82 | 83 | RUN_SCRIPT=`cd $(dirname $0)/; pwd`"/run.sh --formatIn json --formatOut json $PARAMS" 84 | echo $RUN_SCRIPT 85 | 86 | SPLIT_DIR=$INPUT_FILE.split 87 | mkdir -p $SPLIT_DIR 88 | rm -rf $SPLIT_DIR/* 89 | 90 | # Split the input file into subfiles 91 | split -a 10 -l $BATCH_SIZE $INPUT_FILE $SPLIT_DIR/input- 92 | 93 | # Match all files in the split directory 94 | find $INPUT_FILE.split -name "input-*" 2>/dev/null -print0 | xargs -0 -P $PARALLELISM -L 1 bash -c "${RUN_SCRIPT}"' -i "$0" -o "$0.out"' 95 | 96 | function merge_json_format { 97 | SPLIT_DIR=$1 98 | OUTPUT_FILE=$2 99 | # merging json files 100 | for file in $SPLIT_DIR/*.out 101 | do 102 | if [ "$COMPRESS_OUTPUT" = false ]; then 103 | cat $file >> $OUTPUT_FILE 104 | else 105 | cat $file | gzip >> $OUTPUT_FILE 106 | fi 107 | done 108 | } 109 | 110 | 111 | function merge_column_format { 112 | SPLIT_DIR=$1 113 | OUTPUT_FILE=$2 114 | # merging column format segments 115 | 116 | OUTDIR=$INPUT_FILE.out 117 | if [ -d "$OUTDIR" ]; then 118 | echo "$OUTDIR already exists. Aborting." 119 | exit 1 120 | fi 121 | mkdir $OUTDIR 122 | 123 | # first we determine the different annotators by looking at only one segment 124 | annotations=() 125 | for file in $SPLIT_DIR/* 126 | do 127 | if [[ -d $file ]]; then 128 | for ann in $file/* 129 | do 130 | annotations+=("${ann##*.}") 131 | done 132 | break 133 | fi 134 | done 135 | 136 | # now cat them all together 137 | for file in $SPLIT_DIR/* 138 | do 139 | if [[ -d $file ]]; then 140 | for ann in "${annotations[@]}" 141 | do 142 | cat $file/ann.$ann >> $OUTDIR/ann.$ann 143 | done 144 | fi 145 | done 146 | } 147 | 148 | merge_json_format $SPLIT_DIR $OUTPUT_FILE 149 | 150 | # remove split dir 151 | if [ "$KEEP_SPLIT" = false ]; then 152 | rm -rf $SPLIT_DIR 153 | fi 154 | 155 | echo "The output is in $OUTPUT_FILE" 156 | -------------------------------------------------------------------------------- /pipe/run_test.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | ./run.sh --formatIn tsv --tsvValue 2 --tsvKey 0 -i test/input.tsv -o test/out 4 | 5 | -------------------------------------------------------------------------------- /pipe/sbt/sbt: -------------------------------------------------------------------------------- 1 | java $SBT_OPTS -jar `dirname $0`/sbt-launch.jar "$@" -------------------------------------------------------------------------------- /pipe/sbt/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/bazaar/c09dce20f16a90c359f804f9e83d6107547d442c/pipe/sbt/sbt-launch.jar -------------------------------------------------------------------------------- /pipe/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIRNAME=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 4 | 5 | # fetch SR models 6 | DESTDIR=$DIRNAME/lib 7 | FILENAME='stanford-srparser-2014-10-23-models.jar' 8 | if [ ! -e "$DESTDIR/$FILENAME" ]; then 9 | mkdir -p $DESTDIR 10 | wget -P $DESTDIR http://nlp.stanford.edu/software/stanford-srparser-2014-10-23-models.jar 11 | else 12 | echo "Skipping download: $DESTDIR/$FILENAME already exists" 13 | fi 14 | 15 | # On Ubuntu, install java 8 16 | #sudo add-apt-repository -y ppa:openjdk-r/ppa 17 | #sudo apt-get update 18 | #sudo apt-get install -y openjdk-8-jdk 19 | 20 | # build parser 21 | cd $DIRNAME 22 | sbt/sbt stage 23 | 24 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/Main.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe 2 | 3 | import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter} 4 | 5 | import com.clearcut.pipe.annotator.Annotator 6 | import com.clearcut.pipe.io._ 7 | 8 | object Main extends App { 9 | 10 | // Parse command line options 11 | case class Config(serverPort: Integer = null, 12 | in: String = null, 13 | out: String = null, 14 | formatIn: String = "column", 15 | formatOut: String = "column", 16 | documentKey: String = "text", 17 | idKey: String = "id", 18 | documentCol: Int = 1, 19 | idCol: Int = 0, 20 | annotators: String = "SimpleStanfordPipeline") 21 | 22 | val optionsParser = new scopt.OptionParser[Config]("Pipe") { 23 | head("Run CoreNLP annotators and read/write column/json/tsv formats", "0.1") 24 | head("Input: column dir, json file, or tsv file") 25 | head("Output: column files, json file, or tsv file") 26 | opt[String]("formatIn") action { (x, c) => 27 | c.copy(formatIn = x) 28 | } text("column, json or tsv") 29 | opt[String]("formatOut") action { (x, c) => 30 | c.copy(formatOut = x) 31 | } text("column, json or tsv") 32 | opt[String]('v', "jsonValue") action { (x, c) => 33 | c.copy(documentKey = x) 34 | } text("JSON key that contains the document content, for example \"documents.text\"") 35 | opt[String]('k', "jsonKey") action { (x, c) => 36 | c.copy(idKey = x) 37 | } text("JSON key that contains the document id, for example \"documents.id\"") 38 | opt[Int]("tsvValue") action { (x, c) => 39 | c.copy(documentCol = x) 40 | } text("Column number that contains the document content, for example 1") 41 | opt[Int]("tsvKey") action { (x, c) => 42 | c.copy(idCol = x) 43 | } text("Column number that contains the document id, for example 0") 44 | opt[String]('i', "input") action { (x, c) => 45 | c.copy(in = x) 46 | } text("Input dir (column) or file (json, tsv)") 47 | opt[String]('o', "output") action { (x, c) => 48 | c.copy(out = x) 49 | } text("Output dir (column) or file (json, tsv)") 50 | opt[String]('a', "annotators") action { (x, c) => 51 | c.copy(annotators = x) 52 | } text("Comma-separated list of annotators. Default: SimpleStanfordPipeline") 53 | opt[Int]('p', "serverPort") action { (x, c) => 54 | c.copy(serverPort = x) 55 | } text("Run as an HTTP service") 56 | } 57 | 58 | val conf = optionsParser.parse(args, Config()) getOrElse { 59 | throw new IllegalArgumentException 60 | } 61 | 62 | if (conf.serverPort != null) { 63 | Console.println("Listening on port " + conf.serverPort + "...") 64 | new Server(conf.serverPort).run() 65 | System.exit(0) 66 | } 67 | 68 | val errors = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(conf.out + ".errors"))) 69 | 70 | val annotators:Array[Annotator[_,_]] = conf.annotators.split(",").map (s => 71 | Class.forName("com.clearcut.pipe.annotator." + s.trim).newInstance().asInstanceOf[Annotator[_,_]]) 72 | 73 | // load configuration properties from properties file 74 | if (new java.io.File("config.properties").exists) { 75 | println("config.properties exists") 76 | val prop = new java.util.Properties() 77 | val fromFile = new java.io.FileReader("config.properties") 78 | prop.load(fromFile) 79 | fromFile.close 80 | for (ann <- annotators) 81 | ann.setProperties(prop) 82 | } 83 | 84 | val reader:Reader = conf.formatIn match { 85 | case "column" => new ColumnReader(conf.in) 86 | case "json" => new JsonReader(conf.in, conf.idKey, conf.documentKey) 87 | case "tsv" => new TsvReader(conf.in, conf.idCol, conf.documentCol) 88 | } 89 | 90 | val writer:Writer = conf.formatOut match { 91 | case "column" => new ColumnWriter(conf.out) 92 | case "json" => new JsonWriter(conf.out) 93 | case "tsv" => new TsvWriter(conf.out) 94 | } 95 | 96 | run(annotators, reader, writer, errors) 97 | 98 | writer.close 99 | reader.close 100 | errors.close 101 | 102 | def run(annotators:Array[Annotator[_,_]], reader:Reader, writer:Writer, errors:BufferedWriter) = { 103 | val schema = Schema.extendSchema(reader.getSchema, annotators) 104 | val indices = annotators.map(a => Schema.defaultAnnotationIndices(schema, a.requires)) 105 | writer.setSchema(schema) 106 | 107 | for (t <- reader) { 108 | var all = t 109 | for ((a, i) <- annotators.zip(indices)) { 110 | val input = i.map(index => all(index)) 111 | all = all ++ a.annotateUnsafe(input:_*) 112 | } 113 | writer.write(all) 114 | } 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/Schema.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe 2 | 3 | import com.clearcut.pipe.annotator.Annotator 4 | import scala.collection.mutable.Map 5 | 6 | case class Schema 7 | ( 8 | annTyps: Array[String] = Array(), 9 | defaults: Map[String, Int] = Map(), 10 | provenance: Array[String] = Array() 11 | ) 12 | 13 | 14 | object Schema { 15 | 16 | def defaultAnnotations(schema: Schema, needed: Seq[String], all: Seq[String]): Seq[AnyRef] = { 17 | defaultAnnotationIndices(schema, needed).map(all(_)) 18 | } 19 | 20 | def defaultAnnotationIndices(schema: Schema, needed: Seq[String]): Seq[Int] = { 21 | needed.map(schema.defaults(_)) 22 | } 23 | 24 | def extendSchema(before: Schema, annotators: Array[Annotator[_,_]]): Schema = { 25 | val annTyps = Array.concat(before.annTyps, annotators.flatMap(_.generates)) 26 | val defaults = Map[String, Int]() 27 | defaults ++= before.defaults 28 | annTyps.zipWithIndex.foreach { case (c, i) => if (!defaults.contains(c)) defaults += (c -> i) } 29 | val provenance = Array.concat(before.provenance, annotators.flatMap(_.generates)) 30 | new Schema(annTyps, defaults, provenance) 31 | } 32 | 33 | def createSchema(annTyps: String*): Schema = { 34 | val defaults = Map[String, Int]() 35 | annTyps.zipWithIndex.foreach { case (c, i) => if (!defaults.contains(c)) defaults += (c -> i) } 36 | val provenance = annTyps.toArray.map(_ => "provided") 37 | new Schema(annTyps.toArray, defaults, provenance) 38 | } 39 | 40 | def prettyPrint(s:Schema) = { 41 | s.annTyps.map(println(_)) 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/Server.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe 2 | 3 | import java.io._ 4 | 5 | import com.clearcut.pipe.annotator._ 6 | import com.clearcut.pipe.io.{TsvWriter, TsvReader, ColumnWriter, ColumnReader} 7 | import org.http4s._ 8 | import org.http4s.dsl._ 9 | import org.http4s.server.HttpService 10 | import org.http4s.server.jetty.JettyBuilder 11 | import scala.collection.mutable.ListBuffer 12 | import scala.io.Source 13 | 14 | class Server(port: Integer) { 15 | 16 | val route = HttpService { 17 | case req @ GET -> Root => 18 | Ok("Hello. I can parse stuff. Just POST the text to me.\n") 19 | 20 | case req @ POST -> Root => 21 | // WARNING: when request body is empty, http4s seems to hang here 22 | val content = new String(req.body.runLog.run.reduce(_ ++ _).toArray, Charset.`UTF-8`.nioCharset) 23 | 24 | val lines = ListBuffer[String]() 25 | 26 | val annotators:Array[Annotator[_,_]] = Array(new SimpleStanfordPipeline) 27 | 28 | val reader = new TsvReader(inSource = Source.fromString("id\t" + content.replace("\t", " ").replace("\n", " ") + "\n")) 29 | val baos = new ByteArrayOutputStream 30 | val writer = new TsvWriter(outWriter = new BufferedWriter(new OutputStreamWriter(baos, "utf-8"))) 31 | val errors = new BufferedWriter(new PrintWriter(new OutputStreamWriter(System.err, "utf-8"))) 32 | 33 | Main.run(annotators, reader, writer, errors) 34 | 35 | reader.close 36 | writer.close 37 | 38 | Ok(baos.toString("utf-8")) 39 | } 40 | 41 | def run() = { 42 | JettyBuilder 43 | .mountService(route, "") 44 | .bindHttp(port) 45 | .run 46 | .awaitShutdown() 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/Annotator.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import java.util.Properties 4 | import scala.reflect.runtime.universe._ 5 | import com.clearcut.pipe.model._ 6 | 7 | abstract class Annotator[In,Out](implicit inTag:TypeTag[In], outTag:TypeTag[Out]) 8 | extends java.io.Serializable { 9 | 10 | var properties = new Properties() 11 | 12 | def setProperties(p:java.util.Properties) = { 13 | properties = p 14 | } 15 | 16 | def annotate(in:In):Out 17 | 18 | def init = {} 19 | 20 | def close = {} 21 | 22 | def requires = inTypes 23 | 24 | def generates = outTypes 25 | 26 | val inTypes:Seq[String] = toTypes(inTag) 27 | val outTypes:Seq[String] = toTypes(outTag) 28 | 29 | private def toTypes[A](tag:TypeTag[A]):Seq[String] = { 30 | if (tag.tpe <:< typeOf[Product]) 31 | tag.tpe.typeArgs.map(t => { 32 | val s = t.toString 33 | lowerFirst(s.substring(s.lastIndexOf(".") + 1)) 34 | }) else { 35 | val s = tag.tpe.toString 36 | Array(lowerFirst(s.substring(s.lastIndexOf(".") + 1))) 37 | } 38 | } 39 | 40 | val inClazz = inTag.mirror.runtimeClass(inTag.tpe.typeSymbol.asClass) 41 | 42 | def annotateUnsafe(in:AnyRef*):Seq[AnyRef] = { 43 | var outTuple:Out = if (inTypes.size == 1) { 44 | val inTuple = in(0).asInstanceOf[In] 45 | annotate(inTuple) 46 | } else { 47 | val inTuple = inClazz.getConstructors.apply(0).newInstance(in:_*).asInstanceOf[In] 48 | annotate(inTuple) 49 | } 50 | var outSeq = if (outTypes.size == 1) 51 | Seq(outTuple.asInstanceOf[AnyRef]) 52 | else 53 | outTuple.asInstanceOf[Product].productIterator.toSeq.asInstanceOf[Seq[AnyRef]] 54 | outSeq 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/ExtendedCleanHtmlStanfordPipeline.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import scala.collection.JavaConversions._ 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation 5 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation} 6 | import java.util.Properties 7 | import com.clearcut.pipe.model._ 8 | import java.util.regex._ 9 | import org.jsoup.Jsoup 10 | import org.jsoup.safety._ 11 | 12 | class ExtendedCleanHtmlStanfordPipeline extends Annotator[(Text), (Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, 13 | SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets)] { 14 | 15 | override def setProperties(p:Properties) { 16 | super.setProperties(p) 17 | properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, truecase") 18 | properties.put("clean.xmltags", ".*") 19 | properties.put("parse.maxlen", "100") 20 | properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz") 21 | properties.put("truecase.model", "edu/stanford/nlp/models/truecase/truecasing.fast.qn.ser.gz") 22 | properties.put("threads", "1") // Should use extractor-level parallelism 23 | properties.put("clean.allowflawedxml", "true") 24 | properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article") 25 | } 26 | 27 | @transient lazy val pipeline = new StanfordCoreNLP(properties) 28 | 29 | val stripHtml = Pattern.compile("<\\/?a|A[^>]*>") 30 | 31 | override def annotate(t:Text):(Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets) = { 32 | 33 | // clean up Html 34 | var text = extractCleanHtml(t) 35 | 36 | // Temporary fix for bug where brackets are being incorrectly treated as punct 37 | // and somehow this messes up the whole dep parse -> change them to round braces 38 | text = text.replaceAll( """\[""", "(").replaceAll( """\]""", ")") 39 | 40 | var stanAnn = new Annotation(text) 41 | try { 42 | pipeline.annotate(stanAnn) 43 | 44 | } catch { 45 | // If our pipeline still fails on this input, we return an empty tuple. 46 | case e:Exception => 47 | System.err.println(text) 48 | e.printStackTrace(System.err) 49 | System.err.flush() 50 | return (text, Array[Offsets](), Array[Offsets](), Array[String](), Array[String](), Array[String](), Array[String](), Array[Array[Dependency]](), Array[String](), Array[String](), Array[Offsets]()) 51 | } 52 | 53 | val (toa, to) = StanfordTokenizer.fromStanford(stanAnn) 54 | val poss = StanfordPOSTagger.fromStanford(stanAnn) 55 | val nertags = StanfordNERTagger.fromStanford(stanAnn) 56 | val lemmas = StanfordLemmatizer.fromStanford(stanAnn) 57 | val deps = StanfordDependencyExtractor.fromStanford(stanAnn) 58 | val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn) 59 | val pa = StanfordSRParser.fromStanford(stanAnn) 60 | val tcs = StanfordTrueCaseAnnotator.fromStanford(stanAnn) 61 | 62 | (text, so, toa, to, poss, nertags, lemmas, deps, pa, tcs, sto) 63 | } 64 | 65 | def extractCleanHtml(html:String):String = { 66 | val doc = Jsoup.parseBodyFragment(html).body() 67 | doc.html() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/ExtendedHtmlStanfordPipeline.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import scala.collection.JavaConversions._ 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation 5 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation} 6 | import java.util.Properties 7 | import com.clearcut.pipe.model._ 8 | import java.util.regex._ 9 | import org.jsoup.Jsoup 10 | import org.jsoup.safety._ 11 | 12 | class ExtendedHtmlStanfordPipeline extends Annotator[(Text), (Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, 13 | SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets)] { 14 | 15 | override def setProperties(p:Properties) { 16 | super.setProperties(p) 17 | properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, truecase") 18 | properties.put("clean.xmltags", ".*") 19 | properties.put("parse.maxlen", "100") 20 | properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz") 21 | properties.put("truecase.model", "edu/stanford/nlp/models/truecase/truecasing.fast.qn.ser.gz") 22 | properties.put("threads", "1") // Should use extractor-level parallelism 23 | properties.put("clean.allowflawedxml", "true") 24 | properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article") 25 | } 26 | 27 | @transient lazy val pipeline = new StanfordCoreNLP(properties) 28 | 29 | val stripHtml = Pattern.compile("<\\/?a|A[^>]*>") 30 | 31 | override def annotate(t:Text):(Html, SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets) = { 32 | 33 | // clean up Html 34 | //var text = extractCleanHtml(t) 35 | var text = t 36 | 37 | // Temporary fix for bug where brackets are being incorrectly treated as punct 38 | // and somehow this messes up the whole dep parse -> change them to round braces 39 | //text = text.replaceAll( """\[""", "(").replaceAll( """\]""", ")") 40 | 41 | var stanAnn = new Annotation(text) 42 | try { 43 | pipeline.annotate(stanAnn) 44 | 45 | } catch { 46 | // If our pipeline still fails on this input, we return an empty tuple. 47 | case e:Exception => 48 | System.err.println(text) 49 | e.printStackTrace(System.err) 50 | System.err.flush() 51 | return (text, Array[Offsets](), Array[Offsets](), Array[String](), Array[String](), Array[String](), Array[String](), Array[Array[Dependency]](), Array[String](), Array[String](), Array[Offsets]()) 52 | } 53 | 54 | val (toa, to) = StanfordTokenizer.fromStanford(stanAnn) 55 | val poss = StanfordPOSTagger.fromStanford(stanAnn) 56 | val nertags = StanfordNERTagger.fromStanford(stanAnn) 57 | val lemmas = StanfordLemmatizer.fromStanford(stanAnn) 58 | val deps = StanfordDependencyExtractor.fromStanford(stanAnn) 59 | val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn) 60 | val pa = StanfordSRParser.fromStanford(stanAnn) 61 | val tcs = StanfordTrueCaseAnnotator.fromStanford(stanAnn) 62 | 63 | (text, so, toa, to, poss, nertags, lemmas, deps, pa, tcs, sto) 64 | } 65 | 66 | def extractCleanHtml(html:String):String = { 67 | val doc = Jsoup.parseBodyFragment(html).body() 68 | doc.html() 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/ExtendedStanfordPipeline.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import scala.collection.JavaConversions._ 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation 5 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation} 6 | import java.util.Properties 7 | import com.clearcut.pipe.model._ 8 | 9 | class ExtendedStanfordPipeline extends Annotator[(Text), (SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, 10 | SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets)] { 11 | 12 | override def setProperties(p:Properties) { 13 | super.setProperties(p) 14 | properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, truecase") 15 | properties.put("parse.maxlen", "100") 16 | properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz") 17 | properties.put("threads", "1") // Should use extractor-level parallelism 18 | properties.put("clean.allowflawedxml", "true") 19 | properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article") 20 | } 21 | 22 | @transient lazy val pipeline = new StanfordCoreNLP(properties) 23 | 24 | override def annotate(t:Text):(SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies, Parses, TrueCases, SentenceTokenOffsets) = { 25 | // Temporary fix for bug where brackets are being incorrectly treated as punct 26 | // and somehow this messes up the whole dep parse -> change them to round braces 27 | // val text = t.replaceAll( """\[""", "(").replaceAll( """\]""", ")") 28 | val text = t 29 | 30 | val stanAnn = new Annotation(text) 31 | pipeline.annotate(stanAnn) 32 | 33 | val (toa, to) = StanfordTokenizer.fromStanford(stanAnn) 34 | val poss = StanfordPOSTagger.fromStanford(stanAnn) 35 | val nertags = StanfordNERTagger.fromStanford(stanAnn) 36 | val lemmas = StanfordLemmatizer.fromStanford(stanAnn) 37 | val deps = StanfordDependencyExtractor.fromStanford(stanAnn) 38 | val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn) 39 | val pa = StanfordSRParser.fromStanford(stanAnn) 40 | val tcs = StanfordTrueCaseAnnotator.fromStanford(stanAnn) 41 | 42 | (so, toa, to, poss, nertags, lemmas, deps, pa, tcs, sto) 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/SimpleStanfordPipeline.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import edu.stanford.nlp.pipeline.{StanfordCoreNLP, Annotation} 4 | import java.util.Properties 5 | import com.clearcut.pipe.model._ 6 | 7 | class SimpleStanfordPipeline extends Annotator[(Text), (SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, 8 | SentenceDependencies)] { 9 | 10 | //val props = new Properties() 11 | override def setProperties(p:Properties) { 12 | super.setProperties(p) 13 | properties.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse") 14 | properties.put("parse.maxlen", "100") 15 | properties.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz") 16 | properties.put("threads", "1") // Should use extractor-level parallelism 17 | properties.put("clean.allowflawedxml", "true") 18 | properties.put("clean.sentenceendingtags", "p|br|div|li|ul|ol|h1|h2|h3|h4|h5|blockquote|section|article") 19 | } 20 | 21 | @transient lazy val pipeline = new StanfordCoreNLP(properties) 22 | 23 | override def annotate(t:Text):(SentenceOffsets, TokenOffsets, Tokens, Poss, NerTags, Lemmas, SentenceDependencies) = { 24 | // Temporary fix for bug where brackets are being incorrectly treated as punct 25 | // and somehow this messes up the whole dep parse -> change them to round braces 26 | val text = t.replaceAll( """\[""", "(").replaceAll( """\]""", ")") 27 | 28 | val stanAnn = new Annotation(text) 29 | pipeline.annotate(stanAnn) 30 | 31 | val (toa, to) = StanfordTokenizer.fromStanford(stanAnn) 32 | val poss = StanfordPOSTagger.fromStanford(stanAnn) 33 | val nertags = StanfordNERTagger.fromStanford(stanAnn) 34 | val lemmas = StanfordLemmatizer.fromStanford(stanAnn) 35 | val deps = StanfordDependencyExtractor.fromStanford(stanAnn) 36 | val (so, sto) = StanfordSentenceSplitter.fromStanford(stanAnn) 37 | 38 | (so, toa, to, poss, nertags, lemmas, deps) 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordCoreferenceResolver.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import java.util 4 | import java.util.{Properties, Set} 5 | 6 | import com.clearcut.pipe.model._ 7 | import edu.stanford.nlp.dcoref.CorefChain.{CorefMention => StCorefMention} 8 | import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation 9 | import edu.stanford.nlp.dcoref.{CorefChain => StCorefChain, Dictionaries} 10 | import edu.stanford.nlp.ling.CoreAnnotations.{SentencesAnnotation, TokenBeginAnnotation} 11 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 12 | import edu.stanford.nlp.util.{CoreMap, IntPair, IntTuple} 13 | 14 | import scala.collection.JavaConversions.{asScalaBuffer, collectionAsScalaIterable} 15 | import scala.collection.mutable.ArrayBuffer 16 | 17 | class StanfordCoreferenceResolver extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets,SentenceTokenOffsets, 18 | Poss,NerTags,Parses,SentenceDependencies),(Mentions,Coreferences)] { 19 | 20 | // make sure StanfordCoreNLP has parse annotator, which is needed by dcoref 21 | @transient lazy val stanfordAnnotator = 22 | AnnotatorFactories.coref(properties, StanfordUtil.annotatorImplementations).create() 23 | 24 | override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets,SentenceTokenOffsets,Poss,NerTags,Parses, 25 | SentenceDependencies)):(Mentions, Coreferences) = { 26 | val (t, toa, to, soa, stoa, posa, nerta, pa, sda) = in 27 | val stanAnn = new StAnnotation(t) 28 | StanfordTokenizer.toStanford(t, toa, to, stanAnn) 29 | StanfordSentenceSplitter.toStanford(soa, stoa, stanAnn) 30 | StanfordPOSTagger.toStanford(posa, stanAnn) 31 | StanfordNERTagger.toStanford(nerta, stanAnn) 32 | StanfordSRParser.toStanford(pa, stanAnn) 33 | StanfordDependencyExtractor.toStanford("DepCollapsed", sda, stanAnn) 34 | 35 | stanfordAnnotator.annotate(stanAnn) 36 | 37 | StanfordCoreferenceResolver.fromStanford(stanAnn) 38 | } 39 | } 40 | 41 | object StanfordCoreferenceResolver { 42 | 43 | def toStanford(fromT:Text, fromO:TokenOffsets, fromS:SentenceTokenOffsets, 44 | fromM:Mentions, fromC:Coreferences, to:StAnnotation):Unit = { 45 | val cm = new java.util.HashMap[Integer, StCorefChain]() 46 | val mentions = fromM 47 | for (c <- fromC) { 48 | 49 | val mentionMap = new java.util.HashMap[IntPair, Set[StCorefMention]]() 50 | var representative:StCorefMention = null 51 | 52 | for (mentionNum <- c.mentionNums) { 53 | val m = mentions(mentionNum) 54 | 55 | // determine sentNum and sentHead 56 | var sentNum = 0 57 | var sentHead = -1 58 | while (sentHead == -1 && sentNum < fromS.size) { 59 | if (fromS(sentNum)(FROM) <= m.head && m.head < fromS(sentNum)(TO)) { 60 | sentHead = m.head - fromS(sentNum)(FROM) 61 | } else 62 | sentNum += 1 63 | } 64 | val mentionSpan = fromT.substring(fromO(m.tokenOffsets(FROM))(FROM), fromO(m.tokenOffsets(TO) - 1)(TO)) 65 | sentNum += 1 66 | 67 | val com = new StCorefMention( 68 | Dictionaries.MentionType.valueOf(Mention.typeFromByte(m.mentionTyp)), 69 | Dictionaries.Number.valueOf(Mention.numberFromByte(m.number)), 70 | Dictionaries.Gender.valueOf(Mention.genderFromByte(m.gender)), 71 | Dictionaries.Animacy.valueOf(Mention.animacyFromByte(m.animacy)), 72 | m.tokenOffsets(FROM) - fromS(sentNum)(FROM) +1, 73 | m.tokenOffsets(FROM) - fromS(sentNum)(FROM) +1, // -1?? 74 | sentHead, 75 | c.chainNum, 76 | mentionNum, 77 | sentNum, 78 | // the arguments here are probably sentNum and headIndex, TODO: verify 79 | new IntTuple(Array[Int](sentNum, sentHead)), 80 | //new IntTuple(Array[Int](m.positionFrom, m.positionTo)), 81 | mentionSpan 82 | ) 83 | val pos = new IntPair(sentNum, sentHead) 84 | if (!mentionMap.containsKey(pos)) 85 | mentionMap.put(pos, new java.util.HashSet[StCorefMention]()) 86 | mentionMap.get(pos).add(com) 87 | 88 | if (c.representativeMentionNum == mentionNum) 89 | representative = com 90 | } 91 | 92 | val cc = new StCorefChain(c.chainNum, mentionMap, representative) 93 | cm.put(c.chainNum, cc) 94 | } 95 | to.set(classOf[CorefChainAnnotation], cm) 96 | } 97 | 98 | def fromStanford(from:StAnnotation):(Mentions,Coreferences) = { 99 | val ms = new ArrayBuffer[Mention]() 100 | val cl = new ArrayBuffer[CoreferenceChain]() 101 | try { 102 | val cca:java.util.Map[Integer,StCorefChain] = from.get(classOf[CorefChainAnnotation]) 103 | 104 | val sents: util.List[CoreMap] = from.get(classOf[SentencesAnnotation]) 105 | 106 | var chainNum = 0 107 | var mentionNum = 0 108 | for (cc <- cca.values) { 109 | val l = cc.getMentionsInTextualOrder 110 | //val lp = new ArrayBuffer[CMention](l.size) 111 | 112 | var representativeMentionNum = -1 113 | val chainMentions = new ArrayBuffer[Int]() 114 | for (m <- l) { 115 | 116 | // val cpm = CMention( 117 | // m.mentionType.name, 118 | // m.number.name, 119 | // m.gender.name, 120 | // m.animacy.name, 121 | // m.startIndex, 122 | // m.endIndex, 123 | // m.headIndex, 124 | // m.corefClusterID, 125 | // m.mentionID, 126 | // m.sentNum, 127 | // m.position.get(0), 128 | // m.position.get(1), 129 | // m.mentionSpan) 130 | // 131 | // lp += cpm 132 | val sentTokenBegin: Integer = sents(m.sentNum-1).get(classOf[TokenBeginAnnotation]) 133 | 134 | ms += Mention(mentionNum, 135 | sentTokenBegin + m.headIndex-1, 136 | Array(sentTokenBegin + m.startIndex-1, sentTokenBegin + m.endIndex-1), 137 | Mention.typeToByte(m.mentionType.name), 138 | Mention.numberToByte(m.number.name), 139 | Mention.genderToByte(m.gender.name), 140 | Mention.animacyToByte(m.animacy.name)) 141 | 142 | chainMentions += mentionNum 143 | 144 | if (cc.getRepresentativeMention == m) 145 | representativeMentionNum = mentionNum 146 | 147 | mentionNum += 1 148 | } 149 | 150 | cl += CoreferenceChain(chainNum, representativeMentionNum, chainMentions.toArray) 151 | 152 | chainNum += 1 153 | // val cpc = CorefChain(cc.getChainID, 154 | // cc.getRepresentativeMention().mentionID, 155 | // lp.toArray) 156 | // cl += cpc 157 | } 158 | } catch { 159 | case e:Exception => 160 | e.printStackTrace() 161 | println("error in fromStanf") 162 | } 163 | (ms.toArray, cl.toArray) 164 | } 165 | } 166 | 167 | 168 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordDependencyExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import java.util.List 4 | 5 | import com.clearcut.pipe.model.Dependency 6 | import com.clearcut.pipe.model.SentenceDependencies 7 | import edu.stanford.nlp.ling.CoreAnnotations._ 8 | import edu.stanford.nlp.ling.{IndexedWord, CoreLabel} 9 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation} 10 | import edu.stanford.nlp.semgraph.SemanticGraph 11 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.{BasicDependenciesAnnotation, 12 | CollapsedCCProcessedDependenciesAnnotation, CollapsedDependenciesAnnotation} 13 | import edu.stanford.nlp.trees.GrammaticalRelation 14 | 15 | import scala.collection.JavaConversions.asScalaBuffer 16 | import scala.collection.mutable.ArrayBuffer 17 | 18 | object StanfordDependencyExtractor { 19 | 20 | val DEFAULT_DEP_TYPE = "DepCCProcessed" 21 | 22 | val depTypes = Array("DepCollapsed", "DepUncollapsed", "DepCCProcessed") 23 | 24 | def fromStanford(from:StAnnotation, depTyp:String = DEFAULT_DEP_TYPE):SentenceDependencies = { 25 | val sentences = from.get(classOf[SentencesAnnotation]) 26 | val psl = new ArrayBuffer[Array[Dependency]](sentences.size) 27 | for (sentence <- sentences) { 28 | val deps = depTyp match { 29 | case "DepCollapsed" => 30 | sentence.get(classOf[CollapsedDependenciesAnnotation]) 31 | case "DepUncollapsed" => 32 | sentence.get(classOf[BasicDependenciesAnnotation]) 33 | case "DepCCProcessed" => 34 | sentence.get(classOf[CollapsedCCProcessedDependenciesAnnotation]) 35 | } 36 | 37 | if (deps != null) { 38 | val edgeSet = deps.edgeListSorted 39 | val pl = for (e <- edgeSet) yield { 40 | Dependency(e.getRelation.toString, e.getGovernor.index - 1, e.getDependent.index - 1) 41 | } 42 | psl += pl.toArray 43 | } 44 | } 45 | psl.toArray 46 | } 47 | 48 | def toStanford(depTyp:String, from:SentenceDependencies, to:StAnnotation):Unit = { 49 | val toks = to.get(classOf[TokensAnnotation]) 50 | val l = to.get(classOf[SentencesAnnotation]) 51 | for (i <- 0 until l.size) { 52 | val fromIndex = l.get(i).get(classOf[TokenBeginAnnotation]) 53 | val toIndex = l.get(i).get(classOf[TokenEndAnnotation]) 54 | val sntToks = toks.subList(fromIndex, toIndex) 55 | 56 | val sg = toSemanticGraph(sntToks, from(i)) 57 | 58 | depTyp match { 59 | case "DepCollapsed" => 60 | l.get(i).set(classOf[CollapsedDependenciesAnnotation], sg) 61 | case "DepUncollapsed" => 62 | l.get(i).set(classOf[BasicDependenciesAnnotation], sg) 63 | case "DepCCProcessed" => 64 | l.get(i).set(classOf[CollapsedCCProcessedDependenciesAnnotation], sg) 65 | } 66 | } 67 | } 68 | 69 | def toSemanticGraph(tokens:List[CoreLabel], deps:Array[Dependency]):SemanticGraph = { 70 | val sg = new SemanticGraph() 71 | for (i <- 0 until tokens.size) { 72 | val index = i+1 73 | val word = tokens.get(i).value() //getValue(); 74 | 75 | //TODO: not setting root 76 | //(are roots those nodes that have 0 incoming edges) 77 | 78 | val ifl = new IndexedWord(null, 0, index); 79 | // condition added by me, after "/" as token caused IndexOutOfBounds, maybe TokensAnnotation in wrong token format? 80 | val wordAndTag = if (word.length > 1) word.split("/") else Array(word) 81 | ifl.set(classOf[TextAnnotation], wordAndTag(0)) 82 | if (wordAndTag.length > 1) { 83 | ifl.set(classOf[PartOfSpeechAnnotation], wordAndTag(1)) 84 | } 85 | sg.addVertex(ifl) 86 | } 87 | val vertices = sg.vertexListSorted() 88 | 89 | for (d <- deps) { 90 | val govId = d.from 91 | val reln = d.name 92 | val depId = d.to 93 | val gov = vertices.get(govId) 94 | val dep = vertices.get(depId) 95 | val isExtra = false; //? 96 | sg.addEdge(gov, dep, GrammaticalRelation.valueOf(reln), 97 | java.lang.Double.NEGATIVE_INFINITY, isExtra) 98 | } 99 | sg 100 | } 101 | } -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordLemmatizer.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import java.util.Properties 4 | import com.clearcut.pipe.model._ 5 | import scala.collection.JavaConversions.asScalaBuffer 6 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation 7 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 8 | 9 | /** Wraps CoreNLP Lemmatizer as an Annotator. */ 10 | class StanfordLemmatizer extends Annotator[(Text, Poss, SentenceOffsets, TokenOffsets, Tokens), (Lemmas)] { 11 | 12 | @transient lazy val stanfordAnnotator = 13 | AnnotatorFactories.lemma(properties, StanfordUtil.annotatorImplementations).create() 14 | 15 | override def annotate(in:(Text, Poss, SentenceOffsets, TokenOffsets, Tokens)):Lemmas = { 16 | val (t, poa, soa, toa, to) = in 17 | val stanAnn = new StAnnotation(t) 18 | StanfordTokenizer.toStanford(t, toa, to, stanAnn) 19 | StanfordSentenceSplitter.toStanford(soa, null, stanAnn) 20 | StanfordPOSTagger.toStanford(poa, stanAnn) 21 | 22 | stanfordAnnotator.annotate(stanAnn) 23 | 24 | StanfordLemmatizer.fromStanford(stanAnn) 25 | } 26 | } 27 | 28 | /** Stanford model mappings for lemmas. */ 29 | object StanfordLemmatizer { 30 | def toStanford(from:Lemmas, to:StAnnotation):Unit = { 31 | val li = to.get(classOf[TokensAnnotation]) 32 | for (i <- 0 until from.size) { 33 | val lemma = from(i) 34 | li.get(i).setLemma(lemma) 35 | } 36 | } 37 | 38 | def fromStanford(from:StAnnotation):Lemmas = { 39 | val tokens = from.get(classOf[TokensAnnotation]) 40 | val li = for (cl <- tokens) yield { 41 | // there may be *NL* tokens outside sentences; the lemmatizer didn't reach 42 | // these, so set these manually to *NL*, so that serialization is OK 43 | var l = cl.lemma() 44 | if (l == null) l = "*NL*" 45 | l 46 | } 47 | li.toArray 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordNERTagger.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import com.clearcut.pipe.model._ 4 | import scala.collection.JavaConversions._ 5 | import edu.stanford.nlp.ling.CoreAnnotations 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 7 | import com.clearcut.pipe.model._ 8 | import java.util._ 9 | 10 | /** Wraps CoreNLP NER Tagger as an Annotator. */ 11 | class StanfordNERTagger extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets,Lemmas,Poss), (NerTags)] { 12 | 13 | @transient lazy val stanfordAnnotator = 14 | AnnotatorFactories.nerTag(properties, StanfordUtil.annotatorImplementations).create() 15 | 16 | override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets,Lemmas,Poss)): NerTags = { 17 | val (t, toa, to, soa, la, pa) = in 18 | val stanAnn = new StAnnotation(t) 19 | StanfordTokenizer.toStanford(t, toa, to, stanAnn) 20 | StanfordSentenceSplitter.toStanford(soa, null, stanAnn) 21 | StanfordPOSTagger.toStanford(pa, stanAnn) 22 | StanfordLemmatizer.toStanford(la, stanAnn) 23 | 24 | stanfordAnnotator.annotate(stanAnn) 25 | 26 | StanfordNERTagger.fromStanford(stanAnn) 27 | } 28 | } 29 | 30 | /** Stanford model mappings for NER. */ 31 | object StanfordNERTagger { 32 | def toStanford(from:NerTags, to:StAnnotation):Unit = { 33 | val li = to.get(classOf[CoreAnnotations.TokensAnnotation]) 34 | for (i <- 0 until li.size) { 35 | val ner = from(i) 36 | li.get(i).setNER(ner) 37 | } 38 | } 39 | 40 | def fromStanford(from:StAnnotation):NerTags = { 41 | val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation]) 42 | val li = for (cl <- tokens) yield { 43 | // there may be *NL* tokens outside sentences; the lemmatizer didn't reach 44 | // these, so set these manually to *NL*, so that serialization is OK 45 | val n = cl.ner 46 | if (n != null) n else "O" 47 | } 48 | li.toArray 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordPOSTagger.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import com.clearcut.pipe.model._ 4 | import scala.collection.JavaConversions._ 5 | import edu.stanford.nlp.ling.CoreAnnotations 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 7 | import java.util._ 8 | 9 | /** Wraps CoreNLP POS Tagger as an Annotator. */ 10 | class StanfordPOSTagger extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets),(Poss)] { 11 | 12 | @transient lazy val stanfordAnnotator = 13 | AnnotatorFactories.posTag(properties, StanfordUtil.annotatorImplementations).create() 14 | 15 | override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets)):Poss = { 16 | val (t, toa, to, soa) = in 17 | val stanAnn = new edu.stanford.nlp.pipeline.Annotation(t) 18 | StanfordTokenizer.toStanford(t, toa, to, stanAnn) 19 | StanfordSentenceSplitter.toStanford(soa, null, stanAnn) 20 | 21 | stanfordAnnotator.annotate(stanAnn) 22 | 23 | StanfordPOSTagger.fromStanford(stanAnn) 24 | } 25 | } 26 | 27 | /** Stanford model mappings for POS tags. */ 28 | object StanfordPOSTagger { 29 | def toStanford(from:Poss, to:StAnnotation):Unit = { 30 | val li = to.get(classOf[CoreAnnotations.TokensAnnotation]) 31 | for (i <- 0 until li.size) { 32 | val pos = from(i) 33 | li.get(i).set(classOf[CoreAnnotations.PartOfSpeechAnnotation], pos) 34 | } 35 | } 36 | 37 | def fromStanford(from:StAnnotation):Poss = { 38 | val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation]) 39 | tokens.map(_.getString(classOf[CoreAnnotations.PartOfSpeechAnnotation])).toArray 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordSRParser.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | // StanfordSRParser is very fast, but needs A LOT of memory 4 | // ~ 4GB per thread 5 | // with less memory it becomes very slow 6 | 7 | import java.util.Properties 8 | 9 | import com.clearcut.pipe.model._ 10 | import edu.stanford.nlp.ling.CoreAnnotations.{SentenceIndexAnnotation, SentencesAnnotation} 11 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 12 | import edu.stanford.nlp.trees.Tree 13 | import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation 14 | 15 | import scala.collection.JavaConversions._ 16 | 17 | class StanfordSRParser extends Annotator[(Text,SentenceOffsets,SentenceTokenOffsets,TokenOffsets,Tokens,Poss), 18 | (Parses,SentenceDependencies)] { 19 | 20 | override def setProperties(p:Properties) { 21 | super.setProperties(p) 22 | p.setProperty("annotators", "tokenize,ssplit") 23 | p.put("parse.maxlen", "100") 24 | p.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz") 25 | p.put("threads", "1") // Should use extractor-level parallelism 26 | } 27 | 28 | @transient lazy val stanfordAnnotator = 29 | AnnotatorFactories.parse(properties, StanfordUtil.annotatorImplementations).create() 30 | 31 | override def annotate(in:(Text,SentenceOffsets,SentenceTokenOffsets,TokenOffsets,Tokens,Poss)): 32 | (Parses,SentenceDependencies) = { 33 | val (t,soa,stoa,toa,to,poa) = in 34 | val stanAnn = new StAnnotation(t) 35 | StanfordTokenizer.toStanford(t, toa, to, stanAnn) 36 | StanfordSentenceSplitter.toStanford(soa, null, stanAnn) 37 | StanfordPOSTagger.toStanford(poa, stanAnn) 38 | 39 | // NOTE: stanford parser may take too long for all sentences of a document 40 | // if we run this on Hadoop/Spark, we must parse sentence by sentence and 41 | // then report progress using 42 | //if (reporter != null) reporter.incrementCounter(); 43 | 44 | stanfordAnnotator.annotate(stanAnn) 45 | 46 | val pa = StanfordSRParser.fromStanford(stanAnn) 47 | val da = StanfordDependencyExtractor.fromStanford(stanAnn) 48 | (pa, da) 49 | } 50 | } 51 | 52 | object StanfordSRParser { 53 | def toStanford(from:Parses, to:StAnnotation):Unit = { 54 | val l = from 55 | val sentences = to.get(classOf[SentencesAnnotation]) 56 | for (i <- 0 until l.size) { 57 | var tree:Tree = null 58 | if (l(i) != null) 59 | tree = Tree.valueOf(l(i)) 60 | sentences.get(i).set(classOf[TreeAnnotation], tree) 61 | sentences.get(i).set(classOf[SentenceIndexAnnotation], i.asInstanceOf[Integer]) 62 | } 63 | } 64 | 65 | def fromStanford(from:StAnnotation):Parses = { 66 | val sentences = from.get(classOf[SentencesAnnotation]) 67 | val l = for (sentence <- sentences) yield { 68 | val tree = sentence.get(classOf[TreeAnnotation]) 69 | if (tree != null) tree.pennString else null 70 | } 71 | l.toArray 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordTokenizer.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import java.util.{ArrayList, Properties} 4 | import com.clearcut.pipe.model.{Offsets, Text, Tokens, TokenOffsets} 5 | import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 7 | import scala.collection.JavaConversions._ 8 | import scala.collection.JavaConverters._ 9 | 10 | /** Wraps CoreNLP Tokenizer as an Annotator. */ 11 | class StanfordTokenizer extends Annotator[Text,(TokenOffsets,Tokens)] { 12 | 13 | @transient lazy val stanfordAnnotator = 14 | AnnotatorFactories.tokenize(properties, StanfordUtil.annotatorImplementations).create() 15 | 16 | override def annotate(t:(Text)):(TokenOffsets, Tokens) = { 17 | val stanAnn = new StAnnotation(t) 18 | stanfordAnnotator.annotate(stanAnn) 19 | StanfordTokenizer.fromStanford(stanAnn) 20 | } 21 | } 22 | 23 | /** Stanford model mappings for tokens. */ 24 | object StanfordTokenizer { 25 | def toStanford(text:Text, tokenOffsets:TokenOffsets, tokens:Tokens, to:StAnnotation):Unit = { 26 | val li = for (i <- 0 until tokens.size) yield { 27 | val to = tokenOffsets(i) 28 | val cl = new CoreLabel 29 | cl.setValue(tokens(i)) 30 | cl.setWord(tokens(i)) 31 | cl.setOriginalText(text.substring(to(0), to(1))) 32 | cl.set(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation], to(0).asInstanceOf[Integer]) 33 | cl.set(classOf[CoreAnnotations.CharacterOffsetEndAnnotation], to(1).asInstanceOf[Integer]) 34 | cl 35 | } 36 | to.set(classOf[CoreAnnotations.TokensAnnotation], li.asJava) 37 | } 38 | 39 | def fromStanford(from:StAnnotation):(TokenOffsets, Tokens) = { 40 | val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation]) 41 | val li = tokens.map(cl => Array(cl.beginPosition, cl.endPosition)) 42 | val ti = tokens.map(_.word) 43 | (li.toArray, ti.toArray) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordTrueCaseAnnotator.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import com.clearcut.pipe.model._ 4 | import scala.collection.JavaConversions._ 5 | import edu.stanford.nlp.ling.CoreAnnotations 6 | import edu.stanford.nlp.pipeline.{Annotation => StAnnotation, AnnotatorFactories} 7 | import java.util._ 8 | 9 | /** Wraps CoreNLP TrueCaseAnnotator as an Annotator. */ 10 | class StanfordTrueCaseAnnotator extends Annotator[(Text,TokenOffsets,Tokens,SentenceOffsets),(TrueCases)] { 11 | 12 | @transient lazy val stanfordAnnotator = 13 | AnnotatorFactories.truecase(properties, StanfordUtil.annotatorImplementations).create() 14 | 15 | override def annotate(in:(Text,TokenOffsets,Tokens,SentenceOffsets)):TrueCases = { 16 | val (t, toa, to, soa) = in 17 | val stanAnn = new edu.stanford.nlp.pipeline.Annotation(t) 18 | StanfordTokenizer.toStanford(t, toa, to, stanAnn) 19 | StanfordSentenceSplitter.toStanford(soa, null, stanAnn) 20 | 21 | stanfordAnnotator.annotate(stanAnn) 22 | 23 | StanfordTrueCaseAnnotator.fromStanford(stanAnn) 24 | } 25 | } 26 | 27 | /** Stanford model mappings for POS tags. */ 28 | object StanfordTrueCaseAnnotator { 29 | def toStanford(from:TrueCases, to:StAnnotation):Unit = { 30 | val li = to.get(classOf[CoreAnnotations.TokensAnnotation]) 31 | for (i <- 0 until li.size) { 32 | val tc = from(i) 33 | li.get(i).set(classOf[CoreAnnotations.TrueCaseAnnotation], tc) 34 | } 35 | } 36 | 37 | def fromStanford(from:StAnnotation):TrueCases = { 38 | val tokens = from.get(classOf[CoreAnnotations.TokensAnnotation]) 39 | tokens.map(_.getString(classOf[CoreAnnotations.TrueCaseAnnotation])).toArray 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/annotator/StanfordUtil.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.annotator 2 | 3 | import edu.stanford.nlp.pipeline.AnnotatorImplementations 4 | 5 | object StanfordUtil { 6 | 7 | lazy val annotatorImplementations = 8 | new AnnotatorImplementations 9 | 10 | } 11 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/ColumnReader.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import java.io._ 4 | import com.clearcut.pipe.Schema 5 | import com.clearcut.pipe.model._ 6 | 7 | class ColumnReader(dir:String) extends Reader with Iterator[Array[AnyRef]] { 8 | 9 | val schema = Schema.createSchema( 10 | // The schema is determined based on file name suffixes 11 | new File(dir).list.map(n => n.substring(n.lastIndexOf(".") + 1)).map(lowerFirst(_)):_* 12 | ) 13 | 14 | val readers = new File(dir).listFiles.map(f => new BufferedReader 15 | (new InputStreamReader(new FileInputStream(f)))) 16 | 17 | var _next = fetchNext() 18 | 19 | def getSchema(): Schema = schema 20 | 21 | override def hasNext: Boolean = 22 | _next != null 23 | 24 | override def next():Array[AnyRef] = { 25 | val n = _next 26 | _next = fetchNext() 27 | n 28 | } 29 | 30 | private def fetchNext(): Array[AnyRef] = { 31 | readers.zip(schema.annTyps).map { case (r,t) => { 32 | val line = r.readLine 33 | if (line == null) 34 | return null 35 | 36 | Json.read[AnyRef](line, Util.name2clazz(t)) 37 | }} 38 | } 39 | 40 | def close = { 41 | readers.map(_.close) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/ColumnWriter.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import java.io.{File, OutputStreamWriter, FileOutputStream, BufferedWriter} 4 | import com.clearcut.pipe.model._ 5 | 6 | import com.clearcut.pipe.Schema 7 | 8 | class ColumnWriter(dir:String) extends Writer { 9 | val BUFFER_SIZE = 10 * 1024 * 1024 10 | 11 | var writers:Array[BufferedWriter] = null 12 | 13 | def setSchema(schema:Schema): Unit = { 14 | if (! new File(dir).exists) 15 | new File(dir).mkdirs() 16 | writers = schema.annTyps.map(t => { 17 | val name = dir + "/ann." + lowerFirst(t) 18 | if (new File(name).exists) 19 | null 20 | else 21 | new BufferedWriter( 22 | new OutputStreamWriter(new FileOutputStream(name)), BUFFER_SIZE) 23 | }) 24 | } 25 | 26 | def write(annotations:Seq[AnyRef]) = { 27 | for (i <- 0 until writers.length) { 28 | if (writers(i) != null) { 29 | val json = Json.write(annotations(i)) 30 | writers(i).write(json) 31 | writers(i).newLine() 32 | } 33 | } 34 | } 35 | 36 | def close = 37 | for (w <- writers) 38 | if (w != null) w.close() 39 | } 40 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/JSONWriter.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import java.io.{OutputStreamWriter, FileOutputStream, BufferedWriter} 4 | import com.clearcut.pipe.model._ 5 | 6 | import com.clearcut.pipe.Schema 7 | import org.json4s._ 8 | import org.json4s.JsonDSL._ 9 | import org.json4s.jackson.JsonMethods._ 10 | 11 | class JsonWriter(out:String) extends Writer { 12 | 13 | implicit val formats = DefaultFormats 14 | 15 | val writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out))) 16 | var names:Seq[String] = null 17 | 18 | def setSchema(schema:Schema): Unit = { 19 | names = schema.annTyps.map(t => lowerFirst(t)) 20 | } 21 | 22 | def write(annotations:Seq[AnyRef]) = { 23 | val arr:Seq[JObject] = annotations.zip(names).map { case (x,n) => JObject(JField(n, Extraction.decompose(x)))} 24 | var o:JObject = arr(0) 25 | for (i <- 1 until arr.length) 26 | o = o merge arr(i) 27 | 28 | writer.write(compact(render(o))) 29 | writer.newLine 30 | } 31 | 32 | def close = 33 | writer.close 34 | 35 | } 36 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/Json.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import org.json4s.Extraction._ 4 | import org.json4s.NoTypeHints 5 | import org.json4s.jackson.JsonMethods._ 6 | import org.json4s.jackson.Serialization 7 | import org.json4s.reflect.Reflector 8 | 9 | object Json { 10 | 11 | implicit val formats = Serialization.formats(NoTypeHints) 12 | 13 | def write[A <: AnyRef](o:A)(implicit m:Manifest[A]):String = 14 | Serialization.write[A](o) 15 | 16 | def read[AnyRef](s:String, t:Class[_]):AnyRef = { 17 | val json = parse(s) 18 | extract(json, Reflector.scalaTypeOf(t)).asInstanceOf[AnyRef] 19 | } 20 | 21 | def read[A](s:String)(implicit m:Manifest[A]):A = 22 | Serialization.read[A](s) 23 | } 24 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/JsonReader.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import java.nio.charset.CodingErrorAction 4 | 5 | import com.clearcut.pipe.Schema 6 | import com.clearcut.pipe.model.{Id, Text} 7 | 8 | import org.json4s._ 9 | import org.json4s.jackson.JsonMethods._ 10 | import org.json4s.jackson.Serialization 11 | 12 | import scala.io.Source 13 | 14 | class JsonReader(in:String, 15 | idKey:String, documentKey:String) 16 | extends Reader with Iterator[Array[AnyRef]] { 17 | val BUFFER_SIZE = 10 * 1024 * 1024 18 | 19 | implicit val codec = new scala.io.Codec( 20 | java.nio.charset.Charset.forName("utf-8")) 21 | codec.onMalformedInput(CodingErrorAction.IGNORE) 22 | codec.onUnmappableCharacter(CodingErrorAction.IGNORE) 23 | 24 | val reader = Source.fromFile(new java.io.File(in), BUFFER_SIZE) 25 | 26 | var it = reader.getLines.zipWithIndex 27 | var _next = fetchNext() 28 | 29 | override def getSchema:Schema = 30 | Schema.createSchema("id", "text") 31 | 32 | override def hasNext: Boolean = 33 | _next != null 34 | 35 | override def next(): Array[AnyRef] = { 36 | val n = _next 37 | _next = fetchNext() 38 | n 39 | } 40 | 41 | private def fetchNext(): Array[AnyRef] = { 42 | var n:Array[AnyRef] = null 43 | while (n == null && it.hasNext) { 44 | val (line, num) = it.next 45 | 46 | val json = parse(line) 47 | 48 | implicit val formats = DefaultFormats 49 | 50 | try { 51 | val documentId = (json \ idKey).extract[String] 52 | val documentStr = (json \ documentKey).extract[String] 53 | 54 | n = Array(documentId, documentStr) 55 | 56 | } catch { 57 | case e:Exception => 58 | System.err.println(s"Warning: skipped malformed line ${num}: ${line}") 59 | } 60 | } 61 | n 62 | } 63 | 64 | def close = 65 | reader.close 66 | } 67 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/Reader.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import com.clearcut.pipe.Schema 4 | 5 | trait Reader extends Iterator[Array[AnyRef]] { 6 | def getSchema:Schema 7 | def close 8 | } 9 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/TsvReader.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import java.nio.charset.CodingErrorAction 4 | 5 | import com.clearcut.pipe.Schema 6 | import com.clearcut.pipe.model.{Text, Id} 7 | 8 | import scala.io.{Source, BufferedSource} 9 | 10 | class TsvReader(in:String = null, 11 | idCol:Int = 0, documentCol:Int = 1, 12 | inSource:Source = null) 13 | extends Reader with Iterator[Array[AnyRef]] { 14 | 15 | implicit val codec = new scala.io.Codec( 16 | java.nio.charset.Charset.forName("utf-8")) 17 | codec.onMalformedInput(CodingErrorAction.IGNORE) 18 | codec.onUnmappableCharacter(CodingErrorAction.IGNORE) 19 | 20 | val reader = if (inSource != null) inSource else Source.fromFile(in) 21 | 22 | var it = reader.getLines.zipWithIndex 23 | var _next = fetchNext() 24 | 25 | override def getSchema:Schema = 26 | Schema.createSchema("id", "text") 27 | 28 | override def hasNext: Boolean = 29 | _next != null 30 | 31 | override def next(): Array[AnyRef] = { 32 | val n = _next 33 | _next = fetchNext() 34 | n 35 | } 36 | 37 | // should unescape \, \r, \n, \t 38 | private def fetchNext(): Array[AnyRef] = { 39 | var n:Array[AnyRef] = null 40 | while (n == null && it.hasNext) { 41 | val (line, num) = it.next 42 | val tsvArr = line.trim.split("\t") 43 | if (tsvArr.length >= 2) { 44 | val documentId = tsvArr(idCol) 45 | val documentStr = unescape(tsvArr(documentCol)) 46 | n = Array(documentId, documentStr) 47 | } else { 48 | System.err.println(s"Warning: skipped malformed line ${num}: ${line}") 49 | } 50 | } 51 | n 52 | } 53 | 54 | private def unescape(s:String):String = { 55 | val sb = new StringBuilder() 56 | val NORMAL = 0 57 | val ESCAPE = 1 58 | 59 | var state = NORMAL 60 | 61 | for (i <- 0 until s.length) { 62 | val c = s.charAt(i) 63 | //val l = if (i == s.length - 1) Character.UNASSIGNED else s.charAt(i+1) 64 | state match { 65 | case NORMAL => 66 | c match { 67 | case '\\' => state = ESCAPE 68 | case _ => sb.append(c) 69 | } 70 | case ESCAPE => 71 | c match { 72 | case 'r' => sb.append('\r'); state = NORMAL 73 | case 'n' => sb.append('\n'); state = NORMAL 74 | case 't' => sb.append('\t'); state = NORMAL 75 | case '\\' => sb.append('\\'); state = NORMAL 76 | case _ => 77 | println("ERROR") 78 | } 79 | } 80 | } 81 | return sb.toString 82 | } 83 | 84 | def close = 85 | reader.close 86 | } -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/TsvWriter.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import java.io.{FileOutputStream, OutputStreamWriter, BufferedWriter} 4 | 5 | import com.clearcut.pipe.Schema 6 | import com.clearcut.pipe.model._ 7 | 8 | /** Legacy writer for psql readable TSV table. 9 | * 10 | * Example output: 11 | * 12 1 This is a simple example. {"This","is","a","simple","example","."} 12 | * {"this","be","a","simple","example","."} {"DT","VBZ","DT","JJ","NN","."} 13 | * {"O","O","O","O","O","O"} {0,5,8,10,17,24} 14 | * {"nsubj","cop","det","amod","",""} {5,5,5,5,0,0} 15 | */ 16 | class TsvWriter(out:String = null, outWriter:BufferedWriter = null) extends Writer { 17 | 18 | val writer = if (outWriter != null) outWriter else 19 | new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "utf-8")) 20 | 21 | var indices:Seq[Int] = null 22 | 23 | def setSchema(schema:Schema) = { 24 | indices = Schema.defaultAnnotationIndices(schema, Seq("Id", "Text", "SentenceOffsets", 25 | "SentenceTokenOffsets", "Tokens", "TokenOffsets", "Lemmas", "Poss", 26 | "NerTags", "SentenceDependencies")) 27 | } 28 | 29 | def write(annotations:Seq[AnyRef]) = { 30 | val is = indices.map(annotations(_)) 31 | val id = is(0).asInstanceOf[Id] 32 | val ta = is(1).asInstanceOf[Text] 33 | val soa = is(2).asInstanceOf[SentenceOffsets] 34 | val stoa = is(3).asInstanceOf[SentenceTokenOffsets] 35 | val toka = is(4).asInstanceOf[Tokens] 36 | val toa = is(5).asInstanceOf[TokenOffsets] 37 | val la = is(6).asInstanceOf[Lemmas] 38 | val posa = is(7).asInstanceOf[Poss] 39 | val nertaga = is(8).asInstanceOf[NerTags] 40 | val sdepa = is(9).asInstanceOf[SentenceDependencies] 41 | 42 | for (sentNum <- 0 until soa.size) { 43 | var columns = new Array[String](10) 44 | 45 | val s_stoa = stoa(sentNum) 46 | 47 | val outline = List( 48 | id, 49 | sentNum.toString, 50 | ta.substring(soa(sentNum)(FROM), soa(sentNum)(TO)), 51 | list2TSVArray(toka.slice(s_stoa(FROM), s_stoa(TO)).toList), 52 | list2TSVArray(la.slice(s_stoa(FROM), s_stoa(TO)).toList), 53 | list2TSVArray(posa.slice(s_stoa(FROM), s_stoa(TO)).toList), 54 | list2TSVArray(nertaga.slice(s_stoa(FROM), s_stoa(TO)).toList), 55 | intList2TSVArray(toa.slice(s_stoa(FROM), s_stoa(TO)).map {_(FROM) - soa(sentNum)(FROM) }.toList), 56 | list2TSVArray(sdepa(sentNum).map(_.name).toList), 57 | intList2TSVArray(sdepa(sentNum).map(_.from).toList) 58 | ) 59 | writer.append(outline.mkString("\t")) 60 | writer.newLine() 61 | } 62 | } 63 | 64 | /** Construct a Postgres-acceptable array in the TSV format, from a list */ 65 | def list2TSVArray(arr: List[String]) : String = { 66 | return arr.map( x => 67 | // Replace '\' with '\\\\' to be accepted by COPY FROM 68 | // Replace '"' with '\\"' to be accepted by COPY FROM 69 | if (x.contains("\\")) 70 | "\"" + x.replace("\\", "\\\\\\\\").replace("\"", "\\\\\"") + "\"" 71 | else 72 | "\"" + x + "\"" 73 | ).mkString("{", ",", "}") 74 | } 75 | 76 | def intList2TSVArray(arr: List[Int]) : String = { 77 | return arr.map( x => 78 | "" + x 79 | ).mkString("{", ",", "}") 80 | } 81 | 82 | def string2TSVString(str: String) : String = { 83 | if (str.contains("\\")) 84 | str.replace("\\", "\\\\") 85 | else 86 | str 87 | } 88 | 89 | def close = 90 | writer.close 91 | } -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/io/Writer.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.io 2 | 3 | import com.clearcut.pipe.Schema 4 | 5 | trait Writer { 6 | def setSchema(s:Schema) 7 | def write(annotations:Seq[AnyRef]) 8 | def close 9 | } 10 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/model/Util.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe.model 2 | 3 | object Util { 4 | 5 | val types:Array[Class[_ <: AnyRef]] = Array( 6 | classOf[Coreferences], 7 | classOf[Dependencies], 8 | classOf[Lemmas], 9 | classOf[Mentions], 10 | classOf[Ners], 11 | classOf[NerTags], 12 | classOf[Offsets], 13 | classOf[Parses], 14 | classOf[Poss], 15 | classOf[SentenceDependencies], 16 | classOf[SentenceOffsets], 17 | classOf[SentenceTokenOffsets], 18 | classOf[Text], 19 | classOf[TextFragments], 20 | classOf[TextMappings], 21 | classOf[TokenOffsets], 22 | classOf[Tokens], 23 | classOf[TrueCases] 24 | ) 25 | 26 | // val name2clazz = 27 | // Map(types.map(t => lowerFirst(t.getSimpleName) -> t):_*) 28 | 29 | // val clazz2name:Map[Class[_ <: AnyRef], String] = 30 | // name2clazz.map(_.swap) 31 | 32 | 33 | val name2clazz = Map( 34 | "coreferences" -> classOf[Coreferences], 35 | "dependencies" -> classOf[Dependencies], 36 | "lemmas" -> classOf[Lemmas], 37 | "mentions" -> classOf[Mentions], 38 | "ners" -> classOf[Ners], 39 | "nerTags" -> classOf[NerTags], 40 | "parses" -> classOf[Parses], 41 | "poss" -> classOf[Poss], 42 | "sentenceDependencies" -> classOf[SentenceDependencies], 43 | "sentenceOffsets" -> classOf[SentenceOffsets], 44 | "sentenceTokenOffsets" -> classOf[SentenceTokenOffsets], 45 | "text" -> classOf[Text], 46 | "textFragments" -> classOf[TextFragments], 47 | "textMappings" -> classOf[TextMappings], 48 | "tokenOffsets" -> classOf[TokenOffsets], 49 | "tokens" -> classOf[Tokens], 50 | "trueCases" -> classOf[TrueCases] 51 | ) 52 | 53 | 54 | 55 | 56 | 57 | def lowerFirst(s:String) = 58 | if (s == null || s.length < 1) s 59 | else s.charAt(0).toLower + s.substring(1) 60 | } 61 | -------------------------------------------------------------------------------- /pipe/src/main/scala/com/clearcut/pipe/model/package.scala: -------------------------------------------------------------------------------- 1 | package com.clearcut.pipe 2 | 3 | import com.clearcut.pipe.io.Json 4 | 5 | /** Set of our cross-language, minimalist schema */ 6 | package object model { 7 | type Html = String 8 | type Coreferences = Array[CoreferenceChain] 9 | type Dependencies = Array[Dependency] 10 | type Id = String 11 | type Lemmas = Array[String] 12 | type Mentions = Array[Mention] 13 | type Ners = Array[NamedEntity] 14 | type NerTags = Array[String] 15 | type Offsets = Array[Int] 16 | type Parses = Array[String] 17 | type Poss = Array[String] 18 | type SentenceDependencies = Array[Array[Dependency]] 19 | type SentenceOffsets = Array[Offsets] 20 | type SentenceTokenOffsets = Array[Offsets] 21 | type Text = String 22 | type TextFragments = Array[TextFragment] 23 | type TextMappings = Array[TextMapping] 24 | type TokenOffsets = Array[Offsets] 25 | type Tokens = Array[String] 26 | type TrueCases = Array[String] 27 | 28 | /* Constants used for offsets */ 29 | val FROM = 0 30 | val TO = 1 31 | 32 | def print(s: Schema, arr: AnyRef*) = 33 | for ((name, ann) <- s.annTyps.zip(arr)) 34 | println(name + " : " + Json.write(arr)) 35 | 36 | def lowerFirst(s:String) = 37 | if (s == null || s.length < 1) s 38 | else s.charAt(0).toLower + s.substring(1) 39 | 40 | def upperFirst(s:String) = 41 | if (s == null || s.length < 1) s 42 | else s.charAt(0).toUpper + s.substring(1) 43 | 44 | 45 | /* Auxiliary sub-types used above */ 46 | 47 | case class CoreferenceChain 48 | ( 49 | chainNum: Int = -1, 50 | representativeMentionNum: Int = -1, 51 | mentionNums: Array[Int] = Array() 52 | ) 53 | 54 | case class Dependency 55 | ( 56 | name: String, 57 | from: Int, 58 | to: Int 59 | ) 60 | 61 | case class NamedEntity 62 | ( 63 | typ:String, 64 | offsets:Offsets, 65 | head:Int = -1 66 | ) 67 | 68 | 69 | case class Mention 70 | ( 71 | mentionNum:Int = -1, 72 | head:Int = -1, // token offset from begin of document 73 | tokenOffsets:Offsets, 74 | mentionTyp:Byte = -1, //PRONOMINAL, NOMINAL, PROPER, UNKNOWN 75 | number:Byte = -1, //SINGULAR, PLURAL, UNKNOWN 76 | gender:Byte = -1, //MALE, FEMALE, NEUTRAL, UNKNOWN 77 | animacy:Byte = -1 //ANIMATE, INANIMATE, UNKNOWN 78 | ) 79 | 80 | object Mention { 81 | val UNKNOWN = -1.toByte 82 | 83 | // mention types 84 | val PRONOMINAL = 0.toByte 85 | val NOMINAL = 1.toByte 86 | val PROPER = 2.toByte 87 | val LIST = 3.toByte 88 | 89 | // numbers 90 | val SINGULAR = 0.toByte 91 | val PLURAL = 1.toByte 92 | 93 | // genders 94 | val MALE = 0.toByte 95 | val FEMALE = 1.toByte 96 | val NEUTRAL = 2.toByte 97 | 98 | // animacy 99 | val ANIMATE = 0.toByte 100 | val INANIMATE = 1.toByte 101 | 102 | // need bidirectional mappings for stanford conversions 103 | 104 | def typeToByte(s:String) = s match { 105 | case "PRONOMINAL" => PRONOMINAL 106 | case "NOMINAL" => NOMINAL 107 | case "PROPER" => PROPER 108 | case "LIST" => LIST 109 | case "UNKNOWN" => UNKNOWN 110 | } 111 | 112 | def typeFromByte(b:Byte) = b match { 113 | case PRONOMINAL => "PRONOMINAL" 114 | case NOMINAL => "NOMINAL" 115 | case PROPER => "PROPER" 116 | case LIST => "LIST" 117 | case UNKNOWN => "UNKNOWN" 118 | } 119 | 120 | def numberToByte(s:String) = s match { 121 | case "SINGULAR" => SINGULAR 122 | case "PLURAL" => PLURAL 123 | case "UNKNOWN" => UNKNOWN 124 | } 125 | 126 | def numberFromByte(b:Byte) = b match { 127 | case SINGULAR => "SINGULAR" 128 | case PLURAL => "PLURAL" 129 | case UNKNOWN => "UNKNOWN" 130 | } 131 | 132 | def genderToByte(s:String) = s match { 133 | case "MALE" => MALE 134 | case "FEMALE" => FEMALE 135 | case "NEUTRAL" => NEUTRAL 136 | case "UNKNOWN" => UNKNOWN 137 | } 138 | 139 | def genderFromByte(b:Byte) = b match { 140 | case MALE => "MALE" 141 | case FEMALE => "FEMALE" 142 | case NEUTRAL => "NEUTRAL" 143 | case UNKNOWN => "UNKNOWN" 144 | } 145 | 146 | def animacyToByte(s:String) = s match { 147 | case "ANIMATE" => ANIMATE 148 | case "INANIMATE" => INANIMATE 149 | case "UNKNOWN" => UNKNOWN 150 | } 151 | 152 | def animacyFromByte(b:Byte) = b match { 153 | case ANIMATE => "ANIMATE" 154 | case INANIMATE => "INANIMATE" 155 | case UNKNOWN => "UNKNOWN" 156 | } 157 | } 158 | 159 | case class TextFragment 160 | ( 161 | typ:String, 162 | offsets:Offsets, 163 | extract:Boolean 164 | ) 165 | 166 | case class TextMapping 167 | ( 168 | documentID:Int, 169 | beginText:Int, 170 | beginSource:Int, 171 | length:Int 172 | ) 173 | } 174 | -------------------------------------------------------------------------------- /pipe/src/test/resources/testdoc.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |

Wrike has launched a new version of its project management platform with an emphasis on real-time analysis and new features such as syncing calendars to work projects. The new platform, Wrike Enterprise, gives the company a deeper focus on the corporate market for its collaboration-centered tools. It gives customers a way to crunch project management data in the order of a million updates per day. This is data around work items such as tasks completed, the original time planned for the project and the historical data that is associated with the project. The data is presented in “instant infographics,” that help people see the latest updates to projects, said Wrike CEO Andrew Filev in an email interview. Wrike-Enterprise-visual-reports Historically, project managers have done detailed plans that they then track. The manager periodically updates the projects and then compares the current state to the baseline established at the start of the project. With the Wrike platform, the data from every interaction is stored and then compared to historical data and then presented in a chart. A customer can see the state of the project from different dimensions such as the realistic amount of time a project will take to get done,  what requires immediate action and how performance of an employee has evolved over time.

5 |

A new  user group feature in Wrike Enterprise allows the project manager to include employees in multiple work groups by project, department, or any other ad hoc query. It can share the needed data with the whole group and keep permissions organized. This allows the manager to keep track of the overall project without hundreds of people making their own changes.  Wrike-Enterprise-user-groups

6 |
Wrike’s new “Custom Calendars,” syncs projects with the calendars of other members on the team. It allows the manager to track a colleague’s vacations, PTO and extra working days. It is designed to avoid schedule overlaps and build more accurate plans. Wrike-Enterprise-custom-calendars There are also new ways to integrate a company’s  identity into the service. Wrike has also added new security controls for larger customers. In October, Wrike raised $10 million in funding.  It was the first round since the company was originally founded seven years ago. The company has traditionally served the small business community but this release points to its additonal focus on the larger enterprise companies of the world. Wrike competes with the likes of Atlassian and Asana. But its advantage is in its crisp user interface which it can now leverage even more as it embraces data as a way for project managers to better keep track of their projects.  
7 |
8 |
Feature image courtesy of VFS Digital Design on Flickr via Creative Commons)
9 |
10 | 11 | 12 | 13 |
-------------------------------------------------------------------------------- /pipe/src/test/resources/testdoc.json: -------------------------------------------------------------------------------- 1 | {"documents.id" : 5, "documents.text" : "I am document one. I am sentence twp, really. I am another sentence, called sentence three."} 2 | {"documents.id" : 7, "documents.text" : "John drove to Judy’s house and he made her dinner. This sentence should have some corefs."} -------------------------------------------------------------------------------- /pipe/src/test/resources/testdoc.txt: -------------------------------------------------------------------------------- 1 | In a decision that could have far-reaching consequences, the D.C. Circuit Court of Appeals today struck down the FCC’s Open Internet Order. That Order, put into force in 2010 by then-chairman Julius Genachowski, was designed to make it so that broadband service providers couldn’t meddle with traffic on the web based on its type – in other words, they couldn’t block certain kinds of online data transmission just because it didn’t align with their own goals and financial strategy. 2 | 3 | Media watchdog and advocacy agency Free Press released the following statement about the decision via President and CEO Craig Aaron, condemning it while also acknowledging that the Open Internet Order probably wasn’t the best possible solution for enforcing net neutrality: -------------------------------------------------------------------------------- /pipe/src/test/scala/BasicSpec.scala: -------------------------------------------------------------------------------- 1 | import java.io.{BufferedWriter, OutputStreamWriter, FileOutputStream} 2 | import java.util.Properties 3 | import javax.swing.text.html.parser.DocumentParser 4 | 5 | import com.clearcut.pipe.annotator._ 6 | import com.clearcut.pipe.{Schema, Main} 7 | import com.clearcut.pipe.io.{ColumnWriter, ColumnReader, Json} 8 | import com.clearcut.pipe.model.Text 9 | import org.scalatest.{Matchers, FlatSpec} 10 | 11 | /** 12 | * 13 | * Note: SRParser needs a lot of memory. You have to run the test like this: 14 | * sbt -mem 4096 test 15 | * 16 | */ 17 | class BasicSpec extends FlatSpec with Matchers { 18 | 19 | def createTextFile(dir:String) = { 20 | val w = new BufferedWriter(new OutputStreamWriter 21 | (new FileOutputStream(dir + "/ann.text"))) 22 | w.write(Json.write("This is a very simple text file.\nIt contains two sentences.")) 23 | w.close 24 | } 25 | 26 | "ColumnReader and ColumnWriter" should "work" in { 27 | import java.nio.file.{Path, Paths, Files} 28 | val folderPath: Path = Paths.get(System.getProperty("java.io.tmpdir")) 29 | var dir: Path = Files.createTempDirectory(folderPath, "pipe") 30 | 31 | println(dir.toString) 32 | 33 | createTextFile(dir.toString) 34 | 35 | val annotators:Array[Annotator[_,_]] = Array( 36 | new StanfordTokenizer, 37 | new StanfordSentenceSplitter, 38 | new StanfordPOSTagger 39 | //new StanfordLemmatizer 40 | ) 41 | 42 | val r = new ColumnReader(dir.toString) 43 | val w = new ColumnWriter(dir.toString) 44 | val e = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dir + "/.errors"))) 45 | 46 | Main.run(annotators, r, w, e) 47 | 48 | r.close 49 | w.close 50 | e.close 51 | } 52 | 53 | 54 | } 55 | -------------------------------------------------------------------------------- /view/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | /env 4 | /node_modules 5 | /util/elasticsearch-* 6 | /public/closure-library 7 | /public/js/help 8 | /public/js/vis 9 | /public/js/.module-cache 10 | -------------------------------------------------------------------------------- /view/README.md: -------------------------------------------------------------------------------- 1 | View 2 | ==== 3 | 4 | View visualizations of extractions and NLP annotations. Search by keywords. 5 | 6 | 7 | ## Installation 8 | 9 | Run `./setup.sh` to install dependencies. 10 | 11 | Make sure you run `source env.sh` each time you run view. 12 | 13 | You can use `./run.sh` to run the two servers (elasticsearch and nodejs). 14 | 15 | ## How to index your data 16 | 17 | * To update view's index, adjust `view.conf` and run tools in `./util`. 18 | 19 | * The documents should be in [Pipe](../pipe)'s column format. We have included the tool `./fetch-sentences-table.py` which dumps the sentences table from DeepDive and converts it into column format. This tool has been tested with DeepDive's spouse example, so it assumes that the sentences table has a that schema. 20 | 21 | * Then fetch extractor output by running `./fetch-anntations.py`. This tool dumps a candidate or inference table from DeepDive and converts it into the right format. 22 | 23 | * Create the elasticsearch indexes by running: 24 | 25 | ``` 26 | ./create_index.sh 27 | ./refresh-documents.py 28 | ./refresh-annotations.py 29 | ``` 30 | 31 | * Visit `http://localhost:3000`. 32 | 33 | View actually uses two elasticsearch indexes: one containing all documents and their NLP annotations, the other containing all extractions. Typically, the documents index is very large and the extractions index relatively small. By separating the two it is now possible to update the extractions index extremely quickly. This is great for extractor development, since an update to an extractor doesn't require rebuilding the documents index. On the spouse example, updating the extractions index now takes only about 5 seconds. 34 | 35 | To make sure that retrieval of documents and their extractions remains very fast, the two indexes are linked through elasticsearch'es Parent-Child mapping. Each document (parent) has a mapping to a set of extractions (children). This mapping is represented as a hashmap over IDs and is cached in memory while elasticsearch is running. 36 | 37 | -------------------------------------------------------------------------------- /view/app.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var path = require('path'); 3 | var favicon = require('serve-favicon'); 4 | var logger = require('morgan'); 5 | var cookieParser = require('cookie-parser'); 6 | var bodyParser = require('body-parser'); 7 | 8 | var routes = require('./routes/index'); 9 | var users = require('./routes/users'); 10 | 11 | var app = express(); 12 | 13 | // view engine setup 14 | app.set('views', path.join(__dirname, 'views')); 15 | app.set('view engine', 'jade'); 16 | 17 | // uncomment after placing your favicon in /public 18 | //app.use(favicon(__dirname + '/public/favicon.ico')); 19 | app.use(logger('dev')); 20 | app.use(bodyParser.json()); 21 | app.use(bodyParser.urlencoded({ extended: false })); 22 | app.use(cookieParser()); 23 | app.use(express.static(path.join(__dirname, 'public'))); 24 | 25 | app.use('/', routes); 26 | app.use('/users', users); 27 | 28 | // catch 404 and forward to error handler 29 | app.use(function(req, res, next) { 30 | var err = new Error('Not Found'); 31 | err.status = 404; 32 | next(err); 33 | }); 34 | 35 | // error handlers 36 | 37 | // development error handler 38 | // will print stacktrace 39 | if (app.get('env') === 'development') { 40 | app.use(function(err, req, res, next) { 41 | res.status(err.status || 500); 42 | res.render('error', { 43 | message: err.message, 44 | error: err 45 | }); 46 | }); 47 | } 48 | 49 | // production error handler 50 | // no stacktraces leaked to user 51 | app.use(function(err, req, res, next) { 52 | res.status(err.status || 500); 53 | res.render('error', { 54 | message: err.message, 55 | error: {} 56 | }); 57 | }); 58 | 59 | 60 | module.exports = app; 61 | -------------------------------------------------------------------------------- /view/bin/www: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * Module dependencies. 5 | */ 6 | 7 | var app = require('../app'); 8 | var debug = require('debug')('view:server'); 9 | var http = require('http'); 10 | 11 | /** 12 | * Get port from environment and store in Express. 13 | */ 14 | 15 | var port = normalizePort(process.env.PORT || '3000'); 16 | app.set('port', port); 17 | 18 | /** 19 | * Create HTTP server. 20 | */ 21 | 22 | var server = http.createServer(app); 23 | 24 | /** 25 | * Listen on provided port, on all network interfaces. 26 | */ 27 | 28 | server.listen(port); 29 | server.on('error', onError); 30 | server.on('listening', onListening); 31 | 32 | /** 33 | * Normalize a port into a number, string, or false. 34 | */ 35 | 36 | function normalizePort(val) { 37 | var port = parseInt(val, 10); 38 | 39 | if (isNaN(port)) { 40 | // named pipe 41 | return val; 42 | } 43 | 44 | if (port >= 0) { 45 | // port number 46 | return port; 47 | } 48 | 49 | return false; 50 | } 51 | 52 | /** 53 | * Event listener for HTTP server "error" event. 54 | */ 55 | 56 | function onError(error) { 57 | if (error.syscall !== 'listen') { 58 | throw error; 59 | } 60 | 61 | var bind = typeof port === 'string' 62 | ? 'Pipe ' + port 63 | : 'Port ' + port; 64 | 65 | // handle specific listen errors with friendly messages 66 | switch (error.code) { 67 | case 'EACCES': 68 | console.error(bind + ' requires elevated privileges'); 69 | process.exit(1); 70 | break; 71 | case 'EADDRINUSE': 72 | console.error(bind + ' is already in use'); 73 | process.exit(1); 74 | break; 75 | default: 76 | throw error; 77 | } 78 | } 79 | 80 | /** 81 | * Event listener for HTTP server "listening" event. 82 | */ 83 | 84 | function onListening() { 85 | var addr = server.address(); 86 | var bind = typeof addr === 'string' 87 | ? 'pipe ' + addr 88 | : 'port ' + addr.port; 89 | debug('Listening on ' + bind); 90 | } 91 | -------------------------------------------------------------------------------- /view/build.sh: -------------------------------------------------------------------------------- 1 | #jsx view/ public/js 2 | #browserify -t reactify public/js/main.js -o public/bundle.js 3 | #browserify public/js/main.js -o public/bundle.js 4 | 5 | browserify -t [ reactify --es6 ] view/main.jsx -o public/bundle.js 6 | -------------------------------------------------------------------------------- /view/env.sh: -------------------------------------------------------------------------------- 1 | # elasticsearch 2 | export INDEX_NAME=view 3 | 4 | # database 5 | export PGPORT=5432 6 | export PGHOST=localhost 7 | export DBNAME=deepdive_spouse_tsv 8 | export PGUSER=raphael 9 | export PGPASSWORD= 10 | 11 | source env/bin/activate 12 | 13 | PATH="$PWD/node_modules/.bin:$PATH" 14 | PATH="$PWD/util/elasticsearch-1.6.0/bin:$PATH" 15 | -------------------------------------------------------------------------------- /view/gulpfile.js: -------------------------------------------------------------------------------- 1 | var browserify = require('browserify'); 2 | var gulp = require('gulp'); 3 | var gutil = require('gulp-util'); 4 | var source = require("vinyl-source-stream"); 5 | var reactify = require('reactify'); 6 | var es6ify = require('es6ify'); 7 | var watchify = require('watchify'); 8 | 9 | //var requireFiles = ['./node_modules/react/react.js'] 10 | var requireFiles = 'react-router' 11 | var rename = require('gulp-rename'); 12 | 13 | function compileScripts(watch) { 14 | gutil.log('Starting browserify'); 15 | 16 | var entryFile = './view/main.js'; 17 | es6ify.traceurOverrides = {experimental: true}; 18 | 19 | var bundler = browserify({entries: entryFile, debug: true}); 20 | 21 | bundler.require(requireFiles); 22 | bundler.transform(reactify, {es6: true}); 23 | bundler.transform(es6ify.configure(/.jsx/)); 24 | 25 | var rebundle = function () { 26 | var stream = bundler.bundle(); 27 | 28 | stream.on('error', function (err) { console.error(err) }); 29 | stream = stream.pipe(source(entryFile)); 30 | 31 | stream.pipe(rename('bundle.js')); 32 | stream.pipe(gulp.dest('public')); 33 | } 34 | bundler.on('update', rebundle); 35 | return rebundle(); 36 | } 37 | 38 | gulp.task('default', [], function () { 39 | compileScripts(true); 40 | }); -------------------------------------------------------------------------------- /view/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "view", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "start": "node ./bin/www" 7 | }, 8 | "dependencies": { 9 | "body-parser": "~1.12.4", 10 | "cookie-parser": "~1.3.5", 11 | "react-router": "v1.0.0-beta1", 12 | "debug": "~2.2.0", 13 | "elasticsearch": "^5.0.0", 14 | "express": "~4.12.4", 15 | "jade": "~1.9.2", 16 | "morgan": "~1.5.3", 17 | "nodemon": "^1.3.7", 18 | "serve-favicon": "~2.2.1" 19 | }, 20 | "devDependencies": { 21 | "gulp": "^3.9.0", 22 | "gulp-util": "^3.0.6", 23 | "vinyl-source-stream": "^1.1.0", 24 | "reactify": "^1.1.1", 25 | "es6ify": "^1.6.0", 26 | "watchify": "^3.3.0", 27 | "gulp-rename": "^1.2.2" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /view/public/css/main.css: -------------------------------------------------------------------------------- 1 | /** normalize **/ 2 | *, body, button, input, textarea, select { 3 | text-rendering: optimizeLegibility; 4 | font-size:100%; 5 | } 6 | 7 | body,div,dl,dt,dd,ul,ol,li,h1,h2,h3,h4,h5,h6,pre,form,fieldset,input,textarea,p,blockquote,th,td { 8 | margin:0; 9 | padding:0; 10 | } 11 | table { 12 | border-collapse:collapse; 13 | border-spacing:0; 14 | } 15 | fieldset,img { 16 | border:0; 17 | } 18 | address,caption,cite,code,dfn,em,strong,th,var { 19 | font-style:normal; 20 | font-weight:normal; 21 | } 22 | ol,ul { 23 | list-style:none; 24 | } 25 | caption,th { 26 | text-align:left; 27 | } 28 | h1,h2,h3,h4,h5,h6 { 29 | font-size:100%; 30 | font-weight:normal; 31 | } 32 | q:before,q:after { 33 | content:''; 34 | } 35 | abbr,acronym { border:0;} 36 | /* end normalize.css */ 37 | 38 | body { 39 | font-family: "Helvetica", "Arial", "FreeSans", "Verdana", "Tahoma", "Lucida Sans", "Lucida Sans Unicode", "Luxi Sans", sans-serif; 40 | font-size: 14px; 41 | } 42 | 43 | *, html, body { 44 | -webkit-font-smoothing: antialiased; 45 | } 46 | 47 | input:focus {outline:none;} 48 | 49 | .header input:focus { border:1px solid #AAF !important; } 50 | 51 | 52 | 53 | body { 54 | overflow-x:hidden; 55 | overflow-y:visible; 56 | height:100%; 57 | font: normal 13px arial,sans-serif; 58 | } 59 | 60 | html, body { 61 | height: 100%; 62 | } 63 | 64 | .unselectable { 65 | -webkit-user-select: none; 66 | -khtml-user-select: none; 67 | -moz-user-select: none; 68 | -o-user-select: none; 69 | user-select: none; 70 | } 71 | 72 | /** header **/ 73 | 74 | .header { 75 | z-index:1001; /* now beats that of bootstrap dropdown-menu */ 76 | position: absolute; 77 | white-space: nowrap; 78 | color: black; 79 | top:0; 80 | left:0; 81 | width:100%; 82 | margin:0px; 83 | padding:0px; 84 | height:54px; 85 | background-image:none; 86 | background-color:white; 87 | border-bottom:1px solid rgba(0,0,0,0.175); 88 | box-shadow:0 3px 12px rgba(0,0,0,0.175); 89 | } 90 | 91 | .header input { 92 | font: 16px arial,sans-serif; 93 | line-height: 1.2em !important; 94 | height: 1.2em !important; 95 | width:400px; 96 | padding:5px; 97 | margin-top:10px; 98 | border-radius:3px; 99 | border:1px solid #CCC; 100 | } 101 | 102 | .content { 103 | position:absolute; 104 | width:100%; 105 | margin:0; 106 | top:54px; 107 | bottom:0px; 108 | overflow-x:hidden; 109 | background-color:rgb(247, 247, 247); 110 | } 111 | 112 | .help { 113 | position:absolute; 114 | top:10px; 115 | right:0; 116 | min-height:100%; 117 | overflow-x:hidden; 118 | transition:width .25s; 119 | -webkit-transition:width .25s; 120 | } 121 | 122 | .leftmenu { 123 | position:fixed; 124 | width:200px; 125 | top:54px; 126 | bottom:0px; 127 | } 128 | 129 | .result { 130 | background-color:white; 131 | margin-top:0px; 132 | margin-bottom:10px; 133 | margin-left:0px; 134 | padding:4px; 135 | border:1px solid #DDD; 136 | position:relative; 137 | 138 | color: rgb(84, 84, 84); 139 | font-family: arial, sans-serif; 140 | font-size: 13px; 141 | font-weight: normal; 142 | line-height: 18.2px; 143 | } 144 | 145 | .result em { 146 | background-color:yellow; 147 | } 148 | 149 | .result * { 150 | background-color:transparent; 151 | } 152 | 153 | .facet { 154 | cursor:pointer; 155 | padding-left:10px; 156 | font-size:18px; 157 | } 158 | 159 | .facet:hover { 160 | color:#555; 161 | } 162 | 163 | 164 | .facet-inactive div { 165 | visibility:hidden 166 | } 167 | 168 | .extraction { 169 | color:red 170 | } 171 | 172 | .extractionBlue { 173 | color:blue 174 | } 175 | 176 | /* Highlights */ 177 | .highlight_left { 178 | border-top-left-radius:3px; 179 | border-bottom-left-radius:3px; 180 | border-bottom:1px solid; 181 | border-left:1px solid; 182 | border-top:1px solid; 183 | box-sizing:border-box; 184 | margin-left:-1px; 185 | margin-right:0px; 186 | /* 187 | box-shadow: inset 8px 0px 8px -8px red, 188 | inset 0px 8px 8px -8px red, 189 | inset 0px -8px 8px -8px red; 190 | */ 191 | } 192 | 193 | .highlight_right { 194 | border-top-right-radius:3px; 195 | border-bottom-right-radius:3px; 196 | border-bottom:1px solid; 197 | border-right:1px solid; 198 | border-top:1px solid; 199 | box-sizing:border-box; 200 | margin-right:-1px; 201 | margin-left:0px; 202 | /* 203 | box-shadow: inset -8px 0px 8px -8px red, 204 | inset 0px 8px 8px -8px red, 205 | inset 0px -8px 8px -8px red; 206 | */ 207 | } 208 | 209 | .highlight_leftright { 210 | border-radius:3px; 211 | border:1px solid transparent; 212 | box-sizing:border-box; 213 | -webkit-box-sizing:border-box; 214 | margin-left:-1px; 215 | margin-right:-1px; 216 | /* 217 | box-shadow: inset 0px 0px 8px 0px red; 218 | */ 219 | } 220 | 221 | .highlight_inner { 222 | border-top:1px solid; 223 | border-bottom:1px solid; 224 | box-sizing:border-box; 225 | padding:0px; 226 | margin:0px; 227 | /* 228 | box-shadow: inset 0px 8px 8px -8px red, 229 | inset 0px -8px 8px -8px red; 230 | */ 231 | } 232 | 233 | .highlight_red { 234 | /*background-color:rgba(255,0,0,0.3);*/ 235 | background-color:rgba(255,0,0,0.6); 236 | border-color:rgba(255,0,0,0.4); 237 | color:white; 238 | } 239 | 240 | .highlight_strongred { 241 | /* background-color:rgba(255,0,0,0.6) !important; 242 | border-color:rgba(255,0,0,0.4) !important; */ 243 | background-color:rgba(255,0,0,1) !important; 244 | border-color:rgba(255,0,0,1) !important; 245 | color:white !important; 246 | } 247 | 248 | .highlight_grey { 249 | /*background-color:rgba(255,0,0,0.3);*/ 250 | background-color:rgba(200,200,200,0.6); 251 | border-color:rgba(200,200,200,0.4); 252 | color:black; 253 | } 254 | 255 | .highlight_yellow { 256 | background-color:rgba(255,255,0,0.6); 257 | border-color:rgba(255,255,0,0.4); 258 | color:black; 259 | } 260 | 261 | .annotationsSelector { 262 | border-top:1px solid #EEE; 263 | visibility: hidden; 264 | background-color:transparent; 265 | position:absolute; 266 | padding:10px; 267 | top:0px; 268 | right:-170px; 269 | width:150px; 270 | } 271 | 272 | .result:hover .annotationsSelector { 273 | visibility: visible 274 | } 275 | 276 | .help h1 { 277 | font-size:16px; 278 | margin:0px; 279 | } 280 | 281 | .help h3 { 282 | margin:0px; 283 | margin-top:20px; 284 | margin-bottom:5px; 285 | font-weight:bold; 286 | } 287 | 288 | 289 | -------------------------------------------------------------------------------- /view/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | DeepDive 5 | 6 | 7 | 11 | 12 | 13 | 21 | 22 | 23 | 24 | 25 |
26 | 27 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /view/public/js/help/Help.js: -------------------------------------------------------------------------------- 1 | 2 | var Help = React.createClass({displayName: "Help", 3 | 4 | 5 | render: function() { 6 | var show = this.props.isHelp 7 | 8 | var wrapperStyle = {position:'fixed', top: '0px', right:0, minHeight:'100%', overflowX:'hidden', transition:'width .25s', 9 | WebkitTransition:'width .25s', backgroundColor: 'rgb(71, 71, 71)'} 10 | var columnStyle = {position:'absolute', top:'50px', paddingTop:'10px', paddingBottom:'10px', paddingLeft:'10px', paddingRight:'10px', 11 | minHeight:'100%', width:'280px', color:'white', zIndex:3} 12 | 13 | var columnStyleBackground = {} //{position:'fixed', boxSizing:'borderBox', MozBoxSizing:'border-box', WebkitBoxSizing:'border-box', 14 | //top:0, right:0, minHeight:'100%', backgroundColor:'rgba(71,71,71,1)', transition:'width .25s', WebkitTransition:'width .25s', 15 | //zIndex:1} 16 | if (show) { 17 | columnStyleBackground.width = '300px' 18 | wrapperStyle.width = '300px' 19 | } else { 20 | wrapperStyle.width = '0px' 21 | columnStyleBackground.width = '0px'; 22 | } 23 | 24 | return (React.createElement("div", {style: wrapperStyle}, 25 | React.createElement("div", {className: "help", style: columnStyle}, 26 | React.createElement("h1", null, "Query Examples"), 27 | 28 | React.createElement("h3", null, "Words and Phrases"), 29 | React.createElement("code", null, "quick"), " and ", React.createElement("code", null, "\"quick brown\""), 30 | 31 | React.createElement("h3", null, "Field names"), 32 | React.createElement("code", null, "_id:4325235"), React.createElement("br", null), 33 | React.createElement("code", null, "title:(quick OR brown)"), React.createElement("br", null), 34 | React.createElement("code", null, "book.\\*:(quick brown)"), React.createElement("br", null), 35 | React.createElement("code", null, "_missing_:title"), React.createElement("br", null), 36 | React.createElement("code", null, "_exists_:title"), 37 | 38 | React.createElement("h3", null, "Wildcards"), 39 | React.createElement("code", null, "qu?ck bro*"), 40 | 41 | React.createElement("h3", null, "Regular Expressions"), 42 | React.createElement("code", null, "name:/joh?n(ath[oa]n)/"), 43 | 44 | React.createElement("h3", null, "Fuzziness"), 45 | React.createElement("code", null, "quikc~ brwn~ foks~"), React.createElement("br", null), 46 | React.createElement("code", null, "quikc~1"), 47 | 48 | React.createElement("h3", null, "Proximity Searches"), 49 | React.createElement("code", null, "\"fox quick\"~5"), 50 | 51 | React.createElement("h3", null, "Ranges"), 52 | React.createElement("code", null, "date:[2012-01-01 TO 2012-12-31]"), React.createElement("br", null), 53 | React.createElement("code", null, "count:[1 TO 5]"), React.createElement("br", null), 54 | React.createElement("code", null, "tag: ", "{", "alpha TO omega", "}"), React.createElement("br", null), 55 | React.createElement("code", null, "count:[10 TO *]"), React.createElement("br", null), 56 | React.createElement("code", null, "date:", "{", "* TO 2012-01-01", "}"), React.createElement("br", null), 57 | React.createElement("code", null, "count:[1 TO 5", "}"), React.createElement("br", null), 58 | React.createElement("code", null, "age:>=10"), React.createElement("br", null), 59 | React.createElement("code", null, "age:(>=10 AND <20)"), 60 | 61 | React.createElement("h3", null, "Boosting"), 62 | React.createElement("code", null, "quick^2 fox"), React.createElement("br", null), 63 | React.createElement("code", null, "\"john smith\"^2"), React.createElement("br", null), 64 | React.createElement("code", null, "(foo bar)^4"), 65 | 66 | React.createElement("h3", null, "Boolean Operators"), 67 | React.createElement("code", null, "quick brown +fox -news"), React.createElement("br", null), 68 | React.createElement("code", null, "((quick AND fox) OR (brown AND fox) OR fox) AND NOT news"), 69 | 70 | React.createElement("h3", null, "Grouping"), 71 | React.createElement("code", null, "(quick OR brown) AND fox"), React.createElement("br", null), 72 | React.createElement("code", null, "status:(active OR pending) title:(full text search)^2"), 73 | 74 | React.createElement("h3", null, "Reserved Characters"), 75 | "Escape with backslash", React.createElement("br", null), 76 | "Example: ", React.createElement("code", null, "\\(1\\+1\\)\\=2"), " , finds (1+1)=2 ", React.createElement("br", null), 77 | "Characters: ", React.createElement("code", null, "+ - = && || > < ! ( ) ", "{", " ", "}", " [ ] ^ \" ~ * ? : \\ /"), 78 | 79 | React.createElement("h3", null, "Empty Query"), 80 | "Shows all results.", 81 | 82 | React.createElement("p", null, 83 | "For more details, see ", React.createElement("a", {href: "https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax", target: "_blank"}, "here"), "." 84 | ) 85 | ), 86 | React.createElement("div", {style: columnStyleBackground}, 87 | React.createElement("div", {style: {position:'absolute', borderLeft:'1px solid white', minHeight:'100%', width:'1px'}}) 88 | ) 89 | )) 90 | } 91 | }) 92 | 93 | module.exports = Help 94 | -------------------------------------------------------------------------------- /view/public/js/vis/AnnotationsSelector.js: -------------------------------------------------------------------------------- 1 | 2 | var AnnotationsSelector = React.createClass({displayName: "AnnotationsSelector", 3 | 4 | render: function() { 5 | var onLayerChange = this.props.onLayerChange 6 | 7 | var buttons = this.props.layers.map(function(result) { 8 | return ( 9 | React.createElement(AnnotationsSelectorButton, {data: result, 10 | onLayerChange: onLayerChange}) 11 | ); 12 | }); 13 | return (React.createElement("div", {className: "annotationsSelector"}, buttons)); 14 | } 15 | }); 16 | 17 | var AnnotationsSelectorButton = React.createClass({displayName: "AnnotationsSelectorButton", 18 | handleClick: function() { 19 | var active = !this.props.data.active; 20 | this.props.onLayerChange(this.props.data.name, active); 21 | }, 22 | render: function() { 23 | var classes = 'facet'; 24 | if (!this.props.data.active) 25 | classes += ' facet-inactive'; 26 | return (React.createElement("div", {style: {fontSize:'10pt'}, className: classes, onClick: this.handleClick}, 27 | React.createElement("div", {style: {display:'inline-block',width:'30px'}}, 28 | React.createElement("i", {className: "fa fa-check"}) 29 | ), this.props.data.name 30 | )) 31 | } 32 | }) 33 | 34 | module.exports = AnnotationsSelector -------------------------------------------------------------------------------- /view/public/js/vis/TextWithAnnotations.js: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | 3 | var SpansVisualization = require('./core/SpansVisualization.js') 4 | var TokenTagsVisualization = require('./core/TokenTagsVisualization.js') 5 | var EdgesVisualization = require('./core/EdgesVisualization.js') 6 | 7 | var TokensVisualization = function(element, source) { 8 | return SpansVisualization(element, source.tokenOffsets) 9 | } 10 | 11 | var SentencesVisualization = function(element, source) { 12 | return SpansVisualization(element, source.sentenceOffsets) 13 | } 14 | 15 | var PartOfSpeechVisualization = function(element, source) { 16 | return TokenTagsVisualization(element, source.tokenOffsets, source.poss) 17 | } 18 | 19 | var LemmasVisualization = function(element, source) { 20 | return TokenTagsVisualization(element, source.tokenOffsets, source.lemmas) 21 | } 22 | 23 | var DependenciesVisualization = function(element, source) { 24 | return EdgesVisualization(element, source.tokenOffsets, source.sentenceOffsets, source.sentenceTokenOffsets, source.sentenceDependencies) 25 | } 26 | 27 | var ExtractorsVisualization = function(element, source, annotations) { 28 | var sentenceTokenOffsets = source['sentenceTokenOffsets'] 29 | var tokenOffsets = source['tokenOffsets'] 30 | var extractorOffsets = [] 31 | 32 | $.each(annotations, function(i, a) { 33 | var sentNum = a.range.sentNum 34 | var sentenceBeginToken = sentenceTokenOffsets[sentNum][0] 35 | var tokenFrom = sentenceBeginToken + a.range.f 36 | var tokenTo = sentenceBeginToken + a.range.t 37 | var charFrom = tokenOffsets[tokenFrom][0] 38 | var charTo = tokenOffsets[tokenTo - 1][1] 39 | extractorOffsets.push([charFrom,charTo]) 40 | }) 41 | return SpansVisualization(element, extractorOffsets) 42 | } 43 | 44 | var TextWithAnnotations = React.createClass({displayName: "TextWithAnnotations", 45 | 46 | componentDidMount: function() { 47 | this.vis = {} 48 | this.buildCustomDom() 49 | }, 50 | componentDidUpdate: function() { 51 | this.buildCustomDom() 52 | }, 53 | buildCustomDom: function() { 54 | var div = React.findDOMNode(this) 55 | //cleanup existing visualizations 56 | $.each(this.vis, function(k,v) { v.destroy() }) 57 | 58 | this.vis = {} 59 | 60 | var annotations = this.props.data.annotations 61 | var sourceData = this.props.data._source 62 | var vis = this.vis 63 | 64 | $.each(this.props.layers, function(i, l) { 65 | if (vis && vis[l.name] && !l.active) { 66 | vis[l.name].destroy() 67 | delete vis[l.name] 68 | } 69 | if (vis && !vis[l.name] && l.active) { 70 | if (l.name == 'Tokens') 71 | vis[l.name] = new TokensVisualization(div, sourceData) 72 | if (l.name == 'Sentences') 73 | vis[l.name] = new SentencesVisualization(div, sourceData) 74 | if (l.name == 'Extractors') 75 | vis[l.name] = new ExtractorsVisualization(div, sourceData, annotations) 76 | if (l.name == 'Dependencies') 77 | vis[l.name] = new DependenciesVisualization(div, sourceData) 78 | if (l.name == 'Lemmas') 79 | vis[l.name] = new LemmasVisualization(div, sourceData) 80 | if (l.name == 'PartOfSpeech') 81 | vis[l.name] = new PartOfSpeechVisualization(div, sourceData) 82 | } 83 | }) 84 | }, 85 | isActive: function(name) { 86 | var isActive = false 87 | $.each(this.props.layers, function(i, l) { 88 | if (l.name == name) { isActive = l.active; return false } 89 | }) 90 | return isActive 91 | }, 92 | 93 | render: function() { 94 | content = this.props.data._source.content; 95 | // if we have field with keyword highlighting, take that 96 | if (this.props.data.highlight != null && 97 | this.props.data.highlight.content != null) { 98 | content = this.props.data.highlight.content[0]; 99 | } 100 | var details = [] 101 | if (this.isActive('Details')) { 102 | $.each(this.props.data.annotations, function(i, value) { 103 | details.push(React.createElement("div", {className: "extractionBlue"}, JSON.stringify(value), " ")); 104 | }) 105 | $.each(this.props.data._source, function(name, value) { 106 | if (name != 'content' && name != 'id') 107 | details.push (React.createElement("div", {className: "extraction"}, name, " : ", JSON.stringify(value), " ")); 108 | }) 109 | } 110 | 111 | var div = (React.createElement("div", null, React.createElement("span", {dangerouslySetInnerHTML: {__html: content}}), 112 | React.createElement("br", null), React.createElement("div", {style: {'color':'green'}}, this.props.data._id), 113 | details 114 | )) 115 | 116 | return div; 117 | } 118 | }); 119 | 120 | module.exports = TextWithAnnotations 121 | 122 | -------------------------------------------------------------------------------- /view/public/js/vis/core/CharOffsets.js: -------------------------------------------------------------------------------- 1 | var CharOffsets = (function() { 2 | var ELEMENT = 1; 3 | var TEXT = 3; 4 | 5 | var offsetComparator = function(e1, e2) { 6 | return e1.readrOffset - e2.readrOffset; 7 | }; 8 | 9 | var indexOffsets = function(node, offset) { 10 | node.readrOffset = offset; 11 | if (node.nodeType == TEXT) { 12 | node.readrLength = node.nodeValue.length; 13 | } else if (node.nodeType == ELEMENT) { 14 | // ignore if has class ignoreReadrLength 15 | if (goog.dom.classes.has(node, 'ignoreReadrLength')) { 16 | node.readrLength = 0; 17 | } else { 18 | // sum up lengths of children 19 | var l = 0; 20 | for (var i=0, ii = node.childNodes.length; i < ii; i++) { 21 | var child = node.childNodes[i]; 22 | indexOffsets(child, offset + l); 23 | l += child.readrLength; 24 | } 25 | node.readrLength = l; 26 | } 27 | } 28 | }; 29 | 30 | var getTextRangesToHighlightFromIndex = function(node, start, end) { 31 | var results = new Array(); 32 | recur(node, start, end, results); 33 | return results; 34 | }; 35 | 36 | var recur = function(node, start, end, results) { 37 | if (end - start <= 0) return; 38 | 39 | // we assume that start >= node.readrOffset and end <= node.readrOffset + node.readrLength 40 | if (node.nodeType == TEXT) { 41 | results.push([node, start - node.readrOffset, end - node.readrOffset, start, end]); 42 | return; 43 | } 44 | // binary search for start and end 45 | var ns = goog.array.binarySearch(node.childNodes, { readrOffset : start }, offsetComparator); 46 | var ne = goog.array.binarySearch(node.childNodes, { readrOffset : end }, offsetComparator); 47 | 48 | if (ns < 0) { ns = -ns-2; } 49 | if (ne < 0) { ne = -ne-1; } 50 | 51 | for (var i=ns; i < ne; i++) { 52 | var child = node.childNodes[i]; 53 | var s = (i==ns)? start : child.readrOffset; 54 | var e = (i==ne-1)? end : child.readrOffset + child.readrLength; 55 | 56 | recur(child, s, e, results); 57 | } 58 | }; 59 | 60 | var createMultiRangeSpans = function(element, tokenOffsets, renderedSpans, documentOffset) { 61 | if (!renderedSpans) 62 | renderedSpans = new Array(); 63 | if (!documentOffset) 64 | documentOffset = 0 65 | indexOffsets(element[0], documentOffset) 66 | for (var j=0, jj = tokenOffsets.length; j < jj; j++) { 67 | // token has offsets t.f, t.t 68 | var rs = createSingleRangeSpans(element, tokenOffsets[j]); 69 | renderedSpans.push(rs); 70 | } 71 | return renderedSpans; 72 | }; 73 | 74 | var FROM = 0 75 | var TO = 1 76 | 77 | // example tokenOffset: { f:12, t:23 } 78 | var createSingleRangeSpans = function(element, tokenOffset) { 79 | //if (!documentOffset) 80 | //documentOffset = 0 81 | var sels = new Array(); 82 | var todo = getTextRangesToHighlightFromIndex 83 | (element[0], tokenOffset[FROM], tokenOffset[TO]); 84 | for (var i=0, ii = todo.length; i < ii; i++) { 85 | var t = todo[i]; 86 | var range = goog.dom.Range.createFromNodes(t[0], t[1], t[0], t[2]); 87 | var el = goog.dom.createDom('span'); //, { 'style':'background-color:green'}); 88 | range.surroundContents(el); 89 | indexOffsets(t[0].parentNode, t[0].parentNode.readrOffset); 90 | sels.push(el); 91 | } 92 | return { sels:sels }; 93 | }; 94 | 95 | //note, the output of this function is a singleton 96 | return { 97 | indexOffsets: indexOffsets, 98 | getTextRangesToHighlightFromIndex: getTextRangesToHighlightFromIndex, 99 | createMultiRangeSpans: createMultiRangeSpans, 100 | createSingleRangeSpans: createSingleRangeSpans 101 | }; 102 | })() 103 | 104 | module.exports = CharOffsets -------------------------------------------------------------------------------- /view/public/js/vis/core/FramesVisualization.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | // 5 | // 6 | //var FramesVisualization = function(element, source) { 7 | // var state ={} 8 | // 9 | // var documentOffset = 0 10 | // 11 | // var msHeadSpans = new Array(); 12 | // CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset) 13 | // 14 | // 15 | // 16 | //} -------------------------------------------------------------------------------- /view/public/js/vis/core/SpansVisualization.js: -------------------------------------------------------------------------------- 1 | /* TokensVisualization */ 2 | var CharOffsets = require('./CharOffsets.js') 3 | 4 | var Span = function(sels) { 5 | var state = {} 6 | 7 | var fragment = function(i, length) { 8 | var fragment = ''; 9 | if (i==0 && i < length-1) fragment = 'left'; 10 | else if (i==0 && i==length-1) fragment = 'leftright'; 11 | else if (i==length-1 && i > 0) fragment = 'right'; 12 | else if (i > 0 && i < length-1) fragment = 'inner'; 13 | return fragment; 14 | }; 15 | 16 | // initialize 17 | state.sels = sels 18 | state.color = 'red' 19 | if (!sels) return; 20 | var ii = sels.length; 21 | $.each(sels, function(i, sel) { 22 | $(sel).addClass('highlight_' + state.color); 23 | $(sel).addClass('highlight_' + fragment(i, ii)); 24 | //$(sel).on('click', function() { 25 | // console.log('clicked'); 26 | //}); 27 | }) 28 | 29 | state.destroy = function() { 30 | // unbind all handlers 31 | if (!state.sels) return; 32 | 33 | $.each(state.sels, function(sel) { 34 | //$(sel).unbind('click'); 35 | }); 36 | } 37 | return state 38 | } 39 | 40 | var SpansVisualization = function(element, spans) { 41 | var state = { 42 | renderedSpans: new Array(), 43 | destroyed: false 44 | }; 45 | 46 | //var documentOffset = scope.document.offset 47 | var documentOffset = 0 48 | 49 | CharOffsets.createMultiRangeSpans([element,this], spans, state.renderedSpans, documentOffset) 50 | 51 | $.each(state.renderedSpans, function(i, rs) { 52 | var span = new Span(rs.sels) 53 | }); 54 | 55 | state.destroy = function() { 56 | state.destroyed = true; 57 | $.each(state.renderedSpans, function(i, value) { 58 | // do bound listeners automatically get destroyed?? 59 | //value.element.remove(); 60 | //value.scope.$destroy(); 61 | 62 | //$.each(value.aux, function(j,n) { 63 | // goog.dom.removeNode(n); 64 | //}); 65 | $.each(value.sels, function(j,n) { 66 | goog.dom.flattenElement(n); 67 | }); 68 | value.sels = []; 69 | }); 70 | //element.remove(); 71 | //goog.editor.range.normalizeNode(element[0]); 72 | state.renderedSpans.length = 0; 73 | } 74 | return state 75 | } 76 | 77 | module.exports = SpansVisualization -------------------------------------------------------------------------------- /view/public/js/vis/core/TokenTagsVisualization.js: -------------------------------------------------------------------------------- 1 | /* TokenTagsVisualization */ 2 | 3 | var CharOffsets = require('./CharOffsets.js') 4 | 5 | var TokenTagsVisualization = function(element, tokenOffsets, tags) { 6 | var state = { 7 | renderedSpans: new Array(), 8 | destroyed: false 9 | }; 10 | 11 | //var documentOffset = scope.document.offset 12 | var documentOffset = 0 13 | 14 | // insert spans 15 | CharOffsets.createMultiRangeSpans([element,this], tokenOffsets, state.renderedSpans, documentOffset) 16 | 17 | $.each(state.renderedSpans, function(i, rs) { 18 | var firstSpan = rs.sels[0] 19 | var el = goog.dom.createDom('div', { 'style' : 20 | 'position:absolute;' + 21 | 'top:-15px;' + 22 | 'left:0px;right:0px;' + 23 | 'z-index:0;' + 24 | 'width:100px;' + //' + tokenWidth + 'px;' + 25 | 'height:20px;' + 26 | 'color:red;' + 27 | 'font-size:10px;' + 28 | 'font-family:helvetica,arial;' + 29 | 'font-stretch:semi-condensed;' + 30 | 'font-weight:500;'/* + 31 | 'background-color:white'*/ 32 | }) 33 | el.appendChild(goog.dom.createTextNode(tags[i])) 34 | // if you want all lines to be equal height, set marginTop as follows 35 | //var marginTop = (drawing.highestLevels[i]+1) * 15; 36 | // if you want to use inline rather than inline-block spans, use following line 37 | //$(firstSpan).attr('style', 'display:inline;line-height:' + (marginTop + 20) + 38 | // 'px;margin-top:' + marginTop + 'px;position:relative'); 39 | var marginTop = 10 40 | $(firstSpan).attr('style', 'display:inline-block;margin-top:' + marginTop + 'px;position:relative') 41 | firstSpan.appendChild(el) 42 | rs.aux = new Array() 43 | rs.aux.push(el) 44 | }) 45 | 46 | state.destroy = function() { 47 | state.destroyed = true; 48 | $.each(state.renderedSpans, function(i, value) { 49 | $.each(value.aux, function(j, n) { 50 | goog.dom.removeNode(n); 51 | }) 52 | $.each(value.sels, function(j, n) { 53 | goog.dom.flattenElement(n); 54 | }) 55 | value.sels = []; 56 | }); 57 | //element.remove(); 58 | //goog.editor.range.normalizeNode(element[0]); 59 | state.renderedSpans.length = 0; 60 | } 61 | return state 62 | } 63 | 64 | module.exports = TokenTagsVisualization 65 | -------------------------------------------------------------------------------- /view/public/js/vis/visframe.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | var FramesVisualization = function(element, source) { 7 | var state ={} 8 | 9 | var documentOffset = 0 10 | 11 | var msHeadSpans = new Array(); 12 | CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset) 13 | 14 | 15 | 16 | } -------------------------------------------------------------------------------- /view/public/js/vis/visspan.js: -------------------------------------------------------------------------------- 1 | /* TokensVisualization */ 2 | 3 | var Span = function(sels) { 4 | var state = {} 5 | 6 | var fragment = function(i, length) { 7 | var fragment = ''; 8 | if (i==0 && i < length-1) fragment = 'left'; 9 | else if (i==0 && i==length-1) fragment = 'leftright'; 10 | else if (i==length-1 && i > 0) fragment = 'right'; 11 | else if (i > 0 && i < length-1) fragment = 'inner'; 12 | return fragment; 13 | }; 14 | 15 | // initialize 16 | state.sels = sels 17 | state.color = 'red' 18 | if (!sels) return; 19 | var ii = sels.length; 20 | $.each(sels, function(i, sel) { 21 | $(sel).addClass('highlight_' + state.color); 22 | $(sel).addClass('highlight_' + fragment(i, ii)); 23 | //$(sel).on('click', function() { 24 | // console.log('clicked'); 25 | //}); 26 | }) 27 | 28 | state.destroy = function() { 29 | // unbind all handlers 30 | if (!state.sels) return; 31 | 32 | $.each(state.sels, function(sel) { 33 | //$(sel).unbind('click'); 34 | }); 35 | } 36 | return state 37 | } 38 | 39 | 40 | var SpansVisualization = function(element, spans) { 41 | var state = { 42 | renderedSpans: new Array(), 43 | destroyed: false 44 | }; 45 | 46 | //var documentOffset = scope.document.offset 47 | var documentOffset = 0 48 | 49 | CharOffsets.createMultiRangeSpans([element,this], spans, state.renderedSpans, documentOffset) 50 | 51 | $.each(state.renderedSpans, function(i, rs) { 52 | var span = new Span(rs.sels) 53 | }); 54 | 55 | state.destroy = function() { 56 | state.destroyed = true; 57 | $.each(state.renderedSpans, function(i, value) { 58 | // do bound listeners automatically get destroyed?? 59 | //value.element.remove(); 60 | //value.scope.$destroy(); 61 | 62 | //$.each(value.aux, function(j,n) { 63 | // goog.dom.removeNode(n); 64 | //}); 65 | $.each(value.sels, function(j,n) { 66 | goog.dom.flattenElement(n); 67 | }); 68 | value.sels = []; 69 | }); 70 | //element.remove(); 71 | //goog.editor.range.normalizeNode(element[0]); 72 | state.renderedSpans.length = 0; 73 | } 74 | return state 75 | } 76 | 77 | 78 | 79 | 80 | var TokensVisualization = function(element, source) { 81 | return SpansVisualization(element, source.tokenOffsets) 82 | } 83 | 84 | var SentencesVisualization = function(element, source) { 85 | return SpansVisualization(element, source.sentenceOffsets) 86 | } 87 | -------------------------------------------------------------------------------- /view/public/js/vis/vistokentag.js: -------------------------------------------------------------------------------- 1 | /* TokenTagsVisualization */ 2 | 3 | var TokenTagsVisualization = function(element, tokenOffsets, tags) { 4 | var state = { 5 | renderedSpans: new Array(), 6 | destroyed: false 7 | }; 8 | 9 | //var documentOffset = scope.document.offset 10 | var documentOffset = 0 11 | 12 | // insert spans 13 | CharOffsets.createMultiRangeSpans([element,this], tokenOffsets, state.renderedSpans, documentOffset) 14 | 15 | $.each(state.renderedSpans, function(i, rs) { 16 | var firstSpan = rs.sels[0] 17 | var el = goog.dom.createDom('div', { 'style' : 18 | 'position:absolute;' + 19 | 'top:-15px;' + 20 | 'left:0px;right:0px;' + 21 | 'z-index:0;' + 22 | 'width:100px;' + //' + tokenWidth + 'px;' + 23 | 'height:20px;' + 24 | 'color:red;' + 25 | 'font-size:10px;' + 26 | 'font-family:helvetica,arial;' + 27 | 'font-stretch:semi-condensed;' + 28 | 'font-weight:500;'/* + 29 | 'background-color:white'*/ 30 | }) 31 | el.appendChild(goog.dom.createTextNode(tags[i])) 32 | // if you want all lines to be equal height, set marginTop as follows 33 | //var marginTop = (drawing.highestLevels[i]+1) * 15; 34 | // if you want to use inline rather than inline-block spans, use following line 35 | //$(firstSpan).attr('style', 'display:inline;line-height:' + (marginTop + 20) + 36 | // 'px;margin-top:' + marginTop + 'px;position:relative'); 37 | var marginTop = 10 38 | $(firstSpan).attr('style', 'display:inline-block;margin-top:' + marginTop + 'px;position:relative') 39 | firstSpan.appendChild(el) 40 | rs.aux = new Array() 41 | rs.aux.push(el) 42 | }) 43 | 44 | state.destroy = function() { 45 | state.destroyed = true; 46 | $.each(state.renderedSpans, function(i, value) { 47 | $.each(value.aux, function(j, n) { 48 | goog.dom.removeNode(n); 49 | }) 50 | $.each(value.sels, function(j, n) { 51 | goog.dom.flattenElement(n); 52 | }) 53 | value.sels = []; 54 | }); 55 | //element.remove(); 56 | //goog.editor.range.normalizeNode(element[0]); 57 | state.renderedSpans.length = 0; 58 | } 59 | return state 60 | } 61 | 62 | 63 | var PartOfSpeechVisualization = function(element, source) { 64 | return TokenTagsVisualization(element, source.tokenOffsets, source.poss) 65 | } 66 | 67 | var LemmasVisualization = function(element, source) { 68 | return TokenTagsVisualization(element, source.tokenOffsets, source.lemmas) 69 | } 70 | -------------------------------------------------------------------------------- /view/public/js/visframe.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | var FramesVisualization = function(element, source) { 7 | var state ={} 8 | 9 | var documentOffset = 0 10 | 11 | var msHeadSpans = new Array(); 12 | CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset) 13 | 14 | 15 | 16 | } -------------------------------------------------------------------------------- /view/routes/index.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var router = express.Router(); 3 | var path = require('path'); 4 | 5 | var elasticsearch = require('elasticsearch'); 6 | var client = new elasticsearch.Client({ 7 | host: 'http://localhost:9200' 8 | }); 9 | 10 | router.get('/', function(req, res, next) { 11 | //res.render('index', { title: 'Express3' }); 12 | res.sendFile(path.join(__dirname + '/../public/index.html')); 13 | }); 14 | 15 | router.get('/search*', function(req, res, ext) { 16 | //res.render('index', { title: 'Express3' }); 17 | res.sendFile(path.join(__dirname + '/../public/index.html')); 18 | }); 19 | 20 | router.get('/annotators', function(req, res, next) { 21 | var index = req.query.index || 'view' 22 | client.search({ 23 | index: index, //process.env.INDEX_NAME, 24 | type: 'annotators', 25 | body: { 26 | query: { 27 | 'match_all': {} 28 | } 29 | } 30 | }).then(function(body) { 31 | var hits = body.hits.hits; 32 | res.send(hits); 33 | }, function (err) { 34 | console.trace(err.message); 35 | next(err); 36 | }); 37 | }); 38 | 39 | router.get('/annotations', function(req, res, next) { 40 | var doc_ids = [] 41 | var doc_ids_str = req.param('doc_ids') 42 | if (doc_ids_str) doc_ids = doc_ids_str.split(',') 43 | var index = req.query.index || 'view' 44 | 45 | var obj = { 46 | index: index, //process.env.INDEX_NAME, 47 | type: 'annotations', 48 | from: 0, 49 | size: 100000, 50 | body: { 51 | "query" : { 52 | "has_parent": { 53 | "type": "docs", 54 | "query": { 55 | "ids" : { 56 | "values" : doc_ids 57 | } 58 | } 59 | } 60 | } 61 | } 62 | } 63 | 64 | client.search( 65 | obj 66 | ).then(function (body) { 67 | var hits = body.hits.hits; 68 | res.send(hits) 69 | }, function (err) { 70 | console.trace(err.message); 71 | next(err) 72 | }); 73 | }); 74 | 75 | 76 | 77 | router.get('/docs', function(req, res, next) { 78 | var from = req.param('from', 0) 79 | var limit = req.param('limit', 100) 80 | var keywords = req.query.keywords || '' 81 | var facets = req.query.facets || '' 82 | var index = req.query.index || 'view' 83 | 84 | var obj = { 85 | index: index, //process.env.INDEX_NAME, 86 | type: 'docs', 87 | from: from, 88 | size: limit, 89 | body: { 90 | query: { 91 | "match_all" : {} 92 | }, 93 | highlight : { 94 | fields : { 95 | content : { "number_of_fragments" : 0 } 96 | } 97 | } 98 | } 99 | } 100 | 101 | if (keywords.length > 0) { 102 | obj.body.query = { 103 | query_string: { 104 | "default_field" : "content", 105 | "fields" : ["content", "_id", "id"], 106 | "query" : keywords 107 | } 108 | } 109 | } 110 | 111 | if (facets.length > 0) { 112 | var l = facets.split(',') 113 | 114 | var filters = [] 115 | for (var i=0; i < l.length; i++) 116 | filters.push({ 117 | //"exists" : { "field" : l[i] } 118 | "has_child" : { 119 | "type" : "annotations", 120 | "query" : { 121 | "term" : { 122 | "attribute" : l[i] 123 | } 124 | } 125 | } 126 | }); 127 | 128 | if (filters.length > 1) 129 | obj.body.filter = { 130 | "and" : filters 131 | } 132 | else 133 | obj.body.filter = filters[0] 134 | } 135 | 136 | client.search(obj).then(function (body) { 137 | var docs_context = body.hits 138 | var docs = body.hits.hits; 139 | 140 | // we now have the documents, run another query to get all annotations on 141 | // these documents 142 | var doc_ids = new Array(docs.length) 143 | for (var i=0, ii = docs.length; i < ii; i++) 144 | doc_ids[i] = docs[i]._id 145 | 146 | var obj = { 147 | index: index, //process.env.INDEX_NAME, 148 | type: 'annotations', 149 | from:0, 150 | size:100000, 151 | body: { 152 | "query" : { 153 | "has_parent": { 154 | "type": "docs", 155 | "query": { 156 | "ids" : { 157 | "values" : doc_ids 158 | } 159 | } 160 | } 161 | } 162 | } 163 | } 164 | client.search(obj).then(function(body) { 165 | var hits = body.hits.hits 166 | // build a little index of the annotations 167 | var id2ann = {} 168 | for (var i = 0, ii = hits.length; i < ii; i++) { 169 | var id = hits[i]._source.range.doc_id 170 | if (id in id2ann) 171 | id2ann[id].push(hits[i]._source) 172 | else 173 | id2ann[id] = [hits[i]._source] 174 | } 175 | // add to docs 176 | for (var i=0, ii = docs.length; i < ii; i++) { 177 | docs[i].annotations = id2ann[docs[i]._id] || [] 178 | } 179 | res.send(docs_context) 180 | }, function(err) { 181 | console.trace(err.message); 182 | next(err) 183 | }); 184 | 185 | //res.send(hits) 186 | 187 | }, function (err) { 188 | console.trace(err.message); 189 | next(err) 190 | }); 191 | }); 192 | 193 | 194 | module.exports = router; 195 | -------------------------------------------------------------------------------- /view/routes/users.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var router = express.Router(); 3 | 4 | /* GET users listing. */ 5 | router.get('/', function(req, res, next) { 6 | res.send('respond with a resource'); 7 | }); 8 | 9 | module.exports = router; 10 | -------------------------------------------------------------------------------- /view/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ $(uname) = Darwin ]] && 4 | osascript -e 'get path to application "iTerm"' &>/dev/null; then 5 | # On a Mac with iTerm.app, do something nice 6 | start() { ( source ./util/tab && tab "$@" ); } 7 | else 8 | # otherwise, just run the process 9 | start() { local title=$1; shift; "$@" & } 10 | trap wait EXIT 11 | fi 12 | 13 | # launch elasticsearch 14 | 15 | start "ElasticSearch" elasticsearch 16 | 17 | # launch nodejs 18 | start "Nodejs" npm start 19 | 20 | # launch react jsx watch 21 | start "Reactjs" jsx --watch view/ public/js 22 | 23 | -------------------------------------------------------------------------------- /view/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # install virtualenv 4 | command -v virtualenv >/dev/null 2>&1 || { 5 | echo >&2 "virtualenv required but not installed. Aborting."; 6 | echo >&2 "You can install virtualenv with:" 7 | echo >&2 " sudo pip install virtualenv" 8 | } 9 | 10 | virtualenv env 11 | source env/bin/activate 12 | 13 | # install python dependencies 14 | pip install elasticsearch 15 | pip install pyhocon 16 | pip install psycopg2 17 | 18 | # install node packages 19 | npm install 20 | 21 | # elasticsearch 22 | cd util 23 | ES_VER=elasticsearch-1.6.0 24 | if [ ! -f ${ES_VER}.tar.gz ]; then 25 | curl -L -O https://download.elastic.co/elasticsearch/elasticsearch/${ES_VER}.tar.gz 26 | tar xvzf ${ES_VER}.tar.gz 27 | fi 28 | # must add 29 | echo "script.disable_dynamic: false" >> ${ES_VER}/config/elasticsearch.yml 30 | cd .. 31 | 32 | # for development, we would like to enable auto-reload 33 | npm install react-tools nodemon 34 | 35 | cd public 36 | git clone https://github.com/google/closure-library 37 | 38 | 39 | -------------------------------------------------------------------------------- /view/util/cat.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl localhost:9200/_cat/indices/?v 4 | -------------------------------------------------------------------------------- /view/util/create_index.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # exists 4 | HEAD=$(curl -s -XHEAD -i 'http://localhost:9200/view') 5 | [ "${HEAD:0:15}" == "HTTP/1.1 200 OK" ] && EXISTS=1 6 | if [ $EXISTS ]; then 7 | curl -XDELETE 'http://localhost:9200/view/' 8 | fi 9 | 10 | INDEX_NAME=view 11 | TYPE_DOCS_NAME=docs 12 | 13 | curl -XPOST localhost:9200/$INDEX_NAME -d '{ 14 | "settings" : { 15 | "index" : { 16 | "number_of_shards" : 1 17 | }, 18 | "analysis" : { 19 | "analyzer" : { 20 | "fulltext_analyzer" : { 21 | "type" : "custom", 22 | "tokenizer" : "whitespace", 23 | "filter" : [ 24 | "lowercase" 25 | ] 26 | } 27 | } 28 | } 29 | }, 30 | "mappings" : { 31 | "annotations" : { 32 | "_source" : { "enabled" : true }, 33 | "_parent" : { 34 | "type" : "docs" 35 | }, 36 | "properties" : {} 37 | }, 38 | "docs" : { 39 | "_source" : { "enabled" : true }, 40 | "properties" : { 41 | "id" : { 42 | "type" : "string" 43 | }, 44 | "content" : { 45 | "type" : "string", 46 | "term_vector" : "with_positions_offsets", 47 | "store" : false, 48 | "index_analyzer" : "fulltext_analyzer", 49 | "norms" : { 50 | "enabled" : false 51 | } 52 | }, 53 | "text" : { 54 | "type" : "string", 55 | "term_vector" : "with_positions_offsets", 56 | "index_analyzer" : "fulltext_analyzer" 57 | }, 58 | "extr1" : { 59 | "type" : "string", 60 | "index" : "not_analyzed" 61 | }, 62 | "extr1_meta" : { 63 | "type" : "string", 64 | "index" : "not_analyzed" 65 | } 66 | } 67 | } 68 | } 69 | }' 70 | -------------------------------------------------------------------------------- /view/util/fetch-annotations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pyhocon import ConfigFactory 4 | import json 5 | import psycopg2 6 | import psycopg2.extras 7 | import sys 8 | 9 | conf = ConfigFactory.parse_file('../view.conf') 10 | 11 | conf_annotations = conf.get_list('view.annotations') 12 | 13 | def write_annotations(): 14 | # write extractions to json file 15 | dbconf = conf.get('view.db.default') 16 | conn_string = "host='%s' dbname='%s' user='%s' password='%s'" % ( 17 | dbconf.get('host'), 18 | dbconf.get('dbname'), 19 | dbconf.get('user'), 20 | dbconf.get('password')) 21 | conn = psycopg2.connect(conn_string) 22 | for ann in conf_annotations: 23 | with open('../' + ann.get('input'), 'w') as w: 24 | cursor = conn.cursor('ann_cursor', cursor_factory=psycopg2.extras.DictCursor) 25 | cursor.execute(ann.get('sql.query')) 26 | for row in cursor: 27 | #print(row) 28 | # TODO: must write into the following format 29 | # each row: 30 | # {"range":{"type":"sentenceTokenSpan","doc_id":"doc123","sentNum":0,"f":3,"t":4},"target":{"entity":"something"}} 31 | # save in file using w.write 32 | obj = {"id":row[0], "range":{"type":"sentenceTokenSpan","doc_id":row[1],"sentNum":0,"f":row[2],"t":int(row[3])},"target":{"entity":row[4]}} 33 | w.write(json.dumps(obj)) 34 | w.write('\n') 35 | 36 | write_annotations() 37 | -------------------------------------------------------------------------------- /view/util/fetch-sentences-table.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Legacy support for sentences table in DeepDive. 4 | # The script reads the table from the database and stores it in the new column format. 5 | 6 | from pyhocon import ConfigFactory 7 | import json 8 | import psycopg2 9 | import psycopg2.extras 10 | import sys 11 | import pipe 12 | 13 | conf = ConfigFactory.parse_file('../view.conf') 14 | 15 | docs = conf.get('view.docs') 16 | 17 | def find_token_offsets(s): 18 | # split on whitespace 19 | pos = [ -1 ] + [ i for i, ltr in enumerate(s) if ltr == ' ' ] + [ len(s) ] 20 | offsets = [ [ pos[i] + 1, pos[i + 1] ] for i in range(0, len(pos) - 1) ] 21 | return offsets 22 | 23 | def write_docs(): 24 | # write extractions to json file 25 | dbconf = conf.get('view.db.default') 26 | conn_string = "host='%s' dbname='%s' user='%s' password='%s'" % ( 27 | dbconf.get('host'), 28 | dbconf.get('dbname'), 29 | dbconf.get('user'), 30 | dbconf.get('password')) 31 | conn = psycopg2.connect(conn_string) 32 | cursor = conn.cursor('ann_cursor', cursor_factory=psycopg2.extras.DictCursor) 33 | cursor.execute(docs.get('sql.query')) 34 | 35 | with pipe.col_open_w('../data/sentences', [ 'id', 'text', 'tokenOffsets', 'sentenceTokenOffsets', 'sentenceOffsets', 'lemmas', 'poss' ]) as w: 36 | sent_num = 0 37 | prev_document_id = None 38 | for row in cursor: 39 | # id 40 | #document_id = str(row[0]) 41 | #if document_id != prev_document_id: 42 | # sent_num = 0 43 | #id = document_id + '@' + str(sent_num) 44 | id = row[0] 45 | 46 | text = row[1] 47 | token_offsets = find_token_offsets(text) 48 | sentence_token_offsets = [[0,len(token_offsets)]] 49 | sentence_offsets = [[0, len(text)]] 50 | lemmas = row[2] 51 | pos_tags = row[3] 52 | 53 | w.write([id, text, token_offsets, sentence_token_offsets, sentence_offsets, lemmas, pos_tags]) 54 | 55 | #prev_document_id = document_id 56 | sent_num = sent_num + 1 57 | 58 | write_docs() 59 | 60 | -------------------------------------------------------------------------------- /view/util/generate_sentence_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Author: Zifei Shan (zifeishan@gmail.com) 4 | 5 | ''' This file construct a sentence table from ann.* files generated from Pipe project. 6 | 7 | Example usage: 8 | 9 | python generate_sentence_table.py DIRECTORY/OF/ANN/ > output_sentences.tsv 10 | 11 | The generated sentence table follow the format below: 12 | 13 | CREATE TABLE sentences ( 14 | doc_id text, 15 | sent_id integer, 16 | wordidxs integer[], 17 | words text[], 18 | poses text[], 19 | ners text[], 20 | lemmas text[], 21 | dep_tuples text[], -- Triplet format. e.g.: "1 dep 0" 22 | sentence_id text 23 | ); 24 | 25 | ''' 26 | 27 | import sys, json 28 | 29 | # This file can accept an argument: the folder that contains ann.* 30 | # If not specified, use the current directory. 31 | if len(sys.argv) == 1: 32 | basedir = '' 33 | else: 34 | basedir = sys.argv[1] + '/' 35 | 36 | # Helper functions 37 | 38 | def list2TSVarray(a_list, quote=True): 39 | '''Convert a list to a string that can be used in a TSV column and intepreted as 40 | an array by the PostreSQL COPY FROM command. 41 | If 'quote' is True, then double quote the string representation of the 42 | elements of the list, and escape double quotes and backslashes. 43 | ''' 44 | if a_list is None: 45 | return '\\N' 46 | 47 | if quote: 48 | for index in range(len(a_list)): 49 | if "\\" in unicode(a_list[index]): 50 | # Replace '\' with '\\\\"' to be accepted by COPY FROM 51 | a_list[index] = unicode(a_list[index]).replace("\\", "\\\\\\\\") 52 | # This must happen the previous substitution 53 | if "\"" in unicode(a_list[index]): 54 | # Replace '"' with '\\"' to be accepted by COPY FROM 55 | a_list[index] = unicode(a_list[index]).replace("\"", "\\\\\"") 56 | string = ",".join(list(map(lambda x: "\"" + unicode(x) + "\"", a_list))) 57 | else: 58 | string = ",".join(list(map(lambda x: unicode(x), a_list))) 59 | return "{" + string + "}" 60 | 61 | def open_file(fname): 62 | ''' 63 | Opens a file, if not found, return None. 64 | ''' 65 | try: 66 | return open(fname) 67 | except: 68 | return None 69 | 70 | def read_js_line(fp): 71 | ''' 72 | Return None if file is not open. Otherwise read a line from file. 73 | If '' returned, EOF is found. 74 | ''' 75 | if fp == None: 76 | return None 77 | s = fp.readline() 78 | if s == '': 79 | return '' 80 | else: 81 | return json.loads(s) 82 | 83 | def escape_none(s): 84 | ''' 85 | Just escaping a None into psql-friendly format 86 | ''' 87 | if s is None: 88 | return '\\N' 89 | return unicode(s).encode('utf-8') 90 | 91 | def findTokenOffset(token_offsets, sent_offset): 92 | ''' 93 | Construct sent_token_offsets 94 | ''' 95 | start = min(i for i in range(len(token_offsets)) if token_offsets[i][0] == sent_offset[0]) 96 | end = max(i for i in range(len(token_offsets)) if token_offsets[i][1] == sent_offset[1]) + 1 97 | return start, end 98 | 99 | # ----------- Main function ------------- 100 | 101 | # Assume fixed filenames 102 | fdoc_id = open_file(basedir + 'ann.id') 103 | flemma = open_file(basedir + 'ann.lemmas') 104 | fpos = open_file(basedir + 'ann.poss') 105 | fner = open_file(basedir + 'ann.nerTags') 106 | fsent_offset = open_file(basedir + 'ann.sentenceOffsets') 107 | fsent_token_offset = open_file(basedir + 'ann.sentenceTokenOffsets') 108 | ftext = open_file(basedir + 'ann.text') 109 | ftoken_offset = open_file(basedir + 'ann.tokenOffsets') 110 | fsent_deps = open_file(basedir + 'ann.sentenceDependencies') 111 | 112 | while True: 113 | doc_id = read_js_line(fdoc_id) 114 | lemmas = read_js_line(flemma) 115 | poss = read_js_line(fpos) 116 | ners = read_js_line(fner) 117 | sent_offsets = read_js_line(fsent_offset) 118 | # sent_token_offsets = read_js_line(fsent_token_offset) 119 | text = read_js_line(ftext) 120 | token_offsets = read_js_line(ftoken_offset) 121 | sent_deps = read_js_line(fsent_deps) 122 | 123 | if any(x == '' for x in [doc_id, lemmas, poss, sent_offsets, \ 124 | text, token_offsets]): 125 | break 126 | 127 | sent_token_offsets = [ findTokenOffset(token_offsets, x) for x in sent_offsets] 128 | 129 | # loop through each sentence 130 | sent_words = [text[o[0] : o[1]] for o in token_offsets] 131 | # print 'WORDS:', sent_words 132 | 133 | 134 | for sent_id in range(len(sent_token_offsets)): 135 | sent_from, sent_to = sent_token_offsets[sent_id] 136 | sentence_id = unicode(doc_id) + '_' + unicode(sent_id) 137 | if sent_deps is not None: 138 | # e.g.: [[{"name":"det","from":1,"to":0}],[{"name":"advmod","from":1,"to":0},{"name":"advmod","from":1,"to":2}]] 139 | this_sent_deps = ['%d %s %d' % (d['from'], d['name'], d['to']) for d in sent_deps[sent_id]] 140 | print '\t'.join([escape_none(x) for x in [ \ 141 | doc_id, \ 142 | sent_id, \ 143 | list2TSVarray([x for x in range(sent_to - sent_from)]), \ 144 | list2TSVarray( sent_words[ sent_from : sent_to] ) if sent_words is not None else None, \ 145 | list2TSVarray( poss[ sent_from : sent_to]) if poss is not None else None, \ 146 | list2TSVarray( ners[ sent_from : sent_to]) if ners is not None else None, \ 147 | list2TSVarray( lemmas[ sent_from : sent_to]) if lemmas is not None else None, \ 148 | list2TSVarray( this_sent_deps ) if sent_deps is not None else None, \ 149 | sentence_id \ 150 | ]]) 151 | -------------------------------------------------------------------------------- /view/util/get.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #curl -XGET 'http://localhost:9200/dd/docs/10.1371.journal.pone.0042439.Body__50' 4 | #curl -XGET 'http://localhost:9200/view/docs/doc123' 5 | curl -XGET 'http://localhost:9200/view/docs/132553@2' 6 | -------------------------------------------------------------------------------- /view/util/index_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from elasticsearch import Elasticsearch 4 | import json 5 | 6 | INPUT = "../data/sentences.json" 7 | ES_HOST = {"host" : "localhost", "port" : 9200} 8 | INDEX_NAME = 'dd' 9 | TYPE_NAME = 'docs' 10 | N = 1000 11 | 12 | es = Elasticsearch(hosts = [ES_HOST]) 13 | 14 | es.delete_by_query(index = INDEX_NAME, body = { 15 | "query": { 16 | "match_all": {} 17 | } 18 | }) 19 | 20 | with open(INPUT, 'r') as f: 21 | bulk_data = [] 22 | 23 | for line in f: 24 | src = json.loads(line) 25 | id = src['doc_id'] + '__' + src['sent_id'] 26 | content = ' '.join(src['words']) 27 | op_dict = { 28 | "index": { 29 | "_index": INDEX_NAME, 30 | "_type": TYPE_NAME, 31 | "_id": id 32 | } 33 | } 34 | data_dict = { 35 | "id": id, 36 | "content": content 37 | } 38 | bulk_data.append(op_dict) 39 | bulk_data.append(data_dict) 40 | if len(bulk_data) > N: 41 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 42 | bulk_data = [] 43 | 44 | if len(bulk_data) > 0: 45 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 46 | 47 | es.indices.refresh(index = INDEX_NAME) 48 | 49 | #if es.indices.exists(INDEX_NAME): 50 | # res = es.indices.delete(index = INDEX_NAME) 51 | # 52 | #request_body = { 53 | # "settings" : { 54 | # "number_of_shards": 1, 55 | # "number_of_replicas": 0 56 | # } 57 | #} 58 | # 59 | #print("creating '%s' index..." % (INDEX_NAME)) 60 | #res = es.indices.create(index = INDEX_NAME, body = request_body, ignore=400) 61 | 62 | #print("bulk indexing...") 63 | #res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True) 64 | 65 | # sanity check 66 | #res = es.search(index = INDEX_NAME, size=2, body={"query": {"match_all": {}}}) 67 | #print(" response: '%s'" % (res)) 68 | 69 | #print("results:") 70 | #for hit in res['hits']['hits']: 71 | # print(hit["_source"]) 72 | 73 | -------------------------------------------------------------------------------- /view/util/index_extr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from elasticsearch import Elasticsearch 4 | import json 5 | 6 | EXTRACTOR='genepheno' 7 | INPUT='../data/genepheno_rel.json' 8 | ES_HOST = {"host" : "localhost", "port" : 9200} 9 | INDEX_NAME = 'dd' 10 | TYPE_NAME = 'docs' 11 | N = 1000 12 | 13 | es = Elasticsearch(hosts = [ES_HOST]) 14 | 15 | with open(INPUT, 'r') as f: 16 | bulk_data = [] 17 | 18 | for line in f: 19 | src = json.loads(line) 20 | id = src['doc_id'] + '__' + str(src['sent_id']) 21 | op_dict = { 22 | "update": { 23 | "_index": INDEX_NAME, 24 | "_type": TYPE_NAME, 25 | "_id": str(id) 26 | } 27 | } 28 | extr = ','.join(map(str, src['gene_wordidxs'])) + '-' + ','.join(map(str, src['pheno_wordidxs'])) 29 | script_dict = { 30 | "script" : "if (ctx._source.containsKey(\"" + EXTRACTOR + "\")) {ctx._source[\"" + EXTRACTOR + "\"] += ex;} else {ctx._source[\"" + EXTRACTOR + "\"] = [ex]}", 31 | "params" : { 32 | "ex" : extr 33 | } 34 | } 35 | bulk_data.append(op_dict) 36 | bulk_data.append(script_dict) 37 | if len(bulk_data) > N: 38 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 39 | bulk_data = [] 40 | 41 | if len(bulk_data) > 0: 42 | print('doing update') 43 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 44 | 45 | es.indices.refresh(index = INDEX_NAME) 46 | 47 | -------------------------------------------------------------------------------- /view/util/index_extrlist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from elasticsearch import Elasticsearch 4 | import json 5 | 6 | ES_HOST = {"host" : "localhost", "port" : 9200} 7 | INDEX_NAME = 'dd' 8 | TYPE_NAME = 'extractors' 9 | 10 | es = Elasticsearch(hosts = [ES_HOST]) 11 | 12 | es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_NAME, body = { 13 | "query": { 14 | "match_all": {} 15 | } 16 | }) 17 | 18 | 19 | es.index(index = INDEX_NAME, doc_type = TYPE_NAME, body = { 20 | "name" : "genepheno" 21 | }, refresh = False) 22 | 23 | es.indices.refresh(index = INDEX_NAME) 24 | 25 | -------------------------------------------------------------------------------- /view/util/pipe.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | from os import listdir 4 | from os.path import isfile, join 5 | import json 6 | 7 | # column format reader 8 | 9 | def col_open(dir): 10 | return ColumnReaderAsSingleObj(dir) 11 | 12 | def col_open_arr(dir): 13 | return ColumnReader(dir) 14 | 15 | class ColumnReader(object): 16 | '''Reads Pipe's column format''' 17 | 18 | def __init__(self, dir): 19 | files = [ f for f in listdir(dir) if isfile(join(dir, f)) and not f == '.errors' ] 20 | self.types = [ f[f.rfind('.') + 1:] for f in files ] 21 | self.u_types = [ unicode(s, 'utf-8') for s in self.types ] 22 | self.handles = [ open(join(dir, f)) for f in files ] 23 | 24 | def __iter__(self): 25 | return self 26 | 27 | def next(self): 28 | row = [ h.readline() for h in self.handles ] 29 | for c in row: 30 | if c == '': 31 | self.close() 32 | raise StopIteration 33 | return [ json.loads(c.rstrip()) for c in row ] 34 | 35 | def close(self): 36 | for h in self.handles: 37 | if not h.closed: 38 | h.close() 39 | 40 | class ColumnReaderAsSingleObj(ColumnReader): 41 | 42 | def next(self): 43 | row = super(self.__class__, self).next() 44 | obj = {} 45 | for i in range(0, len(row)): 46 | obj[self.u_types[i]] = row[i] 47 | return obj 48 | 49 | # column format writer 50 | 51 | def col_open_w(dir, types): 52 | return ColumnWriter(dir, types) 53 | 54 | class ColumnWriter(object): 55 | '''Writes Pipe's column format''' 56 | 57 | def __init__(self, dir, types): 58 | self.types = types 59 | files = [ 'ann.' + t for t in types ] 60 | self.handles = [ open(join(dir, 'ann.' + t), 'w') for t in types ] 61 | 62 | def __enter__(self): 63 | return self 64 | 65 | def __exit__(self, type, value, traceback): 66 | self.close() 67 | 68 | def write(self, arr): 69 | for i, a in enumerate(arr): 70 | self.handles[i].write(json.dumps(a) + '\n') 71 | 72 | def close(self): 73 | for h in self.handles: 74 | if not h.closed: 75 | h.close() 76 | -------------------------------------------------------------------------------- /view/util/refresh-annotations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ES_HOST = {"host" : "localhost", "port" : 9200} 4 | INDEX_NAME = 'view' 5 | TYPE_ANNOTATORS_NAME = 'annotators' 6 | TYPE_ANNOTATIONS_NAME = 'annotations' 7 | N = 1000 8 | 9 | from pyhocon import ConfigFactory 10 | from elasticsearch import Elasticsearch 11 | import json 12 | import sys 13 | 14 | conf = ConfigFactory.parse_file('../view.conf') 15 | 16 | conf_annotations = conf.get_list('view.annotations') 17 | 18 | es = Elasticsearch(hosts = [ES_HOST]) 19 | 20 | # create a small table that only contains the names of all available extractors 21 | def index_annotators(): 22 | es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_ANNOTATORS_NAME, body = { 23 | "query": { 24 | "match_all": {} 25 | } 26 | }) 27 | for ann in conf_annotations: 28 | es.index(index = INDEX_NAME, doc_type = TYPE_ANNOTATORS_NAME, body = { 29 | "name" : ann.get('name') 30 | }, refresh = False) 31 | es.indices.refresh(index = INDEX_NAME) 32 | 33 | # create a large table that contains all extractions 34 | def index_annotations(): 35 | es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_ANNOTATIONS_NAME, body = { 36 | "query": { 37 | "match_all": {} 38 | } 39 | }) 40 | for ann in conf_annotations: 41 | # read from file 42 | 43 | # bulk index docs 44 | bulk_data = [] 45 | for l in open('../' + ann.get('input')): 46 | o = json.loads(l) 47 | # {"id": "12", "range":{"type":"sentenceTokenSpan","doc_id":"doc123","sentNum":0,"f":3,"t":4},"target":{"entity":"something"}} 48 | o['attribute'] = ann.get('name') 49 | op_dict = { 50 | "index": { 51 | "_index": INDEX_NAME, 52 | "_type": TYPE_ANNOTATIONS_NAME, 53 | "_id": o['id'], 54 | "_parent": o['range']['doc_id'] 55 | } 56 | } 57 | #data_dict = { 58 | # "id": id, 59 | # "content": content, 60 | # "tokenOffsets": tokenOffsets 61 | #} 62 | #o['content'] = o[u'text'] 63 | data_dict = o 64 | #print(op_dict) 65 | #print(data_dict) 66 | bulk_data.append(op_dict) 67 | bulk_data.append(data_dict) 68 | if len(bulk_data) > N: 69 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 70 | bulk_data = [] 71 | 72 | if len(bulk_data) > 0: 73 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 74 | 75 | es.indices.refresh(index = INDEX_NAME) 76 | 77 | index_annotators() 78 | index_annotations() 79 | 80 | -------------------------------------------------------------------------------- /view/util/refresh-documents.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pipe 4 | 5 | ES_HOST = {"host" : "localhost", "port" : 9200} 6 | INDEX_NAME = 'view' 7 | TYPE_NAME = 'docs' 8 | N = 1000 9 | 10 | from pyhocon import ConfigFactory 11 | from elasticsearch import Elasticsearch 12 | import json 13 | import sys 14 | 15 | conf = ConfigFactory.parse_file('../view.conf') 16 | 17 | docs_conf = conf.get('view.docs') 18 | 19 | es = Elasticsearch(hosts = [ES_HOST]) 20 | 21 | def index_docs(): 22 | 23 | # clear index 24 | es.delete_by_query(index = INDEX_NAME, doc_type = TYPE_NAME, body = { 25 | "query": { 26 | "match_all": {} 27 | } 28 | }) 29 | 30 | # bulk index docs 31 | bulk_data = [] 32 | for o in pipe.col_open('../' + docs_conf.get('input')): 33 | id = o[u'id'] 34 | content = o[u'text'] 35 | tokenOffsets = o[u'tokenOffsets'] 36 | 37 | op_dict = { 38 | "index": { 39 | "_index": INDEX_NAME, 40 | "_type": TYPE_NAME, 41 | "_id": id 42 | } 43 | } 44 | #data_dict = { 45 | # "id": id, 46 | # "content": content, 47 | # "tokenOffsets": tokenOffsets 48 | #} 49 | o['content'] = o[u'text'] 50 | data_dict = o 51 | bulk_data.append(op_dict) 52 | bulk_data.append(data_dict) 53 | if len(bulk_data) > N: 54 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 55 | bulk_data = [] 56 | 57 | if len(bulk_data) > 0: 58 | res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = False) 59 | 60 | es.indices.refresh(index = INDEX_NAME) 61 | 62 | index_docs() 63 | -------------------------------------------------------------------------------- /view/util/search.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #curl -XGET 'http://localhost:9200/view/docs/_search?q=_id:10.1371.journal.pone.0042439.Body__50' 4 | curl -XGET 'http://localhost:9200/view/docs/_search?q=_id:doc123' 5 | curl -XGET 'http://localhost:9200/view/docs/_search?q=simple' 6 | -------------------------------------------------------------------------------- /view/util/tab: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | [ `uname -s` != "Darwin" ] && return 4 | 5 | function tab () { 6 | local name="$1" 7 | local cmd="" 8 | local cdto="$PWD" 9 | local args="${@:2}" 10 | 11 | echo "launching $args ..." 12 | 13 | if [ -n "$args" ]; then 14 | cmd="; $args" 15 | fi 16 | 17 | osascript &>/dev/null < 25 |
26 |

Query Examples

27 | 28 |

Words and Phrases

29 | quick and "quick brown" 30 | 31 |

Field names

32 | _id:4325235
33 | title:(quick OR brown)
34 | book.\*:(quick brown)
35 | _missing_:title
36 | _exists_:title 37 | 38 |

Wildcards

39 | qu?ck bro* 40 | 41 |

Regular Expressions

42 | name:/joh?n(ath[oa]n)/ 43 | 44 |

Fuzziness

45 | quikc~ brwn~ foks~
46 | quikc~1 47 | 48 |

Proximity Searches

49 | "fox quick"~5 50 | 51 |

Ranges

52 | date:[2012-01-01 TO 2012-12-31]
53 | count:[1 TO 5]
54 | tag: {"{"}alpha TO omega{"}"}
55 | count:[10 TO *]
56 | date:{"{"}* TO 2012-01-01{"}"}
57 | count:[1 TO 5{"}"}
58 | age:>=10
59 | age:(>=10 AND <20) 60 | 61 |

Boosting

62 | quick^2 fox
63 | "john smith"^2
64 | (foo bar)^4 65 | 66 |

Boolean Operators

67 | quick brown +fox -news
68 | ((quick AND fox) OR (brown AND fox) OR fox) AND NOT news 69 | 70 |

Grouping

71 | (quick OR brown) AND fox
72 | status:(active OR pending) title:(full text search)^2 73 | 74 |

Reserved Characters

75 | Escape with backslash
76 | Example: \(1\+1\)\=2 , finds (1+1)=2
77 | Characters: + - = && || > < ! ( ) {"{"} {"}"} [ ] ^ " ~ * ? : \ / 78 | 79 |

Empty Query

80 | Shows all results. 81 | 82 |

83 | For more details, see here. 84 |

85 |
86 |
87 |
88 |
89 |
) 90 | } 91 | }) 92 | 93 | module.exports = Help 94 | -------------------------------------------------------------------------------- /view/view/vis/AnnotationsSelector.js: -------------------------------------------------------------------------------- 1 | 2 | var AnnotationsSelector = React.createClass({ 3 | 4 | render: function() { 5 | var onLayerChange = this.props.onLayerChange 6 | 7 | var buttons = this.props.layers.map(function(result) { 8 | return ( 9 | 11 | ); 12 | }); 13 | return (
{buttons}
); 14 | } 15 | }); 16 | 17 | var AnnotationsSelectorButton = React.createClass({ 18 | handleClick: function() { 19 | var active = !this.props.data.active; 20 | this.props.onLayerChange(this.props.data.name, active); 21 | }, 22 | render: function() { 23 | var classes = 'facet'; 24 | if (!this.props.data.active) 25 | classes += ' facet-inactive'; 26 | return (
27 |
28 | 29 |
{this.props.data.name} 30 |
) 31 | } 32 | }) 33 | 34 | module.exports = AnnotationsSelector -------------------------------------------------------------------------------- /view/view/vis/TextWithAnnotations.js: -------------------------------------------------------------------------------- 1 | var React = window.React = require('react'); 2 | 3 | var SpansVisualization = require('./core/SpansVisualization.js') 4 | var TokenTagsVisualization = require('./core/TokenTagsVisualization.js') 5 | var EdgesVisualization = require('./core/EdgesVisualization.js') 6 | var SentenceUtils = require('./core/SentenceUtils.js') 7 | 8 | var TokensVisualization = function(element, source) { 9 | return SpansVisualization(element, source.tokenOffsets) 10 | } 11 | 12 | var SentencesVisualization = function(element, source) { 13 | return SpansVisualization(element, source.sentenceOffsets) 14 | } 15 | 16 | var PartOfSpeechVisualization = function(element, source) { 17 | return TokenTagsVisualization(element, source.tokenOffsets, source.poss) 18 | } 19 | 20 | var LemmasVisualization = function(element, source) { 21 | return TokenTagsVisualization(element, source.tokenOffsets, source.lemmas) 22 | } 23 | 24 | var DependenciesVisualization = function(element, source) { 25 | // compute sentenceTokenOffsets 26 | var sentenceTokenOffsets = SentenceUtils.getSentenceTokenOffsets(source.tokenOffsets, source.sentenceOffsets) 27 | return EdgesVisualization(element, source.tokenOffsets, source.sentenceOffsets, sentenceTokenOffsets, source.sentenceDependencies) 28 | } 29 | 30 | var ExtractorsVisualization = function(element, source, annotations) { 31 | var sentenceTokenOffsets = source['sentenceTokenOffsets'] 32 | var tokenOffsets = source['tokenOffsets'] 33 | var extractorOffsets = [] 34 | 35 | $.each(annotations, function(i, a) { 36 | var sentNum = a.range.sentNum 37 | var sentenceBeginToken = sentenceTokenOffsets[sentNum][0] 38 | var tokenFrom = sentenceBeginToken + a.range.f 39 | var tokenTo = sentenceBeginToken + a.range.t 40 | var charFrom = tokenOffsets[tokenFrom][0] 41 | var charTo = tokenOffsets[tokenTo - 1][1] 42 | extractorOffsets.push([charFrom,charTo]) 43 | }) 44 | return SpansVisualization(element, extractorOffsets) 45 | } 46 | 47 | var TextWithAnnotations = React.createClass({ 48 | 49 | componentDidMount: function() { 50 | this.vis = {} 51 | this.buildCustomDom() 52 | }, 53 | componentDidUpdate: function() { 54 | this.buildCustomDom() 55 | }, 56 | buildCustomDom: function() { 57 | var div = React.findDOMNode(this) 58 | //cleanup existing visualizations 59 | $.each(this.vis, function(k,v) { v.destroy() }) 60 | 61 | this.vis = {} 62 | 63 | var annotations = this.props.data.annotations 64 | var sourceData = this.props.data._source 65 | var vis = this.vis 66 | 67 | $.each(this.props.layers, function(i, l) { 68 | if (vis && vis[l.name] && !l.active) { 69 | vis[l.name].destroy() 70 | delete vis[l.name] 71 | } 72 | if (vis && !vis[l.name] && l.active) { 73 | if (l.name == 'Tokens') 74 | vis[l.name] = new TokensVisualization(div, sourceData) 75 | if (l.name == 'Sentences') 76 | vis[l.name] = new SentencesVisualization(div, sourceData) 77 | if (l.name == 'Extractors') 78 | vis[l.name] = new ExtractorsVisualization(div, sourceData, annotations) 79 | if (l.name == 'Dependencies') 80 | vis[l.name] = new DependenciesVisualization(div, sourceData) 81 | if (l.name == 'Lemmas') 82 | vis[l.name] = new LemmasVisualization(div, sourceData) 83 | if (l.name == 'PartOfSpeech') 84 | vis[l.name] = new PartOfSpeechVisualization(div, sourceData) 85 | } 86 | }) 87 | }, 88 | isActive: function(name) { 89 | var isActive = false 90 | $.each(this.props.layers, function(i, l) { 91 | if (l.name == name) { isActive = l.active; return false } 92 | }) 93 | return isActive 94 | }, 95 | 96 | render: function() { 97 | content = this.props.data._source.content; 98 | // if we have field with keyword highlighting, take that 99 | if (this.props.data.highlight != null && 100 | this.props.data.highlight.content != null) { 101 | content = this.props.data.highlight.content[0]; 102 | } 103 | var details = [] 104 | if (this.isActive('Details')) { 105 | $.each(this.props.data.annotations, function(i, value) { 106 | details.push(
{JSON.stringify(value)}
); 107 | }) 108 | $.each(this.props.data._source, function(name, value) { 109 | if (name != 'content' && name != 'id') 110 | details.push (
{name} : {JSON.stringify(value)}
); 111 | }) 112 | } 113 | //style={{'white-space':'pre-wrap'}} 114 | var div = (
115 |
{this.props.data._id}
116 | {details} 117 |
) 118 | 119 | return div; 120 | } 121 | }); 122 | 123 | module.exports = TextWithAnnotations 124 | 125 | -------------------------------------------------------------------------------- /view/view/vis/core/CharOffsets.js: -------------------------------------------------------------------------------- 1 | var CharOffsets = (function() { 2 | var ELEMENT = 1; 3 | var TEXT = 3; 4 | 5 | var offsetComparator = function(e1, e2) { 6 | return e1.readrOffset - e2.readrOffset; 7 | }; 8 | 9 | var indexOffsets = function(node, offset) { 10 | node.readrOffset = offset; 11 | if (node.nodeType == TEXT) { 12 | node.readrLength = node.nodeValue.length; 13 | } else if (node.nodeType == ELEMENT) { 14 | // ignore if has class ignoreReadrLength 15 | if (goog.dom.classes.has(node, 'ignoreReadrLength')) { 16 | node.readrLength = 0; 17 | } else { 18 | // sum up lengths of children 19 | var l = 0; 20 | for (var i=0, ii = node.childNodes.length; i < ii; i++) { 21 | var child = node.childNodes[i]; 22 | indexOffsets(child, offset + l); 23 | l += child.readrLength; 24 | } 25 | node.readrLength = l; 26 | } 27 | } 28 | }; 29 | 30 | var getTextRangesToHighlightFromIndex = function(node, start, end) { 31 | var results = new Array(); 32 | recur(node, start, end, results); 33 | return results; 34 | }; 35 | 36 | var recur = function(node, start, end, results) { 37 | if (end - start <= 0) return; 38 | 39 | // we assume that start >= node.readrOffset and end <= node.readrOffset + node.readrLength 40 | if (node.nodeType == TEXT) { 41 | results.push([node, start - node.readrOffset, end - node.readrOffset, start, end]); 42 | return; 43 | } 44 | // binary search for start and end 45 | var ns = goog.array.binarySearch(node.childNodes, { readrOffset : start }, offsetComparator); 46 | var ne = goog.array.binarySearch(node.childNodes, { readrOffset : end }, offsetComparator); 47 | 48 | if (ns < 0) { ns = -ns-2; } 49 | if (ne < 0) { ne = -ne-1; } 50 | 51 | for (var i=ns; i < ne; i++) { 52 | var child = node.childNodes[i]; 53 | var s = (i==ns)? start : child.readrOffset; 54 | var e = (i==ne-1)? end : child.readrOffset + child.readrLength; 55 | 56 | recur(child, s, e, results); 57 | } 58 | }; 59 | 60 | var createMultiRangeSpans = function(element, tokenOffsets, renderedSpans, documentOffset) { 61 | if (!renderedSpans) 62 | renderedSpans = new Array(); 63 | if (!documentOffset) 64 | documentOffset = 0 65 | indexOffsets(element[0], documentOffset) 66 | for (var j=0, jj = tokenOffsets.length; j < jj; j++) { 67 | // token has offsets t.f, t.t 68 | var rs = createSingleRangeSpans(element, tokenOffsets[j]); 69 | renderedSpans.push(rs); 70 | } 71 | return renderedSpans; 72 | }; 73 | 74 | var FROM = 0 75 | var TO = 1 76 | 77 | // example tokenOffset: { f:12, t:23 } 78 | var createSingleRangeSpans = function(element, tokenOffset) { 79 | //if (!documentOffset) 80 | //documentOffset = 0 81 | var sels = new Array(); 82 | var todo = getTextRangesToHighlightFromIndex 83 | (element[0], tokenOffset[FROM], tokenOffset[TO]); 84 | for (var i=0, ii = todo.length; i < ii; i++) { 85 | var t = todo[i]; 86 | var range = goog.dom.Range.createFromNodes(t[0], t[1], t[0], t[2]); 87 | var parentNode = t[0].parentNode 88 | var parentNodeOffset = parentNode.readrOffset 89 | 90 | var el = goog.dom.createDom('span'); //, { 'style':'background-color:green'}); 91 | range.surroundContents(el); 92 | //indexOffsets(t[0].parentNode, t[0].parentNode.readrOffset); 93 | indexOffsets(parentNode, parentNodeOffset); 94 | sels.push(el); 95 | } 96 | return { sels:sels }; 97 | }; 98 | 99 | //note, the output of this function is a singleton 100 | return { 101 | indexOffsets: indexOffsets, 102 | getTextRangesToHighlightFromIndex: getTextRangesToHighlightFromIndex, 103 | createMultiRangeSpans: createMultiRangeSpans, 104 | createSingleRangeSpans: createSingleRangeSpans 105 | }; 106 | })() 107 | 108 | module.exports = CharOffsets 109 | -------------------------------------------------------------------------------- /view/view/vis/core/FramesVisualization.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | // 5 | // 6 | //var FramesVisualization = function(element, source) { 7 | // var state ={} 8 | // 9 | // var documentOffset = 0 10 | // 11 | // var msHeadSpans = new Array(); 12 | // CharOffsets.createMultiRangeSpans(textElement, msHeadOffsets, msHeadSpans, documentOffset) 13 | // 14 | // 15 | // 16 | //} -------------------------------------------------------------------------------- /view/view/vis/core/SentenceUtils.js: -------------------------------------------------------------------------------- 1 | var SentenceUtils = (function() { 2 | 3 | var FROM = 0 4 | var TO = 1 5 | 6 | var getSentenceTokenOffsets = function(tokenOffsets, sentenceOffsets) { 7 | var sentenceTokenOffsets = [] 8 | var tokBegin = 0 9 | var tokEnd = 0 10 | for (var si = 0; si < sentenceOffsets.length; si++) { 11 | // move start 12 | while (tokenOffsets[tokBegin][FROM] < sentenceOffsets[si][FROM]) tokBegin++ 13 | tokEnd = tokBegin 14 | while (tokEnd < tokenOffsets.length && tokenOffsets[tokEnd][FROM] <= sentenceOffsets[si][TO]) tokEnd++ 15 | 16 | // now we have (tokBegin,tokEnd) for sentence 17 | sentenceTokenOffsets.push([tokBegin, tokEnd]) 18 | tokBegin = tokEnd 19 | } 20 | return sentenceTokenOffsets 21 | } 22 | 23 | var findSentNumByTokenPos = function(pos, sentenceTokenOffsets) { 24 | var minIndex = 0; 25 | var maxIndex = sentenceTokenOffsets.length - 1; 26 | var currentIndex; 27 | var currentElement; 28 | 29 | while (minIndex <= maxIndex) { 30 | currentIndex = (minIndex + maxIndex) / 2 | 0; 31 | currentElement = sentenceTokenOffsets[currentIndex]; 32 | 33 | if (currentElement[TO] <= pos) { 34 | //if (currentElement < searchElement) { 35 | minIndex = currentIndex + 1; 36 | } 37 | else if (currentElement[FROM] > pos) { 38 | //if (currentElement > searchElement) { 39 | maxIndex = currentIndex - 1; 40 | } 41 | else { 42 | return currentIndex; 43 | } 44 | } 45 | } 46 | return { 47 | getSentenceTokenOffsets:getSentenceTokenOffsets, 48 | findSentNumByTokenPos:findSentNumByTokenPos 49 | }; 50 | })() 51 | 52 | module.exports = SentenceUtils -------------------------------------------------------------------------------- /view/view/vis/core/SpansVisualization.js: -------------------------------------------------------------------------------- 1 | /* TokensVisualization */ 2 | var CharOffsets = require('./CharOffsets.js') 3 | 4 | var Span = function(sels) { 5 | var state = {} 6 | 7 | var fragment = function(i, length) { 8 | var fragment = ''; 9 | if (i==0 && i < length-1) fragment = 'left'; 10 | else if (i==0 && i==length-1) fragment = 'leftright'; 11 | else if (i==length-1 && i > 0) fragment = 'right'; 12 | else if (i > 0 && i < length-1) fragment = 'inner'; 13 | return fragment; 14 | }; 15 | 16 | // initialize 17 | state.sels = sels 18 | state.color = 'red' 19 | if (!sels) return; 20 | var ii = sels.length; 21 | $.each(sels, function(i, sel) { 22 | $(sel).addClass('highlight_' + state.color); 23 | $(sel).addClass('highlight_' + fragment(i, ii)); 24 | //$(sel).on('click', function() { 25 | // console.log('clicked'); 26 | //}); 27 | }) 28 | 29 | state.destroy = function() { 30 | // unbind all handlers 31 | if (!state.sels) return; 32 | 33 | $.each(state.sels, function(sel) { 34 | //$(sel).unbind('click'); 35 | }); 36 | } 37 | return state 38 | } 39 | 40 | var SpansVisualization = function(element, spans) { 41 | var state = { 42 | renderedSpans: new Array(), 43 | destroyed: false 44 | }; 45 | 46 | //var documentOffset = scope.document.offset 47 | var documentOffset = 0 48 | 49 | CharOffsets.createMultiRangeSpans([element,this], spans, state.renderedSpans, documentOffset) 50 | 51 | $.each(state.renderedSpans, function(i, rs) { 52 | var span = new Span(rs.sels) 53 | }); 54 | 55 | state.destroy = function() { 56 | state.destroyed = true; 57 | $.each(state.renderedSpans, function(i, value) { 58 | // do bound listeners automatically get destroyed?? 59 | //value.element.remove(); 60 | //value.scope.$destroy(); 61 | 62 | //$.each(value.aux, function(j,n) { 63 | // goog.dom.removeNode(n); 64 | //}); 65 | $.each(value.sels, function(j,n) { 66 | goog.dom.flattenElement(n); 67 | }); 68 | value.sels = []; 69 | }); 70 | //element.remove(); 71 | //goog.editor.range.normalizeNode(element[0]); 72 | state.renderedSpans.length = 0; 73 | } 74 | return state 75 | } 76 | 77 | module.exports = SpansVisualization -------------------------------------------------------------------------------- /view/view/vis/core/TokenTagsVisualization.js: -------------------------------------------------------------------------------- 1 | /* TokenTagsVisualization */ 2 | 3 | var CharOffsets = require('./CharOffsets.js') 4 | 5 | var TokenTagsVisualization = function(element, tokenOffsets, tags) { 6 | var state = { 7 | renderedSpans: new Array(), 8 | destroyed: false 9 | }; 10 | 11 | //var documentOffset = scope.document.offset 12 | var documentOffset = 0 13 | 14 | // insert spans 15 | CharOffsets.createMultiRangeSpans([element,this], tokenOffsets, state.renderedSpans, documentOffset) 16 | 17 | $.each(state.renderedSpans, function(i, rs) { 18 | var firstSpan = rs.sels[0] 19 | var el = goog.dom.createDom('div', { 'style' : 20 | 'position:absolute;' + 21 | 'top:-15px;' + 22 | 'left:0px;right:0px;' + 23 | 'z-index:0;' + 24 | 'width:100px;' + //' + tokenWidth + 'px;' + 25 | 'height:20px;' + 26 | 'color:red;' + 27 | 'font-size:10px;' + 28 | 'font-family:helvetica,arial;' + 29 | 'font-stretch:semi-condensed;' + 30 | 'font-weight:500;'/* + 31 | 'background-color:white'*/ 32 | }) 33 | el.appendChild(goog.dom.createTextNode(tags[i])) 34 | // if you want all lines to be equal height, set marginTop as follows 35 | //var marginTop = (drawing.highestLevels[i]+1) * 15; 36 | // if you want to use inline rather than inline-block spans, use following line 37 | //$(firstSpan).attr('style', 'display:inline;line-height:' + (marginTop + 20) + 38 | // 'px;margin-top:' + marginTop + 'px;position:relative'); 39 | var marginTop = 10 40 | $(firstSpan).attr('style', 'display:inline-block;margin-top:' + marginTop + 'px;position:relative') 41 | firstSpan.appendChild(el) 42 | rs.aux = new Array() 43 | rs.aux.push(el) 44 | }) 45 | 46 | state.destroy = function() { 47 | state.destroyed = true; 48 | $.each(state.renderedSpans, function(i, value) { 49 | $.each(value.aux, function(j, n) { 50 | goog.dom.removeNode(n); 51 | }) 52 | $.each(value.sels, function(j, n) { 53 | goog.dom.flattenElement(n); 54 | }) 55 | value.sels = []; 56 | }); 57 | //element.remove(); 58 | //goog.editor.range.normalizeNode(element[0]); 59 | state.renderedSpans.length = 0; 60 | } 61 | return state 62 | } 63 | 64 | module.exports = TokenTagsVisualization 65 | -------------------------------------------------------------------------------- /view/views/error.jade: -------------------------------------------------------------------------------- 1 | extends layout 2 | 3 | block content 4 | h1= message 5 | h2= error.status 6 | pre #{error.stack} 7 | -------------------------------------------------------------------------------- /view/views/layout.jade: -------------------------------------------------------------------------------- 1 | doctype html 2 | html 3 | head 4 | title= title 5 | link(rel='stylesheet', href='/stylesheets/style.css') 6 | body 7 | block content --------------------------------------------------------------------------------