├── .gitignore
├── DISCLAIMER.txt
├── LICENSE.txt
├── README.md
├── conf.sample
    ├── app.yaml
    └── logging.yaml
├── models
    └── aspell.en.dict
├── packages
    ├── apputils
    │   ├── README.md
    │   ├── setup.py
    │   └── src
    │   │   └── apputils
    │   │       ├── __init__.py
    │   │       ├── config.py
    │   │       ├── fileop.py
    │   │       └── pickling.py
    ├── denoiser
    │   ├── README.md
    │   ├── setup.py
    │   └── src
    │   │   └── denoiser
    │   │       ├── __init__.py
    │   │       ├── models
    │   │           ├── __init__.py
    │   │           ├── indicators
    │   │           │   ├── __init__.py
    │   │           │   └── lists.py
    │   │           ├── inline
    │   │           │   ├── __init__.py
    │   │           │   ├── hashing.py
    │   │           │   ├── ranking.py
    │   │           │   └── utils.py
    │   │           └── machine_learning.py
    │   │       └── text
    │   │           ├── __init__.py
    │   │           └── stats.py
    └── pipeline
    │   ├── README.md
    │   ├── setup.py
    │   └── src
    │       └── pipeline
    │           ├── __init__.py
    │           ├── actors.py
    │           ├── command.py
    │           ├── commands
    │               ├── __init__.py
    │               ├── pdfconverter.py
    │               ├── pngreader.py
    │               └── txtdenoiser.py
    │           ├── files.py
    │           ├── logger.py
    │           ├── queue.py
    │           ├── threads.py
    │           └── utils.py
├── ui.py
├── ui.sh
└── utils
    ├── auth.sh
    ├── check
        ├── ocropy.sh
        ├── packages.py
        ├── python.sh
        ├── redis.sh
        └── xserver.sh
    ├── config.py
    ├── env.sh
    ├── install.sh
    ├── prefix.py
    ├── run-wrapper.sh
    ├── run.py
    └── run.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | .pynative
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 
60 | # IDE folders
61 | /.project
62 | /.idea
63 | 
64 | # App directories
65 | conf
66 | logs
67 | tmp
68 | 


--------------------------------------------------------------------------------
/DISCLAIMER.txt:
--------------------------------------------------------------------------------
1 | Certain commercial firms and trade names are identified in this document in order to specify the installation and usage
2 | procedures adequately. Such identification is not intended to imply recommendation or endorsement by the National
3 | Institute of Standards and Technology, nor is it intended to imply that related products are necessarily the best
4 | available for the purpose.
5 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | This software was developed at the National Institute of Standards and Technology by employees of the Federal Government
2 | in the course of their official duties. Pursuant to title 17 Section 105 of the United States Code this software is not 
3 | subject to copyright protection and is in the public domain. 
4 | 
5 | This software is an experimental system. NIST assumes no responsibility whatsoever for its use by other parties, and
6 | makes no guarantees, expressed or implied, about its quality, reliability, or any other characteristic. We would 
7 | appreciate acknowledgement if the software is used.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OCR Pipeline
  2 | 
  3 | **Author:** Philippe Dessauw, philippe.dessauw@nist.gov
  4 | 
  5 | **Contact:** Alden Dima, alden.dima@nist.gov
  6 | 
  7 | [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/usnistgov/ocr-pipeline/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/usnistgov/ocr-pipeline/?branch=master)
  8 | 
  9 | -----
 10 | 
 11 | ## Description
 12 | 
 13 | The **OCR Pipeline** (referred later as the "pipeline") is designed to convert PDF files to clean TXT files in 3 steps:
 14 | 
 15 | 1. PDF to PNG conversion with *PythonMagick* (Python binding for ImageMagick),
 16 | 2. PNG to TXT conversion using *Ocropy*,
 17 | 3. TXT cleaning in order to remove all trace of garbage strings.
 18 | 
 19 | The pipeline is running on a distributed master/worker architecture with a *Redis* queue as a communication layer.
 20 | 
 21 | * One master server is reading input content to build the job queue,
 22 | * Workers pop jobs from that queue and process them.
 23 | 
 24 | The software is developed by the National Institute of Standards and Technology (NIST).
 25 | 
 26 | *N.B.:* This software has exclusively been designed to be run on **Linux servers**. Execution on Mac and Windows has not
 27 | been tested.
 28 | 
 29 | 
 30 | ## Prerequisites
 31 | 
 32 | ### ImageMagick and Ghostscript
 33 | 
 34 | The pipeline uses ImageMagick and Ghostscript to interact with the PDF file and the generated images. Both of them are available 
 35 | through the package manager of the installed operating system.
 36 | 
 37 | ### Python
 38 | 
 39 | The pipeline is developed in *Python2* (>=2.7). You can check your version using:
 40 | 
 41 | ```bash
 42 | $ python2 --version
 43 | ```
 44 | 
 45 | *Warning:* The pipeline is not designed to work in Python3. Make sure your path point towards a Python2 installation.
 46 | 
 47 | #### Virtual environment
 48 | 
 49 | Using a Python virtual environment is recommended to ensure proper operation of the pipeline. Make sure the environment
 50 | is activated at installation time.
 51 | 
 52 | #### Packages
 53 | 
 54 | There are two package that needed to be installed before installing the pipeline: **pip** and **PythonMagick**.
 55 | 
 56 | ##### pip
 57 | 
 58 | This package will be used to install the packages bundled in this repository and their dependancies. No manual action is
 59 | required to install dependencies. On Ubuntu 16.04LTS, this package is located in `python-pip`.
 60 | 
 61 | ##### PythonMagick
 62 | 
 63 | This package needs to be manually installed. Its version is heavily dependent on your **ImageMagick** version. Please
 64 | visit http://www.imagemagick.org for more information. On Ubuntu 16.04LTS, the latest supported version is **0.9.11**.
 65 | 
 66 | The following commands are the ones needed to download and install PythonMagick on Ubuntu 16.04LTS
 67 | 
 68 | ```bash
 69 | $ PYTHON_MAGICK="PythonMagick-0.9.11"
 70 | $ wget https://www.imagemagick.org/download/python/releases/${PYTHON_MAGICK}.tar.xz
 71 | $ tar xf ${PYTHON_MAGICK}.tar.xz
 72 | $ cd ${PYTHON_MAGICK}
 73 | 
 74 | $ PYPREFIX=`python -c "import sys; print sys.prefix"`
 75 | $ ./configure --prefix ${PYPREFIX}
 76 | $ make && make check && make install
 77 | ```
 78 | 
 79 | ### Redis
 80 | 
 81 | Redis needs to be installed on the master server. Redis version should be **>= 2.7**. Follow Redis installation steps at
 82 | http://redis.io/download#installation.
 83 | 
 84 | ### Ocropy
 85 | 
 86 | Ocropy is required to convert images to text files. The code is available at https://github.com/tmbdev/ocropy. Make sure
 87 | it is downloaded and can be launched on all your workers.
 88 | 
 89 | ### XServer
 90 | 
 91 | The command `xvfb-run` should be available to run the scripts. Depending on your operating system, it is not always
 92 | stored in the same package. On Ubuntu 16.04, the package is named **xvfb**.
 93 | 
 94 | ### NLTK
 95 | 
 96 | In order for NLTK to run properly, you need to download the **english tokenizer**. The following python code will check
 97 | your NLTK installation and get the tokenizer if it is not present:
 98 | 
 99 | ```python
100 | import nltk
101 | 
102 | try:
103 |     nltk.data.find('tokenizers/punkt')
104 | except:
105 |     nltk.download('punkt')
106 | ```
107 | 
108 | 
109 | ## Downloading the project
110 | 
111 | Once all the prerequisites are met, download the project:
112 | 
113 | 1. Get the source code on Github:
114 | 
115 | ```bash
116 | $ cd /path/to/workspace
117 | $ git clone https://github.com/usnistgov/ocr-pipeline.git
118 | ```
119 | 
120 | 2. Configure the application:
121 | 
122 | ```bash
123 | $ cd ocr-pipeline
124 | $ cp -r conf.sample conf
125 | ```
126 | 
127 | 
128 | ## Configuration
129 | 
130 | All the configuration should be put in the *conf* folder.
131 | 
132 | ### app.yaml
133 | 
134 | #### root
135 | 
136 | Absolute path to the pipeline code. The project will be copied to this location when you install and run the pipeline.
137 | 
138 | #### use_sudo
139 | 
140 | Define if the script needs to use sudo to install the pipeline.
141 | 
142 | #### commands / list # PNGReader / ocropy / location
143 | 
144 | Path where you have downloaded Ocropy.
145 | 
146 | #### commands / list # PNGReader / ocropy / model
147 | 
148 | Path where you have downloaded the Ocropy model (*en-default.pyrnn.gz*).
149 | 
150 | 
151 | ## Installation
152 | 
153 | Here are the steps you have to follow to install the pipeline on your architecture machine.
154 | 
155 | 1. Initialize the application on your first machine
156 | 
157 | ```bash
158 | $ cd /path/to/ocr-pipeline
159 | $ ./utils/install.sh
160 | $ ./ui.sh init
161 | ```
162 | 
163 | 2. Create data models
164 | 
165 | ```bash
166 | $ ./ui.sh create_models /path/to/training_set
167 | ```
168 | 
169 | *N.B.* : Depending on your training set, this step could take some time to complete.
170 | 
171 | 3. Check that everything is installed on all the machines
172 | 
173 | ```bash
174 | $ ./ui.sh -r check
175 | ```
176 | 
177 | ## Running the pipeline
178 | 
179 | ### Incoming data
180 | 
181 | When you want to start converting a corpus of PDF files, you have to place the files in the input directory. By default,
182 | this directory is named *data.in*.
183 | 
184 | ### Starting the pipeline
185 | 
186 | To start the pipeline, you just have to run `./ui.sh -r start_pipeline`. It will remotely start all the workers and the
187 | master.
188 | 
189 | ### Output
190 | 
191 | Each time a new file has been processed, it will be put in the output directory of the master server. By default, this
192 | directory is named *data.out*.
193 | 
194 | ## Contact
195 | 
196 | If you encouter any issue or bug with this software please use the [issue tracker](https://github.com/usnistgov/ocr-pipeline/issues).
197 | If you want to make some enhancement, feel free to fork this repository and submit a pull request once your new feature
198 | is ready.
199 | 
200 | If you have any questions, comments or suggestions about this repository, please send an e-mail to Alden Dima
201 | (alden.dima@nist.gov).
202 | 


--------------------------------------------------------------------------------
/conf.sample/app.yaml:
--------------------------------------------------------------------------------
 1 | root: /path/to/pipeline
 2 | use_sudo: false
 3 | dirs:
 4 |     input: data.in
 5 |     output: data.out
 6 |     temp: tmp
 7 |     logs: logs
 8 |     models_root: models
 9 |     models:
10 |         learning: machine_learning
11 |         inline: inline_correction
12 | log_conf: logging.yaml
13 | sleep:
14 |   master: 60
15 |   worker: 5
16 |   job: 5
17 | commands:
18 |    tries: 3
19 |    list:
20 |         -   PDFConverter:
21 |                density: 300
22 |                depth: 8
23 |                quality: 100
24 |         -   PNGReader:
25 |                ocropy:
26 |                    location: /path/to/ocropy
27 |                    model: models/en-default.pyrnn.gz
28 |                #commands:
29 |                #     - ocropus-nlbin
30 |                #     - ocropus-gpageseg
31 |                #     - ocropus-rpred
32 |         -   TXTDenoiser
33 | redis:
34 |     host: 127.0.0.1
35 |     port: 6379
36 | exts:
37 |     tmp: .tmp
38 |     garbage: .grbge.txt
39 |     clean: .clean.txt
40 |     unclassified: .noclass.txt
41 | models:
42 |     aspell_dict: aspell.en.dict
43 |     hashes: hash_list.bin
44 |     learning:
45 |         training_set: training.bin
46 |         classifier: model.bin
47 |     inline:
48 |         unigrams: unigrams.bin
49 |         bigrams: bigrams.bin
50 |         altcase: altcase.bin
51 |         ocr_keys: ocrkeys.bin
52 |         anagrams: anagrams.bin
53 |         dictionary: dictionary.bin
54 | 


--------------------------------------------------------------------------------
/conf.sample/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |     default:
 4 |         format: "[%(asctime)s][%(levelname)s][%(module)s] %(message)s"
 5 |         datefmt: "%Y-%m-%d %H:%M:%S"
 6 | handlers:
 7 |     app_file:
 8 |         class: logging.handlers.RotatingFileHandler
 9 |         formatter: default
10 |         level: DEBUG
11 |         filename: app.log
12 |         maxBytes: 5242880 # 5M
13 |         backupCount: 9
14 |     local_file:
15 |         class: logging.handlers.RotatingFileHandler
16 |         formatter: default
17 |         level: DEBUG
18 |         filename: local.log
19 |         maxBytes: 5242880 # 5M
20 |         backupCount: 9
21 |     console:
22 |         class: logging.StreamHandler
23 |         formatter: default
24 |         level: DEBUG
25 |         stream: ext://sys.stdout
26 | loggers:
27 |     app:
28 |         level: DEBUG
29 |         handlers: [console, app_file]
30 |         propagate: false
31 |     local:
32 |         level: DEBUG
33 |         handlers: [local_file]
34 |         propagate: false


--------------------------------------------------------------------------------
/packages/apputils/README.md:
--------------------------------------------------------------------------------
 1 | # apputils
 2 | 
 3 | **Author:** Philippe Dessauw, philippe.dessauw@nist.gov
 4 | 
 5 | **Contact:** Alden Dima, alden.dima@nist.gov
 6 | 
 7 | -----
 8 | 
 9 | This Python package contains useful functions that can be easily reused inside a Python projects. It offers:
10 | 
11 | * App Configuration using YAML files,
12 | * Zipping and unzipping directories on the fly,
13 | * Storing and retrieving Python objects.
14 | 
15 | ## Installation
16 | 
17 | ### Packaging source files
18 | 
19 | 	$> cd /path/to/apputils
20 | 	$> python setup.py sdist
21 | 
22 | You should now see **dist** package in the main directory.
23 | 
24 | ### Installing the package
25 | 
26 | 	$> cd path/to/apputils/dist
27 | 	$> pip install apputils-*version*.tar.gz
28 | 
29 | This package is now ready to use!
30 | 
31 | ## Contact
32 | 
33 | If you have any questions, comments or suggestions about this repository, please send an e-mail to Alden Dima 
34 | (alden.dima@nist.gov).
35 | 


--------------------------------------------------------------------------------
/packages/apputils/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="apputils",
 5 |     version="0.1.0",
 6 | 
 7 |     description=("A set of useful functions for your Python software",),
 8 | 
 9 |     author="Philippe Dessauw",
10 |     author_email="philippe.dessauw@nist.gov",
11 | 
12 |     packages=find_packages('src'),
13 |     package_dir={'': 'src'},
14 | 
15 |     install_requires=[
16 |         'pyyaml',
17 |     ],
18 | )
19 | 


--------------------------------------------------------------------------------
/packages/apputils/src/apputils/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Package containing several utilities for Python application.
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | 


--------------------------------------------------------------------------------
/packages/apputils/src/apputils/config.py:
--------------------------------------------------------------------------------
  1 | """Configuration package. Contains functions to access YAML configuration file.
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | import logging.config
 17 | from os import makedirs
 18 | from os.path import exists, join, isfile, splitext, dirname
 19 | import yaml
 20 | 
 21 | app_config = None
 22 | """dict: Configuration of the overall application.
 23 | """
 24 | 
 25 | 
 26 | def load_config(filename, root_directory):
 27 |     """Load a YAML configuration file. All data is stored in the variable :attr:`app_config`.
 28 | 
 29 |     Parameters:
 30 |         filename (str): YAML configuration file.
 31 |         root_directory (str): Installation directory of the app.
 32 |     """
 33 |     global app_config
 34 | 
 35 |     with open(filename, "r") as conf_file:
 36 |         app_config = yaml.load(conf_file)
 37 | 
 38 |     install_dir = dirname(filename)
 39 | 
 40 |     # Logging configuration
 41 |     if "log_conf" in app_config.keys():
 42 |         log_conf_name = join(install_dir, app_config["log_conf"])
 43 |         with open(log_conf_name, "r") as log_conf_file:
 44 |             log_conf = yaml.load(log_conf_file)
 45 | 
 46 |             # Create logs directory if it does not exist
 47 |             log_directory = join(root_directory, app_config["dirs"]["logs"])
 48 |             if not exists(log_directory):
 49 |                 makedirs(log_directory)
 50 | 
 51 |             # Append the log folder to the log filename
 52 |             for handler in log_conf["handlers"].values():
 53 |                 if "filename" in handler.keys():
 54 |                     handler["filename"] = join(log_directory, handler["filename"])
 55 | 
 56 |             logging.config.dictConfig(log_conf)
 57 | 
 58 |         del app_config["log_conf"]  # Information is no longer needed
 59 | 
 60 |     # Import other YAML configuration file
 61 |     for key, value in app_config.items():
 62 |         if type(value) == str and isfile(join(install_dir, value)) and splitext(value)[1] == ".yaml":
 63 |             with open(join(install_dir, value), "r") as subconf_file:
 64 |                 app_config[key] = yaml.load(subconf_file)
 65 | 
 66 | 
 67 | def get_config(key):
 68 |     """Return value of a given key hash.
 69 | 
 70 |     Hashes are formatted using '/' to define parent-child relationship and '#' to define a list element.
 71 | 
 72 |     Example:
 73 |         Given the following YAML file (already loaded)::
 74 | 
 75 |             app:
 76 |                 root: /path/to/root
 77 |                 conf:
 78 |                     - dev: conf/dev.conf
 79 |                     - test: conf/test.conf
 80 |                     - prod: conf/prod.conf
 81 | 
 82 |         In order to get the path of the test configuration file you would type::
 83 | 
 84 |             >>> get_config('app/conf#2/test')
 85 | 
 86 | 
 87 |     Parameters:
 88 |         key (str): Key hash of the value to return.
 89 | 
 90 |     Returns:
 91 |         str: Value for the given key if it exists.
 92 | 
 93 |     Raises:
 94 |         ValueError: App config has not been loaded.
 95 |         KeyError: Key hash has not been found.
 96 |     """
 97 |     if app_config is None:
 98 |         raise ValueError("App config not loaded")
 99 | 
100 |     try:
101 |         if '/' in key:
102 |             keys = key.split('/')
103 | 
104 |             tmp_keys = []
105 |             for k in keys:
106 |                 sp = k.split('#')
107 | 
108 |                 if len(sp) != 1:
109 |                     sp[1] = int(sp[1])
110 | 
111 |                 tmp_keys += sp
112 | 
113 |             keys = tmp_keys
114 |             config_data = app_config
115 |             for k in keys:
116 |                 config_data = config_data[k]
117 | 
118 |             return config_data
119 |         else:
120 |             return app_config[key]
121 |     except:
122 |         raise KeyError("Key "+str(key)+" not present in config file")
123 | 


--------------------------------------------------------------------------------
/packages/apputils/src/apputils/fileop.py:
--------------------------------------------------------------------------------
  1 | """File operations packages. Provide functions to interact with the filesystem.
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from zipfile import ZipFile
 17 | from os import walk, remove, makedirs
 18 | from os.path import join, splitext, exists
 19 | from shutil import rmtree
 20 | from hashlib import sha256
 21 | 
 22 | module_conf = {
 23 |     "zip_ext": ".zip"
 24 | }
 25 | 
 26 | 
 27 | def zip_directory(directory):
 28 |     """Zip a directory
 29 | 
 30 |     Parameters:
 31 |         directory (str): Path to the directory to zip.
 32 | 
 33 |     Returns:
 34 |         str: Path to the archive.
 35 |     """
 36 |     archive_name = directory + module_conf["zip_ext"]
 37 |     zip_dir = ZipFile(archive_name, "w")
 38 | 
 39 |     for root, folders, files in walk(directory):
 40 |         for item in folders+files:
 41 |             orig_path = join(root, item)
 42 |             dest_path = orig_path[len(directory):]
 43 | 
 44 |             zip_dir.write(orig_path, dest_path)
 45 | 
 46 |     rmtree(directory)  # Clean directory
 47 |     return archive_name
 48 | 
 49 | 
 50 | def unzip_directory(archive):
 51 |     """Unzip an archive.
 52 | 
 53 |     Parameters:
 54 |         archive (str): Path to the archive to unzip.
 55 | 
 56 |     Returns:
 57 |         str: Path to the directory.
 58 |     """
 59 |     zip_dir = ZipFile(archive, "r")
 60 |     directory = splitext(archive)[0]
 61 | 
 62 |     zip_dir.extractall(directory)
 63 |     remove(archive)
 64 | 
 65 |     return directory
 66 | 
 67 | 
 68 | def create_directories(dir_conf, prefix=None):
 69 |     """Create application directories and subdirectories given a configuration dictionary.
 70 | 
 71 |     Parameters:
 72 |         dir_conf (dict): List of directories to create.
 73 |         prefix (str): Root directory for the directories to create. Default to `None` (directories will be built in
 74 |             the current directory).
 75 | 
 76 |     Raises:
 77 |         ValueError: If there is a subdirectory with no root or if the subdirectory key is not a dictionary.
 78 |     """
 79 |     dirnames = [d for d in dir_conf.values() if isinstance(d, str)]
 80 | 
 81 |     for dirname in dirnames:
 82 |         dirpath = join(prefix, dirname) if prefix is not None else dirname
 83 | 
 84 |         if not exists(dirpath):
 85 |             makedirs(dirpath)
 86 | 
 87 |     dir_keys = dir_conf.keys()
 88 |     roots = [d for d in dir_keys if d.endswith("_root") and d.split("_root")[0] in dir_keys]
 89 |     dir_dicts = [d for d in dir_conf.values() if not isinstance(d, str)]
 90 | 
 91 |     # More dictionaries than roots
 92 |     if len(roots) < len(dir_dicts):
 93 |         raise TypeError("All subdirectory must have a _root key")
 94 | 
 95 |     for r in roots:
 96 |         key = r.split("_root")[0]
 97 |         subfolders = dir_conf[key]
 98 | 
 99 |         if not isinstance(subfolders, dict):
100 |             raise TypeError("Expecting dict, got "+str(type(subfolders)))
101 | 
102 |         if prefix is not None:
103 |             prefix = join(prefix, dir_conf[r])
104 |         else:
105 |             prefix = dir_conf[r]
106 | 
107 |         create_directories(subfolders, prefix)
108 | 
109 | 
110 | def file_checksum(filename):
111 |     """Return the sha256 digest of a file.
112 | 
113 |     Parameters:
114 |         filename (str): The file to hash.
115 | 
116 |     Returns:
117 |         str: Hash of the file.
118 |     """
119 |     return sha256(open(filename).read()).hexdigest()
120 | 
121 | 


--------------------------------------------------------------------------------
/packages/apputils/src/apputils/pickling.py:
--------------------------------------------------------------------------------
 1 | """Persistance package for Python objects.
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | import cPickle
17 | import pickle
18 | 
19 | 
20 | def save(obj, filename):
21 |     """Dump an object to a given file.
22 | 
23 |     Parameters:
24 |         obj: The Python object that you want to store.
25 |         filename (str): Path to the file storing the object.
26 |     """
27 |     with open(filename, "wb") as pickle_file:
28 |         cPickle.dump(obj, pickle_file, pickle.HIGHEST_PROTOCOL)
29 | 
30 | 
31 | def load(filename):
32 |     """Load an object from a filename.
33 | 
34 |     Parameters:
35 |         filename (str): Path to the file storing the object.
36 | 
37 |     Returns:
38 |         Object contained within the file.
39 |     """
40 |     with open(filename, "rb") as pickle_file:
41 |         obj = cPickle.load(pickle_file)
42 | 
43 |     return obj
44 | 


--------------------------------------------------------------------------------
/packages/denoiser/README.md:
--------------------------------------------------------------------------------
 1 | # denoiser
 2 | 
 3 | **Author:** Philippe Dessauw, philippe.dessauw@nist.gov
 4 | 
 5 | **Contact:** Alden Dima, alden.dima@nist.gov
 6 | 
 7 | -----
 8 | 
 9 | The denoiser is a Python package able to **detect and remove garbage strings** from a text file. Garbage strings are 
10 | generated during the transformation of PDF documents to text files via OCR: the conversion of images, charts and tables 
11 | often creates spurious characters within the resulting file.
12 | 
13 | ## Installation
14 | 
15 | ### Packaging source files
16 | 
17 | 	$> cd /path/to/denoiser
18 | 	$> python setup.py sdist
19 | 
20 | You should now see **dist** package in the main directory.
21 | 
22 | ### Installing the package
23 | 
24 | 	$> cd path/to/denoiser/dist
25 | 	$> pip install denoiser-*version*.tar.gz
26 | 
27 | This package is now ready to use!
28 | 
29 | ## Contact
30 | 
31 | If you have any questions, comments or suggestions about this repository, please send an e-mail to Alden Dima 
32 | (alden.dima@nist.gov).
33 | 


--------------------------------------------------------------------------------
/packages/denoiser/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="denoiser",
 5 |     version="1.0.0",
 6 | 
 7 |     description=("Provide objects to curate text files from garbage strings",),
 8 | 
 9 |     author="Philippe Dessauw",
10 |     author_email="philippe.dessauw@nist.gov",
11 | 
12 |     packages=find_packages('src'),
13 |     package_dir={
14 |         '': 'src',
15 |     },
16 | 
17 |     install_requires=[
18 |         'apputils',
19 |         'nltk',
20 |         'numpy',
21 |         'unidecode',
22 |         'scikit-learn==0.15.2',
23 |         'scipy',
24 |     ],
25 | )
26 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/__init__.py:
--------------------------------------------------------------------------------
  1 | """Package containing all the functions and classes needed to clean a file.
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from __future__ import division
 17 | import logging
 18 | from denoiser.models import InlineModel, IndicatorModel, MachineLearningModel
 19 | from denoiser.text import Text
 20 | 
 21 | 
 22 | class Denoiser(object):
 23 |     """Denoiser object, able to clean a file and train related models
 24 |     """
 25 | 
 26 |     def __init__(self, app_config):
 27 |         self.config = app_config
 28 |         self.logger = logging.getLogger('local')
 29 | 
 30 |         self.inline_model = InlineModel(self.config)
 31 |         self.indicator_model = IndicatorModel(self.config)
 32 |         self.learning_model = MachineLearningModel(self.config)
 33 | 
 34 |         self.logger.info("Denoiser initialized")
 35 | 
 36 |     def cleanse(self, filename, is_csv=False):
 37 |         """Cleanse a file given its name
 38 | 
 39 |         Parameters:
 40 |             filename (str): Path of the file to cleanse
 41 |             is_csv (bool): Specifies if the file is a CSV
 42 | 
 43 |         Returns:
 44 |             `Text`: Text data
 45 |         """
 46 |         self.logger.debug("Cleaning "+filename+"...")
 47 |         text_data = Text(filename)
 48 | 
 49 |         # Parse the proper format
 50 |         if is_csv:
 51 |             text_data.read_csv()
 52 |         else:
 53 |             text_data.read_txt()
 54 | 
 55 |         # Clean the text
 56 |         self.inline_model.load(text_data)
 57 |         self.inline_model.correct(text_data)
 58 | 
 59 |         self.indicator_model.load(text_data)
 60 |         self.indicator_model.correct(text_data)
 61 | 
 62 |         self.learning_model.load(text_data)
 63 |         self.learning_model.correct(text_data)
 64 | 
 65 |         return text_data
 66 | 
 67 |     def train(self, dataset):
 68 |         """ Train the denoiser with a set of files
 69 | 
 70 |         Parameters
 71 |             dataset (list): List of files
 72 |         """
 73 |         self.logger.debug("Training denoiser...")
 74 | 
 75 |         # Generate datastructures from dataset
 76 |         text_dataset = [Text(f) for f in dataset]
 77 | 
 78 |         # Create datastructures for the whole dataset
 79 |         for text_data in text_dataset:
 80 |             self.logger.debug("Preprocessing "+text_data.filename)
 81 |             text_data.read_csv()
 82 | 
 83 |             # print "Loading "+text.filename+"..."
 84 |             self.inline_model.load(text_data)
 85 |             self.inline_model.correct(text_data)
 86 | 
 87 |             self.indicator_model.load(text_data)
 88 |             self.indicator_model.correct(text_data)
 89 | 
 90 |         # Load existing training data
 91 |         self.logger.debug("Training learning model...")
 92 |         self.learning_model.train(text_dataset)
 93 | 
 94 |         self.logger.info("Machine learning model trained")
 95 | 
 96 |     def generate_models(self, dataset):
 97 |         """ Generates the datastructures given a set of files
 98 | 
 99 |         Parameters
100 |             dataset (list): List of files
101 |         """
102 |         self.logger.debug("Generating datastructures...")
103 |         text_dataset = [Text(f) for f in dataset]
104 | 
105 |         for text_data in text_dataset:
106 |             self.logger.debug("Processing "+text_data.filename+"...")
107 | 
108 |             text_data.read_csv()
109 |             self.inline_model.load(text_data)
110 | 
111 |         self.logger.info("Datastructure generated")
112 |         return 0
113 | 
114 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/__init__.py:
--------------------------------------------------------------------------------
  1 | """Models package
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from __future__ import division
 17 | from os import unlink
 18 | from sklearn.linear_model.stochastic_gradient import SGDClassifier
 19 | from denoiser.models.inline import Unigrams, Dictionary, Bigrams, AltCaseMap, OcrKeyMap, AnagramMap
 20 | from denoiser.models.inline.ranking import rate_corrections
 21 | from denoiser.models.inline.utils import init_correction_map, select_anagrams, select_ocrsims, build_candidates_list, \
 22 |     correct_case, apply_bigram_boost, select_correction, extract_paragraph_bigrams, select_lower_edit_distance, \
 23 |     select_best_alphabetical_word
 24 | from denoiser.models.machine_learning import MachineLearningFeatures, MachineLearningAlgorithm
 25 | import logging
 26 | from os.path import exists, join
 27 | from denoiser.models.indicators.lists import StrongIndicatorList, CleanIndicatorList
 28 | from apputils.fileop import file_checksum
 29 | from apputils.pickling import save, load
 30 | 
 31 | 
 32 | class AbstractModel(object):
 33 |     """Abstract model, contains main functions
 34 |     """
 35 | 
 36 |     def __init__(self, app_config):
 37 |         self.config = app_config
 38 |         self.logger = logging.getLogger('local')
 39 | 
 40 |         self.hash_filename = join(app_config["dirs"]["models_root"], app_config["models"]["hashes"])
 41 |         self.hash_list = []
 42 | 
 43 |         if exists(self.hash_filename):
 44 |             self.hash_list = load(self.hash_filename)
 45 | 
 46 |     def is_preprocessed(self, filename):
 47 |         """Determine if the given file has already been preprocessed (its data added to the models)
 48 | 
 49 |         Args:
 50 |             filename (str): Path of the given file
 51 | 
 52 |         Returns:
 53 |             int: 0 if not preprocess, 1 otherwise
 54 |         """
 55 |         text_id = file_checksum(filename)
 56 | 
 57 |         if text_id not in self.hash_list:
 58 |             self.hash_list.append(text_id)
 59 |             save(self.hash_list, self.hash_filename)
 60 |             return 0
 61 | 
 62 |         return 1
 63 | 
 64 |     def load(self, text_data):
 65 |         """Load text data to the model
 66 | 
 67 |         Args:
 68 |             text_data (dict): Text data
 69 | 
 70 |         Raise:
 71 |             NotImplementedError: Not yet implemented
 72 |         """
 73 |         raise NotImplementedError()
 74 | 
 75 |     def correct(self, text_data):
 76 |         """Save text data to the model
 77 | 
 78 |         Args:
 79 |             text_data (dict): Text data
 80 | 
 81 |         Raise:
 82 |             NotImplementedError: Not yet implemented
 83 |         """
 84 |         raise NotImplementedError()
 85 | 
 86 | 
 87 | class InlineModel(AbstractModel):
 88 |     """Model for inline data structures
 89 |     """
 90 | 
 91 |     def __init__(self, app_config):
 92 |         super(InlineModel, self).__init__(app_config)
 93 | 
 94 |         inline_models_dir = join(
 95 |             app_config["root"],
 96 |             app_config["dirs"]["models_root"],
 97 |             app_config["dirs"]["models"]["inline"]
 98 |         )
 99 |         inline_models_key = app_config["models"]["inline"]
100 | 
101 |         self.dictionary = Dictionary(join(inline_models_dir, inline_models_key["dictionary"]))
102 | 
103 |         self.unigrams = Unigrams(join(inline_models_dir, inline_models_key["unigrams"]))
104 |         self.tmp_unigrams_filename = self.unigrams.filename + app_config["exts"]["tmp"]
105 | 
106 |         self.bigrams = Bigrams(join(inline_models_dir, inline_models_key["bigrams"]))
107 | 
108 |         self.altcase_map = AltCaseMap(join(inline_models_dir, inline_models_key["altcase"]))
109 |         self.tmp_altcase_filename = self.altcase_map.filename + app_config["exts"]["tmp"]
110 | 
111 |         self.ocrkey_map = OcrKeyMap(join(inline_models_dir, inline_models_key["ocr_keys"]))
112 |         self.anagram_map = AnagramMap(join(inline_models_dir, inline_models_key["anagrams"]))
113 | 
114 |     def load(self, text_data):
115 |         """Load text data to the model
116 | 
117 |         Args:
118 |             text_data (`Text`): Text data
119 |         """
120 |         if self.is_preprocessed(text_data.filename) != 0:
121 |             self.logger.debug(text_data.filename+" already loaded: skipping it.")
122 |             return
123 | 
124 |         tmp_u = Unigrams(self.tmp_unigrams_filename)
125 |         word_list = tmp_u.append_data(text_data)
126 | 
127 |         self.bigrams.append_data(word_list)
128 | 
129 |         tmp_ac = AltCaseMap(self.tmp_altcase_filename)
130 |         tmp_ac.append_data(tmp_u.raw_unigrams)
131 | 
132 |         tmp_u.generate_low_case(tmp_ac.altcase_map)
133 | 
134 |         self.ocrkey_map.append_data(tmp_u.raw_unigrams)
135 | 
136 |         # Updating data
137 |         self.unigrams.raw_unigrams += tmp_u.raw_unigrams
138 |         self.unigrams.ngrams += tmp_u.ngrams
139 |         self.unigrams.prune(0.7)
140 |         self.unigrams.save()
141 | 
142 |         combine_struct = {key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys()}
143 |         for key, value in tmp_ac.altcase_map.items() + self.altcase_map.altcase_map.items():
144 |             combine_struct[key] = combine_struct[key].union(value)
145 | 
146 |         self.altcase_map.altcase_map = combine_struct
147 |         self.altcase_map.prune(self.unigrams.ngrams_pruned)
148 |         self.altcase_map.save()
149 | 
150 |         unlink(self.tmp_unigrams_filename)
151 |         unlink(self.tmp_altcase_filename)
152 | 
153 |         self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned)
154 |         self.dictionary.append_data(self.unigrams.ngrams_pruned)
155 | 
156 |         self.logger.info(text_data.filename+"'s datastructures loaded")
157 | 
158 |     def correct(self, text_data):
159 |         """Correct text data
160 | 
161 |         Args:
162 |             text_data (`Text`): Text data
163 |         """
164 |         correction_data = self.correction_data()
165 | 
166 |         for paragraph in text_data.text:
167 |             for line in paragraph:
168 |                 for token in line.tokens:
169 |                     token[2] = init_correction_map(token[1], correction_data["dictionary"])
170 | 
171 |                     # Skip some correction steps if the token is too short, in the dictionary or already identified as
172 |                     # garbage
173 |                     if not token[2] is None and len(token[2]) == 0:
174 |                         anagrams = select_anagrams(token[1], correction_data)
175 |                         ocr_sims = select_ocrsims(token[1], correction_data)
176 | 
177 |                         token[2] = build_candidates_list(token[1], anagrams, ocr_sims, correction_data)
178 |                         token[2] = correct_case(token[1], token[2], correction_data)
179 | 
180 |                         token[2] = rate_corrections(token[2])
181 | 
182 |                         if len(token[2]) == 0:  # No correction has been found
183 |                             token[2] = None
184 | 
185 |             # Applying the bigram boost to the tokens
186 |             bigrams = extract_paragraph_bigrams(paragraph)
187 |             apply_bigram_boost(paragraph, bigrams, correction_data["occurence_map"])
188 | 
189 |             # Select the appropriate correction
190 |             for line in paragraph:
191 |                 for token in line.tokens:
192 |                     token[2] = select_correction(token[1], token[2])
193 | 
194 |                     if token[2] is not None and len(token[2]) > 1:
195 |                         tkn_list = [tkn for tkn, sc in token[2].items() if sc == max(token[2].values())]
196 | 
197 |                         if len(tkn_list) != 1:
198 |                             tkn_list = select_lower_edit_distance(token[1], {tkn: token[2][tkn] for tkn in tkn_list})
199 | 
200 |                         if len(tkn_list) != 1:
201 |                             tkn_list = [select_best_alphabetical_word(token[1], tkn_list)]
202 | 
203 |                         token[2] = {tkn: token[2][tkn] for tkn in tkn_list}
204 | 
205 |     def correction_data(self):
206 |         """Get the correction data
207 | 
208 |         Returns:
209 |             dict: Correction data
210 |         """
211 |         return {
212 |             "occurence_map": self.unigrams.ngrams + self.bigrams.ngrams,
213 |             "altcase": self.altcase_map.altcase_map,
214 |             "ocrkeys": self.ocrkey_map.ocrkey_map,
215 |             "anagrams": self.anagram_map.anagram_hashmap,
216 |             "alphabet": self.anagram_map.anagram_alphabet,
217 |             "dictionary": self.dictionary.dictionary
218 |         }
219 | 
220 | 
221 | class IndicatorModel(AbstractModel):
222 |     """Model for garbage strings indicators
223 |     """
224 | 
225 |     def __init__(self, app_config):
226 |         super(IndicatorModel, self).__init__(app_config)
227 | 
228 |         self.model = {
229 |             "strong": StrongIndicatorList(),
230 |             "clean": CleanIndicatorList()
231 |         }
232 | 
233 |     def load(self, text_data):
234 |         """Load text data to the model
235 | 
236 |         Args:
237 |             text_data (`Text`): Text data
238 |         """
239 |         for indicator_list in self.model.values():
240 |             indicator_list.set_stats(text_data.stats)
241 | 
242 |     def correct(self, text_data):
243 |         """Correct text data
244 | 
245 |         Args:
246 |             text_data (`Text`): Text data
247 |         """
248 |         # =======================
249 |         # Strong indicators
250 |         # =======================
251 |         lines = [line for paragraph in text_data.text for line in paragraph
252 |                  if line.grade != 0 and self.model["strong"].match(line)]
253 | 
254 |         for line in lines:
255 |             line.set_garbage()
256 | 
257 |         # =======================
258 |         # Clean indicators
259 |         # =======================
260 |         lines = [line for paragraph in text_data.text for line in paragraph
261 |                  if line.grade != 0 and self.model["clean"].match(line)]
262 | 
263 |         for line in lines:
264 |             line.set_clean()
265 | 
266 |         # =======================
267 |         # Post processing
268 |         # =======================
269 |         lines = [line for paragraph in text_data.text for line in paragraph]
270 |         previous_line = None
271 | 
272 |         # Smoothing function
273 |         for line in lines:
274 |             # Decrease grade if previous line is a garbage string
275 |             if previous_line is not None and previous_line.grade == 0 and line.grade != 5:
276 |                 line.decrease_grade()
277 | 
278 |             # Decrease grade of previous line
279 |             if line.grade == 0 and previous_line is not None and previous_line.grade != 5:
280 |                 previous_line.decrease_grade()
281 | 
282 |             previous_line = line
283 | 
284 | 
285 | class MachineLearningModel(AbstractModel):
286 |     """Model storing all machine learning data
287 |     """
288 | 
289 |     def __init__(self, app_config):
290 |         super(MachineLearningModel, self).__init__(app_config)
291 | 
292 |         self.model = {
293 |             "algo": MachineLearningAlgorithm(),
294 |             "features": MachineLearningFeatures()
295 |         }
296 | 
297 |     def train(self, dataset):
298 |         """Train the model with a dataset
299 | 
300 |         Args:
301 |             dataset (list): List of training files
302 |         """
303 |         # Get the original training set
304 |         training_set = self.model["algo"].training_set
305 | 
306 |         # Append the new data to it
307 |         for text in dataset:
308 |             self.logger.debug("Processing "+text.filename+"...")
309 |             unigrams = Unigrams(join(self.config["root"],
310 |                                      self.config["dirs"]["models_root"],
311 |                                      self.config["dirs"]["models"]["inline"],
312 |                                      self.config["models"]["inline"]["unigrams"],))
313 | 
314 |             for p in text.text:
315 |                 for line in p:
316 |                     if line.grade % 5 != 0:  # Unclassified lines are useless for the training
317 |                         continue
318 | 
319 |                     f = MachineLearningFeatures()
320 |                     features = f.extract_features(line, unigrams.ngrams, text.stats)
321 |                     result = int(line.grade / 5)
322 | 
323 |                     training_set["features"].append(features)
324 |                     training_set["results"].append(result)
325 | 
326 |         self.logger.debug("Saving training set...")
327 |         save(training_set, join(self.config["dirs"]["models_root"],
328 |                                 self.config["dirs"]["models"]["learning"],
329 |                                 self.config["models"]["learning"]["training_set"]))
330 | 
331 |         self.logger.debug("Training model...")
332 |         ml_classifier = SGDClassifier(loss="log", class_weight="auto")
333 |         self.model["algo"].set_classifier(ml_classifier)
334 |         self.model["algo"].set_training_set(training_set["features"], training_set["results"])
335 |         self.model["algo"].train()
336 | 
337 |         save(self.model["algo"].classifier, join(self.config["dirs"]["models_root"],
338 |                                                  self.config["dirs"]["models"]["learning"],
339 |                                                  self.config["models"]["learning"]["classifier"]))
340 | 
341 |     def load(self, text_data):
342 |         """Load text data to the model
343 | 
344 |         Args:
345 |             text_data (`Text`): Text data
346 |         """
347 |         pass
348 | 
349 |     def correct(self, text_data):
350 |         """Correct text data
351 | 
352 |         Args:
353 |             text_data (`Text`): Text data
354 |         """
355 |         unigrams = Unigrams(join(self.config["root"],
356 |                                  self.config["dirs"]["models_root"],
357 |                                  self.config["dirs"]["models"]["inline"],
358 |                                  self.config["models"]["inline"]["unigrams"],))
359 | 
360 |         ml_classifier = load(join(self.config["dirs"]["models_root"],
361 |                                   self.config["dirs"]["models"]["learning"],
362 |                                   self.config["models"]["learning"]["classifier"]))
363 | 
364 |         if ml_classifier is None:
365 |             return
366 | 
367 |         self.model["algo"].set_classifier(ml_classifier)
368 | 
369 |         for paragraph in text_data.text:
370 |             for line in paragraph:
371 |                 if line.grade % 5 == 0:
372 |                     continue
373 | 
374 |                 f = MachineLearningFeatures()
375 |                 features = f.extract_features(line, unigrams.ngrams, text_data.stats)
376 |                 line.grade = self.model["algo"].classify(features) * 5
377 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/indicators/__init__.py:
--------------------------------------------------------------------------------
  1 | """List of all the different indicators used to clean a text
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from __future__ import division
 17 | import re
 18 | 
 19 | 
 20 | class StatsIndicator(object):
 21 |     """Indicator based on statistics (match the line depending on the stats)
 22 |     """
 23 | 
 24 |     def __init__(self, text_stats=None):
 25 |         self.stats = text_stats
 26 | 
 27 |     def set_stats(self, text_stats):
 28 |         """Set statistics of the indicator based on text statistics
 29 | 
 30 |         Args:
 31 |             text_stats (`Statistics`): Text statistics
 32 |         """
 33 |         self.stats = text_stats
 34 | 
 35 |     def match(self, line):
 36 |         """Define if a line is matching the rules
 37 | 
 38 |         Args:
 39 |             line (Line): Input line
 40 | 
 41 |         Returns:
 42 |             bool: True
 43 |         """
 44 |         return True
 45 | 
 46 | 
 47 | class RegexIndicator(object):
 48 |     """Indicator based on a regexp (match the line with the given regexp)
 49 |     """
 50 | 
 51 |     def __init__(self, regexp):
 52 |         self.regexp = '^'+regexp+'$'
 53 | 
 54 |     def match(self, line):
 55 |         """Define if a line is matching the rules
 56 | 
 57 |         Args:
 58 |             line (Line): Input line
 59 | 
 60 |         Returns:
 61 |             bool: True if line match the RegExp, false otherwise
 62 |         """
 63 |         return re.match(self.regexp, line.get_clean_line())
 64 | 
 65 | 
 66 | # ==========================================
 67 | # STRONG INDICATORS
 68 | # ==========================================
 69 | 
 70 | class AlphaNumIndicator(StatsIndicator):
 71 |     """Indicator detecting a high number of special chars
 72 |     """
 73 | 
 74 |     def __init__(self, stats=None):
 75 |         self.spchar_rate = 0.6
 76 |         super(AlphaNumIndicator, self).__init__(stats)
 77 | 
 78 |     def match(self, line):
 79 |         return True if len(line) == 0 else line.get_clean_stats().get_stat('sp_char') / len(line) > self.spchar_rate
 80 | 
 81 | 
 82 | class CardinalNumberIndicator(RegexIndicator):
 83 |     """Indicator detecting cardinal numbers
 84 |     """
 85 | 
 86 |     def __init__(self):
 87 |         super(CardinalNumberIndicator, self).__init__("[0-9efEaAoOsSt.,= \\-]+")
 88 | 
 89 | 
 90 | # ==========================================
 91 | # CLEAN INDICATORS
 92 | # ==========================================
 93 | 
 94 | class CleanTextIndicator(StatsIndicator):
 95 |     """Indicator detecting a clean line
 96 |     """
 97 | 
 98 |     def __init__(self, stats=None):
 99 |         self.max_length_rate = 0.5
100 |         self.char_rate = 0.6
101 |         super(CleanTextIndicator, self).__init__(stats)
102 | 
103 |     def match(self, line):
104 |         if len(line) == 0:
105 |             return False
106 | 
107 |         return float(len(line)) >= self.stats.get_stat("line_avg_length") * self.max_length_rate \
108 |             and (line.get_clean_stats().get_stat('lw_char') / len(line) > self.char_rate
109 |                  or line.get_clean_stats().get_stat('up_char') / len(line) > self.char_rate)
110 | 
111 | 
112 | class TitleIndicator(RegexIndicator):
113 |     """Indicator matching a title. A title is a line beginning with an upper char and followed by lower chars or space
114 |     """
115 | 
116 |     def __init__(self):
117 |         super(TitleIndicator, self).__init__("[A-Z][a-z ]+")
118 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/indicators/lists.py:
--------------------------------------------------------------------------------
 1 | """Package containing indicators lists
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from __future__ import division
17 | from denoiser.models.indicators import *
18 | 
19 | 
20 | class IndicatorsList(object):
21 |     """Object handling a list of indicator of a same purpose
22 |     """
23 | 
24 |     def __init__(self):
25 |         self.indicators = []
26 | 
27 |     def add_indicator(self, indicator):
28 |         """Add an indicator to the list
29 | 
30 |         Args:
31 |             indicator (Indicator): Indicator to add to the list
32 |         """
33 |         self.indicators.append(indicator)
34 | 
35 |     def set_stats(self, stats):
36 |         """Set stats for all the StatsIndicator
37 | 
38 |         Args:
39 |             stats (`Statistics`): Text statistics to setup
40 |         """
41 |         for indicator in self.indicators:
42 |             if indicator.__class__.__base__ == StatsIndicator:
43 |                 indicator.set_stats(stats)
44 | 
45 |     def match(self, line):
46 |         """Define if a line is matching the indicators
47 | 
48 |         Args:
49 |             line (`Line`): Input line
50 | 
51 |         Returns:
52 |             bool: True if line match at least one indicator
53 |         """
54 |         return self.match_rate(line) > 0
55 | 
56 |     def match_rate(self, line):
57 |         """Get the ratio of match of a line
58 | 
59 |         Args:
60 |             line (Line): Input line
61 | 
62 |         Returns:
63 |             float: Ratio of match / number of indicators
64 |         """
65 |         total_ind = len(self.indicators)
66 |         matching_ind = 0
67 | 
68 |         for indicator in self.indicators:
69 |             if indicator.match(line):
70 |                 matching_ind += 1
71 | 
72 |         return matching_ind / total_ind
73 | 
74 | 
75 | class StrongIndicatorList(IndicatorsList):
76 |     """List of strong indicator (detecting garbage strings)
77 |     """
78 | 
79 |     def __init__(self):
80 |         super(StrongIndicatorList, self).__init__()
81 | 
82 |         self.add_indicator(AlphaNumIndicator())
83 |         self.add_indicator(CardinalNumberIndicator())
84 | 
85 | 
86 | class CleanIndicatorList(IndicatorsList):
87 |     """List detecting clean lines
88 |     """
89 | 
90 |     def __init__(self):
91 |         super(CleanIndicatorList, self).__init__()
92 | 
93 |         self.add_indicator(CleanTextIndicator())
94 |         self.add_indicator(TitleIndicator())
95 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/inline/__init__.py:
--------------------------------------------------------------------------------
  1 | """Package containing all the main inline structures
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from __future__ import division
 17 | from math import floor
 18 | from numpy.lib.function_base import median
 19 | from collections import Counter
 20 | import inspect
 21 | from os.path import exists
 22 | import operator
 23 | from nltk.util import ngrams as nltk_ngrams
 24 | from denoiser.models.inline.hashing import ocr_key_list_to_str, ocr_key_hash, anagram_hash
 25 | from apputils.pickling import load, save
 26 | import re
 27 | from operator import add
 28 | 
 29 | 
 30 | def truncate_map(occurence_map):
 31 |     """Truncate an occurence map by removing uncommon iteration
 32 | 
 33 |     Parameters:
 34 |         occurence_map (dict): Dictionary containing word as key and occurence as value
 35 | 
 36 |     Returns:
 37 |         dict: Truncated map
 38 |     """
 39 |     # Get occurences distribution
 40 |     distribution = Counter(occurence_map.values())
 41 |     dist_median = median(distribution.values())
 42 | 
 43 |     # Compute upper bound
 44 |     limit = 0.99
 45 |     dist_upper_median = sorted([v for v in distribution.values() if v > dist_median])
 46 |     dist_upper_bound = int(floor(len(dist_upper_median) * limit))
 47 | 
 48 |     # Compute new distribution
 49 |     min_dist_value = dist_upper_median[dist_upper_bound - 1]
 50 |     distribution = {k: v for k, v in distribution.items() if v <= min_dist_value}
 51 | 
 52 |     # Return new occurence map
 53 |     return {k: v for k, v in occurence_map.items() if v in distribution.keys()}
 54 | 
 55 | 
 56 | class InlineStructure(object):
 57 |     """Abstract inline structure
 58 |     """
 59 | 
 60 |     def __init__(self, filename):
 61 |         self.filename = filename
 62 | 
 63 |         if exists(self.filename):
 64 |             self.load()
 65 | 
 66 |     def append_data(self, **kwargs):
 67 |         """Append data to the structure
 68 | 
 69 |         Args:
 70 |             **kwargs: Arbitrary keyword arguments
 71 | 
 72 |         Raise:
 73 |             NotImplementedError: Not yet implemented
 74 |         """
 75 |         raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
 76 | 
 77 |     def load(self):
 78 |         """Load the structure from the file if it exists
 79 |         """
 80 |         if not exists(self.filename):
 81 |             return
 82 | 
 83 |     def save(self):
 84 |         """Save the structure to the file
 85 | 
 86 |         Raise:
 87 |             NotImplementedError: Not yet implemented
 88 |         """
 89 |         raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
 90 | 
 91 | 
 92 | class NGramsStructure(InlineStructure):
 93 |     """Abstract n-gram structure
 94 |     """
 95 | 
 96 |     def __init__(self, filename):
 97 |         self.ngrams = Counter()
 98 |         self.ngrams_pruned = Counter()
 99 | 
100 |         super(NGramsStructure, self).__init__(filename)
101 | 
102 |     def append_data(self, **kwargs):
103 |         raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
104 | 
105 |     def prune(self, rate):
106 |         """Prune ngrams list given the rate of data to keep
107 | 
108 |         Args:
109 |             rate (float): Limit rate of data to keep
110 |         """
111 |         if rate >= 1:
112 |             self.ngrams_pruned = self.ngrams
113 |             return
114 | 
115 |         pruned_target = {}
116 | 
117 |         truncated_target = truncate_map(self.ngrams)
118 |         sorted_target = sorted(truncated_target.iteritems(), key=operator.itemgetter(1), reverse=True)
119 | 
120 |         total = len(sorted_target)
121 |         registered = 0
122 |         current_occ = 0
123 |         for (data, occurence) in sorted_target:
124 |             if registered / total >= rate and occurence != current_occ:
125 |                 break
126 | 
127 |             current_occ = occurence
128 |             pruned_target[data] = occurence
129 |             registered += 1
130 | 
131 |         self.ngrams_pruned = Counter(pruned_target)
132 | 
133 |     def load(self):
134 |         super(NGramsStructure, self).load()
135 | 
136 |     def save(self):
137 |         super(NGramsStructure, self).save()
138 | 
139 | 
140 | class Dictionary(InlineStructure):
141 |     """Dictionary
142 |     """
143 | 
144 |     def __init__(self, filename):
145 |         self.dictionary = list()
146 | 
147 |         super(Dictionary, self).__init__(filename)
148 | 
149 |     def append_data(self, unigrams):
150 |         word_list = []
151 | 
152 |         aspell_dict = "models/aspell.en.dict"
153 |         with open(aspell_dict, "r") as f:
154 |             for line in f:
155 |                 word_list.append(line.strip("\r\n"))
156 | 
157 |         plc_set = set(unigrams)
158 |         word_set = set(word_list)
159 | 
160 |         self.dictionary = list(plc_set.intersection(word_set))
161 |         self.save()
162 | 
163 |     def load(self):
164 |         super(Dictionary, self).load()
165 | 
166 |         self.dictionary = load(self.filename)
167 | 
168 |     def save(self):
169 |         save(self.dictionary, self.filename)
170 | 
171 | 
172 | class Unigrams(NGramsStructure):
173 |     """Unigrams list
174 |     """
175 | 
176 |     def __init__(self, filename):
177 |         self.raw_unigrams = Counter()  # Unigrams not submitted to case modification
178 | 
179 |         super(Unigrams, self).__init__(filename)
180 | 
181 |     def append_data(self, text_data):
182 |         unigrams = [token[1] for paragraph in text_data.text for line in paragraph for token in line.tokens
183 |                     if line.grade != 0 and not token[1] is None and len(token[1]) > 1]
184 | 
185 |         unigrams_counter = Counter(unigrams)
186 |         self.raw_unigrams += unigrams_counter
187 | 
188 |         self.save()
189 |         return unigrams
190 | 
191 |     def generate_low_case(self, altcase_map):
192 |         """Generate lower case unigrams
193 | 
194 |         Args:
195 |             altcase_map (dict): List of alternative case word for a given lowercase word
196 |         """
197 |         low_unigrams = {key: 0 for key in altcase_map.keys()}
198 | 
199 |         for unigram, alt_case_list in altcase_map.items():
200 |             low_unigrams[unigram] = sum([self.raw_unigrams[alt_case] for alt_case in alt_case_list])
201 | 
202 |         self.ngrams = Counter(low_unigrams)
203 |         self.save()
204 | 
205 |     def load(self):
206 |         super(Unigrams, self).load()
207 | 
208 |         data = load(self.filename)
209 | 
210 |         self.raw_unigrams = data["raw_unigrams"]
211 |         self.ngrams = data["unigrams"]
212 |         self.ngrams_pruned = data["unigrams_pruned"]
213 | 
214 |     def save(self):
215 |         data = {
216 |             "raw_unigrams": self.raw_unigrams,
217 |             "unigrams": self.ngrams,
218 |             "unigrams_pruned": self.ngrams_pruned
219 |         }
220 | 
221 |         save(data, self.filename)
222 | 
223 | 
224 | class Bigrams(NGramsStructure):
225 |     """Bigrams list
226 |     """
227 | 
228 |     def __init__(self, filename):
229 |         super(Bigrams, self).__init__(filename)
230 | 
231 |     def append_data(self, unigrams):
232 |         bigrams = [bigram[0].lower()+" "+bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2)
233 |                    if len(bigram[0]) > 1 and len(bigram[1]) > 1]
234 | 
235 |         self.ngrams += Counter(bigrams)
236 |         self.prune(0.35)
237 | 
238 |         self.save()
239 | 
240 |     def load(self):
241 |         super(Bigrams, self).load()
242 | 
243 |         data = load(self.filename)
244 | 
245 |         self.ngrams = data["bigrams"]
246 |         self.ngrams_pruned = data["bigrams_pruned"]
247 | 
248 |     def save(self):
249 |         data = {
250 |             "bigrams": self.ngrams,
251 |             "bigrams_pruned": self.ngrams_pruned
252 |         }
253 | 
254 |         save(data, self.filename)
255 | 
256 | 
257 | class AltCaseMap(InlineStructure):
258 |     """Alternative case map
259 |     """
260 | 
261 |     def __init__(self, filename):
262 |         self.altcase_map = {}
263 |         self.altcase_pruned_map = {}
264 | 
265 |         super(AltCaseMap, self).__init__(filename)
266 | 
267 |     def append_data(self, unigrams):
268 |         _altcase_map = {unigram.lower(): set() for unigram in unigrams.keys()}
269 | 
270 |         for unigram in unigrams.keys():
271 |             _altcase_map[unigram.lower()].add(unigram)
272 | 
273 |         self.altcase_map = {key: set(value) for key, value in _altcase_map.items()}
274 |         self.save()
275 | 
276 |     def prune(self, unigrams_pruned):
277 |         """Prume the map given selected unigrams
278 | 
279 |         Args:
280 |             unigrams_pruned (dict): List of unigrams to keep in the final list
281 |         """
282 |         self.altcase_pruned_map = {unigram: self.altcase_map[unigram] for unigram in unigrams_pruned.keys()}
283 |         self.save()
284 | 
285 |     def load(self):
286 |         super(AltCaseMap, self).load()
287 | 
288 |         data = load(self.filename)
289 | 
290 |         self.altcase_map = data["altcase"]
291 |         self.altcase_pruned_map = data["altcase_pruned"]
292 | 
293 |     def save(self):
294 |         data = {
295 |             "altcase": self.altcase_map,
296 |             "altcase_pruned": self.altcase_pruned_map
297 |         }
298 | 
299 |         save(data, self.filename)
300 | 
301 | 
302 | class OcrKeyMap(InlineStructure):
303 |     """OCR Key map
304 |     """
305 | 
306 |     def __init__(self, filename):
307 |         self.ocrkey_map = {}
308 | 
309 |         super(OcrKeyMap, self).__init__(filename)
310 | 
311 |     def append_data(self, unigrams):
312 |         word_list = []
313 | 
314 |         aspell_dict = "models/aspell.en.dict"
315 |         with open(aspell_dict, "r") as f:
316 |             for line in f:
317 |                 word_list.append(line.strip("\r\n"))
318 | 
319 |         word_set = set(word_list)
320 |         unigram_set = set(unigrams.keys())
321 | 
322 |         ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)}
323 | 
324 |         # Every word contained in the mixed case map and the dictionary
325 |         for word in unigram_set.intersection(word_set):
326 |             h_list = ocr_key_hash(word)
327 |             h_str = ocr_key_list_to_str(h_list)
328 | 
329 |             ocr_key_map[h_str].add(word)  # Add the word to the tab
330 | 
331 |         combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()}
332 | 
333 |         for key, value in self.ocrkey_map.items() + ocr_key_map.items():
334 |             combine_struct[key] = combine_struct[key].union(value)
335 | 
336 |         self.ocrkey_map = combine_struct
337 |         self.save()
338 | 
339 |     def load(self):
340 |         super(OcrKeyMap, self).load()
341 | 
342 |         self.ocrkey_map = load(self.filename)
343 | 
344 |     def save(self):
345 |         save(self.ocrkey_map, self.filename)
346 | 
347 | 
348 | class AnagramMap(InlineStructure):
349 |     """Anagram map
350 |     """
351 | 
352 |     def __init__(self, filename):
353 |         self.anagram_hashmap = {}
354 |         self.anagram_alphabet = {}
355 | 
356 |         super(AnagramMap, self).__init__(filename)
357 | 
358 |     def append_data(self, bigrams, unigrams):
359 |         anaghash_map = {anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys()}
360 | 
361 |         for word in bigrams.keys() + unigrams.keys():
362 |             anaghash_map[anagram_hash(word)].add(word)
363 | 
364 |         self.anagram_hashmap = anaghash_map
365 | 
366 |         clean_word = re.compile(r"^[a-zA-Z '-]+$")
367 |         alphabet = set()
368 | 
369 |         for word in unigrams:
370 |             word = " "+word+" "
371 |             chars = [char for char in word]  # Getting letters from the word
372 |             chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list
373 | 
374 |             alphabet = alphabet.union([anagram_hash(char) for char in set(chars)
375 |                                        if not clean_word.match(char) is None])
376 | 
377 |         alphabet.add(0)
378 | 
379 |         self.anagram_alphabet = alphabet
380 |         self.save()
381 | 
382 |     def load(self):
383 |         super(AnagramMap, self).load()
384 | 
385 |         data = load(self.filename)
386 | 
387 |         self.anagram_hashmap = data["hashmap"]
388 |         self.anagram_alphabet = data["alphabet"]
389 | 
390 |     def save(self):
391 |         data = {
392 |             "hashmap": self.anagram_hashmap,
393 |             "alphabet": self.anagram_alphabet
394 |         }
395 | 
396 |         save(data, self.filename)
397 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/inline/hashing.py:
--------------------------------------------------------------------------------
 1 | """Hashing functions use for anagrams and ocr keys
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | 
17 | ocr_key_class_map = {
18 |     # Lower case
19 |     "a": ("o", 1), "b": ("o", 1), "c": ("c", 1), "d": ("o", 1), "e": ("c", 1), "f": ("i", 1),
20 |     "g": ("o", 1), "h": ("i", 2), "i": ("i", 1), "j": ("i", 1), "k": ("i", 1), "l": ("i", 1),
21 |     "m": ("i", 3), "n": ("i", 2), "o": ("o", 1), "p": ("o", 1), "q": ("o", 1), "r": ("i", 1),
22 |     "s": ("s", 1), "t": ("i", 1), "u": ("i", 2), "v": ("v", 1), "w": ("v", 2), "x": ("v", 1),
23 |     "y": ("v", 1), "z": ("z", 1),
24 | 
25 |     # Upper case
26 |     "A": ("a", 1), "B": ("i", 1), "C": ("c", 1), "D": ("i", 1), "E": ("i", 1), "F": ("i", 1),
27 |     "G": ("c", 1), "H": ("i", 2), "I": ("i", 1), "J": ("i", 1), "K": ("i", 1), "L": ("i", 1),
28 |     "M": ("i", 3), "N": ("i", 2), "O": ("o", 1), "P": ("i", 1), "Q": ("o", 1), "R": ("i", 1),
29 |     "S": ("s", 1), "T": ("i", 1), "U": ("i", 2), "V": ("v", 1), "W": ("v", 2), "X": ("v", 1),
30 |     "Y": ("v", 1), "Z": ("z", 1),
31 | 
32 |     # Numbers and special chars
33 |     "0": ("o", 1), "1": ("i", 1), "5": ("s", 1), "6": ("o", 1), "9": ("o", 1), "!": ("i", 1),
34 |     "'": ("'", 1), "-": ("-", 1)
35 | }
36 | """dict: Mapping of the letters to OCR key values
37 | """
38 | 
39 | 
40 | def anagram_hash(word):
41 |     """Compute anagram hash of a word
42 | 
43 |     Parameters:
44 |         word (:func:`str`): Word that needs to be hashed
45 | 
46 |     Returns:
47 |         int: Anagram representation of the word
48 |     """
49 |     anag_hash = sum([pow(ord(char), 5) for char in word])
50 | 
51 |     return anag_hash
52 | 
53 | 
54 | def ocr_key_hash(word):
55 |     """Generate OCR key hash from a word
56 | 
57 |     Parameters:
58 |         word (:func:`str`): Word that needs to be hashed
59 | 
60 |     Returns:
61 |         list: OCR key hash of the word
62 |     """
63 |     ocrk_hash = []
64 | 
65 |     for char in word:
66 |         if char not in ocr_key_class_map:
67 |             char_class = ("#", 1)
68 |         else:
69 |             char_class = ocr_key_class_map[char]
70 | 
71 |         if len(ocrk_hash) > 0 and ocrk_hash[-1][0] == char_class[0]:
72 |             ocrk_hash[-1] = (char_class[0], ocrk_hash[-1][1] + char_class[1])
73 |         else:
74 |             ocrk_hash.append(char_class)
75 | 
76 |     return ocrk_hash
77 | 
78 | 
79 | def ocr_key_list_to_str(ocr_key_list):
80 |     """Generate OCR key string from the list
81 | 
82 |     Parameters:
83 |         ocr_key_list (list): OCR key hash of a word
84 | 
85 |     Returns:
86 |         :func:`str`: OCR key string
87 |     """
88 |     ocr_key_str = ""
89 | 
90 |     for key_tuple in ocr_key_list:
91 |         ocr_key_str += key_tuple[0] + str(key_tuple[1])
92 | 
93 |     return ocr_key_str
94 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/inline/ranking.py:
--------------------------------------------------------------------------------
 1 | """Ranking functions used across the denoiser
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from math import log
17 | from nltk.metrics.distance import edit_distance
18 | 
19 | 
20 | def rate_anagram(freq_map, word, anagram, int_retrievals):
21 |     """Rate an anagram
22 | 
23 |     Parameters:
24 |         freq_map (dict): Occurence map
25 |         word (:func:`str`): Word to evaluate
26 |         anagram (:func:`str`): Possible anagram
27 |         int_retrievals (int): Internal retrievals
28 | 
29 |     Returns:
30 |         float: OCR key score
31 |     """
32 |     score = len(word) - edit_distance(word, anagram)
33 |     score *= int_retrievals * log(freq_map[anagram.lower()])
34 | 
35 |     return score
36 | 
37 | 
38 | def rate_ocr_key(freq_map, word, ocr_sim, cardinality):
39 |     """Rate an OCR key
40 | 
41 |     Parameters:
42 |         freq_map (dict): Occurence map
43 |         word (:func:`str`): Word to evaluate
44 |         ocr_sim (:func:`str`): Possible OCR key
45 |         cardinality (int): Cardinality
46 | 
47 |     Returns:
48 |         float: OCR key score
49 |     """
50 |     score = len(word) - edit_distance(word, ocr_sim) - cardinality
51 |     score *= log(freq_map[ocr_sim.lower()])
52 | 
53 |     return score
54 | 
55 | 
56 | def rate_bigram(correction, previous_tokens, next_tokens, occurence_map):
57 |     """Get the bigram boost of a given token
58 | 
59 |     Parameters:
60 |         correction (:func:`str`): Correction word to evaluate
61 |         previous_tokens (list): Possible previous tokens
62 |         next_tokens (list): Possible next tokens
63 |         occurence_map (dict): Occurence map
64 | 
65 |     Returns:
66 |         float: Bigram score
67 |     """
68 |     min_score = 2
69 |     total_score = 0
70 | 
71 |     bigrams = [previous_w+" "+correction for previous_w in previous_tokens] + \
72 |               [correction+" "+next_w for next_w in next_tokens]
73 | 
74 |     for bigram in bigrams:
75 |         total_score += occurence_map[bigram]
76 | 
77 |     return log(max(min_score, total_score))
78 | 
79 | 
80 | def rate_corrections(correction_list):
81 |     """Bring the score between 0 and 1
82 | 
83 |     Parameters:
84 |         correction_list (dict): The list of correction
85 |     Returns:
86 |         list: Correction list with corrected scores
87 |     """
88 |     if len(correction_list) == 1:
89 |         key = correction_list.keys()[0]
90 |         correction_list[key] = 1
91 | 
92 |         return correction_list
93 | 
94 |     total_score = sum(correction_list.values())
95 | 
96 |     return {correction: score/total_score for correction, score in correction_list.items()}
97 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/inline/utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities for inline denoising
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from hashlib import md5
 17 | from collections import Counter
 18 | from copy import deepcopy
 19 | from math import log
 20 | from operator import add
 21 | from nltk.metrics.distance import edit_distance
 22 | import operator
 23 | from denoiser.models.inline.hashing import anagram_hash, ocr_key_hash
 24 | from denoiser.models.inline.ranking import rate_anagram, rate_ocr_key, rate_bigram
 25 | 
 26 | 
 27 | def init_correction_map(token, dictionary):
 28 |     """Initialize the correction dictionary
 29 | 
 30 |     Parameters:
 31 |         token (:func:`str`): Cleaned token
 32 |         dictionary (:func:`dict`): Dictionary structure
 33 | 
 34 |     Returns:
 35 |         :func:`dict` or :data:`None` - Correction map
 36 |     """
 37 |     if token is None:
 38 |         return None
 39 | 
 40 |     if len(token) <= 2 or token.lower() in dictionary:
 41 |         return {token: 1}
 42 | 
 43 |     return {}
 44 | 
 45 | 
 46 | def generate_alphabet_from_word(word):
 47 |     """Generate anagram hash for all chars in a word
 48 | 
 49 |     Parameters:
 50 |         word (:func:`str`): Word to generate hash
 51 |     Returns:
 52 |         set - Set of hashes
 53 |     """
 54 |     word = " "+word+" "
 55 |     chars = [char for char in word]  # Getting letters from the word
 56 |     chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list
 57 | 
 58 |     # Computing hash of items and add 0 to the list
 59 |     return set([0] + [anagram_hash(c) for c in set(chars)])
 60 | 
 61 | 
 62 | def select_anagrams(token, structures):
 63 |     """Select possible anagrams for a given token
 64 | 
 65 |     Parameters:
 66 |         token (:func:`str`): Cleaned token
 67 |         structures (:func:`dict`): Datastructures from file
 68 | 
 69 |     Returns:
 70 |         :func:`dict` - Possible anagrams (keys) along with their score (values)
 71 |     """
 72 |     anagrams = {}
 73 |     focus_alphabet = generate_alphabet_from_word(token[1])
 74 |     token_hash = anagram_hash(token)
 75 | 
 76 |     hash_list = []
 77 |     for c in structures["alphabet"]:
 78 |         for f in focus_alphabet:
 79 |             hash_list.append(token_hash + c - f)
 80 | 
 81 |     hash_counter = Counter(hash_list)  # Counting retrieval occurence
 82 | 
 83 |     for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())):
 84 |         count = hash_counter[h]
 85 |         anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3]
 86 | 
 87 |         for anag in anag_list:
 88 |             anag_score = rate_anagram(structures["occurence_map"], token, anag, count)
 89 | 
 90 |             if anag_score > 0:
 91 |                 anagrams[anag] = anag_score
 92 | 
 93 |     return anagrams
 94 | 
 95 | 
 96 | def select_ocrsims(token, structures):
 97 |     """Select similar words for a given token
 98 | 
 99 |     Parameters:
100 |         token (:func:`str`): Cleaned token
101 |         structures (:func:`dict`): Datastructures from file
102 | 
103 |     Returns:
104 |         :func:`dict` - Similar words (keys) along with their score (values)
105 |     """
106 |     delta = 2
107 |     ocr_sims = {}
108 | 
109 |     word_hash = ocr_key_hash(token)
110 | 
111 |     sim_hash_list = {}  # Using a dictionary avoid multiple entries if a key is retrieved twice
112 |     key_index = -1
113 | 
114 |     # for (key, value) in word_hash:
115 |     for key, value in word_hash:
116 |         key_index += 1
117 |         sim_hash = deepcopy(word_hash)
118 | 
119 |         for d in range(-delta, delta+1):
120 |             if d != 0:
121 |                 card = max(int(value)+d, 1)
122 | 
123 |                 sim_hash[key_index] = (key, card)
124 | 
125 |                 # Rebuild OCR key string
126 |                 sim_hash_str = ""
127 |                 for k, v in sim_hash:
128 |                     sim_hash_str += k + str(v)
129 | 
130 |                 if sim_hash_str in structures["ocrkeys"]:
131 |                     card_diff = abs(int(value)-card)
132 | 
133 |                     sim_hash_list[sim_hash_str] = [(sim_word, card_diff)
134 |                                                    for sim_word in structures["ocrkeys"][sim_hash_str]
135 |                                                    if edit_distance(sim_word, token) <= 2]
136 | 
137 |     for sim_hash_str, sim_list in sim_hash_list.items():
138 |         for sim_word, card_diff in sim_list:
139 |             sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff)
140 | 
141 |             if sim_score > 0:
142 |                 ocr_sims[sim_word] = sim_score
143 | 
144 |     return ocr_sims
145 | 
146 | 
147 | def truncate_ocr_sim_list(token, ocr_sims_list, limit=10):
148 |     """Truncate the OCR key similarity list to a defined set of possibilities
149 | 
150 |     Parameters:
151 |         token (:func:`str`): Initial token
152 |         ocr_sims_list (dict): OCR similarities
153 |         limit (int): Final number of similarities
154 | 
155 |     Returns:
156 |         dict - List of similarities to keep
157 |     """
158 |     if len(ocr_sims_list) <= limit:
159 |         return ocr_sims_list
160 | 
161 |     ocr_scores = set([sc for sim, sc in ocr_sims_list.items()])
162 | 
163 |     # Limit of 10 different scores allowed
164 |     sorted_ocr_scores = sorted(ocr_scores, reverse=True)[:limit]
165 |     ocr_list = []
166 |     for score in sorted_ocr_scores:
167 |         tmp_ocr_list = [ocr_sims for ocr_sims, ocr_score in ocr_sims_list.items() if ocr_score == score]
168 | 
169 |         if len(ocr_list) + len(tmp_ocr_list) > limit:
170 |             list_len = limit - len(ocr_list)
171 |             tmp_list = []
172 | 
173 |             while len(tmp_list) < list_len:
174 |                 tmp_list += select_lower_edit_distance(token, tmp_ocr_list)
175 | 
176 |             if len(ocr_list) + len(tmp_list) == limit:  # Final list has exactly 10 elements
177 |                 ocr_list += tmp_list
178 |                 break
179 |             else:  # List has more than 10 arguments (need to chose only the n elements needed)
180 |                 alpha_tmp_list = []
181 | 
182 |                 while len(alpha_tmp_list) != list_len:
183 |                     alpha_word = select_best_alphabetical_word(token, tmp_list)
184 | 
185 |                     alpha_tmp_list.append(alpha_word)
186 |                     tmp_list = [tkn for tkn in tmp_list if tkn != alpha_word]
187 | 
188 |                 ocr_list += alpha_tmp_list
189 |                 break
190 |         elif len(ocr_list) + len(tmp_ocr_list) == limit:
191 |             ocr_list += tmp_ocr_list
192 |             break
193 |         else:  # len(ocr_list) + len(tmp_ocr_list) < limit
194 |             ocr_list += tmp_ocr_list
195 | 
196 |     if len(ocr_list) != limit:
197 |         raise IndexError("OCR list is still too big ("+str(len(ocr_list))+"/"+str(limit)+")")
198 | 
199 |     return {tkn: ocr_sims_list[tkn] for tkn in ocr_list}
200 | 
201 | 
202 | def split_ocr_list(token, ocr_list):
203 |     """Split the OCR list between strong and week OCR words
204 | 
205 |     Parameters:
206 |         token (:func:`str`): Token to correct
207 |         ocr_list (:func:`dict`): List of possible OCR correction
208 |     Returns:
209 |         tuple - Strong OCR words and weak OCR words
210 |     """
211 | 
212 |     # Build the sorted OCR key list and divide it into 2 different stacks
213 |     ocr_words = sorted(
214 |         ocr_list.iteritems(),
215 |         key=operator.itemgetter(1),
216 |         reverse=True
217 |     )
218 |     strong_ocr_words = {tkn: sc for tkn, sc in ocr_words[:5]}
219 |     weak_ocr_words = {tkn: sc for tkn, sc in ocr_words[5:]}
220 | 
221 |     min_strong_score = min([sc for tkn, sc in strong_ocr_words.items()])
222 |     max_weak_score = max([sc for tkn, sc in weak_ocr_words.items()])
223 | 
224 |     if min_strong_score == max_weak_score:
225 |         strong_tmp_ocr_words = [tkn for tkn, score in strong_ocr_words.items() if score == min_strong_score]
226 |         weak_tmp_ocr_words = [tkn for tkn, score in weak_ocr_words.items() if score == min_strong_score]
227 | 
228 |         tmp_list = {t: min_strong_score for t in strong_tmp_ocr_words}
229 |         tmp_list.update({t: min_strong_score for t in weak_tmp_ocr_words})
230 | 
231 |         tmp_strg_list = truncate_ocr_sim_list(token, tmp_list, len(strong_tmp_ocr_words))
232 |         tmp_weak_list = [tkn for tkn in tmp_list.keys() if tkn not in tmp_strg_list]
233 | 
234 |         strong_ocr_words = {tkn: sc for tkn, sc in strong_ocr_words.items() if tkn not in tmp_list.keys()}
235 |         strong_ocr_words.update(tmp_strg_list)
236 | 
237 |         weak_ocr_words = {tkn: sc for tkn, sc in weak_ocr_words.items() if tkn not in tmp_list.keys()}
238 |         weak_ocr_words.update({tkn: min_strong_score for tkn in tmp_weak_list})
239 | 
240 |     return strong_ocr_words, weak_ocr_words
241 | 
242 | 
243 | def build_candidates_list(token, anagrams_list, ocr_sims_list, structures):
244 |     """Merge anagram and OCRkey list into one list.
245 | 
246 |     Parameters:
247 |         token (:func:`str`): Cleaned token
248 |         anagrams_list (:func:`dict`): Result of `select_anagrams`
249 |         ocr_sims_list (:func:`dict`): Result of `select_ocrsims`
250 |         structures (:func:`dict`): Datastructures from file
251 | 
252 |     Returns:
253 |         :func:`dict` - Correction tokens (keys) along with their score (values)
254 |     """
255 |     final_list = anagrams_list
256 | 
257 |     ocr_list = truncate_ocr_sim_list(token, ocr_sims_list)
258 | 
259 |     strong_ocr_list = ocr_list
260 |     weak_ocr_list = {}
261 |     if len(ocr_list) > 5:
262 |         (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list)
263 | 
264 |     for ocr_word, ocr_score in strong_ocr_list.items():
265 |         if ocr_word in final_list.keys():
266 |             final_list[ocr_word] *= ocr_score
267 |             del strong_ocr_list[ocr_word]
268 | 
269 |     strong_ocr_list.update(weak_ocr_list)
270 | 
271 |     for ocr_word, ocr_score in strong_ocr_list.items():
272 |         if ocr_word not in final_list.keys():
273 |             final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \
274 |                 * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0)
275 | 
276 |     return final_list
277 | 
278 | 
279 | def find_correct_case(word, case_mode, structures):
280 |     """Select the best case between a set of already encountered cases
281 | 
282 |     Parameters:
283 |         word (:func:`str`): Word to correct
284 |         case_mode (int): Choice between lower or upper case (extra choice for undecisive)
285 |         structures (dict): List of structures needed to perform the choice
286 |     Returns:
287 |         :func:`str` - Corrected word
288 |     """
289 |     variations = {key: structures["occurence_map"][key] for key in structures["altcase"][word.lower()]}
290 |     variations = sorted(variations.iteritems(), key=operator.itemgetter(1), reverse=True)
291 | 
292 |     tmp_vars = []
293 |     if case_mode == 0:  # Upper case spelling
294 |         for var in variations:
295 |             _word = var[0]
296 |             if _word[0].isupper() and sum(char.isupper() for char in _word) > 2:
297 |                 tmp_vars.append(var)
298 | 
299 |         if len(tmp_vars) == 0:
300 |             tmp_vars = variations
301 |     elif case_mode == 1:  # Lower case with capital initial
302 |         for var in variations:
303 |             _word = var[0]
304 |             if _word[0].isupper() and sum(char.isupper() for char in _word) <= 2:
305 |                 tmp_vars.append(var)
306 | 
307 |         if len(tmp_vars) == 0:
308 |             tmp_vars = variations
309 |     else:  # case_mode == -1 (no capital letters found)
310 |         tmp_vars = variations
311 | 
312 |     max_occ = tmp_vars[0][1]
313 |     dist_vars = {term: edit_distance(word, term) for term, occ in tmp_vars if occ == max_occ}
314 | 
315 |     if len(dist_vars) == 1:
316 |         return dist_vars.keys()[0]
317 | 
318 |     # Several terms with max occurence still exist
319 |     dist_vars = sorted(dist_vars.iteritems(), key=operator.itemgetter(1))
320 | 
321 |     min_dist = dist_vars[0][1]
322 |     min_dist_vars = [term for term, dist in dist_vars if dist == min_dist]
323 | 
324 |     if len(min_dist_vars) == 1:
325 |         return min_dist_vars[0]
326 | 
327 |     # Several terms with same Levenhstein distance exist
328 |     term_ascii_code = {term: [ord(ch) for ch in term] for term in min_dist_vars}
329 | 
330 |     for ascii_code in term_ascii_code.values():
331 |         for i in xrange(len(ascii_code)):
332 |             code = ascii_code[i]
333 | 
334 |             # Non a-zA-Z chars will have a 0 value
335 |             if code < 65 or 90 < code < 97 or code > 122:
336 |                 ascii_code[i] = 0
337 | 
338 |     if case_mode >= 0:
339 |         ascii_val = min(term_ascii_code.values())
340 | 
341 |         t = [t for t, v in term_ascii_code.items() if v == ascii_val]
342 | 
343 |         if len(t) > 1:
344 |             raise ValueError("Too many value in final array")
345 | 
346 |         return t[0]
347 |     else:
348 |         ascii_val = max(term_ascii_code.values())
349 | 
350 |         t = [t for t, v in term_ascii_code.items() if v == ascii_val]
351 | 
352 |         if len(t) > 1:
353 |             raise ValueError("Too many value in final array")
354 | 
355 |         return t[0]
356 | 
357 | 
358 | def correct_case(token, corrections_map, structures):
359 |     """Select the best spelling for a word (case-wise)
360 | 
361 |     Parameters:
362 |         token (:func:`str`): Cleaned token
363 |         corrections_map (:func:`dict`): Result of `build_candidates_list`
364 |         structures (:func:`dict`): Datastructures from file
365 | 
366 |     Returns:
367 |         :func:`dict` - Corrected tokens (keys) along with their score (values)
368 |     """
369 |     alt_case_mode = -1  # Most common variation
370 |     if token[0].isupper():
371 |         if sum(char.isupper() for char in token) > 2:
372 |             alt_case_mode = 0  # Upper case variation
373 |         else:
374 |             alt_case_mode = 1  # Lower case variation with capital first letter
375 | 
376 |     corrected_case_map = {}
377 |     for correct_word, score in corrections_map.items():
378 |         if correct_word.find(" ") != -1:
379 |             words = correct_word.split(" ")
380 | 
381 |             keys_left = find_correct_case(words[0], alt_case_mode, structures)
382 |             keys_right = find_correct_case(words[1], alt_case_mode, structures)
383 |             key = keys_left+" "+keys_right
384 |         else:
385 |             key = find_correct_case(correct_word, alt_case_mode, structures)
386 | 
387 |         # If the key already exists we keep the highest score
388 |         if key in corrected_case_map.keys():
389 |             old_score = corrected_case_map[key]
390 |             corrected_case_map[key] = max(old_score, score)
391 |         else:
392 |             corrected_case_map[key] = score
393 | 
394 |     return corrected_case_map
395 | 
396 | 
397 | def apply_bigram_boost(paragraph, bigrams, occurence_map):
398 |     """Compute the bigram boost for every token of a paragraph and apply it to the possible corrections.
399 | 
400 |     Parameters:
401 |         paragraph (:func:`list`): List of lines
402 |         bigrams (:func:`list`): Bigrams for the given paragraph
403 |         occurence_map (:func:`dict`): Occurence of unigrams and bigrams
404 |     """
405 |     token_index = -1
406 | 
407 |     for line in paragraph:
408 |         for token in line.tokens:
409 |             if token[2] is None:
410 |                 continue
411 | 
412 |             # Finding adjacent tokens
413 |             adjacent_tokens = []
414 | 
415 |             if token_index > 0:
416 |                 adjacent_tokens.append(bigrams[token_index][0])
417 |             else:
418 |                 adjacent_tokens.append(None)
419 | 
420 |             token_index += 1
421 | 
422 |             if token_index < len(bigrams):
423 |                 adjacent_tokens.append(bigrams[token_index][1])
424 |             else:
425 |                 adjacent_tokens.append(None)
426 | 
427 |             # Normalizing adjacent tokens array
428 |             for tkn_index in xrange(len(adjacent_tokens)):
429 |                 adj_tkn = adjacent_tokens[tkn_index]
430 | 
431 |                 if adj_tkn is None:
432 |                     adjacent_tokens[tkn_index] = []
433 |                     continue
434 | 
435 |                 if not adj_tkn[2] is None:
436 |                     adjacent_tokens[tkn_index] = []
437 |                     adj_tkn = sorted(adj_tkn[2].iteritems(), key=operator.itemgetter(1), reverse=True)
438 | 
439 |                     for idx in xrange(min(5, len(adj_tkn))):
440 |                         adjacent_tokens[tkn_index].append(adj_tkn[idx][0].lower())
441 |                 else:
442 |                     if not adj_tkn[1] is None:
443 |                         adjacent_tokens[tkn_index] = [adj_tkn[1].lower()]
444 |                     else:
445 |                         adjacent_tokens[tkn_index] = [adj_tkn[0].lower()]
446 | 
447 |             # Computing bigram boost
448 |             for correction in token[2].keys():
449 |                 bigram_boost = rate_bigram(correction.lower(), adjacent_tokens[0], adjacent_tokens[1], occurence_map)
450 |                 token[2][correction] *= bigram_boost
451 | 
452 | 
453 | def select_lower_edit_distance(ref_word, word_list):
454 |     """Get the word with the lower edit distance
455 | 
456 |     Parameters:
457 |         ref_word (:func:`str`): Word to correct
458 |         word_list (list): List of proposals
459 |     
460 |     Returns:
461 |         :func:`str` - Selected word
462 |     """
463 |     word_dict = {word: edit_distance(ref_word, word) for word in word_list}
464 |     min_dist = min(word_dict.values())
465 | 
466 |     return [word for word, dist in word_dict.items() if dist == min_dist]
467 | 
468 | 
469 | def select_by_hash(word_list):
470 |     """Select the word with the lower md5 hash
471 | 
472 |     Parameters:
473 |         word_list (list): List of proposal
474 | 
475 |     Returns:
476 |         :func:`str` - Selected word
477 |     """
478 |     hashes = set([md5(word).hexdigest() for word in word_list])
479 | 
480 |     if len(hashes) != len(word_list):
481 |         raise Exception("differenciation impossible")
482 | 
483 |     return [tkn for tkn in word_list if md5(tkn).hexdigest() == min(hashes)][0]
484 | 
485 | 
486 | def select_best_alphabetical_word(ref_word, word_list):
487 |     """Select closest alphabetical word (non alphanumerical chars are set to the same value)
488 | 
489 |     Parameters:
490 |         ref_word (:func:`str`): Word to correct
491 |         word_list (list): List of propositions
492 | 
493 |     Returns:
494 |         :func:`str` - Selected word
495 |     """
496 |     case_mode = -1 if ref_word[0].isupper() else 0
497 |     term_ascii_code = {term: [ord(ch) for ch in term] for term in word_list}
498 | 
499 |     for ascii_code in term_ascii_code.values():
500 |         for i in xrange(len(ascii_code)):
501 |             code = ascii_code[i]
502 | 
503 |             # Non a-zA-Z chars will have a 0 value
504 |             if code < 65 or 90 < code < 97 or code > 122:
505 |                 ascii_code[i] = 0
506 | 
507 |     if case_mode >= 0:
508 |         ascii_val = min(term_ascii_code.values())
509 | 
510 |         tkn_list = [t for t, v in term_ascii_code.items() if v == ascii_val]
511 | 
512 |         if len(tkn_list) > 1:
513 |             return select_by_hash(tkn_list)
514 | 
515 |         return tkn_list[0]
516 |     else:
517 |         ascii_val = max(term_ascii_code.values())
518 | 
519 |         tkn_list = [t for t, v in term_ascii_code.items() if v == ascii_val]
520 | 
521 |         if len(tkn_list) > 1:
522 |             return select_by_hash(tkn_list)
523 | 
524 |         return tkn_list[0]
525 | 
526 | 
527 | def select_correction(word, corrections_map):
528 |     """Select the best correction for a word given its score
529 | 
530 |     Parameters:
531 |         word (str): Word to select a correction for.
532 |         corrections_map (:func:`dict`): Dictionary containing all corrections for a token along with their score
533 | 
534 |     Returns:
535 |         :func:`dict` - Chosen correction(s) along with their score
536 |     """
537 |     if corrections_map is None or len(corrections_map) == 1:
538 |         return corrections_map
539 | 
540 |     max_val = max(corrections_map.values())
541 |     final_list = {term: val for term, val in corrections_map.items() if val == max_val}
542 | 
543 |     if len(final_list) == 1:  # One value has the maximum
544 |         if final_list.values()[0] > 0.7:  # Highly valued terms are chosen by default
545 |             return final_list
546 | 
547 |         first_word = final_list.keys()[0]
548 | 
549 |         # If the threshold value has not been reached we are looking for a second term
550 |         del corrections_map[final_list.keys()[0]]
551 | 
552 |         max_val = max(corrections_map.values())
553 |         tmp_list = {term: val for term, val in corrections_map.items() if val == max_val}
554 | 
555 |         if len(tmp_list) == 1:  # One value has the second higher grade
556 |             final_list.update(tmp_list)
557 |             second_word = tmp_list.keys()[0]
558 |         else:  # Several terms with the same score
559 |             # Differenciation on the Levenhstein distance
560 |             tmp_list = select_lower_edit_distance(word, tmp_list.keys())
561 | 
562 |             if len(tmp_list) == 1:  # One term has the lowest score
563 |                 final_list[tmp_list[0]] = max_val
564 |                 second_word = tmp_list[0]
565 |             else:  # Several terms with the same
566 |                 # Choose the best alphabetical term
567 |                 second_word = select_best_alphabetical_word(word, tmp_list)
568 |                 final_list[second_word] = max_val
569 | 
570 |         # Determine if we need one or two terms
571 |         if log(final_list[first_word] / final_list[second_word]) >= 1:
572 |             del final_list[second_word]
573 | 
574 |         return final_list
575 |     elif len(final_list) != 2:  # More than 2 values share the same maximum
576 |         tmp_list = select_lower_edit_distance(word, final_list.keys())
577 | 
578 |         if len(tmp_list) == 1:  # One word get the min edit distance
579 |             first_word = tmp_list[0]
580 |             tmp_final_list = final_list
581 |             del tmp_final_list[first_word]
582 | 
583 |             tmp_list = select_lower_edit_distance(word, tmp_final_list.keys())
584 | 
585 |             if len(tmp_list) == 1:  # One word get the second minimal edit distance
586 |                 final_list = {
587 |                     first_word: max_val,
588 |                     tmp_list[0]: max_val
589 |                 }
590 | 
591 |                 return final_list
592 |             else:  # The second minimal edit distance is shared by several terms
593 |                 best_term = select_best_alphabetical_word(word, tmp_list)
594 | 
595 |                 final_list = {
596 |                     first_word: max_val,
597 |                     best_term: max_val
598 |                 }
599 | 
600 |                 return final_list
601 |         elif len(tmp_list) == 2:  # Exactly two word get the same min edit distance
602 |             final_list = {
603 |                 tmp_list[0]: max_val,
604 |                 tmp_list[1]: max_val
605 |             }
606 | 
607 |             return final_list
608 |         else:  #
609 |             best_term_1 = select_best_alphabetical_word(word, tmp_list)
610 | 
611 |             tmp_list = [term for term in tmp_list if term != best_term_1]
612 |             best_term_2 = select_best_alphabetical_word(word, tmp_list)
613 | 
614 |             final_list = {
615 |                 best_term_1: max_val,
616 |                 best_term_2: max_val
617 |             }
618 | 
619 |             return final_list
620 |     else:  # Two words with the same score
621 |         return final_list
622 | 
623 | 
624 | def extract_paragraph_bigrams(paragraph):
625 |     """Get bigrams for a given paragraph
626 | 
627 |     Parameters:
628 |         paragraph (list): Paragraph to extract bigrams from
629 | 
630 |     Returns:
631 |         list - Bigram list
632 |     """
633 |     p_tokens = [token for line in paragraph for token in line.tokens]
634 |     bigram_list = []
635 | 
636 |     for index in xrange(len(p_tokens) - 1):
637 |         bigram_list.append((p_tokens[index], p_tokens[index + 1]))
638 | 
639 |     return bigram_list
640 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/models/machine_learning.py:
--------------------------------------------------------------------------------
  1 | """Package containing all the machine learning functions and objects
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | import logging
 17 | from numpy import mean
 18 | from numpy.lib.polynomial import poly1d
 19 | 
 20 | 
 21 | logger = logging.getLogger("app")
 22 | 
 23 | 
 24 | class MachineLearningAlgorithm(object):
 25 |     """Machine learning algorithm
 26 |     """
 27 | 
 28 |     def __init__(self):
 29 |         self.training_set = {
 30 |             "features": [],
 31 |             "results": []
 32 |         }
 33 | 
 34 |         self.classifier = None
 35 | 
 36 |         logger.info("Model created (new)")
 37 | 
 38 |     def set_classifier(self, cls):
 39 |         """Set the classifier
 40 | 
 41 |         Args:
 42 |             cls (object): Classifier object
 43 |         """
 44 |         self.classifier = cls
 45 |         logger.info(cls.__class__.__name__+" model loaded")
 46 | 
 47 |     def set_training_set(self, features, results):
 48 |         """Setup the training set and verify its integrity
 49 | 
 50 |         Args:
 51 |             features (list): Training set features
 52 |             results (list): Training set results
 53 |         """
 54 |         if len(features) != len(results):
 55 |             raise AttributeError("Number of features and result are different")
 56 | 
 57 |         self.training_set["features"] = features
 58 |         self.training_set["results"] = results
 59 | 
 60 |         logger.debug("Training set uploaded")
 61 | 
 62 |     def train(self):
 63 |         """Train the model with the given training set
 64 |         """
 65 |         self.classifier.fit(self.training_set["features"], self.training_set["results"])
 66 |         logger.debug("Model trained")
 67 | 
 68 |     def classify(self, features):
 69 |         """Classify features
 70 | 
 71 |         Args:
 72 |             features (list): Features to classify
 73 | 
 74 |         Returns:
 75 |             list: Results of the classification
 76 |         """
 77 |         return self.classifier.predict(features)
 78 | 
 79 |     def compute_error(self, features, results):
 80 |         """Compute classification error
 81 | 
 82 |         Args:
 83 |             features (list): Features to classify
 84 |             results (list): Expected results
 85 | 
 86 |         Returns:
 87 |             float: Classification error
 88 |         """
 89 |         prediction = self.classifier.predict(features)
 90 |         error = 0
 91 | 
 92 |         for index in xrange(len(prediction)):
 93 |             if results[index] < 0:
 94 |                 continue
 95 | 
 96 |             error += ((prediction[index] - results[index]) / 5)**2
 97 | 
 98 |         error /= (2*len(prediction))
 99 |         return error
100 | 
101 | 
102 | class MachineLearningFeatures(object):
103 |     """Feature calculator for machine learning
104 |     """
105 | 
106 |     def __init__(self):
107 |         self.features = []
108 | 
109 |     def extract_features(self, line, unigrams, text_stats):
110 |         """Extract features from a given line
111 | 
112 |         Args:
113 |             line (Line): Line to get features from
114 |             unigrams (Unigrams): Unigrams for the given line
115 |             text_stats (Statistics): Statistics of the text the line is coming from
116 | 
117 |         Returns:
118 |             list: List of the features
119 |         """
120 |         # Simple features
121 |         features = [
122 |             float(line.stats["orig"].get_stat("lw_char")),
123 |             float(line.stats["orig"].get_stat("up_char")),
124 |             float(line.stats["orig"].get_stat("sp_char")),
125 |             float(line.stats["orig"].get_stat("nb_char")),
126 |             float(len(line.tokens)),
127 |         ]
128 | 
129 |         # Additional features
130 |         fappend = features.append
131 |         fappend(line.get_clean_stats().get_stat("lw_char"))
132 |         fappend(line.get_clean_stats().get_stat("up_char"))
133 |         fappend(line.get_clean_stats().get_stat("sp_char"))
134 |         fappend(line.get_clean_stats().get_stat("nb_char"))
135 |         fappend(line.get_line_score())
136 |         fappend(len(line.get_orig_line()))
137 |         fappend(len(line.get_clean_line()))
138 | 
139 |         u = unigrams
140 | 
141 |         tk_len = [len(token[0]) for token in line.tokens]
142 |         word_avg_len = 0
143 | 
144 |         if len(tk_len) > 0:
145 |             word_avg_len = mean(tk_len)
146 | 
147 |         fappend(float(word_avg_len))
148 | 
149 |         t0 = [u[tk[0]] for tk in line.tokens]
150 |         s0 = 0
151 | 
152 |         if len(t0) != 0:
153 |             s0 = mean(t0)
154 | 
155 |         fappend(float(s0))
156 | 
157 |         t1 = [u[tk[1]] for tk in line.tokens if not tk[1] is None]
158 |         s1 = 0
159 | 
160 |         if len(t1) != 0:
161 |             s1 = mean(t1)
162 | 
163 |         fappend(float(s1))
164 | 
165 |         t2 = [u[t] for tk in line.tokens if not tk[2] is None for t in tk[2].keys()]
166 |         s2 = 0
167 | 
168 |         if len(t2) != 0:
169 |             s2 = mean(t2)
170 | 
171 |         fappend(float(s2))
172 | 
173 |         # Regularization
174 |         orig_chars = sum(features[:4])
175 |         clean_chars = sum(features[5:9])
176 | 
177 |         f = [
178 |             features[0] / orig_chars,
179 |             features[1] / orig_chars,
180 |             features[2] / orig_chars,
181 |             features[3] / orig_chars
182 |         ]
183 | 
184 |         if clean_chars != 0:
185 |             f += [features[5] / clean_chars,
186 |                   features[6] / clean_chars,
187 |                   features[7] / clean_chars,
188 |                   features[8] / clean_chars]
189 |         else:
190 |             f += [0, 0, 0, 0]
191 | 
192 |         f += [features[9],
193 |               features[4] / text_stats.get_stat("word_avg_nb"),
194 |               features[12] / text_stats.get_stat("word_avg_length"),
195 |               features[10] / text_stats.get_stat("line_avg_length"),
196 |               features[11] / text_stats.get_stat("line_avg_length")]
197 | 
198 |         if features[13] != 0:
199 |             f.append(features[14] / features[13])
200 |             f.append(features[15] / features[13])
201 |         else:
202 |             f.append(0)
203 |             f.append(0)
204 | 
205 |         features = f
206 | 
207 |         # Ordering the data set
208 |         features = [
209 |             features[11],  # Original line average len
210 |             features[12],  # Clean line average len
211 |             features[9],  # Original line average len
212 |             features[10],  # Clean line average len
213 |             features[13],  # Original line average len
214 |             features[14],  # Clean line average len
215 |             features[0],  # Original line average len
216 |             features[1],  # Clean line average len
217 |             features[2],  # Original line average len
218 |             features[3],  # Clean line average len
219 |             features[4],  # Original line average len
220 |             features[5],  # Clean line average len
221 |             features[6],  # Original line average len
222 |             features[7],  # Clean line average len
223 |         ]
224 | 
225 |         # Polynomial features
226 |         degree = 1
227 |         poly_feat = []
228 |         p_feat = poly1d(features)
229 | 
230 |         for d in xrange(degree):
231 |             poly_feat += (p_feat ** (d+1)).coeffs.tolist()
232 | 
233 |         del poly_feat[5]
234 | 
235 |         self.features = poly_feat
236 | 
237 |         return self.features
238 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/text/__init__.py:
--------------------------------------------------------------------------------
  1 | """This module contains necessary classes to parse a file in order to get the :class:`.Text` object.
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from __future__ import division
 17 | import re
 18 | from nltk.tokenize import word_tokenize
 19 | from unidecode import unidecode
 20 | import codecs
 21 | from collections import Counter
 22 | import csv
 23 | import logging
 24 | from numpy import mean
 25 | from denoiser.text.stats import Statistics
 26 | 
 27 | 
 28 | def tokenize(line):
 29 |     """Separate line to get clean tokens out of it
 30 | 
 31 |     Parameters:
 32 |         line (:func:`str`): A line of text
 33 | 
 34 |     Returns:
 35 |         list - List of different tokens
 36 |     """
 37 |     separators = "=+/,.:;!?%<>#()&[]{}"
 38 | 
 39 |     tokens = []
 40 |     tokenized_line = word_tokenize(line)  # Will get rid of most of the separators
 41 | 
 42 |     for word in tokenized_line:
 43 |         tmp_tokens = [unidecode(word)]
 44 | 
 45 |         for separator in separators:
 46 |             sep_tokens = []
 47 | 
 48 |             for tmp_token in tmp_tokens:
 49 |                 split_token = tmp_token.split(separator)
 50 | 
 51 |                 if len(split_token) != 1:  # Token has been split
 52 |                     # Concatening the list of token with the separator
 53 |                     tkn_sep_list = []
 54 | 
 55 |                     for ind, tkn in enumerate(split_token):
 56 |                         tkn_sep_list.append(tkn)
 57 | 
 58 |                         if ind != len(split_token) - 1:  # Avoid to add the separator at the end
 59 |                             tkn_sep_list.append(unicode(separator))
 60 | 
 61 |                     sep_tokens += tkn_sep_list
 62 |                 else:
 63 |                     sep_tokens += split_token
 64 | 
 65 |             tmp_tokens = sep_tokens
 66 | 
 67 |         tokens += [tkn for tkn in tmp_tokens if tkn != '']
 68 | 
 69 |     return tokens
 70 | 
 71 | 
 72 | def clean_head_tail(word):
 73 |     """Clean head and tail of a word
 74 | 
 75 |     Parameters:
 76 |         word (:func:`str`): The word to clean
 77 |     Returns:
 78 |         :func:`str` - Cleaned word
 79 |     """
 80 |     cleaning_regexp = re.compile(r"^[^a-zA-Z'-]*([a-zA-Z'-](.*[a-zA-Z'-])?)[^a-zA-Z'-]*$")
 81 |     alpha_regexp = re.compile(r"[a-zA-Z]")
 82 | 
 83 |     word_groups = cleaning_regexp.findall(word)
 84 | 
 85 |     # Non matching strings are set as dirty (i.e. cannot be cleaned)
 86 |     # None is returned
 87 |     if len(word_groups) == 0:
 88 |         return None
 89 | 
 90 |     # Words containing no letters are set to None
 91 |     if alpha_regexp.search(word_groups[0][0]) is None:
 92 |         return None
 93 | 
 94 |     return word_groups[0][0]
 95 | 
 96 | 
 97 | class Text(object):
 98 |     """Stores the the text from a filename given in parameters
 99 | 
100 |     Args:
101 |         fname (str): Path to the file.
102 | 
103 |     Attributes:
104 |         filename (:func:`str`): Name of the file.
105 |         text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`.
106 |         stats (:class:`.Statistics`): Statistics object.
107 |     """
108 | 
109 |     def __init__(self, fname):
110 |         self.filename = fname
111 |         self.text = []
112 |         self.contains_training_data = False
113 | 
114 |         self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length",
115 |                                  "word_total_length", "word_avg_nb", "word_total_nb"])
116 |         self.stats.set_stat("line_nb", 0)
117 |         self.stats.set_stat("line_avg_length", 0)
118 |         self.stats.set_stat("line_total_length", 0)
119 |         self.stats.set_stat("word_avg_length", 0)
120 |         self.stats.set_stat("word_total_length", 0)
121 |         self.stats.set_stat("word_avg_nb", 0)
122 |         self.stats.set_stat("word_total_nb", 0)
123 | 
124 |     def read_csv(self):
125 |         """Read a CSV file and build the associated text object
126 | 
127 |         Returns:
128 |             `Text`
129 |         """
130 |         self.contains_training_data = True
131 | 
132 |         with open(self.filename, "r") as f:
133 |             csv_reader = csv.reader(f)
134 |             paragraph = []
135 | 
136 |             for row in csv_reader:
137 |                 if len(row) != 2:
138 |                     if len(paragraph) != 0:
139 |                         self.text.append(paragraph)
140 |                         paragraph = []
141 | 
142 |                     continue
143 | 
144 |                 line = unicode(row[0].decode("utf-8"))
145 |                 line = line.strip(" \t\r\n")
146 | 
147 |                 if len(line) == 0:
148 |                     if len(paragraph) != 0:
149 |                         self.text.append(paragraph)
150 |                         paragraph = []
151 | 
152 |                     continue
153 | 
154 |                 line_object = Line(line, row[1])
155 |                 paragraph.append(line_object)
156 | 
157 |                 self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
158 |                 self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
159 |                 self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
160 | 
161 |                 words_len = sum([len(tkn) for tkn in line_object.tokens])
162 |                 self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
163 | 
164 |             if len(paragraph) != 0:
165 |                 self.text.append(paragraph)
166 | 
167 |         self.stats.set_stat("line_avg_length",
168 |                             self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
169 |         self.stats.set_stat("word_avg_length",
170 |                             self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
171 |         self.stats.set_stat("word_avg_nb",
172 |                             self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
173 | 
174 |         logging.debug(self.filename+" read")
175 | 
176 |     def read_txt(self):
177 |         """Read a text file and build the associated text object
178 | 
179 |         Returns:
180 |             `Text`
181 |         """
182 |         self.contains_training_data = False
183 | 
184 |         with codecs.open(self.filename, "rb", encoding="utf-8") as f:
185 |             paragraph = []
186 | 
187 |             for line in f:
188 |                 line = line.strip(" \t\r\n")
189 | 
190 |                 if len(line) == 0:
191 |                     if len(paragraph) != 0:
192 |                         self.text.append(paragraph)
193 |                         paragraph = []
194 | 
195 |                     continue
196 | 
197 |                 line_object = Line(line)
198 |                 paragraph.append(line_object)
199 | 
200 |                 self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
201 |                 self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
202 |                 self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
203 | 
204 |                 words_len = sum([len(tkn) for tkn in line_object.tokens])
205 |                 self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
206 | 
207 |             if len(paragraph) != 0:
208 |                 self.text.append(paragraph)
209 | 
210 |         self.stats.set_stat("line_avg_length",
211 |                             self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
212 |         self.stats.set_stat("word_avg_length",
213 |                             self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
214 |         self.stats.set_stat("word_avg_nb",
215 |                             self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
216 | 
217 |         logging.debug(self.filename+" read")
218 | 
219 |     def get_clean_lines(self):
220 |         """Returns cleans line from the text object
221 | 
222 |         Returns:
223 |             list: List of clean lines
224 |         """
225 |         lines = []
226 | 
227 |         for paragraph in self.text:
228 |             for line in paragraph:
229 |                 if line.grade == 5:
230 |                     lines.append(line.get_clean_line())
231 | 
232 |             if len(lines) > 0 and lines[-1] != "":
233 |                 lines.append("")
234 | 
235 |         return lines
236 | 
237 |     def get_garbage_lines(self):
238 |         """Returns garbage lines from the text object
239 | 
240 |         Returns:
241 |             list: List of garbage lines
242 |         """
243 |         lines = []
244 | 
245 |         for paragraph in self.text:
246 |             for line in paragraph:
247 |                 if line.grade == 0:
248 |                     lines.append(line.get_orig_line())
249 | 
250 |             if len(lines) > 0 and lines[-1] != "":
251 |                 lines.append("")
252 | 
253 |         return lines
254 | 
255 |     def get_unclassified_lines(self):
256 |         """Returns unclassified lines from the text object
257 | 
258 |         Returns:
259 |             list: List of unclassified lines
260 |         """
261 |         lines = []
262 | 
263 |         for paragraph in self.text:
264 |             for line in paragraph:
265 |                 if line.grade % 5 != 0:  # Grade is not 0 nor 5
266 |                     lines.append(line.get_orig_line())
267 | 
268 |             if len(lines) > 0 and lines[-1] != "":
269 |                 lines.append("")
270 | 
271 |         return lines
272 | 
273 |     def retrieve_text_score(self):
274 |         """Returns some stats and score regarding classification
275 | 
276 |         Returns:
277 |             dict: Dictionary containing the results
278 |         """
279 |         # True positive is a garbage string detected as such
280 |         score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
281 |         class_stats = {"classified": 0, "unclassified": 0, "unrated": 0}
282 | 
283 |         for paragraph in self.text:
284 |             for line in paragraph:
285 |                 if line.grade != 0 and line.grade != 5:
286 |                     class_stats["unclassified"] += 1
287 |                     continue
288 | 
289 |                 if line.result is None or line.result < 0:
290 |                     class_stats["unrated"] += 1
291 |                     continue
292 | 
293 |                 class_stats["classified"] += 1
294 | 
295 |                 if line.grade == 0:  # Line detected as garbage
296 |                     if line.result == 1:  # Line is clean
297 |                         score_stats["FP"] += 1  # False positive
298 |                     else:  # Line is garbage
299 |                         score_stats["TP"] += 1  # True postive
300 |                 else:  # Line detected as clean
301 |                     if line.result == 1:  # Line is clean
302 |                         score_stats["TN"] += 1  # True negative
303 |                     else:  # Line is garbage
304 |                         score_stats["FN"] += 1  # False negative
305 | 
306 |         # Precision
307 |         divider_pr = score_stats["TP"] + score_stats["FP"]
308 |         if divider_pr != 0:
309 |             precision = score_stats["TP"] / divider_pr
310 |         else:
311 |             precision = 0
312 | 
313 |         # Recall
314 |         divider_rc = score_stats["TP"] + score_stats["FN"]
315 |         if divider_rc != 0:
316 |             recall = score_stats["TP"] / divider_rc
317 |         else:
318 |             recall = 0
319 | 
320 |         # F1 score
321 |         if precision + recall != 0:
322 |             f1 = 2 * precision * recall / (precision + recall)
323 |         else:
324 |             f1 = 0
325 | 
326 |         return {
327 |             "class": class_stats,
328 |             "score": {
329 |                 "precision": precision,
330 |                 "recall": recall,
331 |                 "f1": f1
332 |             },
333 |             "raw": score_stats
334 |         }
335 | 
336 | 
337 | class Line(object):
338 |     """Represents a line of text and provides datastructures to handle it.
339 | 
340 |     Args:
341 |         string (unicode): Line to parse.
342 |         result (int): (**Optional**) Expected result for a line (either a garbage string or a clean line)
343 | 
344 |     Attributes:
345 |         tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of
346 |             3 element organized in this order `(original_token, clean_token, corrected_token)`
347 |         pos_string (:func:`str`): Reference string containing the position of all the tokens
348 |         result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the
349 |                                               algorithm
350 |         grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line).
351 |         stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of
352 |             **lower**, **upper** and **special** characters along with **numbers**.
353 |     """
354 | 
355 |     def __init__(self, string, result=None):
356 |         self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)]
357 | 
358 |         self.pos_string = string  # String containing the position of each token (e.g. "%0 %1%2 ... %n")
359 |         for index, token in enumerate(self.tokens):
360 |             self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1)
361 | 
362 |         self.result = None
363 |         if result is not None:
364 |             self.result = int(result)
365 | 
366 |         if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
367 |             self.grade = 0
368 |         else:
369 |             self.grade = 3
370 | 
371 |         self.stats = {
372 |             "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
373 |             "clean": None
374 |         }
375 | 
376 |         tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line())  # Lower chars replacement
377 |         tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
378 |         tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
379 |         tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
380 |         line_stats = Counter(tmp_line)
381 | 
382 |         self.stats["orig"].set_stat("lw_char", line_stats["a"])
383 |         self.stats["orig"].set_stat("up_char", line_stats["A"])
384 |         self.stats["orig"].set_stat("nb_char", line_stats["0"])
385 |         self.stats["orig"].set_stat("sp_char", line_stats["#"])
386 | 
387 |     def raise_grade(self):
388 |         """Add 1 to the grade of the line (up to 5)
389 |         """
390 |         if self.grade < 5:
391 |             self.grade += 1
392 | 
393 |     def decrease_grade(self):
394 |         """Remove 1 to the grade of the line (down to 0)
395 |         """
396 |         if self.grade > 0:
397 |             self.grade -= 1
398 | 
399 |     def set_garbage(self):
400 |         """Set the grade to 0
401 |         """
402 |         self.grade = 0
403 | 
404 |     def set_clean(self):
405 |         """Set the grade to 5
406 |         """
407 |         self.grade = 5
408 | 
409 |     def get_orig_line(self):
410 |         """Returns the original line
411 | 
412 |         Returns:
413 |             str: Original line
414 |         """
415 |         string = self.pos_string
416 | 
417 |         for index, token in reversed(list(enumerate(self.tokens))):
418 |             string = string.replace("%"+str(index), token[0])
419 | 
420 |         return string
421 | 
422 |     def get_clean_line(self):
423 |         """Returns the clean line
424 | 
425 |         Returns:
426 |             str: Clean line
427 |         """
428 |         string = self.pos_string
429 | 
430 |         for index, token in reversed(list(enumerate(self.tokens))):
431 |             if not token[2] is None and len(token[2]) > 0:
432 |                 string = string.replace("%"+str(index), token[2].keys()[0])
433 |             else:  # Inline correction is not available
434 |                 if not token[1] is None:
435 |                     string = string.replace("%"+str(index), token[1])
436 |                 else:  # Clean token does not exist, use the original token
437 |                     string = string.replace("%"+str(index), token[0])
438 | 
439 |         return re.sub(" +", " ", string).strip()
440 | 
441 |     def get_orig_stats(self):
442 |         """Get original stats of the line
443 | 
444 |         Returns:
445 |             Statistics: Statistics of the original line
446 |         """
447 |         return self.stats["orig"]
448 | 
449 |     def get_clean_stats(self):
450 |         """Get clean stats of the line
451 | 
452 |         Returns:
453 |             Statistics: Statistics of the clean line
454 |         """
455 |         if self.stats["clean"] is None:  # Compute clean stats if it is not already done
456 |             self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"])
457 | 
458 |             tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line())  # Lower chars replacement
459 |             tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
460 |             tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
461 |             tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
462 |             line_stats = Counter(tmp_line)
463 | 
464 |             self.stats["clean"].set_stat("lw_char", line_stats["a"])
465 |             self.stats["clean"].set_stat("up_char", line_stats["A"])
466 |             self.stats["clean"].set_stat("nb_char", line_stats["0"])
467 |             self.stats["clean"].set_stat("sp_char", line_stats["#"])
468 | 
469 |         return self.stats["clean"]
470 | 
471 |     def get_line_score(self):
472 |         """Return a global score of the line
473 | 
474 |         Returns:
475 |             float: Score of the line
476 |         """
477 |         score = 0
478 | 
479 |         if len(self.tokens) == 0:
480 |             return score
481 | 
482 |         for token in [t[2] for t in self.tokens if not t[2] is None]:
483 |             score += mean([s for s in token.values()])
484 | 
485 |         return score / len(self.tokens)
486 | 
487 |     def __len__(self):
488 |         return len(self.get_orig_line())
489 | 
490 |     def __str__(self):
491 |         return str(self.tokens) + " | " + str(self.grade)
492 | 


--------------------------------------------------------------------------------
/packages/denoiser/src/denoiser/text/stats.py:
--------------------------------------------------------------------------------
 1 | """Statistic package
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from __future__ import division
17 | import logging
18 | 
19 | 
20 | class Statistics(object):
21 |     """Statistics of one text file
22 |     """
23 | 
24 |     def __init__(self, stat_names):
25 |         if type(stat_names) != list:
26 |             raise TypeError
27 | 
28 |         self.stats = {}
29 | 
30 |         for name in stat_names:
31 |             self.stats[name] = None
32 | 
33 |         logging.debug("Statistics initialized")
34 | 
35 |     def set_stat(self, name, value):
36 |         """Add a new stat to the model
37 |         """
38 |         if name not in self.stats:
39 |             raise KeyError("Key '"+name+"' does not exists")
40 | 
41 |         self.stats[name] = value
42 | 
43 |     def get_stat(self, name):
44 |         """Return a statistic value
45 | 
46 |         Returns:
47 |             float: Value of the stat
48 |         """
49 |         if name not in self.stats:
50 |             raise KeyError("Key '"+name+"' does not exists")
51 | 
52 |         return self.stats[name]
53 | 
54 |     def __str__(self):
55 |         return str(self.stats)
56 | 


--------------------------------------------------------------------------------
/packages/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # pipeline
 2 | 
 3 | **Author:** Philippe Dessauw, philippe.dessauw@nist.gov
 4 | 
 5 | **Contact:** Alden Dima, alden.dima@nist.gov
 6 | 
 7 | -----
 8 | 
 9 | The pipeline package allows you to convert a corpus of PDF files to clean TXT files.
10 | 
11 | ## Installation
12 | 
13 | ### Packaging source files
14 | 
15 | 	$> cd /path/to/pipeline
16 | 	$> python setup.py sdist
17 | 
18 | You should now see **dist** package in the main directory.
19 | 
20 | ### Installing the package
21 | 
22 | 	$> cd path/to/pipeline/dist
23 | 	$> pip install pipeline-*version*.tar.gz
24 | 
25 | This package is now ready to use!
26 | 
27 | ## Contact
28 | 
29 | If you have any questions, comments or suggestions about this repository, please send an e-mail to Alden Dima 
30 | (alden.dima@nist.gov).
31 | 


--------------------------------------------------------------------------------
/packages/pipeline/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="pipeline",
 5 |     version="1.1.0-alpha5",
 6 | 
 7 |     description=("Transform PDF files to clean text files using a distributed architecture",),
 8 | 
 9 |     author="Philippe Dessauw",
10 |     author_email="philippe.dessauw@nist.gov",
11 | 
12 |     packages=find_packages('src'),
13 |     package_dir={
14 |         '': 'src',
15 |     },
16 | 
17 |     install_requires=[
18 |         'apputils',
19 |         'denoiser',
20 |         'fabric',
21 |         'matplotlib',
22 |         'redis',
23 |         'hiredis',
24 |         'PyPDF2',
25 |     ],
26 | )
27 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | """Let you start a slave or master process
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | import signal
17 | import sys
18 | from actors import Slave, Master
19 | 
20 | orig_sigint = signal.getsignal(signal.SIGINT)
21 | """object: Original INTERUPT signal
22 | """
23 | 
24 | 
25 | def run_slave(app_config):
26 |     """Start a slave process
27 | 
28 |     Parameter:
29 |         app_config (dict): Application configuration
30 |     """
31 |     s = Slave(app_config)
32 | 
33 |     def terminate(signum, frame):
34 |         """Stop the process in a clean way
35 | 
36 |         Parameters
37 |             signum (int): Signal code
38 |             frame (object): original signal
39 |         """
40 |         signal.signal(signal.SIGINT, orig_sigint)
41 | 
42 |         try:
43 |             s.stop()
44 |             sys.exit(0)
45 |         except:
46 |             sys.exit(1)
47 | 
48 |     signal.signal(signal.SIGINT, terminate)
49 |     s.run()
50 | 
51 | 
52 | def run_master(app_config):
53 |     """Start the master process
54 | 
55 |     Parameter:
56 |         app_config (dict): Application configuration
57 |     """
58 |     m = Master(app_config)
59 | 
60 |     def terminate(signum, frame):
61 |         """Stop the process in a clean way
62 | 
63 |         Parameters
64 |             signum (int): Signal code
65 |             frame (object): original signal
66 |         """
67 |         signal.signal(signal.SIGINT, orig_sigint)
68 | 
69 |         try:
70 |             m.stop()
71 |             sys.exit(0)
72 |         except:
73 |             sys.exit(1)
74 | 
75 |     signal.signal(signal.SIGINT, terminate)
76 |     m.run()
77 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/actors.py:
--------------------------------------------------------------------------------
  1 | """Package defining master and slave threads
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | import logging
 17 | from os import listdir, getpid, remove
 18 | from os.path import join, exists, split
 19 | from socket import gethostbyname, gethostname
 20 | from time import sleep
 21 | from shutil import move
 22 | from pipeline.files import FileManager
 23 | from pipeline.threads import StoppableThread
 24 | from pipeline.utils import create_data_directory
 25 | from pipeline.logger import AppLogger, LogWriter
 26 | from pipeline.queue import QueueManager, CommandQueueItem
 27 | 
 28 | 
 29 | class Master(StoppableThread):
 30 |     """ Master worker
 31 |     """
 32 | 
 33 |     def __init__(self, app_config):
 34 |         StoppableThread.__init__(self)
 35 | 
 36 |         # ip = app_config["machines"]["master"][0].split('@')
 37 |         # master_ip = ip[-1:][0]
 38 |         redis_ip = app_config["redis"]["host"]
 39 |         redis_port = app_config["redis"]["port"]
 40 | 
 41 |         self.logger = AppLogger("master", logging.getLogger("local"), redis_ip, redis_port)
 42 |         self.log_writer = LogWriter(logging.getLogger("app"), redis_ip, redis_port)
 43 | 
 44 |         self.command_queue = QueueManager(host=redis_ip, port=redis_port, qname="commands")
 45 |         self.finished_queue = QueueManager(host=redis_ip, port=redis_port, qname="finished")
 46 |         # self.fman = FileManager(master_ip, master_queue_port)
 47 |         self.fman = FileManager(app_config)
 48 | 
 49 |         self.config = app_config
 50 |         self.input = app_config["dirs"]["input"]
 51 |         self.output = app_config["dirs"]["output"]
 52 | 
 53 |     def run(self):
 54 |         self.log_writer.start()
 55 |         self.logger.info("Starting master...")
 56 | 
 57 |         # processed_filenames = []
 58 | 
 59 |         while not self.is_stopped():
 60 |             self.logger.info("Reading input directory...")
 61 |             # filenames = [f for f in listdir(self.input) if f not in processed_filenames]
 62 |             filenames = listdir(self.input)
 63 | 
 64 |             if len(filenames) > 0:
 65 |                 self.logger.info(str(len(filenames)) + " file(s) to put in the queue")
 66 | 
 67 |                 for filename in filenames:
 68 |                     self.logger.debug("Processing %s..." % filename)
 69 |                     full_filename = join(self.input, filename)
 70 |                     dirname = create_data_directory(full_filename, self.config["dirs"]["temp"])
 71 |                     self.logger.debug("%s has been created." % dirname)
 72 | 
 73 |                     if dirname is not None:
 74 |                         # archive = zip_directory(dirname)
 75 | 
 76 |                         # self.fman.store_file(archive)
 77 |                         self.command_queue.push(CommandQueueItem(filename=dirname, logger=self.logger,
 78 |                                                                  config=self.config))
 79 | 
 80 |                     # processed_filenames.append(filename)
 81 |                     self.logger.info("Incoming files have been put in the queue")
 82 | 
 83 |             if len(self.finished_queue) > 0:
 84 |                 self.logger.info("Finished queue not empty")
 85 | 
 86 |                 while not self.finished_queue.is_empty():
 87 |                     filename = self.finished_queue.pop()
 88 |                     # self.fman.retrieve_file(filename)
 89 | 
 90 |                     output_file_path = join(self.config["dirs"]["output"], split(filename)[1])
 91 |                     if exists(output_file_path):
 92 |                         remove(output_file_path)
 93 | 
 94 |                     move(filename, self.config["dirs"]["output"])
 95 |                     # self.fman.delete_file(filename)
 96 | 
 97 |                 self.logger.info("No more finished job to process")
 98 | 
 99 |             sleep(self.config["sleep"]["master"])  # Avoid CPU consuption while waiting
100 | 
101 |     def stop(self):
102 |         self.logger.info("Master stopped")
103 | 
104 |         self.log_writer.stop()
105 |         StoppableThread.stop(self)
106 | 
107 | 
108 | class Slave(StoppableThread):
109 |     """ Slave worker
110 |     """
111 | 
112 |     def __init__(self, app_config):
113 |         StoppableThread.__init__(self)
114 | 
115 |         self.config = app_config
116 | 
117 |         # ip = app_config["machines"]["master"][0].split('@')
118 |         # master_ip = ip[-1:][0]
119 |         redis_ip = app_config["redis"]["host"]
120 |         redis_port = app_config["redis"]["port"]
121 | 
122 |         self.command_queue = QueueManager(host=redis_ip, port=redis_port, qname="commands")
123 |         self.finished_queue = QueueManager(host=redis_ip, port=redis_port, qname="finished")
124 |         # self.fman = FileManager(master_ip, master_queue_port)
125 | 
126 |         slave_ip = gethostbyname(gethostname())
127 |         slave_pid = getpid()
128 |         uid = slave_ip + "::" + str(slave_pid)
129 | 
130 |         self.logger = AppLogger(uid, logging.getLogger("local"), redis_ip, redis_port)
131 |         self.max_tries = app_config["commands"]["tries"]
132 | 
133 |         self.logger.info("Slave initiated [redis on "+redis_ip+"]")
134 | 
135 |     def run(self):
136 |         self.logger.info("Starting slave...")
137 | 
138 |         while not self.is_stopped():
139 |             if not self.command_queue.is_empty():
140 |                 cmd_json = self.command_queue.pop()
141 |                 self.logger.debug("CommandQueueItem(jsondata=%s, ...)" % str(cmd_json))
142 |                 cmd = CommandQueueItem(jsondata=cmd_json, logger=self.logger, config=self.config)
143 | 
144 |                 # Start the job after waiting sync between master and worker
145 |                 sleep(self.config["sleep"]["job"])
146 |                 status = cmd.execute()
147 | 
148 |                 # Job returned an error and has reached the limit of tries
149 |                 if status == 1 and cmd.tries >= self.max_tries:
150 |                     self.logger.error("Error when processing command")
151 |                     continue
152 | 
153 |                 if cmd.current_step == -1:
154 |                     self.logger.info("Pushing to finished queue")
155 |                     self.finished_queue.push(cmd.filename)
156 |                     self.logger.info("Job done")
157 |                     continue
158 | 
159 |                 self.command_queue.push(cmd)
160 | 
161 |             sleep(self.config["sleep"]["worker"])  # Avoid CPU consumption while waiting
162 | 
163 |     def stop(self):
164 |         self.logger.info("Slave stopped")
165 |         StoppableThread.stop(self)
166 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/command.py:
--------------------------------------------------------------------------------
 1 | """Command object
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from apputils.fileop import zip_directory, unzip_directory
17 | from pipeline.files import FileManager
18 | 
19 | 
20 | class Command(object):
21 |     """Main command object
22 |     """
23 | 
24 |     def __init__(self, filename, logger, app_config):
25 |         self.logger = logger
26 | 
27 |         # ip = app_config["machines"]["master"][0].split('@')
28 |         # master_ip = ip[-1:][0]
29 |         # master_queue_port = app_config["redis"]["port"]
30 |         # self.fman = FileManager(master_ip, master_queue_port)
31 | 
32 |         self.filename = filename
33 |         # self.unzipped = None
34 |         self.unzipped = filename
35 |         self.config = app_config
36 | 
37 |         self.logger.debug("File to be processed: %s" % str(self.unzipped))
38 | 
39 |     # def get_file(self):
40 |     #     """Retrieve file from redis and unzip it to the local filesystem
41 |     #     """
42 |     #     # Get hash from redis
43 |     #     self.logger.debug("Retrieving "+self.filename+"...")
44 |     #     self.fman.retrieve_file(self.filename)
45 |     #
46 |     #     # Write it in the tmp folder
47 |     #     self._unzip_file()
48 | 
49 |     # def store_file(self):
50 |     #     """Zip file on the local filesystem and store it to redis
51 |     #     """
52 |     #     self._zip_file()
53 |     #
54 |     #     # Store it in redis
55 |     #     self.fman.store_file(self.filename)
56 | 
57 |     # def _zip_file(self):
58 |     #     """Check if the file can be zipped and zip it
59 |     #     """
60 |     #     if self.unzipped is None:
61 |     #         self.logger.error("Zipped directory has not been unzipped")
62 |     #         return
63 |     #
64 |     #     zip_directory(self.unzipped)
65 |     #     self.unzipped = None
66 | 
67 |     # def _unzip_file(self):
68 |     #     """Check if the file can be unzipped and unzip it
69 |     #     """
70 |     #     if self.unzipped is not None:
71 |     #         self.logger.error("Archive already unzipped")
72 |     #         return
73 |     #
74 |     #     self.unzipped = unzip_directory(self.filename)
75 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | """Package containing all the commands
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | 
17 | from pipeline.commands.pdfconverter import *
18 | from pipeline.commands.pngreader import *
19 | from pipeline.commands.txtdenoiser import *
20 | 
21 | # Put your additional commands here
22 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/commands/pdfconverter.py:
--------------------------------------------------------------------------------
 1 | """ Package to convert PDF to PNG
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from os import listdir
17 | from os.path import join, isfile, dirname, splitext, basename, isdir
18 | import PyPDF2
19 | import PythonMagick
20 | from pipeline.command import Command
21 | 
22 | 
23 | class PDFConverter(Command):
24 |     """ Command to convert PDF to PNG.
25 |     """
26 | 
27 |     def __init__(self, filename, logger, config):
28 |         super(PDFConverter, self).__init__(filename, logger, config)
29 | 
30 |         self.density = config["command"]["density"]
31 |         self.depth = config["command"]["depth"]
32 |         self.quality = config["command"]["quality"]
33 | 
34 |         self.logger.debug("PDF converter {density: "+str(self.density)
35 |                           + "; depth: "+str(self.depth)
36 |                           + "; quality: "+str(self.quality) + "}")
37 | 
38 |     def execute(self):
39 |         """ Execute the command
40 |         """
41 |         self.logger.debug(":::    PDF conversion (%s)    :::" % str(self.unzipped))
42 | 
43 |         if not isdir(self.unzipped):
44 |             self.logger.error("%s is not a directory" % self.unzipped)
45 |             return 2
46 | 
47 |         self.logger.debug("Browsing %s for pdf files..." % self.unzipped)
48 |         pdf_list = [join(self.unzipped, f) for f in listdir(self.unzipped)
49 |                     if isfile(join(self.unzipped, f)) and f.endswith(".pdf")]
50 | 
51 |         if len(pdf_list) != 1:
52 |             self.logger.error("Incorrect number of PDF file in " + self.unzipped
53 |                               + " (" + str(len(pdf_list)) + " found, 1 expected)")
54 |             self.finalize()
55 |             return 1
56 | 
57 |         filename = str(pdf_list[0])
58 |         with open(filename, "rb") as pdf:
59 |             pdf_filereader = PyPDF2.PdfFileReader(pdf)
60 |             pdf_page_nb = pdf_filereader.getNumPages()
61 | 
62 |         pdf_dirname = dirname(filename)
63 |         imagesdir = "png"
64 | 
65 |         self.logger.debug(str(pdf_page_nb) + " page(s) detected")
66 |         for p in xrange(pdf_page_nb):
67 | 
68 |             try:  # Reading the PDF
69 |                 img = PythonMagick.Image()
70 |                 img.density(str(self.density))
71 |                 img.depth(self.depth)
72 |                 img.quality(self.quality)
73 | 
74 |                 pdf_page_file = filename + '[' + str(p) + ']'
75 |                 self.logger.debug("Reading " + pdf_page_file + "...")
76 |                 img.read(pdf_page_file)
77 | 
78 |                 png_dirname = join(pdf_dirname, imagesdir)
79 |                 png_filename = splitext(basename(filename))[0] + '-' + str(p) + '.png'
80 |                 png_page_file = join(png_dirname, png_filename)
81 |                 self.logger.debug("Writing " + png_page_file + "...")
82 |                 img.write(png_page_file)
83 |             except Exception, e:
84 |                 self.logger.fatal("An exception has been caugth: "+str(e.message))
85 |                 self.finalize()
86 |                 return 1
87 | 
88 |         self.finalize()
89 |         return 0
90 | 
91 |     def finalize(self):
92 |         """ Finalize the job
93 |         """
94 |         # super(PDFConverter, self).store_file()
95 |         self.logger.debug("::: PDF conversion END (%s) :::" % str(self.unzipped))
96 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/commands/pngreader.py:
--------------------------------------------------------------------------------
  1 | """Package to convert PNG to TXT
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from os.path import join, isdir, split, dirname, basename, splitext, isfile
 17 | from subprocess import check_output, STDOUT
 18 | from os import listdir
 19 | from shutil import move
 20 | from pipeline.command import Command
 21 | 
 22 | 
 23 | class PNGReader(Command):
 24 |     """Command to convert PNG to TXT
 25 |     """
 26 | 
 27 |     def __init__(self, filename, logger, config):
 28 |         super(PNGReader, self).__init__(filename, logger, config)
 29 | 
 30 |         self.proc_count = 1
 31 |         self.ocropus_dir = self.config["command"]["ocropy"]["location"]
 32 |         self.rpred_model = self.config["command"]["ocropy"]["model"]
 33 |         self.python = ["python"]
 34 | 
 35 |         self.logger.info("PNG reader initialized")
 36 | 
 37 |     def execute(self):
 38 |         """Execute the command
 39 |         """
 40 |         self.logger.debug("::: PNG reading :::")
 41 |         # super(PNGReader, self).get_file()
 42 | 
 43 |         procs = str(self.proc_count)
 44 | 
 45 |         png_dir = join(self.unzipped, "png")
 46 |         txt_dir = join(self.unzipped, "txt")
 47 | 
 48 |         command_list = [
 49 |             [join(self.ocropus_dir, 'ocropus-nlbin'), "-Q", procs, join(png_dir, '*.png')],
 50 |             [join(self.ocropus_dir, 'ocropus-gpageseg'), "-Q", procs, join(png_dir, '*.bin.png')],
 51 |             [join(self.ocropus_dir, 'ocropus-rpred'), "-Q", procs, "-m", self.rpred_model,
 52 |              join(png_dir, '*/*.bin.png')],
 53 |         ]
 54 | 
 55 |         # Execute the list of command
 56 |         for command in command_list:
 57 |             try:
 58 |                 self.logger.debug("> "+str(command))
 59 | 
 60 |                 cmdout = check_output(self.python+command, stderr=STDOUT)
 61 |                 self.logger.info(cmdout)
 62 |             except Exception, e:
 63 |                 print e
 64 |                 self.logger.fatal("An exception has been caugth: "+str(e.message))
 65 |                 self.finalize()
 66 |                 return 1
 67 | 
 68 |         # Build the resulting text file from every line file
 69 |         txt_files = [join(png_dir, subdir, f) for subdir in listdir(png_dir) if isdir(join(png_dir, subdir))
 70 |                      for f in listdir(join(png_dir, subdir)) if f.endswith(".txt")]
 71 |         self.logger.debug(str(len(txt_files)) + " text file(s) found")
 72 | 
 73 |         for f in txt_files:
 74 |             dirs = split(dirname(f))
 75 | 
 76 |             filename = basename(f)
 77 |             pagenum = dirs[-1].split("-")[-1]
 78 | 
 79 |             move(f, join(txt_dir, "segments", pagenum+"-"+filename))
 80 | 
 81 |         txt_files = sorted([join(txt_dir, "segments", f) for f in listdir(join(txt_dir, "segments"))
 82 |                             if f.endswith(".txt")])
 83 | 
 84 |         text = ""
 85 |         for f in txt_files:
 86 |             with open(f, "r") as txt:
 87 |                 lines = txt.readlines()
 88 | 
 89 |                 for l in lines:
 90 |                     text += l
 91 | 
 92 |         pdf_files = [join(self.unzipped, f) for f in listdir(self.unzipped) if isfile(join(self.unzipped, f)) and
 93 |                      f.endswith(".pdf")]
 94 |         txt_filename = splitext(basename(pdf_files[0]))[0]+".txt"
 95 | 
 96 |         with open(join(txt_dir, txt_filename), "w") as output:
 97 |             output.write(text)
 98 | 
 99 |         self.finalize()
100 |         return 0
101 | 
102 |     def finalize(self):
103 |         """Finalize the job
104 |         """
105 |         # super(PNGReader, self).store_file()
106 |         self.logger.debug("::: PNG reading (END) :::")
107 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/commands/txtdenoiser.py:
--------------------------------------------------------------------------------
 1 | """Package to clean TXT files
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | import codecs
17 | from os.path import join, isfile, splitext, basename
18 | from os import listdir
19 | from denoiser import Denoiser
20 | from pipeline.command import Command
21 | 
22 | 
23 | class TXTDenoiser(Command):
24 |     """Command to clean TXT files
25 |     """
26 | 
27 |     def __init__(self, filename, logger, config):
28 |         super(TXTDenoiser, self).__init__(filename, logger, config)
29 |         self.denoiser = Denoiser(config)
30 | 
31 |         self.logger.debug("Denoiser initialized")
32 | 
33 |     def execute(self):
34 |         """Execute the command
35 |         """
36 |         try:
37 |             self.logger.debug("::: Text cleaning :::")
38 |             # super(TXTDenoiser, self).get_file()
39 | 
40 |             txt_dir = join(self.unzipped, "txt")
41 |             txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]
42 | 
43 |             if len(txt_files) != 1:
44 |                 self.logger.error("Incorrect number of text files")
45 |                 self.finalize()
46 |                 return -1
47 | 
48 |             text_data = self.denoiser.cleanse(txt_files[0], False)
49 | 
50 |             # Writing classified lines
51 |             base_filename = splitext(basename(txt_files[0]))[0]
52 |             clean_filename = join(txt_dir, base_filename+".clean.txt")
53 |             garbage_filename = join(txt_dir, base_filename+".grbge.txt")
54 |             unclassified_filename = join(txt_dir, base_filename+".unclss.txt")
55 | 
56 |             with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
57 |                 for line in text_data.get_clean_lines():
58 |                     clean_file.write(line+"\n")
59 | 
60 |             with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
61 |                 for line in text_data.get_garbage_lines():
62 |                     garbage_file.write(line+"\n")
63 | 
64 |             if len(text_data.get_unclassified_lines()) > 0:
65 |                 with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
66 |                     for line in text_data.get_unclassified_lines():
67 |                         unclassified_file.write(line+"\n")
68 |         except Exception, e:
69 |             print e
70 | 
71 |             self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
72 |             self.finalize()
73 |             return -2
74 | 
75 |         self.finalize()
76 |         return 0
77 | 
78 |     def finalize(self):
79 |         """Finalize the job
80 |         """
81 |         # super(TXTDenoiser, self).store_file()
82 |         self.logger.debug("::: Text cleaning (END) :::")
83 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/files.py:
--------------------------------------------------------------------------------
 1 | """Package defining Redis file manager
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | import base64
17 | from os import remove
18 | import redis
19 | 
20 | 
21 | class FileManager(object):
22 |     """Redis file manager
23 |     """
24 | 
25 |     # def __init__(self, host="127.0.0.1", port=6379, db=0):
26 |     #     self.server = redis.StrictRedis(host, port, db)
27 |     #     self.hashmap_name = "fman"
28 |     def __init__(self, app_config):
29 |         self.hashmap = {}  # Stores fileid -> filepath
30 |         self.config = app_config
31 | 
32 |     # def retrieve_file(self, filename):
33 |     #     """Retrieve from redis hashmap
34 |     #
35 |     #     Args
36 |     #         filename (str): Filename to retrieve
37 |     #     """
38 |     #     b64_hash = self.server.hget(self.hashmap_name, filename)
39 |     #     data = base64.b64decode(b64_hash)
40 |     #
41 |     #     with open(filename, 'wb') as zip_file:
42 |     #         zip_file.write(data)
43 |     #
44 |     # def store_file(self, filename):
45 |     #     """Store file to redis hashmap
46 |     #
47 |     #     Args
48 |     #         filename (str): Filename to store
49 |     #     """
50 |     #     with open(filename, 'rb') as zip_file:
51 |     #         b64_hash = base64.b64encode(zip_file.read())
52 |     #
53 |     #     self.server.hset(self.hashmap_name, filename, b64_hash)
54 |     #     remove(filename)
55 | 
56 |     def delete_file(self, filename):
57 |         """Delete file from redis hashmap
58 | 
59 |         Args
60 |             filename (str): Filename to delete
61 |         """
62 |         # self.server.hdel(self.hashmap_name, filename)
63 |         if filename not in self.hashmap:
64 |             raise KeyError("FileManager: filename is not present in hashmap")
65 | 
66 |         del self.hashmap[filename]
67 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/logger.py:
--------------------------------------------------------------------------------
  1 | """Package handling different loggers
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | import json
 17 | import logging
 18 | from time import sleep
 19 | from pipeline.threads import StoppableThread
 20 | from pipeline.queue import QueueManager
 21 | 
 22 | 
 23 | class AppLogger(object):
 24 |     """Logger publishing to a redis queue
 25 |     """
 26 | 
 27 |     def __init__(self, uid, local_logger, queue_ip="127.0.0.1", queue_port=6379):
 28 |         self.log_queue = QueueManager(host=queue_ip, port=queue_port, qname="logging")
 29 |         self.logger = local_logger
 30 | 
 31 |         # Identify the logging process
 32 |         self.uid = uid
 33 | 
 34 |     def log(self, level, message):
 35 |         """Log a message with a given level
 36 | 
 37 |         Parameters
 38 |             level (int): Log level
 39 |             message (str): Message to store
 40 |         """
 41 |         log_mess = {
 42 |             "uid": self.uid,
 43 |             "lvl": level,
 44 |             "msg": message
 45 |         }
 46 | 
 47 |         self.log_queue.push(json.dumps(log_mess))
 48 |         self.logger.log(level, "["+self.uid+"] "+message)
 49 | 
 50 |     def debug(self, message):
 51 |         """Log a debug message
 52 | 
 53 |         Parameters
 54 |             message (str): Message to store
 55 |         """
 56 |         self.log(logging.DEBUG, message)
 57 | 
 58 |     def info(self, message):
 59 |         """Log an info message
 60 | 
 61 |         Parameters
 62 |             message (str): Message to store
 63 |         """
 64 |         self.log(logging.INFO, message)
 65 | 
 66 |     def warning(self, message):
 67 |         """Log a warning message
 68 | 
 69 |         Parameters
 70 |             message (str): Message to store
 71 |         """
 72 |         self.log(logging.WARNING, message)
 73 | 
 74 |     def error(self, message):
 75 |         """Log an error message
 76 | 
 77 |         Parameters
 78 |             message (str): Message to store
 79 |         """
 80 |         self.log(logging.ERROR, message)
 81 | 
 82 |     def fatal(self, message):
 83 |         """Log a fatal message
 84 | 
 85 |         Parameters
 86 |             message (str): Message to store
 87 |         """
 88 |         self.log(logging.FATAL, message)
 89 | 
 90 | 
 91 | class LogWriter(StoppableThread):
 92 |     """Pops element from the logging queue and write them in the proper directory
 93 |     """
 94 | 
 95 |     def __init__(self, app_logger, queue_ip="127.0.0.1", queue_port=6379):
 96 |         StoppableThread.__init__(self)
 97 | 
 98 |         self.log_queue = QueueManager(host=queue_ip, port=queue_port, qname="logging")
 99 |         self.logger = app_logger
100 | 
101 |     def write_logs(self):
102 |         """Write logs to a local file
103 |         """
104 |         while not self.log_queue.is_empty():
105 |             log_json = self.log_queue.pop()
106 |             log_data = json.loads(log_json)
107 | 
108 |             self.logger.log(log_data["lvl"], "["+log_data["uid"]+"] "+log_data["msg"])
109 |             sleep(0.2)
110 | 
111 |     def run(self):
112 |         self.logger.debug("Logger initiatied")
113 | 
114 |         while not self.stop_event.isSet():
115 |             self.write_logs()
116 |             sleep(0.5)
117 | 
118 |         self.write_logs()
119 |         self.logger.info("Logger stopped")
120 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/queue.py:
--------------------------------------------------------------------------------
  1 | """ Package defining Redis queue
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | import json
 17 | import redis
 18 | from pipeline.commands import *
 19 | 
 20 | 
 21 | class QueueManager(object):
 22 |     """ Redis queue manager.
 23 |     """
 24 | 
 25 |     def __init__(self, host="127.0.0.1", port=6379, db=0, qname=None):
 26 |         self.server = redis.StrictRedis(host, port, db)
 27 | 
 28 |         if qname is not None:
 29 |             self.queue_name = qname
 30 |         else:
 31 |             self.queue_name = "default"
 32 | 
 33 |     def push(self, json_object):
 34 |         """Push JSON on a redis queue
 35 | 
 36 |         Parameters
 37 |             json_object (dict): JSON to push to redis
 38 |         """
 39 |         self.server.rpush(self.queue_name, json_object)
 40 | 
 41 |     def pop(self):
 42 |         """Pop object from a redis queue
 43 | 
 44 |         Returns
 45 |             dict: JSON from redis
 46 |         """
 47 |         return self.server.lpop(self.queue_name)
 48 | 
 49 |     def is_empty(self):
 50 |         """Test if the queue is empty or not
 51 | 
 52 |         Returns
 53 |             bool: True if empty, false otherwise
 54 |         """
 55 |         return len(self) == 0
 56 | 
 57 |     def __len__(self):
 58 |         return self.server.llen(self.queue_name)
 59 | 
 60 | 
 61 | class CommandQueueItem(object):
 62 |     """ Command stored in the redis queue.
 63 |     """
 64 | 
 65 |     def __init__(self, filename="", jsondata="", logger=None, config=None):
 66 |         if filename != "":
 67 |             self.current_step = 0
 68 |             self.filename = filename
 69 |             self.tries = 0
 70 |         else:  # Rebuild command from JSON
 71 |             data = json.loads(jsondata)
 72 |             self.current_step = data["command"]
 73 |             self.filename = data["filename"]
 74 |             self.tries = data["tries"]
 75 | 
 76 |         # self.filename = join(self.filename)
 77 |         self.logger = logger
 78 |         self.config = config
 79 | 
 80 |         # Building the command list
 81 |         self.steps = []
 82 |         for cmd in self.config["commands"]["list"]:
 83 |             cmd_class = None
 84 |             cmd_config = self.config
 85 | 
 86 |             if type(cmd) == str:
 87 |                 cmd_class = eval(cmd)
 88 |             elif type(cmd) == dict:
 89 |                 if len(cmd.keys()) == 1:
 90 |                     cmd_class = eval(cmd.keys()[0])
 91 |                     cmd_config["command"] = cmd.values()[0]
 92 |             if cmd_class is None:
 93 |                 self.logger.fatal("Unreadable command list")
 94 |                 raise SyntaxError(
 95 |                     "Command list is not correctly formatted"
 96 |                 )
 97 | 
 98 |             self.steps.append(
 99 |                 cmd_class(self.filename, self.logger, cmd_config)
100 |             )
101 | 
102 |     def execute(self):
103 |         """Execute the command
104 | 
105 |         Returns:
106 |             int: 0 when no errors happen, >0 otherwise
107 |         """
108 |         command = self.steps[self.current_step]
109 |         cmd_result = command.execute()
110 | 
111 |         if cmd_result == 1:  # The process has failed
112 |             self.tries += 1
113 |             return 1
114 | 
115 |         # Current step is incremented
116 |         self.current_step += 1
117 | 
118 |         # Stop flag
119 |         if self.current_step >= len(self.steps):
120 |             self.current_step = -1
121 |             self.tries = 0
122 | 
123 |         return 0
124 | 
125 |     def __str__(self):
126 |         repr_str = {
127 |             "command": self.current_step,
128 |             "filename": self.filename,
129 |             "tries": self.tries
130 |         }
131 | 
132 |         return json.dumps(repr_str)
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/threads.py:
--------------------------------------------------------------------------------
 1 | """Utility package for threads
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from threading import Thread, Event
17 | 
18 | 
19 | class StoppableThread(Thread):
20 |     """A thread with the ability to be remotely stopped
21 |     """
22 | 
23 |     def __init__(self):
24 |         Thread.__init__(self)
25 |         self.stop_event = Event()
26 | 
27 |     def is_stopped(self):
28 |         """Test if a thread is stopped
29 | 
30 |         Returns
31 |             bool: True if stopped, False otherwise
32 |         """
33 |         return self.stop_event.isSet()
34 | 
35 |     def stop(self):
36 |         """Stop the thread
37 |         """
38 |         self.stop_event.set()
39 | 


--------------------------------------------------------------------------------
/packages/pipeline/src/pipeline/utils.py:
--------------------------------------------------------------------------------
 1 | """Main utility package
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from hashlib import sha256
17 | from os import makedirs
18 | from os.path import join
19 | from random import choice
20 | from shutil import move
21 | from string import ascii_lowercase, digits
22 | from time import strftime, gmtime
23 | 
24 | local_config = {
25 |     "tmp_dir": "tmp",  # FIXME not used anymore
26 |     "dirs": ["png", "txt/segments"]
27 | }
28 | 
29 | 
30 | def create_data_directory(filename, tmp_dir):
31 |     """Create the data directory for a PDF file
32 | 
33 |     Parameters:
34 |         filename (:func:`str`):
35 |         tmp_dir (str):
36 | 
37 |     Returns:
38 |         :func:`str`  - Location of the directory
39 |     """
40 |     if not filename.endswith(".pdf"):
41 |         return None
42 | 
43 |     # Generate a unique directory name
44 |     file_hash = sha256(open(filename).read()).hexdigest()[0:12]
45 |     creation_time = strftime("%Y%m%d.%H%M%S", gmtime())
46 |     rand_str = ''.join(choice(ascii_lowercase + digits) for _ in range(6))
47 | 
48 |     tmp_dir = join(tmp_dir, "%s.%s.%s" % (file_hash, creation_time, rand_str))
49 | 
50 |     # Creating main directory with the PDF inside
51 |     makedirs(tmp_dir)
52 |     move(filename, tmp_dir)
53 | 
54 |     # Creating subdirectories
55 |     for subdir in local_config["dirs"]:
56 |         makedirs(join(tmp_dir, subdir))
57 | 
58 |     return tmp_dir
59 | 


--------------------------------------------------------------------------------
/ui.py:
--------------------------------------------------------------------------------
  1 | """Manage the OCR pipeline, both remotely and locally
  2 | 
  3 | .. Authors:
  4 |     Philippe Dessauw
  5 |     philippe.dessauw@nist.gov
  6 | 
  7 | .. Sponsor:
  8 |     Alden Dima
  9 |     alden.dima@nist.gov
 10 |     Information Systems Group
 11 |     Software and Systems Division
 12 |     Information Technology Laboratory
 13 |     National Institute of Standards and Technology
 14 |     http://www.nist.gov/itl/ssd/is
 15 | """
 16 | from __future__ import division
 17 | import logging
 18 | from os import listdir
 19 | import os
 20 | from apputils.config import load_config
 21 | from apputils.fileop import create_directories
 22 | from denoiser import Denoiser
 23 | from os.path import join, isdir, exists, abspath
 24 | from fabric.contrib.console import confirm
 25 | from fabric.contrib.project import upload_project
 26 | from fabric.contrib.files import exists as fab_exists
 27 | from fabric.decorators import task, roles, runs_once
 28 | from fabric.api import env, run, local, cd
 29 | from fabric.operations import sudo
 30 | from fabric.tasks import Task, execute
 31 | from fabric.state import output
 32 | from shutil import copytree, rmtree
 33 | 
 34 | # Initialize the app_config variable
 35 | load_config("conf/app.yaml", os.environ['ROOT'])
 36 | from apputils.config import app_config
 37 | 
 38 | logger = logging.getLogger('app')
 39 | 
 40 | # Hosts configurations
 41 | # env.roledefs = app_config["machines"]
 42 | # env.hosts = [ip for ips in env.roledefs.values() for ip in ips]
 43 | 
 44 | # Extra default configuration
 45 | env.warn_only = True
 46 | output.update({
 47 |     "warnings": False,
 48 |     "running": False
 49 | })
 50 | 
 51 | # Global variable to know if the execution is done locally or remotely
 52 | local_exec = False
 53 | 
 54 | 
 55 | # @task
 56 | # def install():
 57 | #     """Install the pipeline on the specified cluster
 58 | #     """
 59 | #     logger.debug("Installing pipeline...")
 60 | #
 61 | #     local_root = os.environ['ROOT']
 62 | #     remote_root = app_config['root']
 63 | #
 64 | #     if local_exec:
 65 | #         if abspath(local_root) == abspath(remote_root):
 66 | #             logger.error("Source and destination folder are the same")
 67 | #             exit(1)
 68 | #
 69 | #         if exists(remote_root):
 70 | #             if confirm("Existing data will be deleted. Do you want to proceed anyway?", default=False):
 71 | #                 rmtree(remote_root)
 72 | #             else:
 73 | #                 logger.error("Pipeline destination folder already exists")
 74 | #                 exit(2)
 75 | #
 76 | #         copytree(local_root, remote_root)
 77 | #         local(remote_root+'/utils/install.sh')
 78 | #     else:
 79 | #         if app_config["use_sudo"]:
 80 | #             run_fn = sudo
 81 | #         else:
 82 | #             run_fn = run
 83 | #
 84 | #         if not fab_exists(remote_root):
 85 | #             logging.debug("Building remote directory...")
 86 | #             run_fn("mkdir -p "+remote_root)
 87 | #         else:
 88 | #             if not confirm("Existing data will be deleted. Do you want to proceed anyway?", default=False):
 89 | #                 logger.error("Pipeline destination folder already exists")
 90 | #                 exit(2)
 91 | #
 92 | #         logging.debug("Uploading project...")
 93 | #         upload_project(
 94 | #             local_dir=local_root,
 95 | #             remote_dir=remote_root,
 96 | #             use_sudo=app_config["use_sudo"]
 97 | #         )
 98 | #
 99 | #         if run_fn(remote_root+"/utils/auth.sh").failed:
100 | #             logger.error("An error occured with modifying the right for the pipeline")
101 | #             exit(3)
102 | #
103 | #         if run(remote_root+"/utils/install.sh").failed:
104 | #             logger.error("An error occured with the install script")
105 | #             exit(4)
106 | #
107 | #     logger.info("Pipeline successfully installed")
108 | 
109 | 
110 | @task
111 | def init():
112 |     """Initialize the app (available for localhost only)
113 |     """
114 |     logger.debug("Initializing app...")
115 | 
116 |     if not local_exec:
117 |         logger.error("Pipeline can only be initialized locally")
118 |         exit(1)
119 | 
120 |     # Create project tree
121 |     create_directories(app_config["dirs"])
122 |     logger.info("App initialized")
123 | 
124 | 
125 | @task
126 | def create_models(dataset_dir):
127 |     """Initialize the app (available for localhost only)
128 | 
129 |     Parameters:
130 |         dataset_dir (:func:`str`): Path to the training set
131 |     """
132 |     logger.debug("Creating models...")
133 | 
134 |     if not local_exec:
135 |         logger.error("Models can only be generated locally")
136 |         exit(1)
137 | 
138 |     # Modify the configuration for local execution
139 |     app_config['root'] = os.environ['ROOT']
140 | 
141 |     # Generate inline models and train classifier
142 |     denoiser = Denoiser(app_config)
143 | 
144 |     if not exists(dataset_dir) or not isdir(dataset_dir):
145 |         logger.error(dataset_dir+" is not a valid directory")
146 |         exit(2)
147 | 
148 |     dataset = [join(dataset_dir, f) for f in listdir(dataset_dir)]
149 | 
150 |     denoiser.generate_models(dataset)
151 |     logger.info("Inline models generated")
152 | 
153 |     denoiser.train(dataset)
154 |     logger.info("Classifier trained")
155 | 
156 | 
157 | @task
158 | def check():
159 |     """Check that the 3rd party software are all installed
160 |     """
161 |     base_dir = "utils/check"
162 |     scripts = [join(base_dir, sc) for sc in listdir(base_dir) if sc.endswith(".sh")]
163 | 
164 |     for script in scripts:
165 |         launch_script(script)
166 | 
167 | 
168 | # @task
169 | # @runs_once
170 | # def start_pipeline():
171 | #     """Start the conversion process across the platform.
172 | #     """
173 | #     execute(start_master)
174 | #     execute(start_slave)
175 | 
176 | 
177 | @task
178 | @roles("master")
179 | def start_master():
180 |     """Start the master server on the local machine
181 |     """
182 |     launch_script("utils/run-wrapper.sh", ["--master", ("> %s/master.log" % app_config["dirs"]["logs"])], True)
183 | 
184 | 
185 | @task
186 | @roles("slaves")
187 | def start_slave():
188 |     """Start a slave on the local machine
189 |     """
190 |     launch_script("utils/run-wrapper.sh", ["--slave", ("> %s/slave.log" % app_config["dirs"]["logs"])], True)
191 | 
192 | 
193 | def launch_script(script_name, script_opts=list(), background=False):
194 |     """Launch any script you specify
195 | 
196 |     Parameters:
197 |         script_name (:func:`str`): Path of the script to run
198 |         script_opts (:func:`str`): Options to pass to the script
199 |         background (bool): Whether to launch the script in background
200 |     """
201 |     if local_exec:
202 |         root_dir = os.environ['ROOT']
203 |     else:
204 |         root_dir = app_config['root']
205 | 
206 |     command = join(root_dir, script_name)+" "+" ".join(script_opts)
207 | 
208 |     logger.debug("Execute: "+command)
209 | 
210 |     with cd(root_dir):
211 |         if local_exec:
212 |             if background:
213 |                 local(command)
214 |                 return
215 | 
216 |             if local(command).failed:
217 |                 logger.error("Command failed "+command)
218 |                 exit(1)
219 |         else:
220 |             if background:
221 |                 run(command)
222 |                 return
223 | 
224 |             if run(command).failed:
225 |                 logger.error("Command failed "+command)
226 |                 exit(1)
227 | 
228 | 
229 | def print_help(light=False):
230 |     """Print the help message
231 | 
232 |     Parameter
233 |         light (bool): If true, only print the usage
234 |     """
235 |     print "Usage: python ui.py [-h] command [args [args ...]]"
236 | 
237 |     if light:
238 |         return
239 | 
240 |     print "Commands: "
241 | 
242 |     default_func_help = "No description available"
243 |     for func in functions.keys():
244 |         help_str = "* "+func+" "*(20-len(func))
245 | 
246 |         if func in functions_help.keys():
247 |             help_str += functions_help[func]
248 |         else:
249 |             help_str += default_func_help
250 | 
251 |         print help_str
252 | 
253 |     print ""
254 |     print "Options:"
255 |     print_opts = ', '.join(help_opts)
256 | 
257 |     print print_opts+" "*(22-len(print_opts))+"Print this help message"
258 | 
259 | 
260 | def setup_local_exec():
261 |     """Set the local_exec variable to true
262 |     """
263 |     global local_exec
264 |     local_exec = True
265 | 
266 | 
267 | if __name__ == "__main__":
268 |     import sys
269 |     args = sys.argv[1:]
270 | 
271 |     setup_local_exec()
272 | 
273 |     help_opts = ["--help", "-h"]
274 |     functions = {k: v for k, v in locals().items() if isinstance(v, Task) and v.__module__ == "__main__"
275 |                  and k != "print_help"}
276 |     functions_help = {k: v.__doc__.split("\n")[0] for k, v in functions.items() if v.__doc__ is not None}
277 | 
278 |     if len(args) == 0:
279 |         logger.error("No function specified")
280 |         print_help(True)
281 |         exit(1)
282 | 
283 |     if len(args) == 1 and args[0] in help_opts:
284 |         print_help()
285 |         exit(0)
286 | 
287 |     if args[0] not in functions.keys():
288 |         logger.fatal("Command "+args[0]+" unknown")
289 |         print_help(True)
290 |         exit(1)
291 | 
292 |     if len(args) > 1:
293 |         functions[args[0]](*args[1:])
294 |     else:
295 |         functions[args[0]]()
296 | 
297 | 
298 | 


--------------------------------------------------------------------------------
/ui.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Script interacting w/ the pipeline. Same script work for remote and local access.
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/utils/env.sh
18 | 
19 | contains() {
20 |     local element
21 | 
22 |     for element in "${@:2}"
23 |     do
24 |         if [ "$element" == "$1" ]
25 |         then
26 |             echo 0
27 |             return
28 |         fi
29 |     done
30 | 
31 |     echo 1
32 | }
33 | 
34 | local_opts=("-l" "--local")
35 | remote_opts=("-r" "--remote")
36 | 
37 | # Store arguments in another variable to manipulate them easily
38 | args=($*)
39 | 
40 | if [ `contains "$1" "${local_opts[@]}"` -eq 0 ]
41 | then
42 |     # Local execution
43 |     args=("${args[@]:1}")
44 |     python2 ui.py ${args[@]} 2>&1
45 | elif [ `contains "$1" "${remote_opts[@]}"` -eq 0 ]
46 | then
47 |     # Remote execution
48 |     args=("${args[@]:1}")
49 | 
50 |     command=${args[0]}
51 |     args=("${args[@]:1}")
52 | 
53 |     # Building argument string
54 |     arg_count=${#args[@]}
55 |     args_str=""
56 | 
57 |     if [ ${arg_count} -ne 0 ]
58 |     then
59 |         arg_index=1
60 |         args_str=":"
61 | 
62 |         for arg in ${args[@]}
63 |         do
64 |             args_str=${args_str}${arg}
65 | 
66 |             if [ ${arg_index} -ne ${arg_count} ]
67 |             then
68 |                 args_str=${args_str}","
69 |             fi
70 | 
71 |             (( arg_index++ ))
72 |         done
73 |     fi
74 | 
75 |     # Deployment command
76 |     fab -f ui.py ${command}${args_str} 2>&1
77 | else
78 |     # By default, app is executed locally
79 |     python2 ui.py ${args[@]} 2>&1
80 | fi


--------------------------------------------------------------------------------
/utils/auth.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Give the logged user regular rights on the ocr-pipeline folder
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/env.sh
18 | 
19 | AUTH_GROUP=`bash -c 'id -gn $SUDO_USER'`
20 | AUTH_USER=`bash -c 'id -un $SUDO_USER'`
21 | 
22 | chown ${AUTH_USER}:${AUTH_GROUP} -R ${ROOT}
23 | 


--------------------------------------------------------------------------------
/utils/check/ocropy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Testing ocropy
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/../env.sh
18 | TMP_DIRNAME=`get_value_for_key dirs/temp` || nok
19 | TMP_DIR="${ROOT}/${TMP_DIRNAME}"
20 | OCROPY_DIR=`get_value_for_key commands/list#1/PNGReader/ocropy/location` || nok
21 | 
22 | ocropy_check_file="${OCROPY_DIR}/tests/testpage.png"
23 | 
24 | echo "[1/4] Checking directory existence..."
25 | if [ ! -d "${OCROPY_DIR}" ]
26 | then
27 |     echo "Resource has not been downloaded"
28 |     nok
29 | fi
30 | 
31 | echo "[2/4] Checking ocropus-nlbin..."
32 | python2 ${OCROPY_DIR}/ocropus-nlbin ${ocropy_check_file} -o ${TMP_DIR} 2>&1 >/dev/null
33 | if [ $? -ne 0 ]
34 | then
35 |     echo "ocropus-nlbin command failed"
36 |     nok
37 | fi
38 | 
39 | echo "[3/4] Checking ocropus-gpageseg..."
40 | python2 ${OCROPY_DIR}/ocropus-gpageseg ${TMP_DIR}/*.bin.png 2>&1 >/dev/null
41 | if [ $? -ne 0 ]
42 | then
43 |     echo "ocropus-gpageseg command failed"
44 |     nok
45 | fi
46 | 
47 | echo "[4/4] Checking ocropus-rpred..."
48 | python2 ${OCROPY_DIR}/ocropus-rpred -n ${TMP_DIR}/*/*.bin.png 2>&1 >/dev/null
49 | if [ $? -ne 0 ]
50 | then
51 |     echo "ocropus-rpred command failed"
52 |     nok
53 | fi
54 | 
55 | ok


--------------------------------------------------------------------------------
/utils/check/packages.py:
--------------------------------------------------------------------------------
 1 | """ Check the presence of Python packages needed to run the pipeline
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | from pipeline import run_master, run_slave
17 | import nltk
18 | 
19 | nltk.data.find('tokenizers/punkt')
20 | 


--------------------------------------------------------------------------------
/utils/check/python.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Test of the python installation
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/../env.sh
18 | 
19 | python2 --version
20 | 
21 | PYTHONPATH=${ROOT}/src python2 ${CURRENT_DIR}/packages.py
22 | if [ $? -ne 0 ]
23 | then
24 |     echo "Packages inclusion failed"
25 |     nok
26 | fi
27 | 
28 | ok


--------------------------------------------------------------------------------
/utils/check/redis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Test the Redis installation
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/../env.sh
18 | TMP_DIRNAME=`get_value_for_key dirs/temp` || nok
19 | TMP_DIR="${ROOT}/${TMP_DIRNAME}"
20 | redis_check_file=${TMP_DIR}/redis.check
21 | 
22 | which redis-server > ${redis_check_file} 2>/dev/null
23 | if [ `cat ${redis_check_file} | wc -l` -ne 1 ] || [ ! -e `cat ${redis_check_file}` ]
24 | then
25 |     rm ${redis_check_file}
26 |     nok
27 | fi
28 | 
29 | redis-cli --version | cut -d' ' -f2 > ${redis_check_file} 2>/dev/null
30 | 
31 | if [ `cat ${redis_check_file} | cut -d'.' -f1` -eq 2 ]
32 | then
33 |     if [ `cat ${redis_check_file} | cut -d'.' -f2` -lt 6 ]
34 |     then
35 |         rm ${redis_check_file}
36 |         nok
37 |     fi
38 | else
39 |     rm ${redis_check_file}
40 |     nok
41 | fi
42 | 
43 | #REDIS_SERVER=`get_value_for_key machines/master#0` || nok
44 | REDIS_SERVER=`get_value_for_key redis/host` || nok
45 | REDIS_PORT=`get_value_for_key redis/port` || nok
46 | 
47 | redis-cli -h ${REDIS_SERVER} -p ${REDIS_PORT} ping > ${redis_check_file}
48 | if [ `cat ${redis_check_file}` != "PONG" ]
49 | then
50 |     rm ${redis_check_file}
51 |     nok
52 | else
53 |     rm ${redis_check_file}
54 |     ok
55 | fi


--------------------------------------------------------------------------------
/utils/check/xserver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Test that the xvfb-run command is present
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/../env.sh
18 | TMP_DIRNAME=`get_value_for_key dirs/temp` || nok
19 | TMP_DIR="${ROOT}/${TMP_DIRNAME}"
20 | XVFB_FILE=${TMP_DIR}/xvfb.test
21 | XVFB_MSG="xvfb"
22 | 
23 | xvfb-run -a echo ${XVFB_MSG} > ${XVFB_FILE} 2>/dev/null
24 | if [ `cat ${XVFB_FILE} | grep ${XVFB_MSG} | wc -l` -ne 0 ]
25 | then
26 |     rm ${XVFB_FILE}
27 |     ok
28 | else
29 |     rm ${XVFB_FILE}
30 |     nok
31 | fi


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | """Prints a given configuration key from a given configuration file
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | if __name__ == "__main__":
17 |     import sys
18 |     import os
19 |     from apputils.config import *
20 | 
21 |     load_config(sys.argv[1], os.environ['ROOT'])
22 |     print get_config(sys.argv[2])
23 | 


--------------------------------------------------------------------------------
/utils/env.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ### Store basic information about the environment
  3 | #
  4 | # Can be called from any script using:
  5 | #   >   source path/to/env.sh
  6 | #
  7 | # Authors:
  8 | #   Philippe Dessauw
  9 | #   philippe.dessauw@nist.gov
 10 | #
 11 | # Sponsor:
 12 | #   Alden Dima
 13 | #   alden.dima@nist.gov
 14 | #   Information Systems Group
 15 | #   Software and Systems Division
 16 | #   Information Technology Laboratory
 17 | #   National Institute of Standards and Technology
 18 | #   http://www.nist.gov/itl/ssd/is
 19 | ###
 20 | 
 21 | # Defining fonctions
 22 | function parse_yaml {
 23 |     local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
 24 | 
 25 |     sed -ne "s|^\($s\)\($w\)$s:$s\"\(.*\)\"$s\$|\1$fs\2$fs\3|p" \
 26 |     -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
 27 |     awk -F$fs '{
 28 |         indent = length($1)/4;
 29 |         vname[indent] = $2;
 30 | 
 31 |         for (i in vname) {if (i > indent) {delete vname[i]}}
 32 |             if (length($3) > 0) {
 33 |             vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
 34 |             printf("%s%s=\"%s\"\n", vn, $2, $3);
 35 |         }
 36 |     }'
 37 | }
 38 | 
 39 | function get_value_for_key {
 40 |     local file=${CONF_FILE} key=$1
 41 |     python2 ${DIR}/config.py ${file} ${key}
 42 | }
 43 | 
 44 | function nok {
 45 |     echo "NOK"
 46 |     exit 1
 47 | }
 48 | 
 49 | function ok {
 50 |     echo "OK"
 51 |     exit 0
 52 | }
 53 | 
 54 | # Setting up main variables
 55 | DIR=`echo $(cd -P $(dirname ${BASH_SOURCE[0]}) && pwd)`
 56 | export ROOT="${DIR}/.."
 57 | CONF_FILE="${ROOT}/conf/app.yaml"
 58 | #ENV_CONF_FILE="${ROOT}/conf/env.yaml"
 59 | #TMP_CONF_FILE="/tmp/env.conf"
 60 | 
 61 | # Turning on the python environment
 62 | #parse_yaml ${ENV_CONF_FILE} > ${TMP_CONF_FILE}
 63 | #source ${TMP_CONF_FILE}
 64 | #rm ${TMP_CONF_FILE}
 65 | 
 66 | # If a python path has been setup and is a directory
 67 | #if [ -d "${python_path}" ]
 68 | #then
 69 | #    export PATH=${python_path}/bin:$PATH
 70 | #else
 71 | #    if [ -n "${python_path}" ]
 72 | #    then
 73 | #        echo "${python_path} is not a valid directory"
 74 | #        nok
 75 | #    fi
 76 | #fi
 77 | #
 78 | ## If a python environment has been setup and is a directory
 79 | #if [ -d "${python_virtualenv}" ]
 80 | #then
 81 | #    current_python_env=`python ${ROOT}/utils/prefix.py`
 82 | #
 83 | #    if [ ${current_python_env} != ${python_virtualenv} ]
 84 | #    then
 85 | #        source activate ${python_virtualenv} &>/dev/null
 86 | #    fi
 87 | #else
 88 | #    if [ -n "${python_virtualenv}" ]
 89 | #    then
 90 | #        echo "${python_virtualenv} is not a valid directory"
 91 | #        nok
 92 | #    fi
 93 | #fi
 94 | 
 95 | # Useful variables
 96 | CURRENT_DIR=`echo $(cd -P $(dirname $0) && pwd)`
 97 | 
 98 | #TMP_DIRNAME=`get_value_for_key dirs/temp` || nok
 99 | #TMP_DIR="${ROOT}/${TMP_DIRNAME}"
100 | 
101 | 


--------------------------------------------------------------------------------
/utils/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Automatically install all the necessary packages to run the pipeline
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/env.sh
18 | 
19 | pkgs=("apputils" "denoiser" "pipeline")
20 | 
21 | for pkg in ${pkgs[@]}
22 | do
23 |     if [ `pip list | grep ${pkg} | wc -l` -eq 1 ]
24 |     then
25 |         pip uninstall -y ${pkg}
26 |     fi
27 | 
28 |     cd ${ROOT}/packages/${pkg}
29 |     rm -r dist
30 |     python2 setup.py sdist
31 |     pip install dist/*.tar.gz --no-cache-dir
32 | done


--------------------------------------------------------------------------------
/utils/prefix.py:
--------------------------------------------------------------------------------
 1 | """Returns the prefix of the local Python installation
 2 | 
 3 | .. Authors:
 4 |     Philippe Dessauw
 5 |     philippe.dessauw@nist.gov
 6 | 
 7 | .. Sponsor:
 8 |     Alden Dima
 9 |     alden.dima@nist.gov
10 |     Information Systems Group
11 |     Software and Systems Division
12 |     Information Technology Laboratory
13 |     National Institute of Standards and Technology
14 |     http://www.nist.gov/itl/ssd/is
15 | """
16 | if __name__ == "__main__":
17 |     import sys
18 |     print sys.prefix
19 | 


--------------------------------------------------------------------------------
/utils/run-wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Wrap the pipeline in a terminal multiplexer (to avoid using nohup, which doesn't work with fabric)
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | if [ `tmux has-session -t pipeline 2>/dev/null; echo $?` -ne 0 ]
18 | then
19 |     tmux new -d -s pipeline
20 | else
21 |     tmux new-window -t pipeline
22 | fi
23 | 
24 | # Send the pipeline command and ensure that it closes the tmux-window on exit
25 | tmux send -t pipeline "bash -c '$(dirname $0)/run.sh $*'; exit" Enter
26 | 


--------------------------------------------------------------------------------
/utils/run.py:
--------------------------------------------------------------------------------
 1 | """This script launches the pipeline on the local machine. The user specify the actor to start on the command line.
 2 | 
 3 | For more information on how this script works, you can use the following command::
 4 | 
 5 |     $ python2 run.py --help
 6 | 
 7 | .. Authors:
 8 |     Philippe Dessauw
 9 |     philippe.dessauw@nist.gov
10 | 
11 | .. Sponsor:
12 |     Alden Dima
13 |     alden.dima@nist.gov
14 |     Information Systems Group
15 |     Software and Systems Division
16 |     Information Technology Laboratory
17 |     National Institute of Standards and Technology
18 |     http://www.nist.gov/itl/ssd/is
19 | """
20 | import os
21 | 
22 | if __name__ == "__main__":
23 |     from pipeline import run_master, run_slave
24 |     import argparse
25 | 
26 |     from apputils.config import load_config
27 |     load_config("conf/app.yaml", os.environ['ROOT'])
28 | 
29 |     from apputils.config import app_config
30 | 
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument("-s", "--slave", action="store_true", help="launch a slave process")
33 |     parser.add_argument("-m", "--master", action="store_true", help="launch a master process")
34 |     args = parser.parse_args()
35 | 
36 |     if args.master == args.slave:
37 |         print "Please choose what kind of process to launch"
38 |         parser.print_help()
39 |         exit()
40 |     elif args.master:
41 |         print "Starting master..."
42 |         run_master(app_config)
43 |     elif args.slave:
44 |         print "Starting slave..."
45 |         run_slave(app_config)
46 | 


--------------------------------------------------------------------------------
/utils/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ### Start a slave or a master on the local machine
 3 | #
 4 | # Authors:
 5 | #   Philippe Dessauw
 6 | #   philippe.dessauw@nist.gov
 7 | #
 8 | # Sponsor:
 9 | #   Alden Dima
10 | #   alden.dima@nist.gov
11 | #   Information Systems Group
12 | #   Software and Systems Division
13 | #   Information Technology Laboratory
14 | #   National Institute of Standards and Technology
15 | #   http://www.nist.gov/itl/ssd/is
16 | ###
17 | source $(dirname $0)/env.sh
18 | 
19 | xvfb-run -a python2 ${ROOT}/utils/run.py $* 2>&1
20 | 


--------------------------------------------------------------------------------