113 |
114 | FAMIE: A Fast Active Learning Framework for Multilingual Information Extraction
115 |
116 |
121 |
122 |
123 | )
124 | }
125 | }
126 |
127 | export default withStyles(styles)(WelcomePage);
128 |
--------------------------------------------------------------------------------
/app-ui/src/routers/ProjectStartPage.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ProjectMain from '../components/project-summary/ProjectMain';
3 | import FileUploadMain from '../components/file-upload/FileUploadMain';
4 | import NotFoundPage from '../components/NotFoundPage';
5 | import ProjectParamsPage from '../components/set-project-params/ProjectParamsPage';
6 |
7 |
8 | class ProjectStartPage extends React.Component {
9 |
10 | constructor(props){
11 | super(props);
12 | this.props.setAllProjectVars(this.props.uri);
13 |
14 | console.log("Inside ProjectStartPage", this.props);
15 | }
16 |
17 | componentDidUpdate(prevProps, prevState) {
18 | console.log("ProjectStartPage did update", prevState, prevProps, this.props);
19 | }
20 |
21 | render() {
22 | if(!this.props.projectName){
23 | return
24 | }
25 |
26 | if(this.props.projectUploadFinished && this.props.projectParamsFinished){
27 | return
35 | }
36 |
37 | if(this.props.projectUploadFinished){
38 | return
42 | }
43 |
44 | // console.log("Doing file upload", props);
45 | return
55 | }
56 | }
57 |
58 | export default ProjectStartPage;
--------------------------------------------------------------------------------
/app-ui/src/styles/theme.js:
--------------------------------------------------------------------------------
1 | import { red } from '@material-ui/core/colors';
2 | import { createMuiTheme } from '@material-ui/core/styles';
3 |
4 | // A custom theme for this app
5 | // const theme = createMuiTheme({
6 | // palette: {
7 | // primary: {
8 | // main: '#556cd6',
9 | // },
10 | // secondary: {
11 | // main: '#19857b',
12 | // },
13 | // error: {
14 | // main: red.A400,
15 | // },
16 | // background: {
17 | // default: '#fff',
18 | // },
19 | // },
20 | // props: {
21 | // MuiButton: {
22 | // variant: 'contained',
23 | // color: 'primary'
24 | // },
25 | // },
26 | // });
27 |
28 | // export default theme;
--------------------------------------------------------------------------------
/app-ui/src/utils.js:
--------------------------------------------------------------------------------
1 | export function getSlug(projectName) {
2 | return encodeURIComponent(projectName);
3 | };
4 |
5 | export function trimString(longString, maxLength){
6 | let displayedName = longString.trim();
7 | if(longString.length > maxLength){
8 | let lastSpace = -1;
9 | for(let textCharIndex = 0; textCharIndex < maxLength; textCharIndex++) {
10 | const currentChar = longString[textCharIndex];
11 | if(currentChar == ' '){
12 | lastSpace = textCharIndex;
13 | }
14 | }
15 | if((lastSpace > -1)){
16 | displayedName = displayedName.substring(0, lastSpace) + '...';
17 | }
18 | else{
19 | displayedName = displayedName.substring(0, maxLength).trim() + '...';
20 | }
21 | }
22 | return displayedName;
23 | }
--------------------------------------------------------------------------------
/app-ui/state-changes.txt:
--------------------------------------------------------------------------------
1 | Files in this folder are modified from https://github.com/dataqa/dataqa/tree/master/dataqa-ui
2 |
--------------------------------------------------------------------------------
/app-ui/webpack.common.js:
--------------------------------------------------------------------------------
1 | const path = require('path');
2 |
3 | module.exports = {
4 | entry: './src/app.js',
5 | output: {
6 | path: path.join(__dirname, 'public'),
7 | filename: 'bundle.js'
8 | },
9 | module: {
10 | rules: [{
11 | loader: 'babel-loader',
12 | test: /\.js$/,
13 | exclude: /node_modules/,
14 | options: {
15 | plugins: ['recharts', 'lodash']
16 | }
17 | }, {
18 | test: /\.s?css$/,
19 | use: [
20 | 'style-loader',
21 | 'css-loader',
22 | 'sass-loader'
23 | ]
24 | }]
25 | }
26 | };
27 |
--------------------------------------------------------------------------------
/app-ui/webpack.dev.js:
--------------------------------------------------------------------------------
1 | const { merge } = require('webpack-merge');
2 | const common = require('./webpack.common.js');
3 | const path = require('path');
4 |
5 | module.exports = merge(common, {
6 | mode: 'development',
7 | devtool: 'eval-cheap-module-source-map',
8 | devServer: {
9 | static: {
10 | directory: path.join(__dirname, 'public'),
11 | },
12 | proxy: {
13 | '/api': {
14 | target: 'http://localhost:8888',
15 | secure: false
16 | }
17 | },
18 | historyApiFallback: true,
19 | hot: true
20 | }
21 | });
22 |
--------------------------------------------------------------------------------
/app-ui/webpack.prod.js:
--------------------------------------------------------------------------------
1 | const { merge } = require('webpack-merge');
2 | const common = require('./webpack.common.js');
3 |
4 | module.exports = merge(common, {
5 | mode: 'production',
6 | devtool: 'source-map'
7 | });
8 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_static/style.css:
--------------------------------------------------------------------------------
1 | .wy-nav-content {
2 | max-width: 1500px !important;
3 | }
4 |
--------------------------------------------------------------------------------
/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% block extrahead %}
3 |
4 | {% endblock %}
5 |
--------------------------------------------------------------------------------
/docs/source/architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/docs/source/architecture.jpg
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../'))
16 | sys.path.insert(0, os.path.abspath('../..'))
17 |
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = 'famie'
22 | copyright = '2022, NLP Group at the University of Oregon'
23 | author = 'NLP Group at the University of Oregon'
24 |
25 | # The full version, including alpha/beta/rc tags
26 | release = ''
27 |
28 |
29 | # -- General configuration ---------------------------------------------------
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 | 'sphinx.ext.autodoc',
36 | 'sphinx.ext.coverage',
37 | 'sphinx.ext.napoleon',
38 | 'recommonmark',
39 | 'sphinx.ext.viewcode'
40 | ]
41 |
42 | source_suffix = ['.rst', '.md']
43 |
44 | master_doc = 'index'
45 |
46 |
47 | # Add any paths that contain templates here, relative to this directory.
48 | templates_path = ['_templates']
49 |
50 | # List of patterns, relative to source directory, that match files and
51 | # directories to ignore when looking for source files.
52 | # This pattern also affects html_static_path and html_extra_path.
53 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
54 |
55 |
56 | # -- Options for HTML output -------------------------------------------------
57 |
58 | # The theme to use for HTML and HTML Help pages. See the documentation for
59 | # a list of builtin themes.
60 | #
61 | html_theme = 'sphinx_rtd_theme'
62 |
63 | # Add any paths that contain custom static files (such as style sheets) here,
64 | # relative to this directory. They are copied after the builtin static files,
65 | # so a file named "default.css" will overwrite the builtin "default.css".
66 | html_static_path = ['_static']
67 |
--------------------------------------------------------------------------------
/docs/source/format.json:
--------------------------------------------------------------------------------
1 | {
2 | 'id': 2,
3 | 'text': 'On\npourra toujours parler à propos d\'Averroès de "décentrement du Sujet".',
4 | 'dspan': (149,
5 | 222),
6 | 'tokens': [
7 | {
8 | 'id': 1,
9 | 'text': 'On',
10 | 'dspan': (149,
11 | 151),
12 | 'span': (0,
13 | 2)
14 | },
15 | {
16 | 'id': 2,
17 | 'text': 'pourra',
18 | 'dspan': (152,
19 | 158),
20 | 'span': (3,
21 | 9)
22 | },
23 | {
24 | 'id': 3,
25 | 'text': 'toujours',
26 | 'dspan': (159,
27 | 167),
28 | 'span': (10,
29 | 18)
30 | },
31 | {
32 | 'id': 4,
33 | 'text': 'parler',
34 | 'dspan': (168,
35 | 174),
36 | 'span': (19,
37 | 25)
38 | },
39 | {
40 | 'id': 5,
41 | 'text': 'à',
42 | 'dspan': (175,
43 | 176),
44 | 'span': (26,
45 | 27)
46 | },
47 | {
48 | 'id': 6,
49 | 'text': 'propos',
50 | 'dspan': (177,
51 | 183),
52 | 'span': (28,
53 | 34)
54 | },
55 | {
56 | 'id': 7,
57 | 'text': "d'",
58 | 'dspan': (184,
59 | 186),
60 | 'span': (35,
61 | 37)
62 | },
63 | {
64 | 'id': 8,
65 | 'text': 'Averroès',
66 | 'dspan': (186,
67 | 194),
68 | 'span': (37,
69 | 45)
70 | },
71 | {
72 | 'id': 9,
73 | 'text': 'de',
74 | 'dspan': (195,
75 | 197),
76 | 'span': (46,
77 | 48)
78 | },
79 | {
80 | 'id': 10,
81 | 'text': '"',
82 | 'dspan': (198,
83 | 199),
84 | 'span': (49,
85 | 50)
86 | },
87 | {
88 | 'id': 11,
89 | 'text': 'décentrement',
90 | 'dspan': (199,
91 | 211),
92 | 'span': (50,
93 | 62)
94 | },
95 | {
96 | 'id': (12,
97 | 13),
98 | 'text': 'du',
99 | 'expanded': [
100 | {
101 | 'id': 12,
102 | 'text': 'de'
103 | },
104 | {
105 | 'id': 13,
106 | 'text': 'le'
107 | }
108 | ],
109 | 'span': (63,
110 | 65),
111 | 'dspan': (212,
112 | 214)
113 | },
114 | {
115 | 'id': 14,
116 | 'text': 'Sujet',
117 | 'dspan': (215,
118 | 220),
119 | 'span': (66,
120 | 71)
121 | },
122 | {
123 | 'id': 15,
124 | 'text': '"',
125 | 'dspan': (220,
126 | 221),
127 | 'span': (71,
128 | 72)
129 | },
130 | {
131 | 'id': 16,
132 | 'text': '.',
133 | 'dspan': (221,
134 | 222),
135 | 'span': (72,
136 | 73)
137 | }
138 | ]
139 | }
--------------------------------------------------------------------------------
/docs/source/howitworks.rst:
--------------------------------------------------------------------------------
1 | How FaMIE works
2 | =================
3 |
4 | In this section, we briefly present the most important details of the technologies used by FaMIE.
5 |
6 | .. figure:: ../../pics/full-process-color.png
7 | :width: 500
8 | :alt: Proxy Active Learning process
9 | :align: center
10 |
11 | Incorporating current large-scale language models into traditional AL process would dramatically increase the model training time, thus introducing a long idle time for annotators that might reduce annotation quality and
12 | quantity. To address this issue without sacrificing final performance, FAMIE introduces **Proxy Active Learning**. In particular, a small proxy model is used to unlabeled data selection, while the main model is trained during the long annotation time of the annotators (i.e., main model training and data annotation are done in parallel). Given the main model trained at previous iteration, knowledge distillation will be employed to synchronize the knowledge between the main and proxy models at the current iteration.
13 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. trankit documentation master file, created by
2 | sphinx-quickstart on March 31 10:21:23 2021.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | FaMIE's Documentation
7 | ================================================
8 |
9 | FAMIE is a comprehensive and efficient **active learning** (AL) toolkit for **multilingual information extraction** (IE). FAMIE is designed to address a fundamental problem in existing AL frameworks where annotators need to wait for a long time between annotation batches due to the time-consuming nature of model training and data selection at each AL iteration. With a novel `proxy AL mechanism `_ and the integration of our SOTA multilingual toolkit `Trankit `_, FAMIE can quickly provide users with a labeled dataset and a ready-to-use model for different IE tasks over `100 languages `_.
10 |
11 | FAMIE's github: https://github.com/nlp-uoregon/famie
12 |
13 | FAMIE's demo website: http://nlp.uoregon.edu:9000/
14 |
15 | .. toctree::
16 | :maxdepth: 2
17 | :caption: Introduction
18 |
19 | installation
20 | overview
21 | howitworks
22 |
--------------------------------------------------------------------------------
/docs/source/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | Installing *FaMIE* is easily done via one of the following methods:
4 |
5 | ## Using pip
6 |
7 | ```
8 | pip install famie
9 | ```
10 | The command would install *FaMIE* and all dependent packages automatically.
11 |
12 | ## From source
13 | ```
14 | git clone https://github.com/nlp-uoregon/famie
15 | cd famie
16 | pip install -e .
17 | ```
18 | This would first clone our github repo and install FaMIE.
19 |
20 | If you encounter any other problem with the installation, please raise an issue [here](https://github.com/nlp-uoregon/famie/issues/new) to let us know. Thanks.
21 |
--------------------------------------------------------------------------------
/docs/source/overview.md:
--------------------------------------------------------------------------------
1 | # Quick examples
2 |
3 | ## Initialization
4 | To start an annotation session, please use the following command:
5 | ```python
6 | famie start
7 | ```
8 | This will run a server on users' local machines (no data or models will leave users' local machines), users can access FAMIE's web interface via the URL: http://127.0.0.1:9000/
9 |
10 | To start a new project, users need to upload an unlabeled dataset file with an entity type file (in text format) to the web interface. After that, they will be directed to a data statistic page. Clicking on the bottom left corner will start the labeling process.
11 |
12 |
13 |
14 | ## Annotation
15 |
16 | for each data sample, annotators first select a label from dropdown, then proceed to highlight appropriate spans for the corresponding labels.
17 |
18 |
19 |
20 | Annotators continue labeling until all entities in the given sentence are covered, from which they can proceed by clicking save button and then next arrow to go to the next example.
21 |
22 |
23 | After finishing labeled every unlabeled data of the current iteration, clicking on **Finish Iter** will take users to a waiting page for the next iteration (during this time, the proxy model is being retrained with the new labeled data, which usually takes about 3 to 5 minutes).
24 |
25 |
26 | ## Output
27 | FAMIE allows users to download the trained models and annotated data of the current round via the web interface.
28 |
29 |
30 | FAMIE also provides a simple and intuitive code
31 | interface for interacting with the resulting labeled
32 | dataset and trained main models after the AL processes.
33 |
34 | ```python
35 | import famie
36 |
37 | # access a project via its name
38 | p = famie.get_project('named-entity-recognition')
39 |
40 | # access the project's labeled data
41 | data = p.get_labeled_data() # a Python dictionary
42 |
43 | # export the project's labeled data to a file
44 | p.export_labeled_data('data.json')
45 |
46 | # export the project's trained model to a file
47 | p.export_trained_model('model.ckpt')
48 |
49 | # access the project's trained model
50 | model = p.get_trained_model()
51 |
52 | # access a trained model from file
53 | model = famie.load_model_from_file('model.ckpt')
54 |
55 | # use the trained model to make predictions
56 | model.predict('Oregon is a beautiful state!')
57 | # ['B-Location', 'O', 'O', 'O', 'O']
--------------------------------------------------------------------------------
/pics/0_newproj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/0_newproj.png
--------------------------------------------------------------------------------
/pics/1_select_label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/1_select_label.png
--------------------------------------------------------------------------------
/pics/2_anno_span.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/2_anno_span.png
--------------------------------------------------------------------------------
/pics/3_save_next.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/3_save_next.png
--------------------------------------------------------------------------------
/pics/4_fin_prox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/4_fin_prox.png
--------------------------------------------------------------------------------
/pics/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/download.png
--------------------------------------------------------------------------------
/pics/full-process-color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/pics/full-process-color.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask>=1.1.1
2 | flask-cors==3.0.10
3 | html5lib==1.1
4 | numpy>=1.19.2
5 | requests>=2.23
6 | scikit-learn>=0.22;python_version>="3.8"
7 | scikit-learn<0.22;python_version<"3.8"
8 | adapter-transformers==2.1.0
9 | langid==1.1.6
10 | trankit==1.1.0
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import setuptools
3 | from sys import platform
4 | import os
5 |
6 | PKG_DIR = os.path.dirname(os.path.abspath(__file__))
7 |
8 | required = Path("requirements.txt").read_text().splitlines()
9 |
10 | HERE = Path(__file__).parent
11 |
12 | # The text of the README file
13 | README = (HERE / "README.md").read_text()
14 |
15 | def package_files(directory):
16 | paths = []
17 | for (path, directories, filenames) in os.walk(directory):
18 | for filename in filenames:
19 | package_relative_path = Path(*Path(path).parts[2:], filename)
20 | paths.append(str(package_relative_path))
21 | return paths
22 |
23 |
24 | extra_files = []
25 | extra_files.extend(package_files(Path('', 'src/famie', 'config')))
26 |
27 | extra_files.extend(["api/static/bundle.js",
28 | "api/templates/index.html"])
29 |
30 | setuptools.setup(
31 | name="famie",
32 | version="0.3.0",
33 | author="NLP Group at the University of Oregon",
34 | author_email="thien@cs.uoregon.edu",
35 | description="FAMIE: A Fast Active Learning Framework for Multilingual Information Extraction",
36 | long_description=README,
37 | long_description_content_type="text/markdown",
38 | url="https://github.com/nlp-uoregon/famie",
39 | python_requires='>=3.6',
40 | install_requires=required,
41 | package_dir={'': 'src'},
42 | packages=setuptools.find_packages(where='src'),
43 | package_data={"famie": extra_files},
44 | entry_points={'console_scripts': 'famie=famie.entry_points.run_app:main'},
45 | data_files=[('.', ["requirements.txt"])],
46 | license='GPL-3.0 License',
47 | classifiers=[
48 | 'Development Status :: 4 - Beta',
49 |
50 | 'Intended Audience :: Developers',
51 | 'Intended Audience :: Education',
52 | 'Intended Audience :: Science/Research',
53 | 'Intended Audience :: Information Technology',
54 | 'Topic :: Scientific/Engineering',
55 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
56 | 'Topic :: Scientific/Engineering :: Information Analysis',
57 | 'Topic :: Text Processing',
58 | 'Topic :: Text Processing :: Linguistic',
59 | 'Topic :: Software Development',
60 | 'Topic :: Software Development :: Libraries',
61 |
62 | 'Programming Language :: Python :: 3.6',
63 | 'Programming Language :: Python :: 3.7',
64 | 'Programming Language :: Python :: 3.8',
65 | ],
66 | )
67 |
--------------------------------------------------------------------------------
/src/famie/api/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/api/__init__.py
4 | '''
5 | from flask import Flask
6 |
7 |
8 | def create_app():
9 | app = Flask(__name__)
10 | return app
11 |
--------------------------------------------------------------------------------
/src/famie/api/active_learning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/api/active_learning/__init__.py
--------------------------------------------------------------------------------
/src/famie/api/active_learning/config.py:
--------------------------------------------------------------------------------
1 | import os, json, trankit, torch
2 | from transformers import XLMRobertaTokenizer
3 | from .constants import WORKING_DIR
4 |
5 |
6 | class Config:
7 | def __init__(self, args_fpath):
8 | if os.path.exists(args_fpath):
9 | with open(args_fpath) as f:
10 | passed_args = json.load(f)
11 | else:
12 | passed_args = {
13 | 'selection': 'mnlp',
14 | 'target_embedding': 'nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large',
15 | 'proxy_embedding': 'nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large'
16 | }
17 | # print('passed args: {}'.format(passed_args))
18 | self.cache_dir = os.path.join(WORKING_DIR, 'resource')
19 | self.target_embedding_name = passed_args['target_embedding']
20 | self.proxy_embedding_name = passed_args['proxy_embedding']
21 |
22 | self.trankit_tokenizer = trankit.Pipeline('english', cache_dir=os.path.join(WORKING_DIR, 'cache/trankit'))
23 | self.proxy_tokenizer = XLMRobertaTokenizer.from_pretrained(self.proxy_embedding_name,
24 | cache_dir=os.path.join(self.cache_dir,
25 | self.proxy_embedding_name),
26 | do_lower_case=False)
27 | self.target_tokenizer = XLMRobertaTokenizer.from_pretrained(self.target_embedding_name,
28 | cache_dir=os.path.join(self.cache_dir,
29 | self.target_embedding_name),
30 | do_lower_case=False)
31 | self.max_sent_length = 200
32 |
33 | self.target_reduction_factor = 4
34 | self.proxy_reduction_factor = 2
35 | self.embedding_dropout = 0.4
36 | self.hidden_num = 200
37 |
38 | self.adapter_learning_rate = 2e-4
39 | self.adapter_weight_decay = 2e-4
40 | self.learning_rate = 1e-3
41 | self.weight_decay = 1e-3
42 |
43 | self.batch_size = 16
44 | self.proxy_max_epoch = 20
45 | self.target_max_epoch = 40
46 | self.seed = 3456
47 | self.accumulate_step = 1
48 | self.grad_clipping = 4.5
49 |
50 | self.distill = True
51 | self.selection = passed_args['selection']
52 | assert self.selection in ['random', 'bertkm', 'badge', 'mnlp']
53 |
54 | self.num_examples_per_iter = 50
55 |
56 | self.vocabs = {}
57 |
58 | if torch.cuda.is_available():
59 | self.use_gpu = True
60 | else:
61 | self.use_gpu = False
62 |
63 |
64 | config = Config(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'passed_args.json'))
65 |
--------------------------------------------------------------------------------
/src/famie/api/active_learning/constants.py:
--------------------------------------------------------------------------------
1 | import os, json
2 |
3 |
4 | def ensure_dir(dir_path):
5 | os.makedirs(dir_path, exist_ok=True)
6 |
7 |
8 | CODE2LANG = {
9 | 'af': 'afrikaans', 'ar': 'arabic', 'hy': 'armenian', 'eu': 'basque', 'be': 'belarusian', 'bg': 'bulgarian',
10 | 'ca': 'catalan', 'zh': 'chinese', 'hr': 'croatian', 'cs': 'czech',
11 | 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'et': 'estonian', 'fi': 'finnish', 'fr': 'french', 'gl': 'galician',
12 | 'de': 'german', 'el': 'greek', 'he': 'hebrew', 'hi': 'hindi',
13 | 'hu': 'hungarian', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'kk': 'kazakh',
14 | 'ko': 'korean', 'ku': 'kurmanji', 'la': 'latin', 'lv': 'latvian',
15 | 'lt': 'lithuanian', 'mr': 'marathi', 'nn': 'norwegian-nynorsk', 'nb': 'norwegian-bokmaal', 'fa': 'persian',
16 | 'pl': 'polish', 'pt': 'portuguese', 'ro': 'romanian',
17 | 'ru': 'russian', 'sr': 'serbian', 'sk': 'slovak', 'sl': 'slovenian', 'es': 'spanish',
18 | 'sv': 'swedish', 'ta': 'tamil', 'te': 'telugu', 'tr': 'turkish',
19 | 'uk': 'ukrainian', 'ur': 'urdu', 'ug': 'uyghur', 'vi': 'vietnamese'
20 | }
21 |
22 | SUPPORTED_TASKS = {
23 | 'ner'
24 | }
25 |
26 | DEBUG = True
27 |
28 | WORKING_DIR = os.path.dirname(os.path.realpath(__file__))
29 | DATABASE_DIR = os.path.join(WORKING_DIR, 'database')
30 | PROJECT_INFO_FPATH = os.path.join(DATABASE_DIR, 'project2info.json')
31 | LOG_DIR = os.path.join(WORKING_DIR, 'logs')
32 | PRETRAIN_DIR = os.path.join(WORKING_DIR, 'pretrain-dir')
33 |
34 | OUTPUT_DIR = os.path.join(WORKING_DIR, 'famie-output')
35 |
36 | SIGNAL_DIR = {'base': os.path.join(WORKING_DIR, 'signals')}
37 | TASK_NAME_FPATH = os.path.join(SIGNAL_DIR['base'], 'task_name.txt')
38 |
39 | ensure_dir(SIGNAL_DIR['base'])
40 | ensure_dir(DATABASE_DIR)
41 | ensure_dir(PRETRAIN_DIR)
42 |
43 | for task in SUPPORTED_TASKS:
44 | SIGNAL_DIR[task] = os.path.join(SIGNAL_DIR['base'], task)
45 | ensure_dir(SIGNAL_DIR[task])
46 |
47 | LISTEN_TIME = 1
48 | MAX_EXAMPLES_PER_ITER = 10000
49 |
50 | STOP_CONTROLLER = 'stop-controller'
51 |
52 | PAUSE_MODEL = 'pause-model'
53 | RUN_TARGET = 'run-target'
54 | RUN_PROXY = 'run-proxy'
55 | PROXY_PREDICTS = 'proxy-predicts'
56 | TARGET_PREDICTS = 'target-predicts'
57 |
58 | SIGNALS = {
59 | STOP_CONTROLLER: 'Stop the controller',
60 | PAUSE_MODEL: 'Pause model',
61 | RUN_TARGET: 'Run the target model',
62 | RUN_PROXY: 'Run the proxy model',
63 | PROXY_PREDICTS: 'Use proxy model to make predictions',
64 | TARGET_PREDICTS: 'Use target model to make predictions'
65 | }
66 |
67 | EMBEDDING2DIM = {
68 | 'xlm-roberta-large': 1024,
69 | 'xlm-roberta-base': 768,
70 | 'nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large': 384,
71 | 'nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large': 384
72 | }
73 |
74 | PROXY_PRETRAINED_TRIGGER_MODEL_PATH = os.path.join(PRETRAIN_DIR, 'proxy.pretrained-trigger.ckpt')
75 | PROXY_PRETRAINED_ARGUMENT_MODEL_PATH = os.path.join(PRETRAIN_DIR, 'proxy.pretrained-argument.ckpt')
76 | TARGET_PRETRAINED_TRIGGER_MODEL_PATH = os.path.join(PRETRAIN_DIR, 'target.pretrained-trigger.ckpt')
77 | TARGET_PRETRAINED_ARGUMENT_MODEL_PATH = os.path.join(PRETRAIN_DIR, 'target.pretrained-argument.ckpt')
78 |
79 | if not os.path.exists(PROJECT_INFO_FPATH):
80 | with open(PROJECT_INFO_FPATH, 'w') as f:
81 | json.dump({}, f)
82 |
83 | SPAN_KEYS = ["id", "start", "end", "text", "entity_id"]
84 |
85 | PROXY_BATCH_FIELDS = [
86 | 'example_ids', 'texts', 'tokens', 'anchors', 'piece_idxs', 'attention_masks', 'token_lens',
87 | 'label_idxs', 'token_nums', 'distill_mask',
88 | 'tch_lbl_dist', 'transitions'
89 | ]
90 |
91 | TARGET_BATCH_FIELDS = [
92 | 'example_ids', 'texts', 'tokens', 'anchors', 'piece_idxs', 'attention_masks', 'token_lens',
93 | 'label_idxs', 'token_nums', 'distill_mask'
94 | ]
95 |
96 | AL_BATCH_FIELDS = [
97 | 'example_ids', 'tokens', 'anchors', 'piece_idxs', 'attention_masks', 'token_lens',
98 | 'labels', 'label_idxs', 'token_nums'
99 | ]
100 |
101 | CKPT_KEYS = {'project_name', 'project_task_type', 'lang', 'embedding_name', 'hidden_num', 'vocabs', 'weights'}
102 |
103 |
104 | def convert_ckpt_to_json(ckpt_fpath):
105 | import json, torch
106 |
107 | ckpt = torch.load(ckpt_fpath) if torch.cuda.is_available() else torch.load(ckpt_fpath, map_location=torch.device('cpu'))
108 | assert set(ckpt.keys()) == CKPT_KEYS
109 |
110 | for param_name in ckpt['weights']:
111 | ckpt['weights'][param_name] = ckpt['weights'][param_name].data.cpu().numpy().tolist()
112 |
113 | return ckpt
114 |
115 |
116 | def convert_json_to_ckpt(json_fpath, use_gpu):
117 | import torch
118 |
119 | with open(json_fpath) as f:
120 | ckpt = json.load(f)
121 |
122 | assert set(ckpt.keys()) == CKPT_KEYS
123 |
124 | for param_name in ckpt['weights']:
125 | ckpt['weights'][param_name] = torch.tensor(ckpt['weights'][param_name]).cuda() if use_gpu else torch.tensor(
126 | ckpt['weights'][param_name])
127 |
128 | return ckpt
129 |
--------------------------------------------------------------------------------
/src/famie/api/active_learning/controllers.py:
--------------------------------------------------------------------------------
1 | # define controllers for active learning modules
2 | from .models import SeqLabel, ConditionalSeqLabel
3 | from .trainers import *
4 | from .utils import *
5 |
6 | import time, os
7 | import _thread as thread
8 |
9 |
10 | class Controller:
11 | def __init__(self, config, task):
12 | assert task in SUPPORTED_TASKS
13 |
14 | self.config = config
15 | self.task = task
16 | self.signal_fpath = os.path.join(SIGNAL_DIR[task], 'signal.controller.txt')
17 |
18 | self.reset_signal()
19 |
20 | self.model = None
21 | self.dataset = None
22 | self.trainer = None
23 | self.is_listening = False
24 |
25 | def reset_signal(self):
26 | with open(self.signal_fpath, 'w') as f:
27 | f.write(PAUSE_MODEL)
28 |
29 | def read_signal(self):
30 | with open(self.signal_fpath) as f:
31 | signal = f.read().strip().lower()
32 |
33 | return signal
34 |
35 | def receive_signal(self, signal):
36 | with open(self.signal_fpath, 'w') as f:
37 | f.write(signal)
38 |
39 | def stop(self):
40 | self.receive_signal(STOP_CONTROLLER)
41 |
42 | def run_proxy_model(self, unlabeled_data, project_name):
43 | self.trainer['proxy'].receive_signal(RUN_PROXY, unlabeled_data, project_name)
44 | self.trainer['target'].receive_signal(RUN_PROXY, unlabeled_data, project_name)
45 |
46 | def run_target_model(self, project_name):
47 | self.trainer['proxy'].receive_signal(RUN_TARGET, [], project_name)
48 | self.trainer['target'].receive_signal(RUN_TARGET, [], project_name)
49 |
50 | def proxy_model_predicts(self, unlabeled_data, project_name):
51 | if self.trainer['proxy'].is_trained:
52 | self.trainer['proxy'].receive_signal(PROXY_PREDICTS, unlabeled_data, project_name)
53 | if self.trainer['target'].is_trained:
54 | self.trainer['target'].receive_signal(PROXY_PREDICTS, unlabeled_data, project_name)
55 |
56 | def target_model_predicts(self, unlabeled_data, project_name):
57 | if self.trainer['proxy'].is_trained:
58 | self.trainer['proxy'].receive_signal(TARGET_PREDICTS, unlabeled_data, project_name)
59 | if self.trainer['target'].is_trained:
60 | self.trainer['target'].receive_signal(TARGET_PREDICTS, unlabeled_data, project_name)
61 |
62 | def stop_listening(self):
63 | if self.trainer:
64 | self.trainer['proxy'].receive_signal(STOP_CONTROLLER, [], None)
65 | self.trainer['target'].receive_signal(STOP_CONTROLLER, [], None)
66 |
67 | self.model = None
68 | self.dataset = None
69 | self.trainer = None
70 | self.is_listening = False
71 | # print('{} controller: stopped listening!'.format(self.task))
72 |
73 | def listen(self, project_state):
74 | project_dir = project_state['project_dir']
75 | project_id = project_state['project_id']
76 | project_task_type = project_state['project_task_type']
77 | project_annotations = project_state['annotations']
78 | provided_labeled_data = project_state['provided_labeled_data']
79 |
80 | if not self.is_listening:
81 | print('-' * 50)
82 | print("Loading models for project '{}'...".format(project_id))
83 | self.is_listening = True
84 |
85 | self.config.vocabs[project_id] = {
86 | 'entity-type': {}, 'entity-label': {'O': 0}
87 | }
88 | with open(os.path.join(project_dir, 'vocabs.json')) as f:
89 | self.config.vocabs[project_id]['entity-type'] = json.load(f)
90 |
91 | for entity_type in self.config.vocabs[project_id]['entity-type']:
92 | self.config.vocabs[project_id]['entity-label']['B-{}'.format(entity_type)] = len(
93 | self.config.vocabs[project_id]['entity-label'])
94 | self.config.vocabs[project_id]['entity-label']['I-{}'.format(entity_type)] = len(
95 | self.config.vocabs[project_id]['entity-label'])
96 |
97 | self.dataset = {
98 | 'proxy': ProxyDataset(self.config, project_id, project_dir, project_annotations, provided_labeled_data),
99 | 'target': TargetDataset(self.config, project_id, project_dir, project_annotations,
100 | provided_labeled_data)
101 | }
102 | self.dataset['target'].lang = self.dataset['proxy'].lang
103 |
104 | if project_task_type == 'conditional':
105 | print('initializating ConditionalSeqLabel models...')
106 | self.model = {
107 | 'proxy': ConditionalSeqLabel(self.config, project_id, model_name='proxy'),
108 | 'target': ConditionalSeqLabel(self.config, project_id, model_name='target')
109 | }
110 | else:
111 | print('initializing SeqLabel models...')
112 | assert project_task_type == 'unconditional'
113 | self.model = {
114 | 'proxy': SeqLabel(self.config, project_id, model_name='proxy'),
115 | 'target': SeqLabel(self.config, project_id, model_name='target')
116 | }
117 |
118 | self.trainer = {
119 | 'proxy': ProxyTrainer(self.config, self.task, self.model['proxy'], self.dataset['proxy'],
120 | project_task_type),
121 | 'target': TargetTrainer(self.config, self.task, self.model['target'], self.dataset['target'],
122 | project_task_type)
123 | }
124 |
125 | thread.start_new_thread(self.trainer['proxy'].start_listening, ())
126 | thread.start_new_thread(self.trainer['target'].start_listening, ())
127 | print('-' * 50)
128 | else:
129 | self.dataset['proxy'].update_data(project_annotations, provided_labeled_data)
130 | self.dataset['target'].update_data(project_annotations, provided_labeled_data)
131 |
132 |
133 | if __name__ == '__main__':
134 | controller = Controller({}, 'ner')
135 | controller.listen()
136 | print('Listening ...')
137 | while True:
138 | time.sleep(1)
139 |
--------------------------------------------------------------------------------
/src/famie/api/active_learning/passed_args.json:
--------------------------------------------------------------------------------
1 | {"selection": "mnlp", "proxy_embedding": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large", "target_embedding": "xlm-roberta-base"}
--------------------------------------------------------------------------------
/src/famie/api/api_fns/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/api/api_fns/__init__.py
--------------------------------------------------------------------------------
/src/famie/api/api_fns/project_creation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/api/api_fns/project_creation/__init__.py
--------------------------------------------------------------------------------
/src/famie/api/api_fns/project_creation/common.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/api/api_fns/project_creation/common.py
4 | '''
5 |
6 | from abc import ABC, abstractmethod
7 | import csv
8 | import json
9 | from pathlib import Path
10 | import random
11 |
12 | from famie.api.api_fns.utils import check_file_size, get_column_names, get_decoded_stream
13 | from famie.constants import (ES_GROUND_TRUTH_NAME_FIELD,
14 | FILE_TYPE_DOCUMENTS,
15 | FILE_TYPE_DOCUMENTS_WIKI,
16 | INPUT_FILE_SPECS,
17 | MAPPINGS,
18 | PROJECT_TYPE_CLASSIFICATION,
19 | PROJECT_TYPE_NER)
20 |
21 |
22 | class UploadedFile(ABC):
23 |
24 | def __init__(self, project_type, input_data, file_type):
25 | # run input param checks
26 | if project_type in [PROJECT_TYPE_CLASSIFICATION, PROJECT_TYPE_NER]:
27 | if file_type not in [FILE_TYPE_DOCUMENTS, FILE_TYPE_DOCUMENTS_WIKI]:
28 | raise Exception(f"File type {file_type} is not supported for project type {project_type}.")
29 |
30 | if file_type == FILE_TYPE_DOCUMENTS_WIKI and project_type != PROJECT_TYPE_NER:
31 | raise Exception(f"Using urls is not supported for project type {project_type}.")
32 |
33 | self.project_type = project_type
34 | self.input_data = input_data
35 | if (type(input_data) != list):
36 | self.filename = input_data.filename
37 | self.total_documents = 0
38 | self.file_type = file_type # documents, kb or documents_wiki
39 | self.processed_data = None
40 |
41 | def do_all_file_checks(self):
42 | self.input_data = get_decoded_stream(self.input_data)
43 |
44 | def read_line(self):
45 | num_lines = 0
46 | for line in self.input_data.readlines():
47 | line = line.strip()
48 | if line:
49 | yield line
50 | num_lines += 1
51 |
52 | def __iter__(self):
53 | for line in self.read_line():
54 | self.total_documents += 1
55 | yield {'text': line}
56 |
57 | @abstractmethod
58 | def process_file(self, es_uri, index_name, get_row, project_full_path, spacy_binary_filepath):
59 | pass
60 |
61 |
62 | class ES_indexer(object):
63 |
64 | def __init__(self, es_uri, index_name, get_row, mapping_specs):
65 | self.es_uri = es_uri
66 | self.index_name = index_name
67 | self.mapping_es = mapping_specs["mapping_es"]
68 | self.settings_es = mapping_specs.get("settings_es")
69 | self.get_row = get_row
70 | self.bulk_line_size = 100
71 | self.num_read_lines = 0
72 | self.num_indexed_docs = 0
73 | self.current_rows = []
74 |
75 | def create_new_index(self):
76 | create_new_index(self.es_uri, self.index_name, self.mapping_es, self.settings_es)
77 |
78 | def __enter__(self):
79 | return self
80 |
81 | def index_line(self, line):
82 | if (self.num_read_lines > 0) and (self.num_read_lines % self.bulk_line_size == 0):
83 | try:
84 | bulk_load_documents(self.es_uri, self.index_name, self.current_rows, self.num_indexed_docs)
85 | self.num_indexed_docs = self.num_read_lines
86 | self.current_rows = []
87 | except Exception as e:
88 | print(f"Error bulk loading lines "
89 | f"{self.num_indexed_docs} to {self.num_indexed_docs + len(self.current_rows) - 1} to elasticsearch")
90 | raise
91 |
92 | new_row = self.get_row(line)
93 | self.current_rows.append(new_row)
94 | self.num_read_lines += 1
95 |
96 | def __exit__(self, type, value, traceback):
97 | # do things at exit time
98 | if self.current_rows:
99 | try:
100 | bulk_load_documents(self.es_uri, self.index_name, self.current_rows, self.num_indexed_docs)
101 | except:
102 | print(f"Error bulk loading lines "
103 | f"{self.num_indexed_docs} to {self.num_indexed_docs + len(self.current_rows) - 1} to elasticsearch")
104 | raise
105 |
106 |
107 | def bulk_load_documents(es_uri, index_name, list_docs, start_doc_ind):
108 | json_data = []
109 | num_docs = 0
110 | for doc in list_docs:
111 | json_data.append(json.dumps({"index": {"_index": index_name,
112 | "_id": start_doc_ind + num_docs}}))
113 | if "id" not in doc:
114 | doc["id"] = start_doc_ind + num_docs
115 | json_data.append(json.dumps(doc))
116 | num_docs += 1
117 | request_body = "\n".join(json_data) + "\n"
118 | bulk_upload(es_uri, request_body)
119 |
120 |
121 | def index_df(es_uri, index_name, df, get_row):
122 | num_lines = 100
123 | rows = []
124 | start_doc_ind = 0
125 |
126 | for ind, row in df.iterrows():
127 | if (ind > 0) and (ind % num_lines == 0):
128 | try:
129 | bulk_load_documents(es_uri, index_name, rows, start_doc_ind)
130 | start_doc_ind = ind
131 | rows = []
132 | except:
133 | print(f"Error bulk loading lines "
134 | f"{start_doc_ind} to {start_doc_ind + num_lines - 1} to elasticsearch")
135 | raise
136 |
137 | new_row = get_row(row)
138 | rows.append(new_row)
139 |
140 | if rows:
141 | bulk_load_documents(es_uri, index_name, rows, start_doc_ind)
142 |
143 |
144 | def get_random_int_5_digits():
145 | return random.randint(10000, 99999)
146 |
147 |
148 | def sanitise_string(s):
149 | return ''.join(e for e in s if (e.isalnum()) or e == '_').lower()
150 |
151 |
152 | def get_random_index_name(prefix):
153 | index_name = sanitise_string(prefix)
154 | suffix = get_random_int_5_digits()
155 | index_name = f"{index_name}_{suffix}"
156 | return index_name
157 |
158 |
159 | def get_upload_key(project_type, file_type):
160 | if file_type not in INPUT_FILE_SPECS[project_type]:
161 | raise Exception(f"File type {file_type} not supported for project of type {project_type}.")
162 | return INPUT_FILE_SPECS[project_type][file_type]['upload_key']
163 |
164 |
165 | def check_column_names(file, column_names):
166 | actual_column_names = get_column_names(file)
167 | for column_name in column_names:
168 | if not column_name in actual_column_names:
169 | raise Exception(f"File needs to contain a \"{column_name}\" column")
170 | return actual_column_names
171 |
--------------------------------------------------------------------------------
/src/famie/api/api_fns/project_settings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/api/api_fns/project_settings/__init__.py
--------------------------------------------------------------------------------
/src/famie/api/api_fns/project_settings/supervised.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/api/api_fns/project_settings/supervised.py
4 | '''
5 | from itertools import cycle
6 | from famie.api.api_fns.utils import check_file_size, get_decoded_stream
7 | from famie.constants import COLOURS
8 | import json
9 |
10 |
11 | def add_class_colours(class_names):
12 | for class_colour, class_item in zip(cycle(COLOURS), class_names):
13 | class_item['colour'] = class_colour
14 |
15 |
16 | def get_class_names(file_bytes):
17 | file = get_decoded_stream(file_bytes)
18 | lines = [line.strip() for line in file.readlines() if line.strip()]
19 | class_names = []
20 | for line_ind, line in enumerate(lines):
21 | class_name = line
22 | if len(class_name) == 0:
23 | raise Exception(f"There is an empty class name on line {line_ind + 1}.")
24 | class_names.append({"id": line_ind, "name": class_name})
25 |
26 | add_class_colours(class_names)
27 | return class_names
28 |
29 |
30 | def set_class_names(file_bytes):
31 | class_names = get_class_names(file_bytes)
32 | return class_names
33 |
34 |
35 | def convert_to_bio2(ori_tags):
36 | bio2_tags = []
37 | for i, tag in enumerate(ori_tags):
38 | if tag == 'O':
39 | bio2_tags.append(tag)
40 | elif tag[0] == 'I':
41 | if i == 0 or ori_tags[i - 1] == 'O' or ori_tags[i - 1][1:] != tag[1:]:
42 | bio2_tags.append('B' + tag[1:])
43 | else:
44 | bio2_tags.append(tag)
45 | else:
46 | bio2_tags.append(tag)
47 | return bio2_tags
48 |
49 |
50 | def get_example_from_lines(sent_lines):
51 | tokens = []
52 | ner_tags = []
53 | for line in sent_lines:
54 | array = line.split()
55 | assert len(array) >= 2
56 | tokens.append(array[0])
57 | ner_tags.append(array[1])
58 | ner_tags = convert_to_bio2(ner_tags)
59 | return {'tokens': [{'text': t} for t in tokens], 'labels': ner_tags}
60 |
61 |
62 | def get_examples_from_bio_fpath(raw_lines):
63 | sent_lines = []
64 | bio2_examples = []
65 | for line in raw_lines:
66 | line = line.strip()
67 | if '-DOCSTART-' in line:
68 | continue
69 | if len(line) > 0:
70 | array = line.split()
71 | if len(array) < 2:
72 | continue
73 | else:
74 | sent_lines.append(line)
75 | elif len(sent_lines) > 0:
76 | example = get_example_from_lines(sent_lines)
77 | bio2_examples.append(example)
78 | bio2_examples[-1]['example_id'] = 'provided-example-{}'.format(len(bio2_examples))
79 |
80 | sent_lines = []
81 |
82 | if len(sent_lines) > 0:
83 | bio2_examples.append(get_example_from_lines(sent_lines))
84 | bio2_examples[-1]['example_id'] = 'provided-example-{}'.format(len(bio2_examples))
85 |
86 | return bio2_examples
87 |
88 |
89 | def parse_labeled_data(file_bytes, project_task_type):
90 | file = get_decoded_stream(file_bytes)
91 | lines = [line.strip() for line in file.readlines()]
92 | if len(lines) == 0:
93 | raise Exception("Provided data is empty.")
94 | try:
95 | d = json.loads(lines[0])
96 | data_format = 'json'
97 | except json.decoder.JSONDecodeError:
98 | data_format = 'BIO'
99 |
100 | if data_format == 'BIO':
101 | # labeled data in BIO format
102 | provided_labeled_data = get_examples_from_bio_fpath(lines)
103 | for example in provided_labeled_data:
104 | example['project_task_type'] = 'unconditional'
105 | example['anchor'] = -1
106 | example['anchor_type'] = 'unknown'
107 | else:
108 | if project_task_type == 'unconditional':
109 | provided_labeled_data = []
110 | for line in lines:
111 | if line:
112 | example = json.loads(line)
113 | inst = {
114 | 'example_id': 'provided-example-{}'.format(len(provided_labeled_data)),
115 | 'text': example['text'],
116 | 'tokens': example['tokens'],
117 | 'anchor': -1,
118 | 'anchor_type': 'unknown'
119 | }
120 | inst['labels'] = ['O'] * len(example['tokens'])
121 | for event in example['event_mentions']:
122 | trigger_start, trigger_end, event_type = event['trigger']
123 | if trigger_end > len(inst['tokens']):
124 | continue
125 | inst['labels'][trigger_start] = 'B-{}'.format(event_type)
126 | for k in range(trigger_start + 1, trigger_end):
127 | inst['labels'][k] = 'I-{}'.format(event_type)
128 |
129 | provided_labeled_data.append(inst)
130 | else:
131 | assert project_task_type == 'conditional'
132 | provided_labeled_data = []
133 | for line in lines:
134 | if line:
135 | example = json.loads(line)
136 |
137 | for event in example['event_mentions']:
138 | trigger_start, trigger_end, event_type = event['trigger']
139 |
140 | inst = {
141 | 'example_id': 'provided-example-{}'.format(len(provided_labeled_data)),
142 | 'text': example['text'],
143 | 'tokens': example['tokens'],
144 | 'anchor': trigger_start,
145 | 'anchor_type': event_type
146 | }
147 |
148 | inst['labels'] = ['O'] * len(example['tokens'])
149 |
150 | for argument in event['arguments']:
151 | arg_start, arg_end, arg_role = argument
152 | if arg_end > len(inst['tokens']):
153 | continue
154 |
155 | inst['labels'][arg_start] = 'B-{}'.format(arg_role)
156 | for k in range(arg_start + 1, arg_end):
157 | inst['labels'][k] = 'I-{}'.format(arg_role)
158 |
159 | provided_labeled_data.append(inst)
160 |
161 | return provided_labeled_data
162 |
--------------------------------------------------------------------------------
/src/famie/api/api_fns/utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/api/api_fns/utils.py
4 | '''
5 | import io
6 | import os
7 |
8 |
9 | def get_column_names(file):
10 | first_line = file.readline()
11 | try:
12 | column_names = first_line.strip("\n").split(',')
13 | except:
14 | raise Exception("Need to load a csv file")
15 | file.seek(0)
16 | return column_names
17 |
18 |
19 | def get_decoded_stream(file_bytes):
20 | file = io.TextIOWrapper(file_bytes, encoding='utf-8')
21 | return file
22 |
23 |
24 | def check_file_size(file):
25 | lines = [line.strip() for line in file.readlines() if line.strip()]
26 | if len(lines) == 0:
27 | raise Exception("File is empty")
28 |
--------------------------------------------------------------------------------
/src/famie/api/blueprints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/api/blueprints/__init__.py
--------------------------------------------------------------------------------
/src/famie/api/blueprints/common.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/api/blueprints/common.py
4 | '''
5 | from datetime import datetime
6 | from distutils.util import strtobool
7 | import json
8 | import shutil
9 |
10 | from flask import (current_app,
11 | render_template,
12 | request,
13 | Blueprint)
14 |
15 | from famie.api.api_fns.project_creation.common import get_upload_key
16 |
17 | from famie.api.api_fns.project_creation.supervised import create_supervised_project
18 |
19 | from famie.constants import (ALL_PROJECT_TYPES,
20 | PROJECT_TYPE_CLASSIFICATION,
21 | PROJECT_TYPE_ED,
22 | PROJECT_TYPE_NER)
23 | from famie.api.active_learning.controllers import *
24 | from famie.api.active_learning.config import config
25 |
26 | bp = Blueprint('api', __name__)
27 |
28 | al_controllers = {'active_task': 'ner', 'active_project': 'asdf0qweo23904123ieaewklf'}
29 | for supported_task in SUPPORTED_TASKS:
30 | al_controllers[supported_task] = Controller(config, task=supported_task)
31 |
32 |
33 | @bp.route("/")
34 | def index():
35 | return render_template('index.html')
36 |
37 |
38 | @bp.route('/', defaults={'path': ''})
39 | @bp.route('/')
40 | def catch_all(path):
41 | return render_template('index.html')
42 |
43 |
44 | @bp.route('/hello')
45 | def hello_world():
46 | return 'Hello, World!'
47 |
48 |
49 | @bp.route('/api/upload', methods=['POST'])
50 | def upload():
51 | start_time = datetime.now()
52 | file = request.files['file']
53 | if not file:
54 | raise Exception("File is undefined")
55 |
56 | project_name = request.form['project_name']
57 | if not project_name:
58 | raise Exception("Project name undefined")
59 |
60 | try:
61 | column_name_mapping = json.loads(request.form['column_names'])
62 | except:
63 | raise Exception(f"Column name mapping not passed correctly")
64 | if not column_name_mapping:
65 | raise Exception("Column names undefined")
66 |
67 | project_type = request.form['project_type']
68 | if not project_type:
69 | raise Exception("Project type undefined")
70 |
71 | try:
72 | file_type = request.form['file_type']
73 | upload_key = get_upload_key(project_type, file_type)
74 | except:
75 | raise Exception(f"File type undefined or incorrect")
76 |
77 | upload_id = request.form[upload_key]
78 | if not upload_id:
79 | raise Exception("Need upload id for polling")
80 |
81 | if project_type not in ALL_PROJECT_TYPES:
82 | raise Exception(f"Non-recognised project type {project_type}. "
83 | f"Needs to be one of: {ALL_PROJECT_TYPES}")
84 |
85 | class_names = None
86 |
87 | input_params = (config,
88 | detect_lang,
89 | file,
90 | project_name,
91 | project_type,
92 | upload_id,
93 | file_type)
94 |
95 | upload_folder = os.path.join(DATABASE_DIR, project_name)
96 | ensure_dir(upload_folder)
97 | project_id, project_task_type = create_supervised_project(*input_params, upload_folder)
98 |
99 | update_project_to_database(project_info={
100 | 'id': project_id, 'name': project_name, 'type': project_type, 'project_task_type': project_task_type,
101 | 'classes': [], 'index_name': project_name, 'filename': os.path.join(upload_folder, 'unlabeled-data.json')
102 | })
103 |
104 | if project_id:
105 | end_time = datetime.now()
106 | print(f"Uploading took {(end_time - start_time).seconds / 60} minutes.")
107 | print('Uploading... done!')
108 | return json.dumps({"id": project_id, "class_names": class_names})
109 |
110 |
111 | @bp.route('/api/delete-project/', methods=['DELETE'])
112 | def delete_project(project_name):
113 | if not project_name:
114 | raise Exception("Project name undefined")
115 | upload_folder = os.path.join(DATABASE_DIR, project_name)
116 | shutil.rmtree(upload_folder, ignore_errors=True)
117 | active_task = al_controllers['active_task']
118 | if project_name == al_controllers['active_project']:
119 | al_controllers[active_task].stop_listening()
120 | return "success"
121 |
122 |
123 | @bp.route('/api/get-projects', methods=['GET'])
124 | def get_projects():
125 | # stop all controllers when users see the list of projects
126 | for task in SUPPORTED_TASKS:
127 | al_controllers[task].stop_listening()
128 |
129 | project_list = get_project_list()
130 | return json.dumps(project_list)
131 |
132 |
133 | @bp.route('/api/export-labels', methods=['POST'])
134 | def export_labels_api():
135 | project_name = request.form['project_name']
136 | if not project_name:
137 | raise Exception("Project name undefined")
138 |
139 | labeled_fpath = os.path.join(OUTPUT_DIR, project_name, 'labeled-data.json')
140 | if not os.path.exists(labeled_fpath):
141 | print('{} does not exist!'.format(labeled_fpath))
142 | return ''
143 | else:
144 | with open(labeled_fpath) as f:
145 | data = f.read().strip()
146 | return data
147 |
--------------------------------------------------------------------------------
/src/famie/api/static/bundle.js.LICENSE.txt:
--------------------------------------------------------------------------------
1 | /*
2 | object-assign
3 | (c) Sindre Sorhus
4 | @license MIT
5 | */
6 |
7 | /* @license
8 | Papa Parse
9 | v5.3.1
10 | https://github.com/mholt/PapaParse
11 | License: MIT
12 | */
13 |
14 | /*!
15 | Copyright (c) 2018 Jed Watson.
16 | Licensed under the MIT License (MIT), see
17 | http://jedwatson.github.io/classnames
18 | */
19 |
20 | /*!
21 | * Sizzle CSS Selector Engine v2.3.6
22 | * https://sizzlejs.com/
23 | *
24 | * Copyright JS Foundation and other contributors
25 | * Released under the MIT license
26 | * https://js.foundation/
27 | *
28 | * Date: 2021-02-16
29 | */
30 |
31 | /*!
32 | * jQuery JavaScript Library v3.6.0
33 | * https://jquery.com/
34 | *
35 | * Includes Sizzle.js
36 | * https://sizzlejs.com/
37 | *
38 | * Copyright OpenJS Foundation and other contributors
39 | * Released under the MIT license
40 | * https://jquery.org/license
41 | *
42 | * Date: 2021-03-02T17:08Z
43 | */
44 |
45 | /*! decimal.js-light v2.5.1 https://github.com/MikeMcl/decimal.js-light/LICENCE */
46 |
47 | /**
48 | * A better abstraction over CSS.
49 | *
50 | * @copyright Oleg Isonen (Slobodskoi) / Isonen 2014-present
51 | * @website https://github.com/cssinjs/jss
52 | * @license MIT
53 | */
54 |
55 | /** @license React v0.13.6
56 | * scheduler.production.min.js
57 | *
58 | * Copyright (c) Facebook, Inc. and its affiliates.
59 | *
60 | * This source code is licensed under the MIT license found in the
61 | * LICENSE file in the root directory of this source tree.
62 | */
63 |
64 | /** @license React v16.13.1
65 | * react-is.production.min.js
66 | *
67 | * Copyright (c) Facebook, Inc. and its affiliates.
68 | *
69 | * This source code is licensed under the MIT license found in the
70 | * LICENSE file in the root directory of this source tree.
71 | */
72 |
73 | /** @license React v16.8.0
74 | * react-dom.production.min.js
75 | *
76 | * Copyright (c) Facebook, Inc. and its affiliates.
77 | *
78 | * This source code is licensed under the MIT license found in the
79 | * LICENSE file in the root directory of this source tree.
80 | */
81 |
82 | /** @license React v16.8.0
83 | * react.production.min.js
84 | *
85 | * Copyright (c) Facebook, Inc. and its affiliates.
86 | *
87 | * This source code is licensed under the MIT license found in the
88 | * LICENSE file in the root directory of this source tree.
89 | */
90 |
91 | /**!
92 | * @fileOverview Kickass library to create and place poppers near their reference elements.
93 | * @version 1.16.1-lts
94 | * @license
95 | * Copyright (c) 2016 Federico Zivolo and contributors
96 | *
97 | * Permission is hereby granted, free of charge, to any person obtaining a copy
98 | * of this software and associated documentation files (the "Software"), to deal
99 | * in the Software without restriction, including without limitation the rights
100 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
101 | * copies of the Software, and to permit persons to whom the Software is
102 | * furnished to do so, subject to the following conditions:
103 | *
104 | * The above copyright notice and this permission notice shall be included in all
105 | * copies or substantial portions of the Software.
106 | *
107 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
109 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
110 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
111 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
112 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
113 | * SOFTWARE.
114 | */
115 |
--------------------------------------------------------------------------------
/src/famie/api/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | FaMIE
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/src/famie/api/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | FAMIE's Demo Website
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/src/famie/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/config/__init__.py
--------------------------------------------------------------------------------
/src/famie/config/common_config.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | ES_HOST = http://localhost
3 | FLASK_PORT = 9000
4 |
--------------------------------------------------------------------------------
/src/famie/config/config_reader.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/config/config_reader.py
4 | '''
5 | import configparser
6 | from famie.constants import HOME, ROOT_PATH
7 | from pathlib import Path
8 |
9 | APP_CONFIG_FILE = str(Path(ROOT_PATH, "config/common_config.ini"))
10 |
11 |
12 | def read_config(platform_config_path=None):
13 | config = configparser.ConfigParser()
14 | config.optionxform = str
15 | config["DEFAULT"]["home"] = HOME
16 | config["DEFAULT"]["root"] = ROOT_PATH
17 | config.read(APP_CONFIG_FILE)
18 | if platform_config_path is not None:
19 | config.read(platform_config_path)
20 | return config
21 |
--------------------------------------------------------------------------------
/src/famie/entry_points/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/entry_points/__init__.py
--------------------------------------------------------------------------------
/src/famie/entry_points/run_app.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from famie.scripts import start_app
3 |
4 | def command_start(args):
5 | print("Starting a new session...")
6 | start_app.main(args)
7 |
8 |
9 | def main():
10 | parser = argparse.ArgumentParser(
11 | description="FAMIE: A Fast Active Learning Framework for Multilingual Information Extraction.")
12 | subparsers = parser.add_subparsers()
13 | parser_start = subparsers.add_parser("start", help="Subparser for creating a new session.")
14 | parser_start.add_argument("--selection",
15 | type=str,
16 | default="mnlp",
17 | help="Data selection strategy",
18 | choices=['mnlp', 'bertkm', 'badge', 'random'])
19 | parser_start.add_argument("--port",
20 | type=str,
21 | default="9000",
22 | help="Port specification")
23 | parser_start.add_argument("--target_embedding",
24 | type=str,
25 | default='xlm-roberta-base',
26 | help="Pretrained language model for the main model, default='xlm-roberta-large'",
27 | choices=['xlm-roberta-base', 'xlm-roberta-large'])
28 | parser_start.add_argument("--proxy_embedding",
29 | type=str,
30 | default='nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large',
31 | help="Pretrained Language Model for the proxy model, default='nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large'",
32 | choices=['nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large',
33 | 'nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large'])
34 | parser_start.set_defaults(handler=command_start)
35 |
36 | '''
37 | if args.action == 'run':
38 | start_app.main()
39 | elif args.action == 'uninstall':
40 | uninstall_app.main(config)
41 | '''
42 | args = parser.parse_args()
43 | if hasattr(args, "handler"):
44 | args.handler(args)
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/src/famie/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-uoregon/famie/e771caa31bb6b1ffc27d6c27c4cdb4913ab94aa8/src/famie/scripts/__init__.py
--------------------------------------------------------------------------------
/src/famie/scripts/start_app.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/scripts/start_app.py
4 | '''
5 | import argparse
6 | import os, json
7 | import webbrowser
8 | from famie.config.config_reader import read_config
9 | from famie.api.active_learning.constants import WORKING_DIR
10 | from pathlib import Path
11 | from flask_cors import CORS
12 |
13 |
14 | def main(args):
15 | ######### passing arguments ########
16 | with open(os.path.join(WORKING_DIR, 'passed_args.json'), 'w') as f:
17 | json.dump({
18 | 'selection': args.selection,
19 | 'proxy_embedding': args.proxy_embedding,
20 | 'target_embedding': args.target_embedding
21 | }, f)
22 | ####################################
23 |
24 | config = read_config()
25 |
26 | from famie.api import create_app
27 |
28 | application = create_app()
29 |
30 | from famie.api.blueprints.common import bp
31 | from famie.api.blueprints.supervised import supervised_bp
32 |
33 | application.register_blueprint(bp)
34 | application.register_blueprint(supervised_bp)
35 |
36 | application.config.from_mapping(config.items("DEFAULT"))
37 |
38 | print('FAMIE`s Web Interface is available at: http://127.0.0.1:{}/'.format(args.port))
39 | print('-' * 50)
40 |
41 | CORS(application)
42 |
43 | application.run(debug=False,
44 | port=args.port,
45 | host='0.0.0.0')
46 |
--------------------------------------------------------------------------------
/src/famie/scripts/uninstall_app.py:
--------------------------------------------------------------------------------
1 | '''
2 | Date: Feb 11, 2022
3 | Mofied from:https://github.com/dataqa/dataqa/blob/master/src/dataqa/scripts/uninstall_app.py
4 | '''
5 | import os
6 | import shutil
7 |
8 |
9 | def main(config):
10 | upload_folder = config["DEFAULT"]["UPLOAD_FOLDER"]
11 |
12 | if os.path.exists(upload_folder):
13 | reply = input(f"Delete directory {upload_folder}? [y/[n]] ")
14 | if reply.lower().strip() == "y":
15 | shutil.rmtree(upload_folder)
16 | else:
17 | print("Doing nothing")
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
22 |
--------------------------------------------------------------------------------