├── .coveragerc ├── .editorconfig ├── .eslintignore ├── .eslintrc.js ├── .gitignore ├── .gitlab-ci.yml ├── .htmllintrc ├── .stylelintrc.js ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── doc └── images │ ├── nlg-ide-input.png │ ├── nlg-ide-toplist.gif │ └── nlg-template-settings.png ├── examples └── intro-narrative-api.ipynb ├── nlg ├── __init__.py ├── app │ ├── __init__.py │ ├── body.html │ ├── error │ │ ├── 400.html │ │ ├── 401.html │ │ ├── 403.html │ │ ├── 404.html │ │ └── 500.html │ ├── gramex.yaml │ ├── html │ │ ├── demo.html │ │ └── template-navbar.html │ ├── index.html │ ├── login.html │ ├── nlg.js │ ├── setup.sh │ ├── style.css │ ├── template-navbar.html │ └── templates │ │ ├── demo.tmpl │ │ ├── new-variable.tmpl │ │ ├── template-settings.tmpl │ │ └── variable-settings.tmpl ├── grammar.py ├── narrative.py ├── search.py ├── tests │ ├── __init__.py │ ├── data │ │ ├── actors.csv │ │ └── imdb_ratings.csv │ ├── test_grammar.py │ ├── test_narrative.py │ ├── test_search.py │ ├── test_utils.py │ └── test_webapp.py ├── utils.py └── webapp.py ├── requirements.txt ├── setup.cfg └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | show_missing = True 3 | skip_covered = True -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # .editorconfig maintains consistent coding styles between different editors. 2 | # Get plugins at http://editorconfig.org/ 3 | # - Sublime text: https://github.com/sindresorhus/editorconfig-sublime 4 | # - Notepad++: https://github.com/editorconfig/editorconfig-notepad-plus-plus 5 | 6 | root = true 7 | 8 | # Apply common styles for most standard code files. 9 | # Do not apply to * - that covers binary files as well 10 | [*.{js,html,php,py,css,svg,json,less,yaml,yml,scss,xml,sh,java,bat,R,tmpl}] 11 | end_of_line = lf 12 | insert_final_newline = true 13 | trim_trailing_whitespace = true 14 | charset = utf-8 15 | # Stick to 2-space indenting by default, to conserve space 16 | indent_style = space 17 | indent_size = 2 18 | 19 | [*.py] 20 | indent_size = 4 21 | 22 | [Makefile] 23 | indent_style = tab 24 | indent_size = 4 25 | 26 | [testlib/test_config/config.empty.yaml] 27 | insert_final_newline = false 28 | [tests/dir/gramex.yaml] 29 | insert_final_newline = false 30 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | **/node_modules/* 2 | app/node_modules/* 3 | docs/* 4 | 5 | # Our Gitlab runner uses eslint@2.6.0 to allow eslint-template. 6 | # chromecapture.js requires ecmaVersion 8 which eslint@2.6.0 does not support 7 | # So let's not eslint that 8 | gramex/apps/capture/chromecapture.js 9 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "plugins": [ 3 | "template" // Handle Tornado templates and JS in HTML files 4 | ], 5 | "env": { 6 | "es6": true, // Allow ES6 in JavaScript 7 | "browser": true, // Include browser globals 8 | "jquery": true, // Include jQuery and $ 9 | "mocha": true // Include it(), assert(), etc 10 | }, 11 | "globals": { 12 | "_": true, // underscore.js 13 | "d3": true, // d3.js 14 | "vg": true, // vega.js 15 | "L": true, // leaflet.js 16 | "ga": true, // Google analytics 17 | "G": true, // G.min.js 18 | "topojson": true, // topojson.js 19 | "moment": true, // moment.js 20 | "numeral": true, // numeral.js 21 | "assert": true // chai.js 22 | }, 23 | "extends": "eslint:recommended", 24 | "rules": { 25 | /* Override default rules */ 26 | "indent": ["off", 2], // We eventually want 2 space indentation 27 | "linebreak-style": ["off", "unix"], // We eventually want UNIX style line 28 | "quotes": ["off", "double"], // We may go for a double-quotes style 29 | "semi": ["off", "never"] // We may go for a no-semicolon style 30 | } 31 | }; 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | TODO 2 | 3 | # Ignore auto-generated documentation 4 | docs/gramex*.rst 5 | docs/modules.rst 6 | 7 | # Ignore files generated by testcases 8 | tests/**/*.test 9 | tests/**/gen.* 10 | testlib/**/gen.* 11 | .noseids 12 | 13 | # Ignore byte-compiled / optimised / DLL files 14 | *.py[cod] 15 | 16 | # Filenames should NOT have spaces 17 | * * 18 | 19 | # Ignore SQLite3 files. Gramex creates some automatically 20 | *.sqlite3 21 | *.sqlite3-journal 22 | 23 | # Cache folders used for testing 24 | .cache-* 25 | .pytest_cache 26 | 27 | # Ignore log files 28 | *.log* 29 | 30 | # Don't commit data files, except what's required for testing or by Gramex apps 31 | *.csv 32 | *.xls* 33 | !tests/**/*.csv 34 | !tests/*.xlsx 35 | !testlib/**/*.csv 36 | !testlib/*.xlsx 37 | !gramex/apps/**/*.csv 38 | !gramex/apps/guide/**/*.xlsx 39 | 40 | *.ppt* 41 | !testlib/input.pptx 42 | !tests/template.pptx 43 | !gramex/apps/guide/formhandler/input.pptx 44 | !gramex/apps/guide/pptxhandler/examples-input.pptx 45 | 46 | # Don't commit databases created by test cases 47 | tests/*.db 48 | 49 | # Don't commit uploads created by test cases 50 | tests/uploads 51 | 52 | # Don't commit ZIP files, except what's required for testing 53 | *.7z 54 | *.zip 55 | !tests/*.zip 56 | 57 | # Documents 58 | *.doc* 59 | *.pdf 60 | 61 | # Avoid media files 62 | *.avi 63 | *.mp* 64 | *.wmv 65 | 66 | # Backup files 67 | ~$* 68 | *~ 69 | *.bak* 70 | 71 | # Sublime-text workspaces, etc 72 | *.sublime-* 73 | .vscode/ 74 | .vim/ 75 | 76 | # IPython Notebook checkpoints 77 | .ipynb_checkpoints 78 | 79 | # Typically bash.exe.stackdump on Cygwin 80 | *.stackdump 81 | 82 | # Node modules and bower components 83 | node_modules 84 | bower_components 85 | 86 | # Prefer yarn.lock over package-lock.json 87 | package-lock.json 88 | 89 | # Windows shortcut files 90 | *.lnk 91 | 92 | # Windows / Mac OS junk files 93 | Desktop.ini 94 | $RECYCLE.BIN/ 95 | *[Tt]humbs.db 96 | *.DS_Store 97 | 98 | # R history files 99 | .RHistory 100 | 101 | # C extensions 102 | *.so 103 | 104 | # Packages 105 | *.egg 106 | *.eggs 107 | *.egg-info 108 | dist 109 | build 110 | eggs 111 | parts 112 | bin 113 | var 114 | sdist 115 | develop-eggs 116 | .installed.cfg 117 | lib 118 | lib64 119 | 120 | # Installer logs 121 | pip-log.txt 122 | 123 | # Unit test / coverage reports 124 | .coverage 125 | .tox 126 | nosetests.xml 127 | htmlcov 128 | cover 129 | 130 | # Translations 131 | *.mo 132 | 133 | # Mr Developer 134 | .mr.developer.cfg 135 | .project 136 | .pydevproject 137 | 138 | # Pycharm 139 | .idea 140 | 141 | # Complexity 142 | output/*.html 143 | output/*/index.html 144 | 145 | # Sphinx 146 | docs/_build 147 | 148 | # For Linux FUSE file system 149 | .fuse_hidden* 150 | 151 | # IDE 152 | .vim 153 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | validate: 2 | tags: [py3] 3 | script: 4 | - python setup.py develop 5 | - python -m spacy download en 6 | - gramex license accept 7 | - nosetests -sv --with-coverage --cover-package=nlg 8 | 9 | 10 | deploy: 11 | stage: deploy 12 | script: deploy 13 | only: [dev] 14 | tags: [py3] 15 | variables: 16 | SERVER: ubuntu@uat.gramener.com # Deploy to uat.gramener.com/app-name/ 17 | URL: nlg-tmplgen 18 | SETUP: sh setup.sh 19 | VERSION: py3v1 20 | PORT: 8040 21 | -------------------------------------------------------------------------------- /.htmllintrc: -------------------------------------------------------------------------------- 1 | // NLG .htmllintrc v1.2 2 | { 3 | "plugins": [], 4 | 5 | "attr-bans": [ 6 | "align", 7 | "background", 8 | "border", 9 | // "frameborder", // frameborder is used in YouTube embeds 10 | "longdesc", 11 | "marginwidth", 12 | "marginheight", 13 | "scrolling" 14 | ], 15 | "attr-name-style": false, 16 | "attr-no-dup": false, // attr name may be computed, and get replaced by {} 17 | "attr-no-unsafe-char": false, // title contains single quotes ' 18 | "attr-quote-style": "double", // attributes contain double quotes 19 | "attr-req-value": false, 20 | "class-no-dup": true, // no duplicate classes in a tag 21 | "doctype-first": false, // snippet templates need not begin with doctype 22 | "doctype-html5": true, 23 | "fig-req-figcaption": false, 24 | "focusable-tabindex-style": false, 25 | "head-req-title": false, // title may be inside a Block.run() 26 | "href-style": false, 27 | "html-req-lang": false, 28 | "id-class-ignore-regex": "\\{ *\\}", // ignore tornado template id / class 29 | "id-class-no-ad": false, 30 | "id-class-style": false, // no styles enforced for now 31 | "id-no-dup": false, // template replacement IDs { } cause duplication 32 | "img-req-alt": "allownull", // for dynamic image content 33 | "img-req-src": false, 34 | "indent-style": "spaces", 35 | "indent-width": 2, 36 | "label-req-for": false, // cannot use if multiple forms with same key 37 | "line-end-style": false, // raises too many errors 38 | "raw-ignore-regex": "<%.*?%>\\s*|{[%#{].*?[%#}]}\\s*", // ignore templates 39 | "spec-char-escape": false, // using > or < is not that big a deal 40 | "table-req-caption": false, 41 | "tag-bans": [ 42 | // "b", // Bootstrap caret example uses 43 | // "i", // Font-awesome icons use 44 | "s", // avoid strike tag, deprecated 45 | // "style", // Single-page templates need style tag 46 | "u", 47 | "strike", 48 | "font", 49 | "center" 50 | ], 51 | "tag-name-lowercase": true, 52 | "tag-name-match": true, 53 | "tag-self-close": false, 54 | "title-max-len": false, // we sometimes have tables inside the title="" 55 | "title-no-dup": true 56 | } 57 | -------------------------------------------------------------------------------- /.stylelintrc.js: -------------------------------------------------------------------------------- 1 | "use strict" 2 | 3 | module.exports = { 4 | rules: { 5 | "at-rule-no-unknown": true, 6 | "block-no-empty": true, 7 | "color-no-invalid-hex": true, 8 | "comment-no-empty": true, 9 | "declaration-block-no-duplicate-properties": [ 10 | true, 11 | { 12 | ignore: ["consecutive-duplicates-with-different-values"] 13 | } 14 | ], 15 | "declaration-block-no-shorthand-property-overrides": true, 16 | "font-family-no-duplicate-names": true, 17 | "font-family-no-missing-generic-family-keyword": true, 18 | "function-calc-no-unspaced-operator": true, 19 | "function-linear-gradient-no-nonstandard-direction": true, 20 | "keyframe-declaration-no-important": true, 21 | "media-feature-name-no-unknown": true, 22 | "no-descending-specificity": true, 23 | "no-duplicate-at-import-rules": true, 24 | "no-duplicate-selectors": true, 25 | "no-empty-source": true, 26 | "no-extra-semicolons": true, 27 | "no-invalid-double-slash-comments": true, 28 | "property-no-unknown": true, 29 | "selector-pseudo-class-no-unknown": true, 30 | "selector-pseudo-element-no-unknown": true, 31 | "selector-type-no-unknown": true, 32 | "string-no-newline": true, 33 | "unit-no-unknown": true 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | # Run in Python 3 only. Drop Python 2 testing 4 | language: python 5 | python: '3.7' 6 | 7 | dist: xenial 8 | sudo: yes 9 | 10 | # Cache modules for faster builds 11 | cache: 12 | timeout: 1000 13 | pip: true 14 | npm: true 15 | yarn: true 16 | # Don't cache miniconda directory. It's slower. Fresh install takes ~200s. 17 | # But caching takes ~150s (extraction) + ~190s (re-packing) = ~340s (slower). 18 | # directories: 19 | # - $HOME/miniconda 20 | 21 | install: 22 | # Install miniconda 23 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh 24 | - bash $HOME/miniconda.sh -b -u -p $HOME/miniconda 25 | - export PATH="$HOME/miniconda/bin:$PATH" 26 | - hash -r 27 | - conda config --set always_yes yes --set changeps1 no 28 | # Install pip modules 29 | - pip install flake8 pep8-naming flake8-gramex flake8-blind-except flake8-print flake8-debugger nose coverage 30 | - npm install -g yarn 31 | - yarn global add eclint eslint eslint-plugin-html eslint-plugin-template htmllint-cli 32 | - yarn install 33 | # Set up variables 34 | - export BRANCH=$TRAVIS_BRANCH 35 | 36 | script: 37 | - eclint check '**/*.html' '**/*.js' '**/*.css' '**/*.yaml' '**/*.md' 38 | - htmllint 39 | - flake8 40 | - bandit nlg --recursive --format csv || true 41 | - pip install -e . 42 | - gramex setup nlg/app 43 | - nosetests -sv --with-coverage --cover-package=nlg 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Gramex-NLG is licensed under the [MIT License][1] 2 | 3 | Copyright (c) 2019, Gramener 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | 23 | Gramex includes [third party libraries][2] with permissive licenses. 24 | 25 | [1]: https://opensource.org/licenses/MIT 26 | [2]: https://learn.gramener.com/guide/license/thirdparty.md 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft nlg/app 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Build Status| 2 | 3 | nlg 4 | === 5 | 6 | Natural Language Generation component for 7 | `Gramex `__. The NLG module is 8 | designed to work as a Python library, as well as a `Gramex 9 | application `__. 10 | 11 | The library: 12 | 13 | 1. Automatically creates tornado templates from English text in the 14 | context of a dataset. 15 | 2. Allows for modification and generalization of these templates. 16 | 3. Renders these templates as a unified narrative. 17 | 18 | Installation 19 | ------------ 20 | 21 | The NLG library can be installed from PyPI as follows: 22 | 23 | .. code:: bash 24 | 25 | $ pip install nlg 26 | $ python -m spacy download en_core_web_sm 27 | $ gramex setup ui 28 | 29 | or from source as follows: 30 | 31 | .. code:: bash 32 | 33 | $ git clone https://github.com/gramener/gramex-nlg.git 34 | $ cd gramex-nlg 35 | $ pip install -e . 36 | $ gramex setup ./app 37 | 38 | Usage 39 | ----- 40 | 41 | Using the Python library 42 | ~~~~~~~~~~~~~~~~~~~~~~~~ 43 | 44 | To get started, see the `example notebook here `_. 45 | 46 | .. code:: python 47 | 48 | >>> import pandas as pd 49 | >>> from gramex import data 50 | 51 | >>> # load some data 52 | >>> df = pd.read_csv('iris.csv') 53 | 54 | >>> # specify a FormHandler operation - find the average sepal_width per species 55 | >>> fh_args = {'_by': ['species'], '_c': ['sepal_width|avg'], '_sort': ['sepal_width|avg']} 56 | 57 | >>> # Draw a sample 58 | >>> xdf = df.sample(frac=0.1, random_state=10) 59 | 60 | >>> # perform the FormHandler operation on the data 61 | >>> print(data.filter(xdf, fh_args.copy())) 62 | species sepal_width|avg 63 | 2 virginica 2.70 64 | 1 versicolor 2.92 65 | 0 setosa 3.15 66 | 67 | >>> # Write something about the output 68 | >>> from nlg.utils import load_spacy_model 69 | >>> text = nlp("The virginica species has the least average sepal_width.") 70 | 71 | >>> # Generate a template 72 | >>> from nlg.search import templatize 73 | >>> tmpl = templatize(text, fh_args, xdf) 74 | >>> print(tmpl) 75 | {% set fh_args = {"_by": ["species"], "_c": ["sepal_width|avg"], "_sort": ["sepal_width|avg"]} %} 76 | {% set df = U.gfilter(orgdf, fh_args.copy()) %} 77 | {% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %} 78 | The {{ df["species"].iloc[0] }} species has the least average {{ fh_args["_sort"][0].lower() }}. 79 | 80 | >>> # Render the same template with new data. 81 | >>> print(render(df, tmpl).decode('utf8')) 82 | The versicolor species has the least average sepal_width|avg. 83 | 84 | Using the NLG IDE 85 | ~~~~~~~~~~~~~~~~~ 86 | 87 | The NLG module ships with an IDE. The IDE is a `Gramex 88 | application `__. 89 | 90 | To use it, install the NLG module as indicated above, and add the 91 | following to your ``gramex.yaml``: 92 | 93 | .. code:: yaml 94 | 95 | variables: 96 | NLG_ROOT: 97 | function: nlg.utils._locate_app_config() 98 | 99 | import: 100 | nlg: 101 | path: $NLG_ROOT 102 | YAMLURL: $YAMLURL/nlg 103 | 104 | This configuration mounts the app at the ``/nlg/`` resource. Start gramex to access it. 105 | 106 | The Gramex NLG IDE 107 | ------------------ 108 | 109 | The NLG component depends on two sources of information: 110 | 111 | 1. A source dataset, which can be uploaded on to the IDE. A dataset is 112 | uniquely identified with its filename. Once uploaded, the file 113 | persists and is available for selection from the app. Any *file* that 114 | makes a valid URL for 115 | `FormHandler `__ can be 116 | used with the NLG app. 117 | 2. A *narrative*, which is a collection of templates and rules around 118 | them. The narrative consists of the configuration which governs the 119 | rendered text. An existing narrative can be uploaded through the "Add 120 | Data" button, or can be created through the IDE. Once created, the 121 | narrative can be named and becomes available for selection from the 122 | "Add Data" modal. 123 | 124 | The NLG IDE 125 | ----------- 126 | 127 | The primary purpose of the IDE is to create or edit narratives based on 128 | a dataset. Once a dataset has been selected, it is exposed in the IDE as 129 | a `FormHandler 130 | table `__. 131 | 132 | .. figure:: doc/images/nlg-ide-input.png 133 | :alt: 134 | 135 | Users can now type English text into the IDE and add it to the 136 | narrative. This automatically templatizes the text, and adds the 137 | template to the narrative. For example, typing "Humphrey Bogart is at 138 | the top of the list." does this: 139 | 140 | .. figure:: doc/images/nlg-ide-toplist.gif 141 | :alt: 142 | 143 | This means that the input statement has been templatized and added to 144 | the narrative. The part of the input text that was successfully 145 | templatized is highlighted in green. Clicking on the spanner button next 146 | to a template opens the `Template Settings <#template-settings>`__ 147 | modal. 148 | 149 | Template Settings 150 | ----------------- 151 | 152 | .. figure:: doc/images/nlg-template-settings.png 153 | :alt: 154 | 155 | This dialog provides configuration options for all template attributes: 156 | 157 | 1. **Template Name** - Each template can optionally be named. 158 | 2. **Condition** - Any Python expression which evaluates to a boolean 159 | may be set as a condition, which controls whether the template is 160 | rendered. 161 | 3. The actual Tornado template itself can be edited. Any valid Tornado 162 | template is acceptable. 163 | 4. **Token Settings** - Every token from the input text that finds a 164 | match in the dataset or in FormHandler arguments (i.e. every token 165 | that is highlighted in the preview) is converted into a `template 166 | expression `__. 167 | Such tokens have their own attributes, as follows: 168 | 169 | - **Token search results** - if a token is found in more than one 170 | place (say, a dataframe cell as well as a FormHandler argument), 171 | this setting allows the user to select the right result. 172 | - **Grammar options** - the NLG engine may automatically apply 173 | certain string formatting or lexical operations to the template 174 | expression to make it match the input text. Any number of these 175 | operations can be enabled / disabled through this setting. 176 | - **Make variable** - a token may be set as a local variable within 177 | the template. 178 | - **Ignore** - the template expression corresponding to the token 179 | may be ignored, and set back to the literal input text. 180 | 181 | 5. **Run Template** - Run the current template against the dataframe and 182 | preview its output. 183 | 6. **Save Template** - Save the template. Note that this is required if 184 | the template has been manually edited in the textarea. 185 | 186 | Naming and Saving a Narrative 187 | ----------------------------- 188 | 189 | Once a narrative has been fully configured, it can be named and saved. 190 | Doing so causes it to appear the narrative dropdown menu on the app. 191 | 192 | Sharing a Narrative 193 | ------------------- 194 | 195 | After a narrative has been named and saved, it be shared in two modes: 196 | 197 | 1. **IDE mode** - This option lets users copy a URL that redirects to 198 | the IDE, with the current dataset and the current narrative set in 199 | the session. 200 | 2. **Embed mode** - Copy an HTML snippet to embed into a page which 201 | contains a Formhandler table. The template will render live as the 202 | table changes. 203 | 204 | 205 | Glossary: Grammar of Data-Driven Narratives 206 | =========================================== 207 | 208 | This section describes the building blocks of Gramex's approach to natural language generation. 209 | These concepts serve as primitives to the logic and automation capabilities of the NLG engine. 210 | 211 | 1. **Narrative** - A *narrative* is a piece of text written by a user or generated by a machine which contains facts about a dataset. 212 | A narrative in its entirity is assumed to be a function of three items: 213 | 214 | a. A dataset 215 | b. Operations on that dataset 216 | c. Some "source text" provided by the user. 217 | 218 | For example, the following is a narrative about the `Fisher Iris dataset `_. 219 | 220 | The iris dataset contains measurements from a hundred and fifty samples of three unique species of the iris flower - setosa, versicolor and virginica. The species are equally distributed within the dataset, so that each species has fifty samples. For each sample, four measurements are taken - sepal width, petal width, sepal width and sepal length. The average petal length of the setosa is significantly less than that of versicolor or virginica. The average petal width of virginica is much higher than that of versicolor. However, there is no pair of features that can uniquely identify a species. The presence of such properties makes the iris dataset ideal for explaining machine learning concepts. 221 | 222 | 2. **Nugget** - A *nugget* is ideally a single sentence which conveys a fact about the data. Each sentence in the example narrative except the last two is a nugget. Note that each nugget derives its facts from the source data directly, or from the result of some operation on the data. For example, the following nugget 223 | 224 | The average petal length of the setosa is significantly less than that of versicolor or virginica. 225 | 226 | derives from a groupby-and-average operation on one column of the dataset. Some nuggets, like the one enumerating the number of samples in the dataset, derive from the raw dataset, *not* from the result of any operations on it. A narrative is essentially an ordered collection of nuggets. 227 | 228 | 3. **Variables** - A *variable* is a piece of text which can change with the data or the operations performed on it. Here is a reproduction of the example narrative, with all variables shown in bold. 229 | 230 | The iris dataset contains measurements from **a hundred and fifty** samples of **three** unique species of the iris flower - **setosa, versicolor and virginica**. The species are equally distributed within the dataset, so that each species has **fifty** samples. For each sample, **four** measurements are taken - **sepal width, petal width, sepal width and sepal length**. The **average petal length** of the setosa is significantly **less** than that of versicolor or virginica. The **average petal width** of virginica is much **higher** than that of versicolor. However, there is no pair of features that can uniquely identify a species. The presence of such properties makes the iris dataset ideal for explaining machine learning concepts. 231 | 232 | Note that each variable has two defining components: 233 | 234 | * a *source text*, as initially provided by the user 235 | * one or more *formulae*, which compute the value of the variable for a specific instance of the data. Note that the source text of a variable may be found in multiple places within a dataset, and as such, a variable may have multiple formulae - one of which will have to be preferred by the user. 236 | 237 | For example, for the first variable in example narrative, "hundred and fifty" is the source text, and the formula is any machine code that counts the number of rows in the dataset and translates it into a human-readable form. A variable may additionally have other attributes, like: 238 | 239 | * a set of linguistic *inflections* which determine the form of the rendered variable text - these are distinct from the formula itself, in that the formula creates the base form of the text and inflections modify the base form. 240 | * a *name* used to identify the variable within the template of the nugget 241 | 242 | 243 | Thus, narratives are composed from nuggets, and nuggets from variables. This grammar allows the NLG engine to approach the problem of data-driven, machine-generated narratives in a more *compositional* manner than a *generative* one. 244 | 245 | .. |Build Status| image:: https://travis-ci.org/gramener/gramex-nlg.svg?branch=dev 246 | :target: https://travis-ci.org/gramener/gramex-nlg 247 | 248 | -------------------------------------------------------------------------------- /doc/images/nlg-ide-input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/doc/images/nlg-ide-input.png -------------------------------------------------------------------------------- /doc/images/nlg-ide-toplist.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/doc/images/nlg-ide-toplist.gif -------------------------------------------------------------------------------- /doc/images/nlg-template-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/doc/images/nlg-template-settings.png -------------------------------------------------------------------------------- /examples/intro-narrative-api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Introduction to NLG's Narrative API\n", 8 | "===================================\n", 9 | "\n", 10 | "This notebook is an introduction to Gramex NLG's Narrative API. Here we will learn how to create data-driven narratives with the NLG module, by going over the building blocks of the API.\n", 11 | "\n", 12 | "Getting Started\n", 13 | "---------------\n", 14 | "\n", 15 | "If the NLG module is not installed, install it as follows:\n", 16 | "\n", 17 | "```bash\n", 18 | "$ pip install nlg\n", 19 | "```" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "Test the installation by running the following cell:" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from nlg.search import templatize\n", 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Next, let's load some data. For this tutorials, we will be using [this](https://raw.githubusercontent.com/gramener/gramex-nlg/master/nlg/tests/data/actors.csv) dataset. Please download the file and load it as a pandas dataframe." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | "
categorynameratingvotes
0ActorsHumphrey Bogart0.570197109
1ActorsCary Grant0.438602142
2ActorsJames Stewart0.988374120
3ActorsMarlon Brando0.102045108
4ActorsFred Astaire0.20887784
5ActressesKatharine Hepburn0.03918863
6ActressesBette Davis0.28280714
7ActressesAudrey Hepburn0.12019794
8ActressesIngrid Bergman0.29614052
9ActorsSpencer Tracy0.466311192
10ActorsCharlie Chaplin0.24442676
\n", 158 | "
" 159 | ], 160 | "text/plain": [ 161 | " category name rating votes\n", 162 | "0 Actors Humphrey Bogart 0.570197 109\n", 163 | "1 Actors Cary Grant 0.438602 142\n", 164 | "2 Actors James Stewart 0.988374 120\n", 165 | "3 Actors Marlon Brando 0.102045 108\n", 166 | "4 Actors Fred Astaire 0.208877 84\n", 167 | "5 Actresses Katharine Hepburn 0.039188 63\n", 168 | "6 Actresses Bette Davis 0.282807 14\n", 169 | "7 Actresses Audrey Hepburn 0.120197 94\n", 170 | "8 Actresses Ingrid Bergman 0.296140 52\n", 171 | "9 Actors Spencer Tracy 0.466311 192\n", 172 | "10 Actors Charlie Chaplin 0.244426 76" 173 | ] 174 | }, 175 | "execution_count": 2, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "# Replace the path with wherever you have downloaded the dataset.\n", 182 | "df = pd.read_csv('../nlg/tests/data/actors.csv')\n", 183 | "df" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Let us now sort the dataframe by the `rating` column. NLG is designed to work with Gramex's [FormHandler](https://learn.gramener.com/guide/formhandler). Therefore, we will use FormHandler's own DSL to make any transformation on the dataset." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 3, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "from gramex.data import filter as gfilter # do not clobber the `filter` function from the Python stdlib\n", 200 | "sort_args = {'_sort': ['-rating']}" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Note that the `_sort` key in the dictionary above tells Gramex to sort a dataframe by the given columns. The value of they key is a _list_, indicating that dataframes can be sorted by multiple columns. Also, the hyphen before the column name indicates that the sorting is _descending_." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 4, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "xdf = gfilter(df, sort_args.copy())" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 5, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/html": [ 227 | "
\n", 228 | "\n", 241 | "\n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
categorynameratingvotes
2ActorsJames Stewart0.988374120
0ActorsHumphrey Bogart0.570197109
9ActorsSpencer Tracy0.466311192
1ActorsCary Grant0.438602142
8ActressesIngrid Bergman0.29614052
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " category name rating votes\n", 293 | "2 Actors James Stewart 0.988374 120\n", 294 | "0 Actors Humphrey Bogart 0.570197 109\n", 295 | "9 Actors Spencer Tracy 0.466311 192\n", 296 | "1 Actors Cary Grant 0.438602 142\n", 297 | "8 Actresses Ingrid Bergman 0.296140 52" 298 | ] 299 | }, 300 | "execution_count": 5, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "xdf.head()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "Now, let's write something about this dataset. It is apparent that James Stewart has the highest rating." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 6, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "from nlg.utils import load_spacy_model\n", 323 | "nlp = load_spacy_model()\n", 324 | "\n", 325 | "text = nlp(\"James Stewart is the actor with the highest rating.\")" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "The entry-point into the NLG module is the [`nlg.search.templatize`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/search.py#L478) function. This function uses:\n", 333 | "* a dataframe\n", 334 | "* operations on the dataframe (as FormHandler arguments)\n", 335 | "* some text about the dataset\n", 336 | "\n", 337 | "to create a [`Nugget`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L102) object. To learn more about the `Nugget` object and it's methods, see the [README](https://github.com/gramener/gramex-nlg/tree/dev#glossary-grammar-of-data-driven-narratives)." 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 7, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stderr", 347 | "output_type": "stream", 348 | "text": [ 349 | "/home/jaidevd/src/nlg/nlg/search.py:62: UserWarning: Ignoring lemmatization.\n", 350 | " warnings.warn('Ignoring lemmatization.')\n", 351 | "/home/jaidevd/src/nlg/nlg/search.py:92: UserWarning: Ignoring lemmatization.\n", 352 | " warnings.warn('Ignoring lemmatization.')\n", 353 | "/home/jaidevd/src/nlg/nlg/search.py:80: FutureWarning: Series.nonzero() is deprecated and will be removed in a future version.Use Series.to_numpy().nonzero() instead\n", 354 | " indices = {array[i]: i for i in mask.nonzero()[0]}\n", 355 | "/home/jaidevd/src/nlg/nlg/search.py:109: UserWarning: Cannot lemmatize multi-word cells.\n", 356 | " warnings.warn('Cannot lemmatize multi-word cells.')\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "nugget = templatize(text, sort_args, df)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 8, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "{% set fh_args = {\"_sort\": [\"-rating\"]} %}\n", 373 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n", 374 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n", 375 | "{# Do not edit above this line. #}\n", 376 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[-2]).lower() }} with the highest rating." 377 | ] 378 | }, 379 | "execution_count": 8, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "nugget" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "As we see, a nugget has an underlying [Tornado template](https://www.tornadoweb.org/en/stable/template.html) which has been auto-generated by the `templatize` function. Let's see how well this template re-renders on the dataset." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 9, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "b' James Stewart is the actor with the highest rating.'\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "print(nugget.render(df))" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "The text above is identical to the input text, but this is generated from a template. Essentially, we can pass any dataframe to the [`.render`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L190) method of the nugget object, and the text will be rendered in the context of that data. To test this, let's create a copy of the dataframe and give all the artists a random rating." 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 10, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "import numpy as np\n", 426 | "np.random.seed(12345)\n", 427 | "\n", 428 | "fake_ratings = df.copy()\n", 429 | "fake_ratings['rating'] = np.random.rand(df.shape[0])" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "Let's see who the top rated artist is in this new, fake dataset." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 11, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/html": [ 447 | "
\n", 448 | "\n", 461 | "\n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | "
categorynameratingvotes
6ActressesBette Davis0.96451514
0ActorsHumphrey Bogart0.929616109
8ActressesIngrid Bergman0.74890752
10ActorsCharlie Chaplin0.74771576
9ActorsSpencer Tracy0.653570192
\n", 509 | "
" 510 | ], 511 | "text/plain": [ 512 | " category name rating votes\n", 513 | "6 Actresses Bette Davis 0.964515 14\n", 514 | "0 Actors Humphrey Bogart 0.929616 109\n", 515 | "8 Actresses Ingrid Bergman 0.748907 52\n", 516 | "10 Actors Charlie Chaplin 0.747715 76\n", 517 | "9 Actors Spencer Tracy 0.653570 192" 518 | ] 519 | }, 520 | "execution_count": 11, 521 | "metadata": {}, 522 | "output_type": "execute_result" 523 | } 524 | ], 525 | "source": [ 526 | "fake_ratings.sort_values('rating', ascending=False).head()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "Now, let's see if our original nugget is able to adapt to this new dataset." 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 12, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/plain": [ 544 | "b' Bette Davis is the actor with the highest rating.'" 545 | ] 546 | }, 547 | "execution_count": 12, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "nugget.render(fake_ratings)" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "Clearly, that is false. Bette Davis is the _actress_ with the highest rating. To see what went wrong, let's take a look at the template again." 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 13, 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "name": "stdout", 570 | "output_type": "stream", 571 | "text": [ 572 | "{% set fh_args = {\"_sort\": [\"-rating\"]} %}\n", 573 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n", 574 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n", 575 | "{# Do not edit above this line. #}\n", 576 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[-2]).lower() }} with the highest rating.\n" 577 | ] 578 | } 579 | ], 580 | "source": [ 581 | "print(nugget.template)" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "As we can see, the words 'actor' or 'actress' don't appear in the template. This means that the template-generator has correctly figured out that these words are dependent on the transformed dataset. However, it has not managed to determine the exact formula for this.\n", 589 | "\n", 590 | "Any token in the input text which is data-dependent, is called a [`Variable`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L27). To see which words in a nugget are variables, take a look at the `.variables` attribute of the nugget." 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 14, 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "{James Stewart: {{ df[\"name\"].iloc[0] }},\n", 602 | " actor: {{ G.singular(df[\"category\"].iloc[-2]).lower() }}}" 603 | ] 604 | }, 605 | "execution_count": 14, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "nugget.variables" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "metadata": {}, 617 | "source": [ 618 | "We see here that there are two tokens from the original text - `\"James Stewart\"` and `\"actor\"` that have been identified as variables. Only, the Python _expression_ for determining one of them is wrong. Whether the highest rated artist is an actor or an actress needs to be found from the `\"category\"` column of the first row.\n", 619 | "\n", 620 | "To fix this, we can use the [`.set_expr`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L58) method of the respective variable. The `.set_expr` method accepts any valid Python expression as a string." 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 15, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "var = nugget.get_var('actor')" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 16, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "var.set_expr('df[\"category\"].iloc[0]')" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 17, 644 | "metadata": {}, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/plain": [ 649 | "{{ G.singular(df[\"category\"].iloc[0]).lower() }}" 650 | ] 651 | }, 652 | "execution_count": 17, 653 | "metadata": {}, 654 | "output_type": "execute_result" 655 | } 656 | ], 657 | "source": [ 658 | "var" 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "metadata": {}, 664 | "source": [ 665 | "Now that we have fixed the variable. Let's re-render the nugget on the fake dataset." 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 18, 671 | "metadata": {}, 672 | "outputs": [ 673 | { 674 | "data": { 675 | "text/plain": [ 676 | "b' Bette Davis is the actress with the highest rating.'" 677 | ] 678 | }, 679 | "execution_count": 18, 680 | "metadata": {}, 681 | "output_type": "execute_result" 682 | } 683 | ], 684 | "source": [ 685 | "nugget.render(fake_ratings)" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": {}, 691 | "source": [ 692 | "----" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "There is scope for yet more automation. Note that the last word in the text, \"rating\", matches the name of the column by which the dataframe has been sorted. Therefore, even that can be turned into a variable. Essentially, we want the template to render the name of whichever column is used to sort the data, in place of rating.\n", 700 | "\n", 701 | "New variables can be added to a nugget using the [`.add_var`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L236) method of the nugget object, as follows:" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 19, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "var_token = text[-2] # The spacy token corresponding to \"rating\"" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 20, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "var_expr = 'fh_args[\"_sort\"][0]' # The Python expression to detect the sorted column" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 21, 725 | "metadata": {}, 726 | "outputs": [ 727 | { 728 | "data": { 729 | "text/plain": [ 730 | "{% set fh_args = {\"_sort\": [\"-rating\"]} %}\n", 731 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n", 732 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n", 733 | "{# Do not edit above this line. #}\n", 734 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[0]).lower() }} with the highest {{ fh_args[\"_sort\"][0] }}." 735 | ] 736 | }, 737 | "execution_count": 21, 738 | "metadata": {}, 739 | "output_type": "execute_result" 740 | } 741 | ], 742 | "source": [ 743 | "nugget.add_var(var_token, expr=var_expr)\n", 744 | "nugget" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "----\n", 752 | "Let us now test a scenario where we sort the dataframe by votes." 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 22, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "data": { 762 | "text/plain": [ 763 | "{% set fh_args = {\"_sort\": [\"-votes\"]} %}\n", 764 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n", 765 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n", 766 | "{# Do not edit above this line. #}\n", 767 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[0]).lower() }} with the highest {{ fh_args[\"_sort\"][0] }}." 768 | ] 769 | }, 770 | "execution_count": 22, 771 | "metadata": {}, 772 | "output_type": "execute_result" 773 | } 774 | ], 775 | "source": [ 776 | "nugget.fh_args = {'_sort': ['-votes']}\n", 777 | "nugget" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": 23, 783 | "metadata": {}, 784 | "outputs": [ 785 | { 786 | "data": { 787 | "text/plain": [ 788 | "b' Spencer Tracy is the actor with the highest votes.'" 789 | ] 790 | }, 791 | "execution_count": 23, 792 | "metadata": {}, 793 | "output_type": "execute_result" 794 | } 795 | ], 796 | "source": [ 797 | "nugget.render(df)" 798 | ] 799 | }, 800 | { 801 | "cell_type": "markdown", 802 | "metadata": {}, 803 | "source": [ 804 | "---" 805 | ] 806 | }, 807 | { 808 | "cell_type": "markdown", 809 | "metadata": {}, 810 | "source": [ 811 | "Now we know how to create templates from raw text, and how to assign tokens within the text as data-dependent variables. In forthcoming examples, we will explore:\n", 812 | "\n", 813 | "1. how to design more complex variable expression - especially those that cannot be defined a short and simple Python strings\n", 814 | "2. how to create longer narratives by putting together different nuggets." 815 | ] 816 | } 817 | ], 818 | "metadata": { 819 | "kernelspec": { 820 | "display_name": "Python 3", 821 | "language": "python", 822 | "name": "python3" 823 | }, 824 | "language_info": { 825 | "codemirror_mode": { 826 | "name": "ipython", 827 | "version": 3 828 | }, 829 | "file_extension": ".py", 830 | "mimetype": "text/x-python", 831 | "name": "python", 832 | "nbconvert_exporter": "python", 833 | "pygments_lexer": "ipython3", 834 | "version": "3.6.8" 835 | } 836 | }, 837 | "nbformat": 4, 838 | "nbformat_minor": 4 839 | } 840 | -------------------------------------------------------------------------------- /nlg/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | __version__ = '0.1.3' 4 | 5 | try: 6 | __NLG_SETUP__ 7 | except NameError: 8 | __NLG_SETUP__ = False 9 | 10 | 11 | if __NLG_SETUP__: 12 | sys.stderr.write('Partial import of nlg during the build process.\n') 13 | else: 14 | from .search import templatize # NOQA: F401 15 | from .grammar import get_gramopts 16 | grammar_options = get_gramopts() 17 | __all__ = ['templatize', 'grammar_options'] 18 | -------------------------------------------------------------------------------- /nlg/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/nlg/app/__init__.py -------------------------------------------------------------------------------- /nlg/app/body.html: -------------------------------------------------------------------------------- 1 | {% set admin_kwargs = handler.kwargs.get('admin_kwargs', '') or {} %} 2 | 3 | {% from nlg.webapp import read_current_config, get_user_dir, is_user_authenticated %} 4 | {% set dsid = read_current_config(handler).get('dsid') %} 5 | 6 |
7 |
8 |
9 |
10 | 13 |
14 |
15 |
16 | 17 | 18 | 74 | 75 |
76 |
77 |
78 | 99 |
100 |
101 |
102 |
103 |
104 | 107 | 110 | 113 |
114 | 118 | 122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 | 139 |
140 |
141 | 142 | 143 |
144 |
145 |
146 | 147 | 149 |
150 |
151 |
152 | 156 |
157 |
158 | 160 |
161 |
162 | 164 |
165 |
166 |
167 |
168 | {% if dsid %} 169 | {% from os.path import basename %} 170 | {% set user_dir = basename(get_user_dir(handler)) %} 171 |
172 | {% end %} 173 | 174 | 182 | 183 | 196 | 197 | 198 | 215 | 216 | 217 | 232 |
233 | 234 | 235 | 236 | 237 | 279 | -------------------------------------------------------------------------------- /nlg/app/error/400.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Bad Request 8 | 17 | 18 | 19 |

Bad request

20 | {% set exception = kwargs['exc_info'][1] %} 21 | 22 | 23 | {% for key, val in vars(exception).items() %} 24 | 25 | 26 | 27 | 28 | {% end %} 29 | 30 |
{{ key.replace('_', ' ').title() }}{{ val }}
31 | 32 | 33 | -------------------------------------------------------------------------------- /nlg/app/error/401.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Forbidden 8 | 15 | 16 | 17 |

You are not logged in

18 |

You must be logged in to perform this action.

19 | {% set login_url = handler.kwargs.get('login_url', None) %} 20 | {% import gramex %} 21 | {% set app_login = 'gramex.conf.app.settings.login_url' %} 22 |

Log in and try again. If that fails, contact the app owner.

23 |
HTTP 401: Unauthorized
24 | 25 | 26 | {# 27 | 28 | Note: this template is rarely called, because: 29 | 30 | - basehandler.py redirects to login_url if 401: UNAUTHORIZED 31 | - authhandler.py explicitly renders specific templates if 401: UNAUTHORIZED 32 | 33 | The rare cases where this is used are: 34 | 35 | - If an application explicitly raises a 401 36 | - If basehandler.py raises a 401 37 | - for an OTP request when user is not logged in 38 | - if the request is not GET/HEAD, it is not redirected to login_url 39 | 40 | #} 41 | -------------------------------------------------------------------------------- /nlg/app/error/403.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Forbidden 8 | 15 | 16 | 17 |

You need access

18 | {% try %}{% set xsrf = handler.check_xsrf_cookie() or True %}{% except %}{% set xsrf = False %}{% end %} 19 | {% if handler.request.method not in {'GET', 'HEAD', 'OPTIONS'} and not xsrf %} 20 |

Your app sent a {{ handler.request.method }} request without an XSRF cookie.

21 | {% elif handler.current_user %} 22 |

You are logged in, but as a user that cannot access this page.

23 | {% import yaml %} 24 |
{{ yaml.safe_dump(handler.current_user, default_flow_style=False) }}
25 | {% else %} 26 |

You are not logged in.

27 | {% end %} 28 | {% set login_url = None %} 29 | {% try %} 30 | {% set login_url = handler.kwargs.auth.login_url %} 31 | {% except %} 32 | {% try %} 33 | {% import gramex %} 34 | {% set login_url = gramex.conf.app.settings.login_url %} 35 | {% except %} 36 | {% end %} 37 | {% end %} 38 | {% if login_url %} 39 |

Try logging in again.

40 | {% end %} 41 |

Contact the app owner for more information.

42 |
HTTP 403: Forbidden
43 | 44 | 45 | -------------------------------------------------------------------------------- /nlg/app/error/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | File missing 8 | 15 | 16 | 17 |

File missing

18 |

This page cannot be found

19 |
{{ handler.request.uri }}
20 |

Details are on the server log.

21 |
HTTP 404: Not Found
22 | 23 | 24 | -------------------------------------------------------------------------------- /nlg/app/error/500.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Application error 8 | 18 | 19 | 20 |
21 |

Application error

22 |

This application made a mistake.

23 | {% set exception = kwargs['exc_info'][1] %} 24 | {% if hasattr(exception, 'reason') %} 25 |

Reason: {{ exception.reason }}

26 | {% end %} 27 |
{{ repr(exception) }}
28 |

Details are on the server log.

29 |
30 | 31 | {% from gramex import conf %} 32 | {% if conf.app.settings.serve_traceback %} 33 |
34 | {% import traceback %} 35 |
{{ ''.join(traceback.format_exception(*kwargs['exc_info'])) }}
36 |

Traceback is only for development. Disable it in gramex.yaml with app.settings.serve_traceback: false

37 |
38 |

Show traceback

39 | 47 | {% end %} 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /nlg/app/gramex.yaml: -------------------------------------------------------------------------------- 1 | # Configurable variables: 2 | # $NLG_AUTH: 3 | # Authentication is needed for saving and sharing narratives. 4 | # Set this variable to any valid auth configuration to use it 5 | # within the NLG app. 6 | variables: 7 | NLG_AUTH: /$YAMLURL/login/ 8 | NLG_BASE: /$YAMLURL/ 9 | 10 | import: 11 | ui: 12 | path: $GRAMEXAPPS/ui/gramex.yaml 13 | YAMLURL: $YAMLURL/ui/ 14 | languagetool: 15 | path: $GRAMEXAPPS/languagetool/gramex.yaml 16 | YAMLURL: $YAMLURL/languagetool/ 17 | 18 | url: 19 | demo-embed-$*: 20 | pattern: /$YAMLURL/demoembed 21 | handler: FileHandler 22 | kwargs: 23 | path: $YAMLPATH/templates/demo.tmpl 24 | template: true 25 | headers: 26 | Cache-Control: no-store 27 | move-nuggets-$*: 28 | pattern: /$YAMLURL/movenugget/(\d+)/(\d+) 29 | handler: FunctionHandler 30 | kwargs: 31 | function: nlg.webapp.move_nuggets 32 | headers: 33 | Cache-Control: no-store 34 | save-narrative-$*: 35 | pattern: /$YAMLURL/saveNarrative/(.*) 36 | handler: FunctionHandler 37 | kwargs: 38 | function: nlg.webapp.save_narrative 39 | nlg-default-login-$*: 40 | pattern: /$YAMLURL/login/ 41 | handler: SimpleAuth 42 | kwargs: 43 | credentials: 44 | alpha: alpha 45 | beta: beta 46 | gamma: gamma 47 | template: $YAMLPATH/login.html 48 | view-narrative-cache-$*: 49 | pattern: /$YAMLURL/narratives 50 | handler: FunctionHandler 51 | kwargs: 52 | function: nlg.webapp.get_narrative_cache 53 | xsrf_cookies: false 54 | headers: 55 | Content-Type: application/json 56 | Cache-Control: no-store 57 | narrative-download-$*: 58 | pattern: /$YAMLURL/download 59 | handler: FunctionHandler 60 | kwargs: 61 | function: nlg.webapp.download_narrative 62 | xsrf_cookies: false 63 | headers: 64 | Content-Type: application/json 65 | Content-Disposition: attachment; filename=narrative.json 66 | nlg-new-variable-tmpl-$*: 67 | pattern: /$YAMLURL/newvariable/(\d+)/(.*) 68 | handler: FunctionHandler 69 | kwargs: 70 | function: nlg.webapp.new_variable_tmpl 71 | xsrf_cookies: false 72 | headers: 73 | Content-Type: text/html 74 | nlg-new-variable-add-$*: 75 | pattern: /$YAMLURL/newvar/(\d+)/(.*) 76 | handler: FunctionHandler 77 | kwargs: 78 | function: nlg.webapp.add_new_variable 79 | xsrf_cookies: false 80 | nlg-variable-settings-$*: 81 | pattern: /$YAMLURL/variablesettings/(\d+)/(.*) 82 | handler: FunctionHandler 83 | kwargs: 84 | function: nlg.webapp.get_variable_settings_tmpl 85 | xsrf_cookies: false 86 | headers: 87 | Content-Type: text/html 88 | nlg-update-variable-$*: 89 | pattern: /$YAMLURL/updatevar/(\d+)/(.*) 90 | handler: FunctionHandler 91 | kwargs: 92 | function: nlg.webapp.set_variable_settings_tmpl 93 | xsrf_cookies: false 94 | nlg-get-nugget-$*: 95 | pattern: /$YAMLURL/nuggets/(\d+) 96 | handler: FunctionHandler 97 | kwargs: 98 | function: nlg.webapp.get_nugget 99 | xsrf_cookies: false 100 | methods: [GET, POST, DELETE] 101 | headers: 102 | Content-Type: application/json 103 | nlg-nugget-settings-$*: 104 | pattern: /$YAMLURL/nuggetsettings/(\d+) 105 | handler: FunctionHandler 106 | kwargs: 107 | function: nlg.webapp.get_nugget_settings_tmpl 108 | xsrf_cookies: false 109 | headers: 110 | Content-Type: text/html 111 | nlg-home-$*: 112 | pattern: /$YAMLURL/ 113 | handler: FileHandler 114 | kwargs: 115 | auth: 116 | login_url: $NLG_AUTH 117 | path: $YAMLPATH/index.html 118 | transform: 119 | "index.html": 120 | function: template 121 | nlg-condition-$*: 122 | pattern: /$YAMLURL/condition/(\d+) 123 | handler: FunctionHandler 124 | kwargs: 125 | function: nlg.webapp.add_condition 126 | xsrf_cookies: false 127 | nlg-data-selector-$*: 128 | pattern: /$YAMLURL/initform 129 | handler: FunctionHandler 130 | kwargs: 131 | headers: 132 | Cache-Control: no-store 133 | xsrf_cookies: false 134 | function: nlg.webapp.init_form 135 | redirect: 136 | query: next 137 | tablepreview-$*: 138 | pattern: /$YAMLURL/preview/(.*)/(.*) 139 | handler: FormHandler 140 | kwargs: 141 | url: $GRAMEXDATA/nlg/{_0}/{_1} 142 | headers: 143 | Cache-Control: no-store 144 | nlg-static_files-$*: 145 | pattern: /$YAMLURL/(.*) 146 | handler: FileHandler 147 | kwargs: 148 | path: $YAMLPATH 149 | nlg-config-handler-$*: 150 | pattern: /$YAMLURL/initconf 151 | handler: FunctionHandler 152 | kwargs: 153 | function: nlg.webapp.get_init_config 154 | headers: 155 | Content-Type: application/json 156 | Cache-Control: no-store 157 | textproc-$*: 158 | pattern: /$YAMLURL/textproc 159 | handler: FunctionHandler 160 | kwargs: 161 | function: nlg.webapp.process_text 162 | xsrf_cookies: false 163 | headers: 164 | Content-Type: application/json 165 | rendertmpl-$*: 166 | pattern: /$YAMLURL/render-template/(\d+)? 167 | handler: FunctionHandler 168 | kwargs: 169 | function: nlg.webapp.render_template 170 | xsrf_cookies: false 171 | headers: 172 | Content-Type: text/plain 173 | Cache-Control: no-store 174 | renderall-$*: 175 | pattern: /$YAMLURL/renderall 176 | handler: FunctionHandler 177 | kwargs: 178 | function: nlg.webapp.render_narrative 179 | headers: 180 | Content-Type: application/json 181 | Cache-Control: no-store 182 | render-live-template-$*: 183 | pattern: /$YAMLURL/render-live-template 184 | handler: FunctionHandler 185 | kwargs: 186 | function: nlg.webapp.render_live_template 187 | headers: 188 | Content-Type: text/plain 189 | -------------------------------------------------------------------------------- /nlg/app/html/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | nlg-demos 8 | 9 | 10 | 11 | 12 | 13 | {% set base = '.' %} 14 | {% include template-navbar.html %} 15 | {% from web_app import read_demo_config %} 16 | {% set dsid, nrid = read_demo_config(handler) %} 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /nlg/app/html/template-navbar.html: -------------------------------------------------------------------------------- 1 | 51 | -------------------------------------------------------------------------------- /nlg/app/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | GramexNLG Template Generator 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | {% from gramex import variables %} 19 | {% set nlg_base = variables['NLG_BASE'].rstrip('/') %} 20 |
21 | {% include "./body.html" %} 22 |
23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /nlg/app/login.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | foo Login 8 | 9 | 10 | 11 | 12 | 13 | {% set base = '..' %} 14 | {% include template-navbar.html %} 15 | {% set kwargs = handler.kwargs %} 16 | {% try %}{% set user = kwargs.user.arg %}{% except %}{% set user = 'user' %}{% end %} 17 | {% try %}{% set password = kwargs.password.arg %}{% except %}{% set password = 'password' %}{% end %} 18 |
19 |
20 | {% if error %} 21 |
22 |

Error logging in

23 |

{{ error['error'] }}

24 |
code: {{ error['code'] }}
25 |
26 | {% end %} 27 |
28 |
29 | 30 | 31 |
32 |
33 | 34 | 35 |
36 | 37 |

38 | {% if kwargs.get('forgot') %} 39 |

Forgot password

40 | {% end %} 41 |
Default login: alpha (password: alpha)
42 |
43 |
44 |
45 | 46 | 47 | {% if 'hash' in kwargs.get('password', {}) %} 48 | 49 | 57 | {% end %} 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /nlg/app/nlg.js: -------------------------------------------------------------------------------- 1 | /* globals currentTemplateIndex, grammarOptions, templates, args, df, currentEventHandlers, nlg_base, g1 */ 2 | /* exported addToNarrative, setInitialConfig, checkTemplate, saveTemplate, 3 | addCondition, addName, shareNarrative, copyToClipboard, 4 | findAppliedInflections, checkSelection */ 5 | /* eslint-disable no-global-assign */ 6 | var narrative_name, dataset_name 7 | var styleparams = {bold: true, italic: false, underline: false, style: 'para'} 8 | 9 | function openDemoPage() { 10 | if (!(narrative_name)) { 11 | $('#saveModal').modal({show: true}) 12 | } else { 13 | let fh_url = encodeURIComponent($('.formhandler').attr('data-src')) 14 | let url = `${nlg_base}/demoembed?fh_url=${fh_url}&nname=${narrative_name}` 15 | window.open(url) 16 | } 17 | } 18 | 19 | function activateStyleControl() { 20 | if (styleparams.bold) { 21 | $('#boldpreview').addClass('active') 22 | } 23 | if (styleparams.italic) { 24 | $('#italicpreview').addClass('active') 25 | } 26 | if (styleparams.underline) { 27 | $('#ulinepreview').addClass('active') 28 | } 29 | 30 | if (styleparams.style == 'para') { 31 | $('#parastyle').prop('checked', true) 32 | } else { 33 | $('#liststyle').prop('checked', true) 34 | } 35 | renderByStyle() 36 | } 37 | 38 | function toggleRenderStyle(e) { 39 | if (e.currentTarget.id == "parastyle") { 40 | if ($('#parastyle').prop('checked')) { 41 | styleparams.style = "para" 42 | } 43 | } else if (e.currentTarget.id == "liststyle") { 44 | if ($('#liststyle').prop('checked')) { 45 | styleparams.style = "list" 46 | } 47 | } else if (e.currentTarget.id == "boldpreview") { 48 | if ($('#boldpreview').hasClass('active')) { 49 | styleparams.bold = true 50 | } else { 51 | styleparams.bold = false 52 | } 53 | } else if (e.currentTarget.id == "italicpreview") { 54 | if ($('#italicpreview').hasClass('active')) { 55 | styleparams.italic = true 56 | } else { 57 | styleparams.italic = false 58 | } 59 | } else if (e.currentTarget.id == "ulinepreview") { 60 | if ($('#ulinepreview').hasClass('active')) { 61 | styleparams.underline = true 62 | } else { 63 | styleparams.underline = false 64 | } 65 | } 66 | renderByStyle() 67 | } 68 | 69 | function renderByStyle() { 70 | let url = g1.url.parse(`${nlg_base}/renderall`) 71 | url.update(styleparams) 72 | $.getJSON(url.toString()).done((e) => { 73 | $(`#previewspan`).html(e.render) 74 | styleparams = e.style 75 | }) 76 | } 77 | 78 | function makeControlDroppable(elem, ondrop) { 79 | elem.on('dragover', (e) => { 80 | e.preventDefault() 81 | e.stopPropagation() 82 | }) 83 | elem.on('dragleave', (e) => { 84 | e.preventDefault() 85 | e.stopPropagation() 86 | }) 87 | elem.on('drop', (e) => { 88 | e.preventDefault() 89 | e.stopPropagation() 90 | ondrop(e) 91 | }) 92 | } 93 | 94 | function prepDrag(row) { 95 | row.on('dragstart', (e) => { 96 | e.dataTransfer = e.originalEvent.dataTransfer 97 | e.dataTransfer.setData('text', e.target.id)}) 98 | } 99 | 100 | 101 | function findControlRow(elem) { 102 | if (!(elem.id)) { 103 | return false 104 | } else { 105 | return elem.id.match(/^controlrow-\d+$/) 106 | } 107 | } 108 | 109 | function findDropPosition(y) { 110 | let rows = _.filter(Object.values($('tr[id^=controlrow]')), findControlRow) 111 | let bottoms = _.flatMap(rows, (r) => {return r.getBoundingClientRect().bottom}) 112 | for (let i=0; i { 201 | $('#variable-settings').html(e) 202 | } 203 | ) 204 | } else { 205 | let textSel = hasTextSelection() 206 | if (textSel) { 207 | $.get(`${nlg_base}/newvariable/${currentTemplateIndex}/${textSel.join(',')}`).done( 208 | (e) => { 209 | $('#variable-settings').html(e) 210 | } 211 | ) 212 | } 213 | } 214 | } 215 | 216 | function addToNarrative() { 217 | // Pick text from the input textarea, templatize, and add to the narrative. 218 | $.post( 219 | `${nlg_base}/textproc`, 220 | JSON.stringify({ 221 | 'args': args, 'data': df, 222 | 'text': $('#textbox').val() 223 | }), (pl) => { 224 | pl = new Template(pl) 225 | templates.push(pl) 226 | renderPreview(null) 227 | } 228 | ) 229 | } 230 | 231 | function renderPreview(fh) { 232 | // Render the preview of all current templates on the front page. 233 | if (fh) { 234 | df = fh.formdata 235 | args = g1.url.parse(g1.url.parse(window.location.href).hash).searchList 236 | refreshTemplates() 237 | return true 238 | } 239 | $('#template-preview').template({n_templates: templates.length}) 240 | makeControlDroppable($('#controltable'), handleDrop) 241 | for (let i = 0; i < templates.length; i++) { 242 | // add the remove listener 243 | var deleteListener = function () { deleteTemplate(i) } 244 | $(`#rm-btn-${i}`).on('click', deleteListener) 245 | 246 | // add setting listener 247 | var settingsListener = function () { triggerTemplateSettings(i) } 248 | $(`#settings-btn-${i}`).on('click', settingsListener) 249 | 250 | // Add the preview 251 | $.get(`${nlg_base}/render-template/${i}`).done( 252 | (e) => {$(`#preview-${i}`).html(e)} 253 | ) 254 | // prep the row for dragging 255 | prepDrag($(`#controlrow-${i}`)) 256 | } 257 | renderByStyle() 258 | } 259 | 260 | 261 | function refreshTemplate(n) { 262 | // Refresh the nth template from the backend 263 | $.getJSON(`${nlg_base}/nuggets/${n}`).done((e) => { 264 | templates[n] = new Template(e) 265 | $('#tmpl-setting-preview').html(templates[n].previewHTML()) 266 | renderPreview(null) 267 | }) 268 | } 269 | 270 | function refreshTemplates() { 271 | // Refresh the output of all templates in the current narrative. 272 | templates = [] 273 | $.getJSON(`${nlg_base}/narratives`).done((e) => { 274 | if (e.narrative.length > 0) { 275 | for (let i=0; i { 301 | $('#tmpllist').html(e) 302 | } 303 | ) 304 | } 305 | 306 | 307 | function setInitialConfig() { 308 | // At page ready, load the latest config for the authenticated user 309 | // and show it. 310 | $.getJSON(`${nlg_base}/initconf`).done((e) => { 311 | narrative_name = e.nrid 312 | refreshTemplates() 313 | Object.assign(styleparams, e.style) 314 | activateStyleControl() 315 | }) 316 | } 317 | 318 | function addName() { 319 | // Add an optional name to a template. 320 | let name = $('#tmpl-name-editor').val() 321 | if (name) { 322 | templates[currentTemplateIndex].name = name 323 | } 324 | } 325 | 326 | function embedNarrative(url, nname, selector) { 327 | renderLiveNarrative(url, nname, selector) 328 | $(window).on('#?', function (e, fh_args) { 329 | let xurl = g1.url.parse(url).join(fh_args).toString() 330 | renderLiveNarrative(xurl, nname, selector) 331 | }).urlchange() 332 | } 333 | 334 | 335 | function renderLiveNarrative(url, nname, selector) { 336 | $.getJSON(url).done((e) => { 337 | $.post( 338 | `${nlg_base}/render-live-template`, 339 | JSON.stringify({ 340 | data: e, 341 | nrid: nname 342 | }), 343 | (f) => {$(selector).html(f)} 344 | ) 345 | }) 346 | } 347 | 348 | 349 | function getNarrativeEmbedCode() { 350 | // Generate embed code for this narrative. 351 | let html = ` 352 |
353 | 354 | ` 359 | return html 360 | } 361 | 362 | function generateEmbedCode() { 363 | let url = $('#embedTargetURL').val() 364 | $('#embedCodeText').text(url) 365 | } 366 | 367 | function saveNarrative(name) { 368 | narrative_name = name 369 | $.get(`${nlg_base}/saveNarrative/${name}`) 370 | } 371 | -------------------------------------------------------------------------------- /nlg/app/setup.sh: -------------------------------------------------------------------------------- 1 | pip install nlg 2 | python -m spacy download en_core_web_sm 3 | -------------------------------------------------------------------------------- /nlg/app/style.css: -------------------------------------------------------------------------------- 1 | /* UI component styles. Customize via ?bootstrap-variable=encoded-value. Example: 2 | Colors. Can be a name or a number (e.g. %23aabbcc). Preserve the hues below. 3 | primary=blue 4 | success=green 5 | info=cyan 6 | warning=orange 7 | danger=red 8 | secondary=grey 9 | light=lightgrey 10 | dark=darkgrey 11 | body-bg=white 12 | body-color=black 13 | Fonts. Can be a system font or Open+Sans, Roboto, Lato, Anton, Monserrat 14 | font-family-base=Segoe+UI 15 | headings-font-family=Segoe+UI 16 | Other 17 | https://github.com/twbs/bootstrap/blob/v4-dev/scss/_variables.scss 18 | */ 19 | @import url("ui/bootstraptheme.css?body-bg=white&navbar-dark-color=rgba(255%2C255%2C255%2C.8)&navbar-dark-hover-color=white"); 20 | /* For v4 icons, use url("ui/font-awesome/css/font-awesome.min.css") */ 21 | @import url("ui/@fortawesome/fontawesome-free/css/all.min.css"); 22 | 23 | 24 | /* custom styles for app: nlg-demos */ 25 | -------------------------------------------------------------------------------- /nlg/app/template-navbar.html: -------------------------------------------------------------------------------- 1 | 51 | -------------------------------------------------------------------------------- /nlg/app/templates/demo.tmpl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | GramexNLG Demo 7 | 8 | 9 | 10 | 11 | 12 |
13 |
14 |
15 |
16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /nlg/app/templates/new-variable.tmpl: -------------------------------------------------------------------------------- 1 |

Add new variable

2 | 3 |
5 | 6 |
7 | 8 |
9 | 10 |
11 |
12 | 13 |
14 | 15 |
16 | 17 |
18 |
19 | 20 | 21 |
22 | 23 | 36 | -------------------------------------------------------------------------------- /nlg/app/templates/template-settings.tmpl: -------------------------------------------------------------------------------- 1 | {% from tornado.escape import xhtml_unescape %} 2 |
3 |
4 |
5 |

{{ xhtml_unescape(template['previewHTML']) }}

6 |
7 |
8 |
9 |
10 | 11 | 12 |
13 | 14 |
15 | 16 | 17 |
18 |
19 |
20 |
21 | 23 |
24 |

25 | 27 | 29 |
30 | 59 | -------------------------------------------------------------------------------- /nlg/app/templates/variable-settings.tmpl: -------------------------------------------------------------------------------- 1 |
3 | 4 |
5 | 6 |
7 | 8 |
9 |
10 | 11 |
12 | 13 |
14 | 23 |
24 |
25 | 26 |
27 | 28 |
29 | 30 |
31 |
32 | 33 |
34 | 35 |
36 | 46 |
47 |
48 | 49 | 50 |
51 | 52 | 65 | -------------------------------------------------------------------------------- /nlg/grammar.py: -------------------------------------------------------------------------------- 1 | from inflect import engine 2 | from tornado.template import Template 3 | from math import floor # noqa: F401 4 | 5 | from nlg.utils import load_spacy_model, set_nlg_gramopt, get_lemmatizer 6 | 7 | infl = engine() 8 | nlp = load_spacy_model() 9 | 10 | 11 | def is_plural_noun(text): 12 | """Whether given text is a plural noun.""" 13 | doc = load_spacy_model()(text) 14 | for t in list(doc)[::-1]: 15 | if not t.is_punct: 16 | return t.tag_ in ('NNS', 'NNPS') 17 | return False 18 | 19 | 20 | is_singular_noun = lambda x: not is_plural_noun(x) # NOQA: E731 21 | 22 | 23 | @set_nlg_gramopt(source='G', fe_name='Concate Items') 24 | def concatenate_items(items, sep=', '): 25 | """Concatenate a sequence of tokens into an English string. 26 | 27 | Parameters 28 | ---------- 29 | 30 | items : list-like 31 | List / sequence of items to be printed. 32 | sep : str, optional 33 | Separator to use when generating the string 34 | 35 | Returns 36 | ------- 37 | str 38 | """ 39 | if len(items) == 0: 40 | return "" 41 | if len(items) == 1: 42 | return items[0] 43 | items = list(map(str, items)) 44 | if sep == ', ': 45 | s = sep.join(items[:-1]) 46 | s += ' and ' + items[-1] 47 | else: 48 | s = sep.join(items) 49 | return s 50 | 51 | 52 | @set_nlg_gramopt(source='G', fe_name='Pluralize') 53 | def plural(word): 54 | """Pluralize a word. 55 | 56 | Parameters 57 | ---------- 58 | 59 | word : str 60 | word to pluralize 61 | 62 | Returns 63 | ------- 64 | str 65 | Plural of `word` 66 | """ 67 | if not is_plural_noun(word): 68 | word = infl.plural(word) 69 | return word 70 | 71 | 72 | @set_nlg_gramopt(source='G', fe_name='Singularize') 73 | def singular(word): 74 | """ 75 | Singularize a word. 76 | 77 | Parameters 78 | ---------- 79 | word : str 80 | Word to singularize. 81 | 82 | Returns 83 | ------- 84 | str 85 | Singular of `word`. 86 | """ 87 | if is_plural_noun(word): 88 | word = infl.singular_noun(word) 89 | return word 90 | 91 | 92 | # @set_nlg_gramopt(source='G', fe_name='Pluralize by') 93 | def pluralize_by(word, by): 94 | """ 95 | Pluralize a word depending on another argument. 96 | 97 | Parameters 98 | ---------- 99 | word : str 100 | Word to pluralize 101 | by : any 102 | Any object checked for a pluralish value. If a sequence, it must have 103 | length greater than 1 to qualify as plural. 104 | 105 | Returns 106 | ------- 107 | str 108 | Plural or singular of `word`. 109 | """ 110 | if hasattr(by, '__iter__'): 111 | if len(by) > 1: 112 | word = plural(word) 113 | else: 114 | word = singular(word) 115 | else: 116 | if by > 1: 117 | word = plural(word) 118 | else: 119 | word = singular(word) 120 | return word 121 | 122 | 123 | # @set_nlg_gramopt(source='G', fe_name='Pluralize like') 124 | def pluralize_like(x, y): 125 | """ 126 | Pluralize a word if another is a plural. 127 | 128 | Parameters 129 | ---------- 130 | x : str 131 | The word to pluralize. 132 | y : str 133 | The word to check. 134 | 135 | Returns 136 | ------- 137 | str 138 | Plural of `x` if `y` is plural, else singular. 139 | """ 140 | if not is_plural_noun(y): 141 | return singular(x) 142 | return plural(x) 143 | 144 | 145 | @set_nlg_gramopt(source='str', fe_name='Capitalize') 146 | def capitalize(word): 147 | return word.capitalize() 148 | 149 | 150 | @set_nlg_gramopt(source='str', fe_name='Lowercase') 151 | def lower(word): 152 | return word.lower() 153 | 154 | 155 | @set_nlg_gramopt(source='str', fe_name='Swapcase') 156 | def swapcase(word): 157 | return word.swapcase() 158 | 159 | 160 | @set_nlg_gramopt(source='str', fe_name='Title') 161 | def title(word): 162 | return word.title() 163 | 164 | 165 | @set_nlg_gramopt(source='str', fe_name='Uppercase') 166 | def upper(word): 167 | return word.upper() 168 | 169 | 170 | # @set_nlg_gramopt(source='G', fe_name='Lemmatize') 171 | def lemmatize(word, target_pos): 172 | return get_lemmatizer()(word, target_pos) 173 | 174 | 175 | def _token_inflections(x, y): 176 | """ 177 | If two words share the same root, find lexical changes required for turning 178 | one into another. 179 | 180 | Parameters 181 | ---------- 182 | x : spacy.token.Tokens 183 | y : spacy.token.Tokens 184 | 185 | Examples 186 | -------- 187 | >>> _token_inflections('language', 'Language') 188 | ['upper'] 189 | >>> _token_inflections('language', 'languages') 190 | ['plural'] 191 | """ 192 | if x.lemma_ != y.lemma_: 193 | return [] 194 | 195 | inflections = [] 196 | 197 | # check if x and y are singulars or plurals of each other. 198 | number_infl = _number_inflection(x, y) 199 | if number_infl: 200 | inflections.append(number_infl) 201 | 202 | shp_infl = _shape_inflection(x, y, prev=number_infl) 203 | if shp_infl: 204 | inflections.append(shp_infl) 205 | 206 | # Disable detecting inflections until they can be 207 | # processed without intervention. 208 | # if x.pos_ != y.pos_: 209 | # return lemmatize 210 | return inflections 211 | 212 | 213 | def _shape_inflection(x, y, prev=False): 214 | if not prev: 215 | prev = lambda x: x # noqa: E731 216 | if len(prev(x.text)) == len(y.text): 217 | for methname in ['capitalize', 'lower', 'swapcase', 'title', 'upper']: 218 | func = lambda x: getattr(x, methname)() # NOQA: E731 219 | if func(prev(x.text)) == y.text: 220 | return globals()[methname] 221 | return False 222 | 223 | 224 | def _number_inflection(x, y): 225 | if is_singular_noun(y.text): 226 | if singular(x.text).lower() == y.text.lower(): 227 | return singular 228 | elif is_plural_noun(y.text): 229 | if plural(x.text).lower() == y.text.lower(): 230 | return plural 231 | return False 232 | 233 | 234 | def find_inflections(search, fh_args, df): 235 | """ 236 | Find lexical inflections between words in input text and the search results 237 | obtained from FormHandler arguments and dataframes. 238 | 239 | Parameters 240 | ---------- 241 | search : nlg.search.DFSearchResults 242 | The DFSearchResults object corresponding to `text` and `df` 243 | fh_args : dict 244 | FormHandler arguments. 245 | df : pandas.DataFrame 246 | The source dataframe. 247 | 248 | Returns 249 | ------- 250 | dict 251 | With keys as tokens found in the dataframe or FH args, and values as 252 | list of inflections applied on them to make them closer match tokens in `text`. 253 | """ 254 | inflections = {} 255 | for token, tklist in search.items(): 256 | tmpl = [t['tmpl'] for t in tklist if t.get('enabled', False)][0] 257 | rendered = Template('{{{{ {} }}}}'.format(tmpl)).generate( 258 | df=df, fh_args=fh_args).decode('utf8') 259 | if rendered != token.text: 260 | x = nlp(rendered)[0] 261 | infl = _token_inflections(x, token) 262 | if infl: 263 | inflections[token] = infl 264 | return inflections 265 | 266 | 267 | def get_gramopts(): 268 | """Find all Grammar and token inflection options from the NLG library. 269 | Primarily used for creating the select box in the template settings dialog.""" 270 | funcs = {} 271 | module = globals().copy() 272 | for attrname in module: 273 | obj = module[attrname] 274 | if obj and getattr(obj, 'gramopt', False): 275 | funcs[obj.fe_name] = { 276 | 'fe_name': obj.fe_name, 'source': obj.source, 'func_name': attrname 277 | } 278 | return funcs 279 | 280 | 281 | if __name__ == "__main__": 282 | print(get_gramopts()) # noqa 283 | -------------------------------------------------------------------------------- /nlg/narrative.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | 5 | """The Narrative class.""" 6 | import json 7 | import re 8 | import warnings 9 | 10 | from spacy.tokens import Token, Span, Doc 11 | from tornado.template import Template 12 | 13 | from nlg import utils, grammar 14 | 15 | t_templatize = lambda x: '{{ ' + x + ' }}' # noqa: E731 16 | nlp = utils.load_spacy_model() 17 | 18 | 19 | def _templatizer_factory(bold, italic, underline): 20 | def templatizer(x): 21 | x = t_templatize(x) 22 | if bold: 23 | x = f"{x}" 24 | if italic: 25 | x = f"{x}" 26 | if underline: 27 | x = f"{x}" 28 | return x 29 | return templatizer 30 | 31 | 32 | def _check_unique_token(t, doc): 33 | if len([c for c in doc if c.text == t]) > 1: 34 | msg = f'There is more than one token in the document that matches the text "{t}".' \ 35 | + " Using the first match." \ 36 | + " Please use a `spacy.token.Token` instance for searching." 37 | warnings.warn(msg) 38 | 39 | 40 | class Variable(object): 41 | """ 42 | NLG Variable 43 | 44 | A variable is a piece of text which can change with the data or the operations performed on it. 45 | Each variable has two defining components: 46 | 47 | * a source text, as initially provided by the user, and 48 | * one or more *formulae*, which compute the value of the variable for a 49 | specific instance of the data. 50 | 51 | The source text of a variable may be found in multiple places within a dataset, and as such, 52 | a variable may have multiple formulae - one of which will have to be preferred by the user. 53 | A variable may additionally have other attributes, like: 54 | 55 | * a set of linguistic inflections which determine the form of the rendered variable text - 56 | these are distinct from the formula itself, in that the formula creates the base form 57 | of the text and inflections modify the base form. 58 | * a *name* used to identify the variable within the template of the nugget 59 | """ 60 | 61 | def __init__(self, token, sources=None, varname='', inflections=None): 62 | self._token = token 63 | if sources is None: 64 | sources = [] 65 | self.sources = sources 66 | self.varname = varname 67 | if inflections is None: 68 | inflections = [] 69 | self.inflections = inflections 70 | self.templatizer = t_templatize 71 | 72 | def to_dict(self): 73 | """Serialize the variable to dict.""" 74 | payload = {'text': self._token.text} 75 | token = self._token 76 | if isinstance(token, Token): 77 | payload['index'] = token.i 78 | payload['idx'] = token.idx 79 | elif isinstance(token, Span): 80 | payload['index'] = token.start, token.end 81 | payload['idx'] = token[0].idx 82 | elif isinstance(token, Doc): 83 | payload['index'] = 0 84 | payload['idx'] = 0 85 | payload['sources'] = self.sources 86 | payload['varname'] = self.varname 87 | payload['inflections'] = self.inflections 88 | return payload 89 | 90 | def set_expr(self, expr): 91 | """Change the formula or expression for the variable. 92 | 93 | Parameters 94 | ---------- 95 | expr : str 96 | Python expression used to determine the value of the variable. 97 | """ 98 | tmpl = self.enabled_source 99 | tmpl['tmpl'] = expr 100 | 101 | @property 102 | def enabled_source(self): 103 | for tmpl in self.sources: 104 | if tmpl.get('enabled', False): 105 | return tmpl 106 | 107 | def enable_source(self, tmpl): 108 | if isinstance(tmpl, int): 109 | for source in self.sources: 110 | source['enabled'] = False 111 | self.sources[tmpl]['enabled'] = True 112 | elif tmpl in [c['tmpl'] for c in self.sources]: 113 | for source in self.sources: 114 | if source['tmpl'] == tmpl: 115 | source['enabled'] = True 116 | else: 117 | source['enabled'] = False 118 | else: 119 | raise ValueError('Variable source not found.') 120 | 121 | @property 122 | def template(self): 123 | tmpl = self.enabled_source 124 | tmplstr = tmpl['tmpl'] 125 | 126 | for i in self.inflections: 127 | tmplstr = self._add_inflection(tmplstr, i) 128 | 129 | varname = tmpl.get('varname', '') 130 | if varname: 131 | return tmplstr 132 | 133 | return self.templatizer(tmplstr) 134 | 135 | def _add_inflection(self, tmplstr, infl): 136 | func = infl['func_name'] 137 | source = infl['source'] 138 | if source == 'str': 139 | tmplstr += f'.{func}()' 140 | else: 141 | tmplstr = f'{source}.{func}({tmplstr})' 142 | return tmplstr 143 | 144 | def __repr__(self): 145 | return self.template 146 | 147 | 148 | class Nugget(object): 149 | """ 150 | Gramex-NLG Nugget 151 | 152 | A nugget is ideally a single sentence which conveys an insight about the data. 153 | It is created by searching the source dataframe and operations performed on it 154 | for entities found in the input text. 155 | 156 | Note: This class is not meant to be instantiated directly. Please use `nlg.templatize`. 157 | """ 158 | 159 | def __init__(self, text, tokenmap=None, inflections=None, fh_args=None, 160 | condition=None, template="", name=""): 161 | self.doc = text 162 | self.tokenmap = {} 163 | if inflections is None: 164 | inflections = {} 165 | if tokenmap is not None: 166 | for tk, tkobj in tokenmap.items(): 167 | if isinstance(tkobj, Variable): 168 | token = tkobj 169 | elif isinstance(tkobj, list): 170 | token = Variable(tk, tkobj, inflections=inflections.get(tk)) 171 | self.tokenmap[tk] = token 172 | if fh_args is not None: 173 | self.fh_args = fh_args 174 | else: 175 | self.fh_args = {} 176 | self._template = template 177 | self.condition = condition 178 | self.name = name 179 | self.templatizer = t_templatize 180 | 181 | def to_dict(self): 182 | """Serialze the nugget to dict.""" 183 | payload = {} 184 | payload['text'] = self.doc.text 185 | tokenmap = [] 186 | for _, variable in self.tokenmap.items(): 187 | tokenmap.append(variable.to_dict()) 188 | payload['tokenmap'] = tokenmap 189 | payload['fh_args'] = self.fh_args 190 | payload['condition'] = self.condition 191 | payload['name'] = self.name 192 | payload['template'] = self.template 193 | return payload 194 | 195 | @classmethod 196 | def from_json(cls, obj): 197 | if isinstance(obj, str): 198 | obj = json.loads(obj) 199 | 200 | text = obj.pop('text') 201 | obj['text'] = nlp(text) 202 | 203 | tokenlist = obj.pop('tokenmap') 204 | tokenmap = {} 205 | for tk in tokenlist: 206 | index = tk.pop('index') 207 | if isinstance(index, int): 208 | token = obj['text'][index] 209 | elif isinstance(index, (list, tuple)): 210 | start, end = index 211 | token = obj['text'][start:end] 212 | tk.pop('idx') 213 | tk.pop('text') 214 | tokenmap[token] = Variable(token, **tk) 215 | obj['tokenmap'] = tokenmap 216 | 217 | return cls(**obj) 218 | 219 | @property 220 | def variables(self): 221 | return self.tokenmap 222 | 223 | def get_var(self, t): 224 | """Get a variable from the nugget. 225 | 226 | Parameters 227 | ---------- 228 | t : any 229 | The string, or token corresponding to the variable. 230 | Using strings is discouraged, since the nugget may have 231 | more than one variable which renders to the same string form. 232 | Using spacy tokens is unambiguous. 233 | 234 | Returns 235 | ------- 236 | nlg.narrative.Variable 237 | 238 | Example 239 | ------- 240 | >>> from nlg import templatize 241 | >>> df = pd.read_csv('actors.csv') 242 | >>> text = nlp("Charlie Chaplin has 76 votes.") 243 | >>> nugget = templatize(text, {}, df) 244 | >>> nugget.get_var('Charlie Chaplin') 245 | {{ df["name"].iloc[-1] }} 246 | """ 247 | if len(self.tokenmap) == 1: 248 | token, var = tuple(self.tokenmap.items())[0] 249 | if isinstance(token, Doc): 250 | variable = var 251 | elif isinstance(t, Token): 252 | variable = self.tokenmap.get(t, False) 253 | elif isinstance(t, str): 254 | _check_unique_token(t, self.doc) 255 | variable = False 256 | for token in self.doc: 257 | if token.text == t: 258 | variable = self.tokenmap.get(token, False) 259 | else: 260 | if isinstance(t, int): 261 | token = self.doc[t] 262 | elif isinstance(t, (list, tuple)): 263 | start, end = t 264 | token = self.doc[start:end] 265 | variable = self.tokenmap.get(token, False) 266 | if variable: 267 | return variable 268 | raise KeyError('Variable not found.') 269 | 270 | @property 271 | def template(self): 272 | sent = self.doc.text 273 | for tk, tkobj in self.tokenmap.items(): 274 | tmpl = tkobj.template 275 | sent = sent.replace(tk.text, tmpl) 276 | if tkobj.varname: 277 | pattern = re.escape(tmpl) 278 | sent = re.sub(pattern, self.templatizer(tkobj.varname), sent) 279 | sent = f'{{% set {tkobj.varname} = {tmpl} %}}\n' + sent 280 | if self.condition: 281 | sent = f'{{% if {self.condition} %}}\n' + sent + '\n{% end %}' 282 | return self.add_fh_args(sent) 283 | 284 | def _set_templatizer(self, func): 285 | self.templatizer = func 286 | for _, variable in self.tokenmap.items(): 287 | variable.templatizer = self.templatizer 288 | 289 | def _reset_templatizer(self): 290 | self._set_templatizer(t_templatize) 291 | 292 | def to_html(self, bold=True, italic=False, underline=False, **kwargs): 293 | self._set_templatizer(_templatizer_factory(bold, italic, underline)) 294 | try: 295 | s = self.render(**kwargs) 296 | return s 297 | finally: 298 | self._reset_templatizer() 299 | 300 | def __repr__(self): 301 | return self.template 302 | 303 | def render(self, df, fh_args=None, **kwargs): 304 | """Render the template for the given set of arguments. 305 | 306 | Parameters 307 | ---------- 308 | df : pandas.DataFrame 309 | The dataframe to use in the new rendering. 310 | 311 | fh_args : dict 312 | FormHandler arguments to use to transform the dataframe. 313 | 314 | **kwargs : dict 315 | Arguments passed to the `tornado.template.Template.generate` method. 316 | 317 | Returns 318 | ------- 319 | str 320 | Rendered string. 321 | 322 | Example 323 | ------- 324 | >>> from nlg import templatize 325 | >>> df = pd.read_csv('actors.csv') 326 | >>> text = nlp("Humphrey Bogart is at the top of the list.") 327 | >>> nugget = templatize(text, {}, df) 328 | >>> nugget.render(df.iloc[1:]) 329 | b'Cary Grant is at the top of the list' 330 | """ 331 | if fh_args is not None: 332 | self.fh_args = fh_args 333 | else: 334 | fh_args = {} 335 | kwargs['fh_args'] = fh_args 336 | return Template( 337 | self.template, whitespace='oneline').generate( 338 | df=df, orgdf=df, U=utils, G=grammar, **kwargs) 339 | 340 | def add_fh_args(self, sent): 341 | if self.fh_args: 342 | fh_args = json.dumps(self.fh_args) 343 | tmpl = f'{{% set fh_args = {fh_args} %}}\n' 344 | tmpl += f'{{% set df = U.gfilter(orgdf, fh_args.copy()) %}}\n' 345 | tmpl += f'{{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}}\n' 346 | tmpl += '{# Do not edit above this line. #}\n' 347 | return tmpl + sent 348 | return sent 349 | 350 | def add_var(self, token, varname='', expr=''): 351 | """Set a token within the source document as a variable. 352 | 353 | Parameters 354 | ---------- 355 | token : int or spacy.tokens.Token or spacy.tokens.Span 356 | If `token` is an integer, it is interpreted as the position of the token 357 | in the source document. 358 | 359 | varname : str, optional 360 | Optional variable name used to refer to the variable within the Tornado template. 361 | 362 | expr : str, optional 363 | Python expression used to determine the value of the variable. 364 | Note that if `expr` is not provided, it has to be passed at the time of rendering the 365 | template. (See the `nlg.narrative.Nugget.render` method) 366 | 367 | Example 368 | ------- 369 | >>> from nlg import templatize 370 | >>> df = pd.read_csv('actors.csv') 371 | >>> fh_args = {'_sort': ['-rating']} 372 | >>> text = nlp("James Stewart is the actor with the highest rating.") 373 | >>> nugget = templatize(text, fh_args, df) 374 | >>> nugget.add_var(-2, 'sort_col', 'fh_args["_sort"][0]') 375 | """ 376 | if not (varname or expr): 377 | raise ValueError('One of `varname` or `expr` must be provided.') 378 | if isinstance(token, int): 379 | token = self.doc[token] 380 | elif isinstance(token, (list, tuple)): 381 | token = self.doc.char_span(*token) 382 | try: 383 | if any([token in c for c in self.tokenmap if isinstance(c, (Span, Doc))]): 384 | raise ValueError('Token is already contained in another variable.') 385 | except TypeError: 386 | pass 387 | source = [{'tmpl': expr, 'type': 'user', 'enabled': True}] 388 | self.tokenmap[token] = Variable(token, sources=source, varname=varname) 389 | 390 | 391 | class Narrative(list): 392 | """A list to hold only Nuggets.""" 393 | 394 | default_style = dict(style='para', liststyle='html', bold=True, italic=False, underline=False) 395 | 396 | def render(self, sep=' ', **kwargs): 397 | return sep.join([c.render(**kwargs).decode('utf8') for c in self]) 398 | 399 | def to_html(self, style='para', liststyle='html', bold=True, italic=False, underline=False, 400 | **kwargs): 401 | self.html_style = { 402 | 'bold': bold, 'italic': italic, 'underline': underline, 403 | 'style': style, 'liststyle': liststyle 404 | } 405 | rendered = [c.to_html(bold, italic, underline, **kwargs).decode('utf8') for c in self] 406 | if style == 'para': 407 | s = ' '.join(rendered) 408 | elif style == 'list': 409 | if liststyle == "html": 410 | l_render = "".join(["
  • {}
  • ".format(r) for r in rendered]) 411 | s = f"
      {l_render}
    " 412 | elif liststyle == 'markdown': 413 | s = "\n".join(["* " + r for r in rendered]) 414 | else: 415 | raise ValueError('Unknown liststyle.') 416 | return s 417 | 418 | def move(self, x, y): 419 | raise NotImplementedError 420 | 421 | def to_dict(self): 422 | return {'narrative': [c.to_dict() for c in self], 423 | 'style': getattr(self, 'html_style', self.default_style)} 424 | 425 | @classmethod 426 | def from_json(cls, obj): 427 | narrative = cls() 428 | for nugget in obj['narrative']: 429 | narrative.append(Nugget.from_json(nugget)) 430 | narrative.html_style = obj['style'] 431 | return narrative 432 | -------------------------------------------------------------------------------- /nlg/search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim:fenc=utf-8 3 | 4 | """ 5 | Search tools. 6 | """ 7 | 8 | from itertools import chain 9 | import warnings 10 | 11 | import numpy as np 12 | import pandas as pd 13 | from tornado.template import Template 14 | 15 | from nlg import grammar 16 | from nlg import narrative 17 | from nlg import utils 18 | 19 | SEARCH_PRIORITIES = [ 20 | # {'type': 'doc'}, 21 | {'type': 'ne'}, # A match which is a named entity gets the highest priority 22 | {'location': 'fh_args'}, # than one that is a formhandler arg 23 | {'location': 'colname'}, # than one that is a column name 24 | {'type': 'quant'}, # etc 25 | {'location': 'cell'} 26 | ] 27 | 28 | 29 | def _sort_search_results(items, priorities=SEARCH_PRIORITIES): 30 | """ 31 | Sort a list of search results by `priorities`. 32 | 33 | Parameters 34 | ---------- 35 | items : dict 36 | Dictionary containing search results, where keys are tokens and values 37 | are lists of locations where the token was found. Preferably this should 38 | be a `DFSearchResults` object. 39 | priorities : list, optional 40 | List of rules that allow sorting of search results. A `rule` is any 41 | subset of a search result dictionary. Lower indices indicate higher priorities. 42 | 43 | Returns 44 | ------- 45 | dict 46 | Prioritized search results - for each {token: search_matches} pair, sort 47 | search_matches such that a higher priority search result is enabled. 48 | """ 49 | if len(items) > 1: 50 | match_ix = [[p.items() <= item.items() for p in priorities] for item in items] 51 | min_match = [m.index(True) for m in match_ix] 52 | items[min_match.index(min(min_match))]['enabled'] = True 53 | else: 54 | items[0]['enabled'] = True 55 | return items 56 | 57 | 58 | def _preprocess_array_search(text, array, literal=False, case=False, lemmatize=True, 59 | nround=False): 60 | nlp = utils.load_spacy_model() 61 | if case or nround: 62 | raise NotImplementedError 63 | 64 | if literal and lemmatize: 65 | warnings.warn('Ignoring lemmatization.') 66 | 67 | if not (literal or lemmatize): 68 | warnings.warn( 69 | 'One of `literal` or `lemmatize` must be True. Falling back to lemmatize=True') 70 | literal, lemmatize = False, True 71 | 72 | if literal: # ignore every other flag else 73 | tokens = pd.Series([c.text for c in text], index=text) 74 | 75 | elif lemmatize: 76 | tokens = pd.Series([c.lemma_ for c in text], index=text) 77 | if array.ndim == 1: 78 | array = array.map(nlp) 79 | array = pd.Series([token.lemma_ for doc in array for token in doc]) 80 | elif array.ndim == 2: 81 | for col in array.columns[array.dtypes == np.dtype('O')]: 82 | s = [c if isinstance(c, str) else str(c) for c in array[col]] 83 | s = [nlp(c) for c in s] 84 | try: 85 | array[col] = [token.lemma_ for doc in s for token in doc] 86 | except ValueError: 87 | warnings.warn('Cannot lemmatize multi-word cells.') 88 | if not case: # still need to respect the `case` param 89 | array[col] = array[col].str.lower() 90 | 91 | return tokens, array 92 | 93 | 94 | def _remerge_span_tuples(results): 95 | # re-merge span objects that end up as tuples; see issue #25 96 | unmerged_spans = [k for k in results if isinstance(k, tuple)] 97 | for span in unmerged_spans: 98 | start, end = span[0].idx, span[-1].idx + len(span[-1]) 99 | new_span = span[0].doc.char_span(start, end) 100 | results[new_span] = results.pop(span) 101 | return results 102 | 103 | 104 | def _text_search_array(text, array, case=False): 105 | array = array.astype(str) 106 | if not case: 107 | stext = text.lower() 108 | if array.ndim == 1: 109 | array = array.map(lambda x: x.lower()) 110 | elif array.ndim == 2: 111 | for col in array: 112 | array[col] = array[col].str.lower() 113 | else: 114 | stext = text 115 | mask = array == stext 116 | if not mask.any(axis=None): 117 | return [] 118 | indices = mask.values.nonzero() 119 | if array.ndim == 1: 120 | return indices[0] 121 | if array.ndim == 2: 122 | return indices 123 | 124 | 125 | def _search_1d_array(text, array, literal=False, case=False, lemmatize=True, 126 | nround=False): 127 | tokens, array = _preprocess_array_search(text, array, literal, case, lemmatize, nround) 128 | mask = array.isin(tokens) 129 | if not mask.any(): 130 | return {} 131 | if isinstance(mask, pd.Series): 132 | nz = mask.to_numpy().nonzero()[0] 133 | else: 134 | nz = mask.nonzero()[0] 135 | indices = {array[i]: i for i in nz} 136 | tk = tokens[tokens.isin(array)] 137 | return _remerge_span_tuples({token: indices[s] for token, s in tk.items()}) 138 | 139 | 140 | def _search_2d_array(text, array, literal=False, case=False, lemmatize=True, nround=False): 141 | array = array.astype(str) 142 | tokens, array = _preprocess_array_search(text, array, literal, case, lemmatize, nround) 143 | mask = array.isin(tokens.values) 144 | if not mask.any().any(): 145 | return {} 146 | indices = {array.iloc[i, j]: (i, j) for i, j in zip(*mask.values.nonzero())} 147 | tk = tokens[tokens.isin(array.values.ravel())] 148 | return _remerge_span_tuples({token: indices[s] for token, s in tk.items()}) 149 | 150 | 151 | def _df_maxlen(df): 152 | # Find the length of the longest string present in the columns, indices or values of a df 153 | col_max = max([len(c) for c in df.columns.astype(str)]) 154 | ix_max = max([len(c) for c in df.index.astype(str)]) 155 | array_max = max([df[c].astype(str).apply(len).max() for c in df]) 156 | return max(col_max, ix_max, array_max) 157 | 158 | 159 | # TODO: Can this be done with defaultdict? 160 | class DFSearchResults(dict): 161 | """A convenience wrapper around `dict` to collect search results. 162 | 163 | Different from `dict` in that values are always lists, and setting to 164 | existing key appends to the list. 165 | """ 166 | 167 | def __setitem__(self, key, value): 168 | if key not in self: 169 | super(DFSearchResults, self).__setitem__(key, [value]) 170 | elif self[key][0] != value: 171 | self[key].append(value) 172 | 173 | def update(self, other): 174 | # Needed because the default update method doesn't seem to use setitem 175 | for k, v in other.items(): 176 | self[k] = v 177 | 178 | def clean(self): 179 | """Sort the search results for each token by priority and un-overlap tokens.""" 180 | for k, v in self.items(): 181 | _sort_search_results(v) 182 | # unoverlap the keys 183 | to_remove = [] 184 | for k in self: 185 | to_search = self.keys() - {k} 186 | if utils.is_overlap(k, to_search): 187 | to_remove.append(k) 188 | for i in to_remove: 189 | del self[i] 190 | 191 | 192 | class DFSearch(object): 193 | """Make a dataframe searchable.""" 194 | 195 | def __init__(self, df, nlp=None, **kwargs): 196 | """Default constrictor. 197 | 198 | Parameters 199 | ---------- 200 | df : pd.DataFrame 201 | The dataframe to search. 202 | nlp : A `spacy.lang` model, optional 203 | """ 204 | self.df = df 205 | # What do results contain? 206 | # A map of tokens to list of search results. 207 | self.results = DFSearchResults() 208 | if not nlp: 209 | nlp = utils.load_spacy_model() 210 | self.matcher = kwargs.get('matcher', utils.make_np_matcher(nlp)) 211 | self.ents = [] 212 | 213 | def search(self, text, colname_fmt='df.columns[{}]', 214 | cell_fmt='df["{}"].iloc[{}]', **kwargs): 215 | """ 216 | Search the dataframe. 217 | 218 | Parameters 219 | ---------- 220 | text : spacy.Doc 221 | The text to search. 222 | colname_fmt : str, optional 223 | String format to describe dataframe columns in the search results, 224 | can be one of 'df.columns[{}]' or 'df[{}]'. 225 | cell_fmt : str, optional 226 | String format to describe dataframe values in the search results. 227 | Can be one of 'df.iloc[{}, {}]', 'df.loc[{}, {}]', 'df[{}][{}]', etc. 228 | 229 | Returns 230 | ------- 231 | dict 232 | A dictionary who's keys are tokens from `text` found in 233 | the source dataframe, and values are a list of locations in the df 234 | where they are found. 235 | """ 236 | self.search_nes(text) 237 | if len(text.text) <= _df_maxlen(self.df): 238 | for i in _text_search_array(text.text, self.df.columns): 239 | self.results[text] = {'location': 'colname', 'tmpl': colname_fmt.format(i), 240 | 'type': 'doc'} 241 | for x, y in zip(*_text_search_array(text.text, self.df)): 242 | x = utils.sanitize_indices(self.df.shape, x, 0) 243 | y = utils.sanitize_indices(self.df.shape, y, 1) 244 | self.results[text] = { 245 | 'location': 'cell', 'tmpl': cell_fmt.format(self.df.columns[y], x), 246 | 'type': 'doc'} 247 | 248 | else: 249 | for token, ix in self.search_columns(text, **kwargs).items(): 250 | ix = utils.sanitize_indices(self.df.shape, ix, 1) 251 | self.results[token] = {'location': 'colname', 'tmpl': colname_fmt.format(ix), 252 | 'type': 'token'} 253 | 254 | for token, (x, y) in self.search_table(text, **kwargs).items(): 255 | x = utils.sanitize_indices(self.df.shape, x, 0) 256 | y = utils.sanitize_indices(self.df.shape, y, 1) 257 | self.results[token] = { 258 | 'location': 'cell', 'tmpl': cell_fmt.format(self.df.columns[y], x), 259 | 'type': 'token'} 260 | self.search_quant([c for c in text if c.pos_ == 'NUM']) 261 | # self.search_derived_quant([c.text for c in selfdoc if c.pos_ == 'NUM']) 262 | 263 | return self.results 264 | 265 | def search_nes(self, doc, colname_fmt='df.columns[{}]', cell_fmt='df["{}"].iloc[{}]'): 266 | """Find named entities in text, and search for them in the dataframe. 267 | 268 | Parameters 269 | ---------- 270 | text : str 271 | The text to search. 272 | """ 273 | self.ents = utils.ner(doc, self.matcher) 274 | for token, ix in self.search_columns(self.ents, literal=True).items(): 275 | ix = utils.sanitize_indices(self.df.shape, ix, 1) 276 | self.results[token] = { 277 | 'location': 'colname', 278 | 'tmpl': colname_fmt.format(ix), 'type': 'ne' 279 | } 280 | for token, (x, y) in self.search_table(self.ents, literal=True).items(): 281 | x = utils.sanitize_indices(self.df.shape, x, 0) 282 | y = utils.sanitize_indices(self.df.shape, y, 1) 283 | self.results[token] = { 284 | 'location': 'cell', 285 | 'tmpl': cell_fmt.format(self.df.columns[y], x), 'type': 'ne'} 286 | 287 | def search_table(self, text, **kwargs): 288 | """Search the `.values` attribute of the dataframe for tokens in `text`.""" 289 | kwargs['array'] = self.df.copy() 290 | return self._search_array(text, **kwargs) 291 | 292 | def search_columns(self, text, **kwargs): 293 | """Search df columns for tokens in `text`.""" 294 | kwargs['array'] = self.df.columns 295 | return self._search_array(text, **kwargs) 296 | 297 | def search_quant(self, quants, nround=2, cell_fmt='df["{}"].iloc[{}]'): 298 | """Search the dataframe for a set of quantitative values. 299 | 300 | Parameters 301 | ---------- 302 | quants : list / array like 303 | The values to search. 304 | nround : int, optional 305 | Numeric values in the dataframe are rounded to these many 306 | significant digits before searching. 307 | """ 308 | dfclean = utils.sanitize_df(self.df, nround) 309 | qarray = np.array([c.text for c in quants]) 310 | quants = np.array(quants) 311 | n_quant = qarray.astype('float').round(nround) 312 | for x, y in zip(*dfclean.isin(n_quant).values.nonzero()): 313 | x = utils.sanitize_indices(dfclean.shape, x, 0) 314 | y = utils.sanitize_indices(dfclean.shape, y, 1) 315 | tk = quants[n_quant == dfclean.iloc[x, y]][0] 316 | self.results[tk] = { 317 | 'location': 'cell', 'tmpl': cell_fmt.format(self.df.columns[y], x), 318 | 'type': 'quant'} 319 | 320 | def search_derived_quant(self, quants, nround=2): 321 | """Search the common derived dataframe parameters for a set of quantitative values. 322 | 323 | Parameters 324 | ---------- 325 | quants : list / array like 326 | The values to search. 327 | nround : int, optional 328 | Numeric values in the dataframe are rounded to these many 329 | significant digits before searching. 330 | """ 331 | dfclean = utils.sanitize_df(self.df, nround) 332 | quants = np.array(quants) 333 | # n_quant = quants.astype('float').round(2) 334 | 335 | for num in quants: 336 | if int(num) == len(dfclean): 337 | self.results[num] = { 338 | 'location': 'cell', 'tmpl': "len(df)", 339 | 'type': 'quant'} 340 | 341 | def _search_array(self, text, array, literal=False, 342 | case=False, lemmatize=True, nround=False): 343 | """Search for tokens in text within an array. 344 | 345 | Parameters 346 | ---------- 347 | text : str or spacy document 348 | Text to search 349 | array : array-like 350 | Array to search in. 351 | literal : bool, optional 352 | Whether to match tokens to values literally. 353 | case : bool, optional 354 | If true, run a case sensitive search. 355 | lemmatize : bool, optional 356 | If true (default), search on lemmas of tokens and values. 357 | nround : int, optional 358 | Significant digits used to round `array` before searching. 359 | 360 | Returns 361 | ------- 362 | dict 363 | Mapping of tokens to a sequence of indices within `array`. 364 | 365 | Example 366 | ------- 367 | >>> _search_array('3', np.arange(5)) 368 | {'3': [3]} 369 | >>> df = pd.DataFrame(np.eye(3), columns='one punch man'.split()) 370 | >>> _search_array('1', df.values) 371 | {'1': [(0, 0), (1, 1), (2, 2)]} 372 | >>> _search_array('punched man', df.columns) 373 | {'punched': [1], 'man': [2]} 374 | >>> _search_array('1 2 buckle my shoe', df.index) 375 | {'1': [1], '2': [2]} 376 | """ 377 | if array.ndim == 1: 378 | func = _search_1d_array 379 | else: 380 | func = _search_2d_array 381 | return func(text, array, literal, case, lemmatize, nround) 382 | # if len(res) == 0: # Fall back on searching the whole string, not just the entities 383 | # res = func([text], array, literal, case, lemmatize, nround) 384 | # return res 385 | 386 | 387 | def _search_fh_args(entities, args, key, lemmatized): 388 | colnames = args.get(key, False) 389 | if not colnames: 390 | return {} 391 | nlp = utils.load_spacy_model() 392 | argtokens = list(chain(*[nlp(c) for c in colnames])) 393 | res = {} 394 | for i, token in enumerate(argtokens): 395 | for ent in entities: 396 | if lemmatized and (token.lemma_ == ent.lemma_): 397 | match = True 398 | elif token.text == ent.text: 399 | match = True 400 | else: 401 | match = False 402 | if match: 403 | res[ent] = { 404 | 'type': 'token', 'tmpl': f"fh_args['{key}'][{i}]", 405 | 'location': 'fh_args' 406 | } 407 | return res 408 | 409 | 410 | def _search_groupby(entities, args, lemmatized=True): 411 | return _search_fh_args(entities, args, key='_by', lemmatized=lemmatized) 412 | 413 | 414 | def _search_sort(entities, args, lemmatized=True): 415 | return _search_fh_args(entities, args, key='_sort', lemmatized=lemmatized) 416 | 417 | 418 | def _search_select(entities, args, lemmatized=True): 419 | return _search_fh_args(entities, args, key='_c', lemmatized=lemmatized) 420 | 421 | 422 | def search_args(entities, args, lemmatized=True, fmt='fh_args["{}"][{}]', 423 | argkeys=('_sort', '_by', '_c')): 424 | """ 425 | Search formhandler arguments provided as URL query parameters. 426 | 427 | Parameters 428 | ---------- 429 | entities : list 430 | list of named entities found in the source text 431 | args : dict 432 | FormHandler args as parsed by g1.url.parse(...).searchList 433 | lemmatized : bool, optional 434 | whether to search on lemmas of text values 435 | fmt : str, optional 436 | String format used to describe FormHandler arguments in the template 437 | argkeys : list, optional 438 | Formhandler argument keys to be considered for the search. Any key not 439 | present in this will be ignored. 440 | # TODO: Column names can be keys too!! 441 | 442 | Returns 443 | ------- 444 | dict 445 | Mapping of entities / tokens to objects describing where they are found 446 | in Formhandler arguemnts. Each search result object has the following 447 | structure: 448 | { 449 | 'type': 'some token', 450 | 'location': 'fh_args', 451 | 'tmpl': 'fh_args['_by'][0]' # The template that gets this token from fh_args 452 | } 453 | """ 454 | args = {k: v for k, v in args.items() if k in argkeys} 455 | search_res = {} 456 | entities = list(chain(*entities)) 457 | search_res.update(_search_groupby(entities, args, lemmatized=lemmatized)) 458 | search_res.update(_search_sort(entities, args, lemmatized=lemmatized)) 459 | search_res.update(_search_select(entities, args, lemmatized=lemmatized)) 460 | return search_res 461 | 462 | 463 | def _search(text, args, df, copy=False): 464 | """Construct a tornado template which regenerates some 465 | text from a dataframe and formhandler arguments. 466 | 467 | The pipeline consists of: 468 | 1. cleaning the text and the dataframe 469 | 2. searching the dataframe and FH args for tokens in the text 470 | 3. detecting inflections on the tokens. 471 | 472 | Parameters 473 | ---------- 474 | text : spacy.Doc 475 | Input text 476 | args : dict 477 | Formhandler arguments 478 | df : pd.DataFrame 479 | Source dataframe. 480 | 481 | Returns 482 | -------- 483 | tuple 484 | of search results, cleaned text and token inflections. The webapp uses 485 | these to construct a tornado template. 486 | """ 487 | # utils.load_spacy_model() 488 | if copy: 489 | df = df.copy() 490 | df = utils.gfilter(df, args.copy()) 491 | # Do this only if needed: 492 | # clean_text = utils.sanitize_text(text.text) 493 | args = utils.sanitize_fh_args(args, df) 494 | # Is this correct? 495 | dfs = DFSearch(df) 496 | dfix = dfs.search(text) 497 | dfix.update(search_args(dfs.ents, args)) 498 | dfix.clean() 499 | inflections = grammar.find_inflections(dfix, args, df) 500 | _infl = {} 501 | for token, funcs in inflections.items(): 502 | _infl[token] = [] 503 | for func in funcs: 504 | _infl[token].append({ 505 | 'source': func.source, 506 | 'fe_name': func.fe_name, 507 | 'func_name': func.__name__ 508 | }) 509 | # FIXME: Why return text if it's unchanged? 510 | return dfix, text, _infl 511 | 512 | 513 | def _make_inflection_string(tmpl, infl): 514 | source = infl['source'] 515 | func_name = infl['func_name'] 516 | if source == 'str': 517 | tmpl += f'.{func_name}()' 518 | else: 519 | tmpl = f'{source}.{func_name}({tmpl})' 520 | return tmpl 521 | 522 | 523 | def templatize_token(token, results, inflection): 524 | for r in results: 525 | if r.get('enabled', False): 526 | break 527 | tmpl = r['tmpl'] 528 | if inflection: 529 | for i in inflection: 530 | tmpl = _make_inflection_string(tmpl, i) 531 | return narrative.t_templatize(tmpl) 532 | 533 | 534 | def templatize(text, args, df): 535 | """Construct an NLG Nugget which templatizes the given text in 536 | the context of a dataframe, and FormHandler operations on it. 537 | 538 | Parameters 539 | ---------- 540 | text : spacy.tokens.Doc 541 | Input document 542 | args : dict 543 | Formhandler arguments 544 | df : pd.DataFrame 545 | Source dataframe. 546 | 547 | Returns 548 | ------- 549 | nlg.narrative.Nugget 550 | An NLG Nugget object containing the template for the input text. 551 | 552 | Example 553 | ------- 554 | >>> from gramex import data 555 | >>> from nlg.utils import load_spacy_model 556 | >>> df = pd.read_csv('iris.csv') 557 | >>> fh_args = {'_by': ['species']} 558 | >>> df = data.filter(df, fh_args.copy()) 559 | >>> nlp = load_spacy_model() 560 | >>> text = 'The iris dataset has 3 species - setosa, versicolor and virginica.' 561 | >>> nugget = templatize(text, fh_args, df) 562 | >>> print(template) 563 | {% set fh_args = {"_by": ["species"]} %} 564 | {% set df = U.gfilter(orgdf, fh_args.copy()) %} 565 | The iris dataset has 3 {{ df.columns[0] }} - {{ df["species"].iloc[0] }}, \ 566 | {{ df["species"].iloc[1] }} and {{ df["species"].iloc[-1] }}. 567 | """ 568 | dfix, clean_text, infl = _search(text, args, df) 569 | return narrative.Nugget(clean_text, dfix, infl, args) 570 | 571 | 572 | def add_manual_template(input_template, manual_template=None): 573 | """Append user defined template for any word in the original text. 574 | 575 | Parameters 576 | ---------- 577 | input_template : str 578 | Input text 579 | manual_template : dict 580 | Doct to add with key=word in the text, valu=dataframe expression 581 | 582 | 583 | Returns 584 | ------- 585 | str 586 | Tornado template corresponding to the text and data. 587 | 588 | Example 589 | ------- 590 | input_template = "The iris dataset has 3 {{ df.columns[0] }} - {{ df["species"].iloc[0] }}, \ 591 | {{ df["species"].iloc[1] }} and {{ df["species"].iloc[-1] }}." 592 | manual_template = {"3" : "{{ "+ len(df["species"].unique()) + " }}" } 593 | 594 | output_template = "The iris dataset has "{{ "+ len(df["species"].unique()) + \ 595 | " }}" {{ df.columns[0] }} - {{ df["species"].iloc[0] }}, \ 596 | {{ df["species"].iloc[1] }} and {{ df["species"].iloc[-1] }}." 597 | 598 | """ 599 | if manual_template is None: 600 | return input_template 601 | 602 | for key in manual_template: 603 | replace_with = "{{ " + manual_template[key][0]['tmpl'] + " }}" 604 | input_template = input_template.replace(key, replace_with) 605 | return input_template 606 | 607 | 608 | def render(df, template): 609 | return Template(template).generate(orgdf=df, U=utils, G=grammar) 610 | -------------------------------------------------------------------------------- /nlg/tests/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | -------------------------------------------------------------------------------- /nlg/tests/data/actors.csv: -------------------------------------------------------------------------------- 1 | category,name,rating,votes 2 | Actors,Humphrey Bogart,0.57019677,109 3 | Actors,Cary Grant,0.438601513,142 4 | Actors,James Stewart,0.988373838,120 5 | Actors,Marlon Brando,0.102044811,108 6 | Actors,Fred Astaire,0.208876756,84 7 | Actresses,Katharine Hepburn,0.039187792,63 8 | Actresses,Bette Davis,0.282806963,14 9 | Actresses,Audrey Hepburn,0.120196561,94 10 | Actresses,Ingrid Bergman,0.296140198,52 11 | Actors,Spencer Tracy,0.466310773,192 12 | Actors,Charlie Chaplin,0.244425592,76 13 | -------------------------------------------------------------------------------- /nlg/tests/data/imdb_ratings.csv: -------------------------------------------------------------------------------- 1 | category,name,rating,votes 2 | Series,How I Met Your Mother,8.3,561820.0 3 | Movies,Inception,8.8,1915875.0 4 | Series,Dexter,8.6,614596.0 5 | Movies,The Shawshank Redemption,9.3,2184030.0 6 | Movies,The Godfather,9.2,1504601.0 7 | Series,Game of Thrones,9.3,1633121.0 8 | Series,Sherlock,9.1,743103.0 9 | Movies,The Dark Knight,9.0,2167100.0 10 | Movies,The Lord of the Rings: The Return of the King,8.9,1553705.0 11 | Movies,Fight Club,8.8,1744466.0 12 | Movies,The Matrix,8.7,1572571.0 13 | Series,The Big Bang Theory,8.1,677647.0 14 | Series,The Walking Dead,8.2,814294.0 15 | Series,Friends,8.9,732626.0 16 | Series,Breaking Bad,9.5,1303750.0 17 | Series,Stranger Things,8.8,701867.0 18 | Movies,Forrest Gump,8.8,1683919.0 19 | Movies,Pulp Fiction,8.9,1715698.0 20 | Movies,The Lord of the Rings: The Fellowship of the Ring,8.8,1565594.0 21 | Series,True Detective,9.0,464140.0 22 | -------------------------------------------------------------------------------- /nlg/tests/test_grammar.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | 5 | """ 6 | Tests for the nlg.grammar module. 7 | """ 8 | import os 9 | import unittest 10 | 11 | import pandas as pd 12 | 13 | import nlg.grammar as G # noqa: N812 14 | from nlg import utils 15 | from nlg.search import search_args, DFSearch 16 | 17 | nlp = utils.load_spacy_model() 18 | op = os.path 19 | 20 | 21 | class TestGrammar(unittest.TestCase): 22 | 23 | def test_is_plural(self): 24 | self.assertTrue(G.is_plural_noun("languages")) 25 | # self.assertTrue(G.is_plural("geese")) 26 | self.assertTrue(G.is_plural_noun("bacteria")) 27 | self.assertTrue(G.is_plural_noun("Office supplies")) 28 | 29 | def test_concatenate_items(self): 30 | self.assertEqual(G.concatenate_items("abc"), "a, b and c") 31 | self.assertEqual(G.concatenate_items([1, 2, 3], sep=""), "123") 32 | self.assertFalse(G.concatenate_items([])) 33 | 34 | def test_pluralize(self): 35 | self.assertEqual(G.plural("language"), "languages") 36 | self.assertEqual(G.plural("languages"), "languages") 37 | self.assertEqual(G.plural("bacterium"), "bacteria") 38 | self.assertEqual(G.plural("goose"), "geese") 39 | 40 | def test_singular(self): 41 | self.assertEqual(G.singular("languages"), "language") 42 | self.assertEqual(G.singular("language"), "language") 43 | self.assertEqual(G.singular("bacteria"), "bacterium") 44 | # self.assertEqual(G.singular("geese"), "goose") 45 | 46 | def test_pluralize_by(self): 47 | self.assertEqual(G.pluralize_by("language", [1, 2]), "languages") 48 | self.assertEqual(G.pluralize_by("languages", [1]), "language") 49 | self.assertEqual(G.pluralize_by("language", []), "language") 50 | self.assertEqual(G.pluralize_by("language", 1), "language") 51 | self.assertEqual(G.pluralize_by("language", 2), "languages") 52 | 53 | def test_number_inflection(self): 54 | text = nlp('Actors and actors.') 55 | x, y = text[0], text[-2] 56 | infl = G._number_inflection(x, y) 57 | self.assertEqual(infl, G.plural) 58 | 59 | text = nlp('Actors and dancers.') 60 | x, y = text[0], text[-2] 61 | infl = G._number_inflection(x, y) 62 | self.assertFalse(infl) 63 | 64 | def test_shape_inflections(self): 65 | text = nlp('Actors is plural of actors.') 66 | x, y = text[0], text[-2] 67 | infl = G._shape_inflection(x, y) 68 | self.assertEqual(infl, G.lower) 69 | 70 | def test_inflections(self): 71 | text = nlp('James Stewart is the actor with the highest rating.') 72 | df = pd.read_csv(op.join(op.dirname(__file__), "data", "actors.csv"), 73 | encoding='utf8') 74 | fh_args = {'_sort': ['-rating']} 75 | df = utils.gfilter(df, fh_args.copy()) 76 | args = utils.sanitize_fh_args(fh_args, df) 77 | dfs = DFSearch(df) 78 | dfix = dfs.search(text) 79 | dfix.update(search_args(dfs.ents, args)) 80 | dfix.clean() 81 | infl = G.find_inflections(dfix, fh_args, df) 82 | x, y = infl[text[4]] 83 | self.assertEqual(x, G.singular) 84 | self.assertEqual(y, G.lower) 85 | -------------------------------------------------------------------------------- /nlg/tests/test_narrative.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | 5 | """ 6 | Tests for the nlg.narrative module. 7 | """ 8 | 9 | import os 10 | import re 11 | import unittest 12 | 13 | import pandas as pd 14 | from spacy.tokens import Doc 15 | 16 | from nlg import templatize 17 | from nlg.narrative import Nugget, Narrative 18 | from nlg.utils import load_spacy_model 19 | 20 | op = os.path 21 | nlp = load_spacy_model() 22 | 23 | 24 | class TestNarrative(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.df = pd.read_csv(op.join(op.dirname(__file__), "data", "actors.csv"), 29 | encoding='utf8') 30 | cls.text = nlp('James Stewart is the actor with the highest rating.') 31 | cls.nugget = templatize(cls.text, {'_sort': ['-rating']}, cls.df) 32 | 33 | def test_nugget_variables(self): 34 | varnames = set([c.text for c in self.nugget.variables]) 35 | self.assertSetEqual(varnames, {'James Stewart', 'actor', 'rating'}) 36 | 37 | def test_nugget_get_var(self): 38 | with self.assertRaises(KeyError): 39 | self.nugget.get_var('James Stewart') 40 | var = self.nugget.get_var('actor') 41 | self.assertEqual(str(var), '{{ G.singular(df["category"].iloc[0]).lower() }}') 42 | 43 | def test_nugget_render(self): 44 | df = self.df 45 | rendered = self.nugget.render(self.df) 46 | self.assertEqual(rendered.lstrip().decode('utf8'), self.text.text) 47 | xdf = df[df['category'] == 'Actors'].copy() 48 | xdf['rating'] = 1 - df.loc[xdf.index, 'rating'] 49 | rendered = self.nugget.render(xdf) 50 | self.assertEqual(rendered.lstrip().decode('utf8'), 51 | 'Marlon Brando is the actor with the highest rating.') 52 | 53 | def test_set_expr(self): 54 | var = self.nugget.get_var('actor') 55 | var.set_expr('df["category"].iloc[0]') 56 | self.assertEqual(str(var), '{{ G.singular(df["category"].iloc[0]).lower() }}') 57 | xdf = self.df[self.df['category'] == 'Actresses'] 58 | rendered = self.nugget.render(xdf) 59 | self.assertEqual(rendered.lstrip().decode('utf8'), 60 | 'Ingrid Bergman is the actress with the highest rating.') 61 | 62 | def test_add_var(self): 63 | var = self.nugget.get_var('actor') 64 | var_token, var_exp = self.text[-2], 'fh_args["_sort"][0]' 65 | for k in self.nugget.tokenmap: 66 | if k.text == 'rating': 67 | break 68 | del self.nugget.tokenmap[k] 69 | var.set_expr('df["category"].iloc[0]') 70 | self.nugget.add_var(var_token, expr=var_exp) 71 | 72 | # sort by votes 73 | self.nugget.fh_args = {'_sort': ['-votes']} 74 | rendered = self.nugget.render(self.df) 75 | self.assertEqual(rendered.lstrip().decode('utf8'), 76 | 'Spencer Tracy is the actor with the highest votes.') 77 | xdf = self.df[self.df['category'] == 'Actresses'] 78 | rendered = self.nugget.render(xdf) 79 | self.assertEqual(rendered.lstrip().decode('utf8'), 80 | 'Audrey Hepburn is the actress with the highest votes.') 81 | 82 | # Set the ratings back 83 | self.nugget.fh_args = {'_sort': ['-rating']} 84 | rendered = self.nugget.render(self.df) 85 | self.assertEqual(rendered.lstrip().decode('utf8'), 86 | 'James Stewart is the actor with the highest rating.') 87 | xdf = self.df[self.df['category'] == 'Actresses'] 88 | rendered = self.nugget.render(xdf) 89 | self.assertEqual(rendered.lstrip().decode('utf8'), 90 | 'Ingrid Bergman is the actress with the highest rating.') 91 | 92 | def test_serialize(self): 93 | pl = self.nugget.to_dict() 94 | self.assertEqual(pl['text'], self.text.text) 95 | self.assertDictEqual(pl['fh_args'], {'_sort': ['-rating']}) 96 | tokenmap = pl['tokenmap'] 97 | ideal = [ 98 | { 99 | 'text': 'rating', 'index': 8, 'idx': 44, 100 | 'sources': [ 101 | { 102 | 'tmpl': 'fh_args["_sort"][0]', 'type': 'user', 103 | 'enabled': True 104 | } 105 | ], 106 | 'varname': '', 'inflections': [] 107 | }, 108 | { 109 | 'index': (0, 2), 'idx': 0, 'text': 'James Stewart', 110 | 'sources': [ 111 | { 112 | 'location': 'cell', 'tmpl': 'df["name"].iloc[0]', 'type': 'ne', 113 | 'enabled': True 114 | } 115 | ], 116 | 'varname': '', 'inflections': [] 117 | }, 118 | { 119 | 'index': 4, 'idx': 21, 'text': 'actor', 120 | 'sources': [ 121 | { 122 | 'location': 'cell', 'tmpl': 'df["category"].iloc[0]', 'type': 'token', 123 | 'enabled': True 124 | } 125 | ], 126 | 'varname': '', 127 | 'inflections': [ 128 | {'source': 'G', 'fe_name': 'Singularize', 'func_name': 'singular'}, 129 | {'source': 'str', 'fe_name': 'Lowercase', 'func_name': 'lower'} 130 | ] 131 | } 132 | ] 133 | tokenmap = sorted(tokenmap, key=lambda x: x['text']) 134 | ideal = sorted(ideal, key=lambda x: x['text']) 135 | self.assertListEqual(ideal, tokenmap) 136 | 137 | def test_deserialize(self): 138 | pl = self.nugget.to_dict() 139 | nugget = Nugget.from_json(pl) 140 | actual = nugget.render(self.df).lstrip().decode('utf8') 141 | self.assertEqual(actual, self.text.text) 142 | 143 | def test_doc_serialize(self): 144 | nugget = templatize(nlp('Humphrey Bogart'), {}, self.df) 145 | pl = nugget.to_dict() 146 | self.assertEqual(len(pl['tokenmap']), 1) 147 | var = nugget.get_var(0) 148 | self.assertTrue(isinstance(var._token, Doc)) 149 | self.assertEqual(var._token.text, 'Humphrey Bogart') 150 | var_serialized = pl['tokenmap'][0] 151 | self.assertEqual(var_serialized['text'], 'Humphrey Bogart') 152 | self.assertEqual(var_serialized['idx'], 0) 153 | self.assertEqual(len(var_serialized['sources']), 1) 154 | source = var_serialized['sources'][0] 155 | self.assertEqual(source['tmpl'], 'df["name"].iloc[0]') 156 | 157 | def test_narrative_html(self): 158 | text = nlp('Katharine Hepburn is the actress with the least rating.') 159 | fh_args = {'_sort': ['-rating']} 160 | nugget = templatize(text, fh_args, self.df) 161 | narrative = Narrative([self.nugget, nugget]) 162 | 163 | # test default render 164 | actual = narrative.to_html(df=self.df) 165 | actual = re.sub(r'\s+', ' ', actual) 166 | ideal = ' James Stewart is the actor ' \ 167 | + 'with the highest rating. Katharine Hepburn is ' \ 168 | + 'the actress with the least rating.' 169 | self.assertEqual(ideal, actual) 170 | 171 | # test other options 172 | actual = narrative.to_html(bold=False, df=self.df) 173 | actual = re.sub(r'\s+', ' ', actual) 174 | no_bold = ideal.replace('', '') 175 | no_bold = no_bold.replace('', '') 176 | self.assertEqual(actual, no_bold) 177 | 178 | actual = narrative.to_html(italic=True, df=self.df) 179 | actual = re.sub(r'\s+', ' ', actual) 180 | italic = ideal.replace('', '') 181 | italic = italic.replace('', '') 182 | self.assertEqual(actual, italic) 183 | 184 | actual = narrative.to_html(underline=True, df=self.df) 185 | actual = re.sub(r'\s+', ' ', actual) 186 | italic = ideal.replace('', '') 187 | italic = italic.replace('', '') 188 | self.assertEqual(actual, italic) 189 | 190 | def test_parastyle(self): 191 | text = nlp('Katharine Hepburn is the actress with the least rating.') 192 | fh_args = {'_sort': ['-rating']} 193 | nugget = templatize(text, fh_args, self.df) 194 | narrative = Narrative([self.nugget, nugget]) 195 | 196 | actual = narrative.to_html(style='list', df=self.df) 197 | actual = re.sub(r'\s+', ' ', actual) 198 | ideal = '
    • James Stewart is the actor ' \ 199 | + 'with the highest rating.' \ 200 | + '
    • Katharine Hepburn is ' \ 201 | + 'the actress with the least rating.
    ' 202 | self.assertEqual(actual, ideal) 203 | 204 | actual = narrative.to_html(bold=False, style='list', liststyle='markdown', df=self.df) 205 | actual = [re.sub(r'\s+', ' ', c) for c in actual.splitlines()] 206 | ideal = [ 207 | '* James Stewart is the actor with the highest rating.', 208 | '* Katharine Hepburn is the actress with the least rating.' 209 | ] 210 | self.assertListEqual(actual, ideal) 211 | 212 | def test_condition(self): 213 | try: 214 | self.nugget.condition = 'df["category"].nunique() == 2' 215 | actual = self.nugget.render(self.df) 216 | self.assertEqual(actual.lstrip().rstrip(), 217 | b'James Stewart is the actor with the highest rating.') 218 | xdf = self.df[self.df['category'] == 'Actors'] 219 | actual = self.nugget.render(xdf) 220 | self.assertRegexpMatches(actual.decode('utf8'), r'^\s*$') 221 | finally: 222 | self.nugget.condition = None 223 | -------------------------------------------------------------------------------- /nlg/tests/test_search.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | 5 | """ 6 | Tests of the nlg.search module 7 | """ 8 | 9 | import os.path as op 10 | import re 11 | import unittest 12 | 13 | import pandas as pd 14 | from spacy.tokens import Span 15 | from tornado.template import Template 16 | 17 | from nlg import search, utils 18 | 19 | nlp = utils.load_spacy_model() 20 | matcher = utils.make_np_matcher(nlp) 21 | 22 | 23 | class TestDFSearch(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | fpath = op.join(op.dirname(__file__), "data", "actors.csv") 28 | cls.df = pd.read_csv(fpath, encoding='utf-8') 29 | cls.dfs = search.DFSearch(cls.df) 30 | 31 | def test__search_1d_array_literal(self): 32 | text = nlp('The votes, name and rating of the artists.') 33 | res = search._search_1d_array(text, self.df.columns, literal=True) 34 | ideal = {text[1]: 3, text[3]: 1, text[5]: 2} 35 | self.assertDictEqual(res, ideal) 36 | 37 | def test__search_1d_array_lemmatize(self): 38 | text = nlp('The votes, names and ratings of the artists.') 39 | res = search._search_1d_array(text, self.df.columns) 40 | ideal = {text[1]: 3, text[3]: 1, text[5]: 2} 41 | self.assertDictEqual(res, ideal) 42 | 43 | def test__search_2d_array_literal(self): 44 | text = nlp( 45 | "James Stewart is the actor with the highest rating of 0.988373838 and 120 votes.") 46 | xdf = self.df.sort_values('rating', ascending=False) 47 | res = search._search_2d_array(text, xdf, literal=True) 48 | ideal = {text[-5]: (0, 2), text[-3]: (0, 3)} 49 | self.assertDictEqual(res, ideal) 50 | 51 | def test__search_2d_array_lemmatize(self): 52 | text = nlp( 53 | "James Stewart is the actor with the highest rating of 0.988373838 and 120 votes.") 54 | xdf = self.df.sort_values('rating', ascending=False) 55 | res = search._search_2d_array(text, xdf) 56 | ideal = {text[-5]: (0, 2), text[-3]: (0, 3), text[4]: (9, 0)} 57 | self.assertDictEqual(res, ideal) 58 | 59 | def test__search_array(self): 60 | sent = nlp("The votes, names and ratings of artists.") 61 | res = self.dfs._search_array(sent, self.df.columns, literal=True) 62 | self.assertDictEqual(res, {sent[1]: 3}) 63 | 64 | res = self.dfs._search_array(sent, self.df.columns) 65 | self.assertDictEqual(res, {sent[1]: 3, sent[3]: 1, sent[5]: 2}) 66 | 67 | def test_dfsearch_lemmatized(self): 68 | df = pd.DataFrame.from_dict( 69 | { 70 | "partner": ["Lata Mangeshkar", "Asha Bhosale", "Mohammad Rafi"], 71 | "song": [20, 5, 15], 72 | } 73 | ) 74 | sent = nlp("Kishore Kumar sang the most songs with Lata Mangeshkar.") 75 | dfs = search.DFSearch(df) 76 | self.assertDictEqual( 77 | dfs.search(sent, lemmatize=True), 78 | { 79 | sent[5]: [{"location": "colname", "type": "token", "tmpl": "df.columns[1]"}], 80 | sent[-3:-1]: [ 81 | {'location': 'cell', 'tmpl': 'df["partner"].iloc[0]', 'type': 'ne'}], 82 | } 83 | ) 84 | 85 | def test_search_df(self): 86 | fpath = op.join(op.dirname(__file__), "data", "actors.csv") 87 | df = pd.read_csv(fpath, encoding='utf-8') 88 | df.sort_values("votes", ascending=False, inplace=True) 89 | df.reset_index(inplace=True, drop=True) 90 | dfs = search.DFSearch(df) 91 | sent = nlp("Spencer Tracy is the top voted actor.") 92 | self.assertDictEqual( 93 | dfs.search(sent), 94 | { 95 | sent[:2]: [ 96 | {'location': 'cell', 'tmpl': 'df["name"].iloc[0]', 'type': 'ne'} 97 | ], 98 | sent[-3]: [{'location': 'colname', 'tmpl': 'df.columns[-1]', 'type': 'token'}], 99 | sent[-2]: [ 100 | {'location': 'cell', 'tmpl': 'df["category"].iloc[-4]', 'type': 'token'}] 101 | } 102 | ) 103 | 104 | 105 | class TestSearch(unittest.TestCase): 106 | 107 | @classmethod 108 | def setUpClass(cls): 109 | fpath = op.join(op.dirname(__file__), "data", "actors.csv") 110 | cls.df = pd.read_csv(fpath, encoding='utf-8') 111 | fpath = op.join(op.dirname(__file__), "data", "imdb_ratings.csv") 112 | cls.imdb = pd.read_csv(fpath, encoding='utf-8') 113 | 114 | def test_dfsearches(self): 115 | x = search.DFSearchResults() 116 | x['hello'] = 'world' 117 | x['hello'] = 'world' 118 | self.assertDictEqual(x, {'hello': ['world']}) 119 | x = search.DFSearchResults() 120 | x['hello'] = 'world' 121 | x['hello'] = 'underworld' 122 | self.assertDictEqual(x, {'hello': ['world', 'underworld']}) 123 | 124 | def test_search_args(self): 125 | args = utils.sanitize_fh_args({"_sort": ["-votes"]}, self.df) 126 | doc = nlp("James Stewart is the top voted actor.") 127 | ents = utils.ner(doc, matcher) 128 | self.assertDictEqual( 129 | search.search_args(ents, args), 130 | { 131 | doc[-3]: { 132 | "tmpl": "fh_args['_sort'][0]", 133 | "type": "token", 134 | "location": "fh_args" 135 | } 136 | } 137 | ) 138 | 139 | def test_search_args_literal(self): 140 | args = utils.sanitize_fh_args({"_sort": ["-rating"]}, self.df) 141 | doc = nlp("James Stewart has the highest rating.") 142 | ents = utils.ner(doc, matcher) 143 | self.assertDictEqual(search.search_args(ents, args, lemmatized=False), 144 | {doc[-2]: { 145 | "tmpl": "fh_args['_sort'][0]", 146 | "location": "fh_args", 147 | "type": "token"}}) 148 | 149 | def test_templatize(self): 150 | df = self.df.sort_values("votes", ascending=False) 151 | df.reset_index(inplace=True, drop=True) 152 | 153 | doc = nlp(""" 154 | Spencer Tracy is the top votes actor, followed by Cary Grant. 155 | The least votes actress is Bette Davis, trailing at only 14 votes, followed by 156 | Ingrid Bergman at a rating of 0.296140198. 157 | """) 158 | ideal = """ 159 | {{ df['name'].iloc[0] }} is the top {{ fh_args['_sort'][0] }} 160 | {{ df['category'].iloc[-4] }}, followed by {{ df['name'].iloc[1] }}. 161 | The least {{ fh_args['_sort'][0] }} {{ df['category'].iloc[-1] }} is 162 | {{ df['name'].iloc[-1] }}, trailing at only {{ df['votes'].iloc[-1] }} 163 | {{ df.columns[-1] }}, followed by {{ df['name'].iloc[-2] }} at a {{ df.columns[2] }} 164 | of {{ df['rating'].iloc[-2] }}. 165 | """ 166 | args = {"_sort": ["-votes"]} 167 | tokenmap, text, inflections = search._search(doc, args, df, copy=True) 168 | actual = text.text 169 | for token, tmpls in tokenmap.items(): 170 | tmpl = [t for t in tmpls if t.get('enabled', False)][0] 171 | actual = actual.replace(token.text, 172 | '{{{{ {} }}}}'.format(tmpl['tmpl'])) 173 | cleaner = lambda x: re.sub(r"\s+", " ", x) # NOQA: E731 174 | ideal, actual = map(cleaner, (ideal, actual)) 175 | args = utils.sanitize_fh_args(args, df) 176 | ideal = Template(ideal).generate(df=df, fh_args=args) 177 | actual = Template(actual).generate(df=df, fh_args=args) 178 | self.assertEqual(ideal, actual) 179 | self.assertDictEqual( 180 | inflections, 181 | { 182 | doc[7]: [{'fe_name': 'Singularize', 'source': 'G', 'func_name': 'singular'}, 183 | {'fe_name': 'Lowercase', 'source': 'str', 'func_name': 'lower'}], 184 | doc[18]: [ # noqa: E912 185 | {'fe_name': 'Singularize', 'source': 'G', 'func_name': 'singular'}, 186 | {'fe_name': 'Lowercase', 'source': 'str', 'func_name': 'lower'}] 187 | } 188 | # Don't detect inflections until they can be processed without intervention 189 | # 'voted': [{'source': 'G', 'fe_name': 'Lemmatize', 'func_name': 'lemmatize'}]} 190 | ) 191 | 192 | def test_search_sort(self): 193 | results = [ 194 | {'tmpl': 'df.loc[0, "name"]', 'type': 'ne', 'location': 'cell'}, 195 | {'tmpl': 'df.columns[0]', 'type': 'token', 'location': 'colname'}, 196 | {'tmpl': 'args["_sort"][0]', 'type': 'token', 'location': 'fh_args'} 197 | ] 198 | _sorted = search._sort_search_results(results) 199 | enabled = [c for c in _sorted if c.get('enabled', False)] 200 | self.assertListEqual(enabled, results[:1]) 201 | 202 | results = [ 203 | {'tmpl': 'df.columns[0]', 'type': 'token', 'location': 'colname'}, 204 | {'tmpl': 'args["_sort"][0]', 'type': 'token', 'location': 'fh_args'}, 205 | {'tmpl': 'df["foo"].iloc[0]', 'type': 'token', 'location': 'cell'} 206 | ] 207 | _sorted = search._sort_search_results(results) 208 | enabled = [c for c in _sorted if c.get('enabled', False)] 209 | self.assertListEqual(enabled, results[1:2]) 210 | 211 | results = [ 212 | {'tmpl': 'df.columns[0]', 'type': 'token', 'location': 'colname'}, 213 | {'tmpl': 'args["_sort"][0]', 'type': 'token', 'location': 'cell'}, 214 | {'tmpl': 'df["foo"].iloc[0]', 'type': 'quant', 'location': 'cell'} 215 | ] 216 | _sorted = search._sort_search_results(results) 217 | enabled = [c for c in _sorted if c.get('enabled', False)] 218 | self.assertListEqual(enabled, results[:1]) 219 | 220 | results = [ 221 | {'tmpl': 'args["_sort"][0]', 'type': 'token', 'location': 'cell'}, 222 | {'tmpl': 'df["foo"].iloc[0]', 'type': 'quant', 'location': 'cell'} 223 | ] 224 | _sorted = search._sort_search_results(results) 225 | enabled = [c for c in _sorted if c.get('enabled', False)] 226 | self.assertListEqual(enabled, results[1:]) 227 | 228 | def test_single_entity_search(self): 229 | text = nlp("Humphrey Bogart") 230 | nugget = search.templatize(text, {}, self.df) 231 | self.assertEqual(len(nugget.tokenmap), 1) 232 | for token, variable in nugget.tokenmap.items(): 233 | break 234 | self.assertEqual(token.text, text.ents[0].text) 235 | self.assertEqual(variable.template, '{{ df["name"].iloc[0] }}') 236 | 237 | def test_literal_search(self): 238 | texts = ['How I Met Your Mother', 'Sherlock', 'Dexter', 'Breaking Bad'] 239 | for t in texts: 240 | doc = nlp(t) 241 | nugget = search.templatize(doc, {}, self.imdb) 242 | self.assertEqual(len(nugget.tokenmap), 1) 243 | for token, variable in nugget.tokenmap.items(): 244 | self.assertEqual(token.text, t) 245 | self.assertRegex(nugget.template, r'{{ df\["name"\].iloc\[-*\d+\] }}') 246 | 247 | def test_search_short_strings(self): 248 | # Check strings that are shorter than the max length of the df, 249 | # but still not a literal match 250 | nugget = search.templatize(nlp('Dexter is a good show'), {}, self.imdb) 251 | self.assertEqual(len(nugget.tokenmap), 1) 252 | _, variable = nugget.tokenmap.popitem() 253 | self.assertRegex(variable.enabled_source['tmpl'], r'df\["name"\].iloc\[-*\d+\]') 254 | 255 | def test_token_span_overlap(self): 256 | df = pd.DataFrame([('Technology', 1, 3), ('Furniture', 2, 2), ('Office Supplies', 3, 1)]) 257 | df.columns = ['Category', 'Number', 'Sales'] 258 | text = nlp('Technology has the highest sales, followed by furniture and office supplies.') 259 | dfs = search.DFSearch(df) 260 | results = dfs.search(text) 261 | results.clean() 262 | self.assertEqual(len(results), 3) 263 | self.assertIn('Technology', [c.text for c in results]) 264 | for k in results: 265 | if k.text == 'Technology': 266 | self.assertTrue(isinstance(k, Span)) 267 | 268 | 269 | if __name__ == "__main__": 270 | unittest.main() 271 | -------------------------------------------------------------------------------- /nlg/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | 5 | """Tests for nlg.utils""" 6 | 7 | import os 8 | import unittest 9 | 10 | import pandas as pd 11 | 12 | from nlg import utils 13 | 14 | 15 | nlp = utils.load_spacy_model() 16 | matcher = utils.make_np_matcher(nlp) 17 | op = os.path 18 | 19 | 20 | class TestUtils(unittest.TestCase): 21 | 22 | def test_join_words(self): 23 | sent = 'The quick brown fox jumps over the lazy dog.' 24 | self.assertEqual(utils.join_words(sent), sent.rstrip('.')) 25 | self.assertEqual(utils.join_words(sent, ''), sent.rstrip('.').replace(' ', '')) 26 | self.assertEqual(utils.join_words('-Office supplies'), 'Office supplies') 27 | 28 | def test_sanitize_args(self): 29 | args = {'_by': ['category'], '_c': ['votes|avg'], '_sort': ['-votes|avg']} 30 | df = pd.read_csv(op.join(op.dirname(__file__), 'data', 'actors.csv'), encoding='utf8') 31 | self.assertDictEqual( 32 | utils.sanitize_fh_args(args, df), 33 | { 34 | '_by': ['category'], 35 | '_c': ['votes'], 36 | '_sort': ['votes|avg'] 37 | } 38 | ) 39 | 40 | @unittest.skip('NER is unstable.') 41 | def test_ner(self): 42 | sent = nlp( 43 | """ 44 | US President Donald Trump is an entrepreneur and 45 | used to run his own reality show named 'The Apprentice'.""" 46 | ) 47 | ents = utils.ner(sent, matcher) 48 | self.assertSetEqual( 49 | set([c.text for c in utils.unoverlap(ents)]), 50 | { 51 | "Donald Trump", 52 | "Apprentice", 53 | "US President", 54 | "President Donald", 55 | "entrepreneur", 56 | "reality show" 57 | }, 58 | ) 59 | 60 | def test_sanitize_indices(self): 61 | self.assertEqual(utils.sanitize_indices((3, 3), 0), 0) 62 | self.assertEqual(utils.sanitize_indices((3, 3), 1), 1) 63 | self.assertEqual(utils.sanitize_indices((3, 3), 2), -1) 64 | self.assertEqual(utils.sanitize_indices((3, 3), 0, 1), 0) 65 | self.assertEqual(utils.sanitize_indices((3, 3), 1, 1), 1) 66 | self.assertEqual(utils.sanitize_indices((3, 3), 2, 1), -1) 67 | 68 | @unittest.skip('WIP') 69 | def test_infer_quant(self): 70 | text = 'Of the three species, setosa has the highest average sepal width.' 71 | doc = nlp(text) 72 | self.assertEqual(utils.infer_quant(doc[2]), 3) 73 | 74 | text = 'Of the 3 species, setosa has the highest average sepal width.' 75 | doc = nlp(text) 76 | self.assertEqual(utils.infer_quant(doc[2]), 3) 77 | 78 | text = 'The value of pi is 3.14.' 79 | doc = nlp(text) 80 | self.assertEqual(utils.infer_quant(doc[-2]), 3.14) # noqa: E912 81 | 82 | 83 | if __name__ == "__main__": 84 | unittest.main() 85 | -------------------------------------------------------------------------------- /nlg/tests/test_webapp.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import pandas as pd 5 | 6 | from nlg import templatize 7 | from nlg.utils import load_spacy_model 8 | from nlg import webapp as app 9 | 10 | 11 | nlp = load_spacy_model() 12 | op = os.path 13 | 14 | 15 | class TestWebApp(TestCase): 16 | 17 | @classmethod 18 | def setUpClass(cls): 19 | cls.df = pd.read_csv(op.join(op.dirname(__file__), "data", "actors.csv"), 20 | encoding='utf8') 21 | fh_args = {'_sort': ['-rating']} 22 | cls.text = nlp('James Stewart is the actor with the highest rating.') 23 | cls.nugget = templatize(cls.text, fh_args, cls.df) 24 | 25 | def test_preview_html(self): 26 | html = '{}' 27 | ideal = html.format("James Stewart") + " is the " 28 | ideal += html.format('actor') + " with the highest " + html.format('rating') + '.' 29 | template = self.nugget.to_dict() 30 | self.assertEqual(app.get_preview_html(template, True), ideal) 31 | 32 | text = nlp("James Stewart, Humphrey Bogart, Marlon Brando and Ingrid Bergman are actors.") 33 | names = ['James Stewart', 'Humphrey Bogart', 'Marlon Brando', 'Ingrid Bergman'] 34 | ideal = ", ".join([html.format(name) for name in names[:-1]]) 35 | ideal += " and " + html.format(names[-1]) + " are " + html.format('actors') + "." 36 | nugget = templatize(text, {}, self.df) 37 | template = nugget.to_dict() 38 | actual = app.get_preview_html(template, True) 39 | self.assertEqual(actual, ideal) 40 | 41 | def test_preview_html_noninteractive(self): 42 | html = '{}' 43 | ideal = html.format("James Stewart") + " is the " 44 | ideal += html.format('actor') + " with the highest " + html.format('rating') + "." 45 | template = self.nugget.to_dict() 46 | self.assertEqual(app.get_preview_html(template), ideal) 47 | 48 | text = nlp("James Stewart, Humphrey Bogart, Marlon Brando and Ingrid Bergman are actors.") 49 | names = ['James Stewart', 'Humphrey Bogart', 'Marlon Brando', 'Ingrid Bergman'] 50 | ideal = ", ".join([html.format(name) for name in names[:-1]]) 51 | ideal += " and " + html.format(names[-1]) + " are " + html.format('actors') + "." 52 | nugget = templatize(text, {}, self.df) 53 | template = nugget.to_dict() 54 | actual = app.get_preview_html(template) 55 | self.assertEqual(actual, ideal) 56 | -------------------------------------------------------------------------------- /nlg/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim:fenc=utf-8 3 | 4 | """ 5 | Miscellaneous utilities. 6 | """ 7 | import os.path as op 8 | import re 9 | 10 | import pandas as pd 11 | from spacy.tokens import Token, Doc, Span 12 | from tornado.template import Template 13 | 14 | from gramex.data import filter as gfilter # NOQA: F401 15 | from gramex.data import ( 16 | _filter_groupby_columns, _filter_select_columns, _filter_sort_columns, _filter_col, 17 | _agg_sep 18 | ) 19 | 20 | NP_RULES = { 21 | 'NP1': [{'POS': 'PROPN', 'OP': '+'}], 22 | 'NP2': [{'POS': 'NOUN', 'OP': '+'}], 23 | 'NP3': [{'POS': 'ADV', 'OP': '+'}, {'POS': 'VERB', 'OP': '+'}], 24 | 'NP4': [{'POS': 'ADJ', 'OP': '+'}, {'POS': 'VERB', 'OP': '+'}], 25 | 'QUANT': [{'POS': 'NUM', 'OP': '+'}] 26 | } 27 | QUANT_PATTERN = re.compile(r'(^\.d+|^d+\.?(d?)+)') 28 | _spacy = { 29 | 'model': False, 30 | 'lemmatizer': False, 31 | 'matcher': False 32 | } 33 | 34 | 35 | def _locate_app_config(): 36 | return op.join(op.dirname(__file__), 'app', 'gramex.yaml') 37 | 38 | 39 | def load_spacy_model(): 40 | """Load the spacy model when required.""" 41 | if not _spacy['model']: 42 | from spacy import load 43 | nlp = load('en_core_web_sm') 44 | _spacy['model'] = nlp 45 | else: 46 | nlp = _spacy['model'] 47 | return nlp 48 | 49 | 50 | def get_lemmatizer(): 51 | if not _spacy['lemmatizer']: 52 | from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES 53 | from spacy.lemmatizer import Lemmatizer 54 | lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) 55 | _spacy['lemmatizer'] = lemmatizer 56 | else: 57 | lemmatizer = _spacy['lemmatizer'] 58 | return lemmatizer 59 | 60 | 61 | def make_np_matcher(nlp, rules=NP_RULES): 62 | """Make a rule based noun phrase matcher. 63 | 64 | Parameters 65 | ---------- 66 | nlp : `spacy.lang` 67 | The spacy model to use. 68 | rules : dict, optional 69 | Mapping of rule IDS to spacy attribute patterns, such that each mapping 70 | defines a noun phrase structure. 71 | 72 | Returns 73 | ------- 74 | `spacy.matcher.Matcher` 75 | """ 76 | if not _spacy['matcher']: 77 | from spacy.matcher import Matcher 78 | matcher = Matcher(nlp.vocab) 79 | for k, v in rules.items(): 80 | matcher.add(k, None, v) 81 | _spacy['matcher'] = matcher 82 | else: 83 | matcher = _spacy['matcher'] 84 | return matcher 85 | 86 | 87 | def render_search_result(text, results, **kwargs): 88 | for token, tokenlist in results.items(): 89 | tmpl = [t for t in tokenlist if t.get('enabled', False)][0] 90 | text = text.replace(token, '{{{{ {} }}}}'.format(tmpl['tmpl'])) 91 | return Template(text).generate(**kwargs).decode('utf-8') 92 | 93 | 94 | def join_words(x, sep=' '): 95 | return sep.join(re.findall(r'\w+', x, re.IGNORECASE)) 96 | 97 | 98 | class set_nlg_gramopt(object): # noqa: class to be used as a decorator 99 | """Decorator for adding callables to grammar options of the webapp. 100 | """ 101 | def __init__(self, **kwargs): 102 | self.kwargs = kwargs 103 | 104 | def __call__(self, func): 105 | func.gramopt = True 106 | for k, v in self.kwargs.items(): 107 | if not getattr(func, k, False): 108 | setattr(func, k, v) 109 | return func 110 | 111 | 112 | def is_overlap(x, y): 113 | """Whether the token x is contained within any span in the sequence y.""" 114 | if len(y) == 0: 115 | return False 116 | if isinstance(x, Token): 117 | if x.pos_ == "NUM": 118 | return False 119 | elif 'NUM' in [c.pos_ for c in x]: 120 | return False 121 | if len(y) > 1: 122 | if isinstance(x, Token): 123 | return any([x.text in yy.text for yy in y]) 124 | y = y.pop() 125 | if isinstance(x, (Token, Span)) and isinstance(y, Doc): 126 | return x.doc == y 127 | return False 128 | 129 | 130 | def unoverlap(tokens): 131 | """From a set of tokens, remove all tokens that are contained within 132 | others.""" 133 | textmap = {c: c for c in tokens} 134 | newtokens = [] 135 | for token in tokens: 136 | if not is_overlap(textmap[token], set(tokens) - {token}): 137 | newtokens.append(token) 138 | return [textmap[t] for t in newtokens] 139 | 140 | 141 | def ner(doc, matcher, match_ids=False, remove_overlap=True): 142 | """Find all NEs and other nouns in a spacy doc. 143 | 144 | Parameters 145 | ---------- 146 | doc: spacy.tokens.doc.Doc 147 | The document in which to search for entities. 148 | matcher: spacy.matcher.Matcher 149 | The rule based matcher to use for finding noun phrases. 150 | match_ids: list, optional 151 | IDs from the spacy matcher to filter from the matches. 152 | remove_overlap: bool, optional 153 | Whether to remove overlapping tokens from the result. 154 | 155 | Returns 156 | ------- 157 | list 158 | List of spacy.token.span.Span objects. 159 | """ 160 | entities = set() 161 | for span in doc.ents: 162 | newtokens = [c for c in span if not c.is_space] 163 | if newtokens: 164 | newspan = doc[newtokens[0].i: (newtokens[-1].i + 1)] 165 | entities.add(newspan) 166 | if not match_ids: 167 | entities.update([doc[start:end] for _, start, end in matcher(doc)]) 168 | else: 169 | for m_id, start, end in matcher(doc): 170 | if matcher.vocab.strings[m_id] in match_ids: 171 | entities.add(doc[start:end]) 172 | if remove_overlap: 173 | entities = unoverlap(entities) 174 | return entities 175 | 176 | 177 | def sanitize_indices(shape, i, axis=0): 178 | n = shape[axis] 179 | if i <= n // 2: 180 | return i 181 | return -(n - i) 182 | 183 | 184 | def sanitize_text(text, d_round=2): 185 | """All text cleaning and standardization logic goes here.""" 186 | nums = re.findall(r'\d+\.\d+', text) 187 | for num in nums: 188 | text = re.sub(num, str(round(float(num), d_round)), text) 189 | return text 190 | 191 | 192 | def sanitize_df(df, d_round=2, **options): 193 | """All dataframe cleaning and standardizing logic goes here.""" 194 | for c in df.columns[df.dtypes == float]: 195 | df[c] = df[c].round(d_round) 196 | return df 197 | 198 | 199 | def sanitize_fh_args(args, df): 200 | columns = df.columns 201 | meta = { 202 | 'filters': [], # Applied filters as [(col, op, val), ...] 203 | 'ignored': [], # Ignored filters as [(col, vals), ...] 204 | 'sort': [], # Sorted columns as [(col, asc), ...] 205 | 'offset': 0, # Offset as integer 206 | 'limit': None, # Limit as integer - None if not applied 207 | 'by': [], # Group by columns as [col, ...] 208 | } 209 | res = {} 210 | if '_by' in args: 211 | res['_by'] = _filter_groupby_columns(args['_by'], columns, meta) 212 | col_list = args.get('_c', False) 213 | if not col_list: 214 | col_list = [col + _agg_sep + 'sum' for col in columns # noqa 215 | if pd.api.types.is_numeric_dtype(df[col])] 216 | res['_c'] = [] 217 | for c in col_list: 218 | res['_c'].append(_filter_col(c, df.columns)[0]) 219 | columns = col_list 220 | elif '_c' in args: 221 | selected, _ = _filter_select_columns(args['_c'], columns, meta) 222 | res['_c'] = [c[0] for c in selected] 223 | if '_sort' in args: 224 | sort = _filter_sort_columns(args, columns, meta) 225 | res['_sort'] = [c[0] for c in sort] 226 | return res 227 | 228 | 229 | def add_html_styling(template, style): 230 | """Add HTML styling spans to template elements. 231 | 232 | Parameters 233 | ---------- 234 | template : str 235 | A tornado template 236 | style : dict or bool 237 | If False, no styling is added. 238 | If True, a default bgcolor is added to template variables. 239 | If dict, expected to contain HTML span styling elements. 240 | 241 | Returns 242 | ------- 243 | str 244 | Modified template with each variabled stylized. 245 | 246 | Example 247 | ------- 248 | >>> t = 'Hello, {{ name }}!' 249 | >>> add_html_styling(t, True) 250 | 'Hello, {{ name }}!' 251 | >>> add_html_styling(t, False) 252 | 'Hello, {{ name }}!' 253 | >>> add_html_style(t, {'background-color': '#ffffff', 'font-family': 'monospace'}) 254 | 'Hello, {{ name }}!' 255 | """ 256 | 257 | if not style: 258 | return template 259 | pattern = re.compile(r'\{\{[^\{\}]+\}\}') 260 | if isinstance(style, dict): 261 | # convert the style dict into a stylized HTML span 262 | spanstyle = ';'.join(['{}:{}'.format(k, v) for k, v in style.items()]) 263 | else: 264 | spanstyle = 'background-color:#c8f442' 265 | for m in re.finditer(pattern, template): 266 | token = m.group() 267 | repl = '{token}'.format( 268 | ss=spanstyle, token=token) 269 | template = re.sub(re.escape(token), repl, template, 1) 270 | return '

    {template}

    '.format(template=template) 271 | 272 | 273 | def infer_quant(token): 274 | """Infer the quantitative value from a token which has POS == 'NUM' or is like_num. 275 | 276 | Parameters 277 | ---------- 278 | token : `spacy.tokens.Token` 279 | A spacy token representing a number / scalar. This can be anything with a POS attribute of 280 | 'NUM' or is like_nnum 281 | 282 | Returns 283 | ------- 284 | float or int 285 | 286 | Example 287 | ------- 288 | >>> doc = nlp('Aryabhatta invented the zero.') 289 | >>> infer_quant(doc[-2]) 290 | 0 291 | """ 292 | if re.fullmatch(QUANT_PATTERN, token.shape_): 293 | if "." in token.text: 294 | return float(token.text) 295 | return int(token.text) 296 | -------------------------------------------------------------------------------- /nlg/webapp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim:fenc=utf-8 3 | 4 | """ 5 | Module for gramex exposure. This shouldn't be imported anywhere, only for use 6 | with gramex. 7 | """ 8 | import glob 9 | import json 10 | import os 11 | import os.path as op 12 | 13 | from gramex.config import variables 14 | from gramex.config import app_log # noqa: F401 15 | import pandas as pd 16 | from tornado.template import Loader 17 | 18 | from nlg import utils, templatize, grammar_options 19 | from nlg.narrative import Narrative 20 | 21 | DATAFILE_EXTS = {'.csv', '.xls', '.xlsx', '.tsv'} 22 | NARRATIVE_CACHE = {} 23 | 24 | nlg_path = op.join(variables['GRAMEXDATA'], 'nlg') 25 | nlp = utils.load_spacy_model() 26 | tmpl_loader = Loader(op.join(op.dirname(__file__), "app", "templates"), autoescape=None) 27 | 28 | if not op.isdir(nlg_path): 29 | os.mkdir(nlg_path) 30 | 31 | 32 | def get_config_modal(handler): 33 | return tmpl_loader.load("init-config-modal.tmpl").generate(handler=handler) 34 | 35 | 36 | def get_narrative_cache(handler): 37 | narrative = NARRATIVE_CACHE.get(handler.current_user.id, Narrative()) 38 | return json.dumps(narrative.to_dict()) 39 | 40 | 41 | download_narrative = get_narrative_cache 42 | load_narrative = get_narrative_cache 43 | 44 | 45 | def new_variable_tmpl(handler): 46 | nugget_id = int(handler.path_args[0]) 47 | variable_ix = handler.path_args[1] 48 | nugget = NARRATIVE_CACHE[handler.current_user.id][nugget_id] 49 | start, end = map(int, variable_ix.split(',')) 50 | span = nugget.doc.text[start:end] 51 | nlg_base = variables['NLG_BASE'].rstrip('/') 52 | return tmpl_loader.load("new-variable.tmpl").generate( 53 | nugget_id=nugget_id, text=span, variable_ix=variable_ix, nlg_base=nlg_base) 54 | 55 | 56 | def add_new_variable(handler): 57 | nugget = NARRATIVE_CACHE[handler.current_user.id][int(handler.path_args[0])] 58 | start, end = map(int, handler.path_args[1].split(',')) 59 | nugget.add_var([start, end], expr=handler.args['expr'][0]) 60 | return nugget.template 61 | 62 | 63 | def get_preview_html(template, interactive=False): 64 | """get_preview_html 65 | 66 | Parameters 67 | ---------- 68 | template : {{_type_}} 69 | 70 | 71 | Returns 72 | ------- 73 | 74 | Example 75 | ------- 76 | """ 77 | text = template['text'] 78 | if interactive: 79 | html = '{}' 80 | else: 81 | html = '{}' 82 | l_offset = len(html.format('')) 83 | offset = 0 84 | tokenmap = sorted(template['tokenmap'], key=lambda x: x['idx']) 85 | for token in tokenmap: 86 | start = token['idx'] + offset 87 | end = start + len(token['text']) 88 | prefix = text[:start] 89 | suffix = text[end:] 90 | text = prefix + html.format(token['text']) + suffix 91 | offset += l_offset 92 | return text 93 | 94 | 95 | def get_variable_settings_tmpl(handler): 96 | nugget_id, variable_ix = handler.path_args 97 | nugget = NARRATIVE_CACHE[handler.current_user.id][int(nugget_id)] 98 | if not variable_ix.isdigit(): 99 | start, stop = map(int, variable_ix.split(",")) 100 | variable = nugget.get_var((start, stop)).to_dict() 101 | else: 102 | variable_i = int(variable_ix) 103 | variable = nugget.get_var(variable_i).to_dict() 104 | tmpl = tmpl_loader.load("variable-settings.tmpl") 105 | return tmpl.generate( 106 | variable=variable, nugget_id=nugget_id, variable_id=variable_ix, 107 | grammar_options=grammar_options) 108 | 109 | 110 | def set_variable_settings_tmpl(handler): 111 | nugget_id, variable_ix = handler.path_args 112 | nugget = NARRATIVE_CACHE[handler.current_user.id][int(nugget_id)] 113 | if not variable_ix.isdigit(): 114 | variable_i = map(int, variable_ix.split(",")) 115 | else: 116 | variable_i = int(variable_ix) 117 | variable = nugget.get_var(variable_i) 118 | # handler.args will be something like 119 | # {'sourcetext': [''], 'sources': ['0'], 'expr': ['foo'], 'inflections': ['Singularize']} 120 | 121 | expr = handler.args['expr'][0] 122 | if expr: # Ignore the default value of the sources dropdown if expression is present 123 | variable.set_expr(expr) 124 | else: 125 | source = int(handler.args['sources'][0]) 126 | if variable.sources[source]['tmpl'] != variable.enabled_source: 127 | variable.enable_source(source) 128 | 129 | inflections = handler.args.get('inflections', False) 130 | if inflections: 131 | variable.inflections = [grammar_options[i] for i in inflections] 132 | else: 133 | variable.inflections = [] 134 | return nugget.template 135 | 136 | 137 | def get_nugget_settings_tmpl(handler): 138 | nugget = get_nugget(handler) 139 | nugget_id = int(handler.path_args[0]) 140 | nlg_base = variables['NLG_BASE'].rstrip('/') 141 | return tmpl_loader.load("template-settings.tmpl").generate( 142 | template=nugget, nugget_id=nugget_id, nlg_base=nlg_base) 143 | 144 | 145 | def add_condition(handler): 146 | nugget = NARRATIVE_CACHE[handler.current_user.id][int(handler.path_args[0])] 147 | nugget.condition = handler.args['condition'][0] 148 | 149 | 150 | def get_nugget(handler): 151 | nugget_id = int(handler.path_args[0]) 152 | if 'delete' in handler.args: 153 | del NARRATIVE_CACHE[handler.current_user.id][nugget_id] 154 | return NARRATIVE_CACHE[handler.current_user.id].to_dict() 155 | else: 156 | nugget = NARRATIVE_CACHE[handler.current_user.id][nugget_id] 157 | nugget = nugget.to_dict() 158 | nugget['previewHTML'] = get_preview_html(nugget, True) 159 | return nugget 160 | 161 | 162 | def clean_anonymous_files(): 163 | """Remove all files uploaded by anonymous users. 164 | This may be used at startup when deploying the app.""" 165 | import shutil 166 | anon_dir = op.join(nlg_path, 'anonymous') 167 | if op.isdir(anon_dir): 168 | shutil.rmtree(anon_dir) 169 | 170 | 171 | def is_user_authenticated(handler): 172 | """Check if the current user is authenticated.""" 173 | current_user = getattr(handler, 'current_user', False) 174 | return bool(current_user) 175 | 176 | 177 | def get_user_dir(handler): 178 | if is_user_authenticated(handler): 179 | dirpath = op.join(nlg_path, handler.current_user.id) 180 | else: 181 | dirpath = op.join(nlg_path, 'anonymous') 182 | return dirpath 183 | 184 | 185 | def render_live_template(handler): 186 | """Given a narrative ID and df records, render the template.""" 187 | payload = json.loads(handler.request.body) 188 | df = pd.DataFrame.from_records(payload['data']) 189 | nrid = payload['nrid'] 190 | if not nrid.endswith('.json'): 191 | nrid += '.json' 192 | with open(op.join(get_user_dir(handler), nrid), 'r', encoding='utf8') as fin: 193 | narrative = json.load(fin) 194 | narrative = Narrative.from_json(narrative) 195 | return narrative.to_html(**narrative.html_style, df=df) 196 | 197 | 198 | def get_style_kwargs(handler_args): 199 | style_kwargs = { 200 | 'style': handler_args.pop('style', ['para'])[0], 201 | 'liststyle': handler_args.pop('liststyle', ['html'])[0], 202 | } 203 | style_kwargs.update({k: json.loads(v[0]) for k, v in handler_args.items()}) 204 | return style_kwargs 205 | 206 | 207 | def render_narrative(handler): 208 | orgdf = get_original_df(handler) 209 | narrative = NARRATIVE_CACHE.get(handler.current_user.id, False) 210 | if narrative: 211 | style_kwargs = get_style_kwargs(handler.args) 212 | pl = {'render': narrative.to_html(**style_kwargs, df=orgdf), 213 | 'style': narrative.html_style} 214 | else: 215 | pl = {'render': '', 'style': Narrative.default_style} 216 | return pl 217 | 218 | 219 | def get_original_df(handler): 220 | """Get the original dataframe which was uploaded to the webapp.""" 221 | data_dir = get_user_dir(handler) 222 | meta_path = op.join(data_dir, 'meta.cfg') 223 | if op.isfile(meta_path): 224 | with open(meta_path, 'r') as fout: # noqa: No encoding for json 225 | meta = json.load(fout) 226 | dataset_path = op.join(data_dir, meta['dsid']) 227 | return pd.read_csv(dataset_path, encoding='utf-8') 228 | 229 | 230 | def render_template(handler): 231 | """Render a set of templates against a dataframe and formhandler actions on it.""" 232 | orgdf = get_original_df(handler) 233 | nugget = NARRATIVE_CACHE[handler.current_user.id][int(handler.path_args[0])] 234 | return nugget.render(orgdf) 235 | 236 | 237 | def save_nugget(sid, nugget): 238 | narrative = NARRATIVE_CACHE.get(sid, Narrative()) 239 | narrative.append(nugget) 240 | if len(narrative) > 0: 241 | NARRATIVE_CACHE[sid] = narrative 242 | # outpath = op.join(nlg_path, sid + ".json") 243 | # with open(outpath, 'w', encoding='utf8') as fout: 244 | # json.dump([n.to_dict() for n in narrative], fout, indent=4) 245 | 246 | 247 | def process_text(handler): 248 | """Process English text in the context of a df and formhandler arguments 249 | to templatize it.""" 250 | payload = json.loads(handler.request.body.decode('utf8')) 251 | df = pd.DataFrame.from_records(payload['data']) 252 | args = payload.get('args', {}) or {} 253 | nugget = templatize(nlp(payload['text']), args.copy(), df) 254 | save_nugget(handler.current_user.id, nugget) 255 | nugget = nugget.to_dict() 256 | nugget['previewHTML'] = get_preview_html(nugget) 257 | return nugget 258 | 259 | 260 | def read_current_config(handler): 261 | """Read the current data and narrative IDs written to the session file.""" 262 | user_dir = get_user_dir(handler) 263 | meta_path = op.join(user_dir, 'meta.cfg') 264 | if not op.isdir(user_dir): 265 | os.mkdir(user_dir) 266 | if not op.isfile(meta_path): 267 | return {} 268 | with open(meta_path, 'r') as fout: # noqa: No encoding for json 269 | meta = json.load(fout) 270 | return meta 271 | 272 | 273 | def get_dataset_files(handler): 274 | """Get all filenames uploaded by the user. 275 | 276 | Parameters 277 | ---------- 278 | handler : tornado.RequestHandler 279 | 280 | Returns 281 | ------- 282 | list 283 | List of filenames. 284 | """ 285 | files = glob.glob('{}/*'.format(get_user_dir(handler))) 286 | return [f for f in files if op.splitext(f)[-1].lower() in DATAFILE_EXTS] 287 | 288 | 289 | def get_narrative_config_files(handler): 290 | """Get list of narrative config files generated by the user. 291 | 292 | Parameters 293 | ---------- 294 | handler : tornado.RequestHandler 295 | 296 | Returns 297 | ------- 298 | list 299 | List of narrative configurations. 300 | """ 301 | return glob.glob('{}/*.json'.format(get_user_dir(handler))) 302 | 303 | 304 | def init_form(handler): 305 | """Process input from the landing page and write the current session config.""" 306 | meta = {} 307 | data_dir = get_user_dir(handler) 308 | if not op.isdir(data_dir): 309 | os.makedirs(data_dir) 310 | 311 | # handle dataset 312 | data_file = handler.request.files.get('data-file', [{}])[0] 313 | if data_file: 314 | # TODO: Unix filenames may not be valid Windows filenames. 315 | outpath = op.join(data_dir, data_file['filename']) 316 | with open(outpath, 'wb') as fout: 317 | fout.write(data_file['body']) 318 | else: 319 | dataset = handler.args['dataset'][0] 320 | outpath = op.join(data_dir, dataset) 321 | # shutil.copy(outpath, fh_fpath) 322 | meta['dsid'] = op.basename(outpath) 323 | 324 | # handle config 325 | config_name = handler.get_argument('narrative', '') 326 | if config_name: 327 | outpath = op.join(data_dir, config_name) 328 | # shutil.copy(config_path, op.join(local_data_dir, 'config.json')) 329 | else: 330 | conf_file = handler.request.files.get('config-file', [{}])[0] 331 | if conf_file: 332 | outpath = op.join(data_dir, conf_file['filename']) 333 | with open(outpath, 'wb') as fout: 334 | fout.write(conf_file['body']) 335 | else: 336 | outpath = False 337 | if outpath: 338 | meta['nrid'] = op.basename(outpath) 339 | 340 | # write meta config 341 | with open(op.join(data_dir, 'meta.cfg'), 'w') as fout: # NOQA 342 | json.dump(meta, fout, indent=4) 343 | 344 | 345 | def get_init_config(handler): 346 | """Get the initial default configuration for the current user.""" 347 | user_dir = get_user_dir(handler) 348 | metapath = op.join(user_dir, 'meta.cfg') 349 | if op.isfile(metapath): 350 | with open(metapath, 'r') as fout: # NOQA: no encoding for JSON 351 | meta = json.load(fout) 352 | narrative_file = meta.get('nrid', '') 353 | narrative_name = op.splitext(narrative_file)[0] 354 | config_file = op.join(user_dir, narrative_file) 355 | if op.isfile(config_file): 356 | with open(config_file, 'r') as fout: # NOQA: no encoding for JSON 357 | meta['config'] = json.load(fout) 358 | global NARRATIVE_CACHE 359 | NARRATIVE_CACHE = {} 360 | NARRATIVE_CACHE[handler.current_user.id] = \ 361 | Narrative.from_json(meta['config']) 362 | app_log.debug('Initial config loaded from {}'.format(config_file)) 363 | return { 364 | 'style': NARRATIVE_CACHE[handler.current_user.id].html_style, 365 | 'nrid': narrative_name} 366 | return {} 367 | 368 | 369 | def save_narrative(handler): 370 | name = handler.path_args[0] 371 | if not name.endswith('.json'): 372 | name += '.json' 373 | outpath = op.join(get_user_dir(handler), name) 374 | with open(outpath, 'w', encoding='utf8') as fout: 375 | json.dump(NARRATIVE_CACHE[handler.current_user.id].to_dict(), 376 | fout, indent=4) 377 | 378 | 379 | def move_nuggets(handler): 380 | pop, drop = map(int, handler.path_args) 381 | narrative = NARRATIVE_CACHE[handler.current_user.id] 382 | popped = narrative.pop(pop) 383 | narrative.insert(drop, popped) 384 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This is intended for installation of the Gramex app. 2 | # Do NOT use for anything. 3 | git+https://github.com/gramener/gramex-nlg@dev#egg=nlg 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | 4 | [pep8] 5 | ignore = E265,E402 6 | 7 | [flake8] 8 | exclude=build,dist,docs,.eggs,node_modules,.vscode 9 | max-line-length=99 10 | ; E911 allows use of str(). Required for pathlib.Path to string conversions 11 | ; N802 ignores "function name should be in lowercase". Required for 12 | ; tearDownModule(), extendMarkdown, etc where function name is pre-defined 13 | ignore=E911,N802 14 | 15 | [nosetests] 16 | verbosity=2 17 | nocapture=1 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | """ 6 | NLG Setup. 7 | """ 8 | 9 | import builtins 10 | from setuptools import setup, find_packages 11 | 12 | 13 | builtins.__NLG_SETUP__ = True 14 | 15 | # Setuptools config 16 | NAME = "nlg" 17 | DESCRIPTION = "Natural Language Generation framework for Python." 18 | with open('README.rst', encoding='utf-8') as f: 19 | LONG_DESCRIPTION = f.read() 20 | MAINTAINER = 'Jaidev Deshpande' 21 | MAINTAINER_EMAIL = 'jaidev.deshpande@gramener.com' 22 | URL = "https://github.com/gramener/gramex-nlg" 23 | DOWNLOAD_URL = 'https://pypi.org/project/nlg/#files' 24 | LICENSE = 'MIT' 25 | PROJECT_URLS = { 26 | 'Bug Tracker': 'https://github.com/gramener/gramex-nlg/issues', 27 | 'Documentation': 'https://learn.gramener.com/guide/nlg', 28 | 'Source Code': 'https://github.com/gramener/gramex-nlg' 29 | } 30 | 31 | # Requirements 32 | install_requires = [ 33 | 'gramex', 34 | 'humanize', 35 | 'inflect', 36 | 'spacy==2.1.8', 37 | ] 38 | 39 | # Setup 40 | import nlg # NOQA: E402 41 | setup( 42 | name=NAME, 43 | maintainer=MAINTAINER, 44 | maintainer_email=MAINTAINER_EMAIL, 45 | description=DESCRIPTION, 46 | license=LICENSE, 47 | url=URL, 48 | download_url=DOWNLOAD_URL, 49 | include_package_data=True, 50 | version=nlg.__version__, 51 | long_description=LONG_DESCRIPTION, 52 | packages=find_packages(), 53 | install_requires=install_requires 54 | ) 55 | --------------------------------------------------------------------------------