├── .coveragerc
├── .editorconfig
├── .eslintignore
├── .eslintrc.js
├── .gitignore
├── .gitlab-ci.yml
├── .htmllintrc
├── .stylelintrc.js
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── doc
└── images
│ ├── nlg-ide-input.png
│ ├── nlg-ide-toplist.gif
│ └── nlg-template-settings.png
├── examples
└── intro-narrative-api.ipynb
├── nlg
├── __init__.py
├── app
│ ├── __init__.py
│ ├── body.html
│ ├── error
│ │ ├── 400.html
│ │ ├── 401.html
│ │ ├── 403.html
│ │ ├── 404.html
│ │ └── 500.html
│ ├── gramex.yaml
│ ├── html
│ │ ├── demo.html
│ │ └── template-navbar.html
│ ├── index.html
│ ├── login.html
│ ├── nlg.js
│ ├── setup.sh
│ ├── style.css
│ ├── template-navbar.html
│ └── templates
│ │ ├── demo.tmpl
│ │ ├── new-variable.tmpl
│ │ ├── template-settings.tmpl
│ │ └── variable-settings.tmpl
├── grammar.py
├── narrative.py
├── search.py
├── tests
│ ├── __init__.py
│ ├── data
│ │ ├── actors.csv
│ │ └── imdb_ratings.csv
│ ├── test_grammar.py
│ ├── test_narrative.py
│ ├── test_search.py
│ ├── test_utils.py
│ └── test_webapp.py
├── utils.py
└── webapp.py
├── requirements.txt
├── setup.cfg
└── setup.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | show_missing = True
3 | skip_covered = True
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # .editorconfig maintains consistent coding styles between different editors.
2 | # Get plugins at http://editorconfig.org/
3 | # - Sublime text: https://github.com/sindresorhus/editorconfig-sublime
4 | # - Notepad++: https://github.com/editorconfig/editorconfig-notepad-plus-plus
5 |
6 | root = true
7 |
8 | # Apply common styles for most standard code files.
9 | # Do not apply to * - that covers binary files as well
10 | [*.{js,html,php,py,css,svg,json,less,yaml,yml,scss,xml,sh,java,bat,R,tmpl}]
11 | end_of_line = lf
12 | insert_final_newline = true
13 | trim_trailing_whitespace = true
14 | charset = utf-8
15 | # Stick to 2-space indenting by default, to conserve space
16 | indent_style = space
17 | indent_size = 2
18 |
19 | [*.py]
20 | indent_size = 4
21 |
22 | [Makefile]
23 | indent_style = tab
24 | indent_size = 4
25 |
26 | [testlib/test_config/config.empty.yaml]
27 | insert_final_newline = false
28 | [tests/dir/gramex.yaml]
29 | insert_final_newline = false
30 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | **/node_modules/*
2 | app/node_modules/*
3 | docs/*
4 |
5 | # Our Gitlab runner uses eslint@2.6.0 to allow eslint-template.
6 | # chromecapture.js requires ecmaVersion 8 which eslint@2.6.0 does not support
7 | # So let's not eslint that
8 | gramex/apps/capture/chromecapture.js
9 |
--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | "plugins": [
3 | "template" // Handle Tornado templates and JS in HTML files
4 | ],
5 | "env": {
6 | "es6": true, // Allow ES6 in JavaScript
7 | "browser": true, // Include browser globals
8 | "jquery": true, // Include jQuery and $
9 | "mocha": true // Include it(), assert(), etc
10 | },
11 | "globals": {
12 | "_": true, // underscore.js
13 | "d3": true, // d3.js
14 | "vg": true, // vega.js
15 | "L": true, // leaflet.js
16 | "ga": true, // Google analytics
17 | "G": true, // G.min.js
18 | "topojson": true, // topojson.js
19 | "moment": true, // moment.js
20 | "numeral": true, // numeral.js
21 | "assert": true // chai.js
22 | },
23 | "extends": "eslint:recommended",
24 | "rules": {
25 | /* Override default rules */
26 | "indent": ["off", 2], // We eventually want 2 space indentation
27 | "linebreak-style": ["off", "unix"], // We eventually want UNIX style line
28 | "quotes": ["off", "double"], // We may go for a double-quotes style
29 | "semi": ["off", "never"] // We may go for a no-semicolon style
30 | }
31 | };
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | TODO
2 |
3 | # Ignore auto-generated documentation
4 | docs/gramex*.rst
5 | docs/modules.rst
6 |
7 | # Ignore files generated by testcases
8 | tests/**/*.test
9 | tests/**/gen.*
10 | testlib/**/gen.*
11 | .noseids
12 |
13 | # Ignore byte-compiled / optimised / DLL files
14 | *.py[cod]
15 |
16 | # Filenames should NOT have spaces
17 | * *
18 |
19 | # Ignore SQLite3 files. Gramex creates some automatically
20 | *.sqlite3
21 | *.sqlite3-journal
22 |
23 | # Cache folders used for testing
24 | .cache-*
25 | .pytest_cache
26 |
27 | # Ignore log files
28 | *.log*
29 |
30 | # Don't commit data files, except what's required for testing or by Gramex apps
31 | *.csv
32 | *.xls*
33 | !tests/**/*.csv
34 | !tests/*.xlsx
35 | !testlib/**/*.csv
36 | !testlib/*.xlsx
37 | !gramex/apps/**/*.csv
38 | !gramex/apps/guide/**/*.xlsx
39 |
40 | *.ppt*
41 | !testlib/input.pptx
42 | !tests/template.pptx
43 | !gramex/apps/guide/formhandler/input.pptx
44 | !gramex/apps/guide/pptxhandler/examples-input.pptx
45 |
46 | # Don't commit databases created by test cases
47 | tests/*.db
48 |
49 | # Don't commit uploads created by test cases
50 | tests/uploads
51 |
52 | # Don't commit ZIP files, except what's required for testing
53 | *.7z
54 | *.zip
55 | !tests/*.zip
56 |
57 | # Documents
58 | *.doc*
59 | *.pdf
60 |
61 | # Avoid media files
62 | *.avi
63 | *.mp*
64 | *.wmv
65 |
66 | # Backup files
67 | ~$*
68 | *~
69 | *.bak*
70 |
71 | # Sublime-text workspaces, etc
72 | *.sublime-*
73 | .vscode/
74 | .vim/
75 |
76 | # IPython Notebook checkpoints
77 | .ipynb_checkpoints
78 |
79 | # Typically bash.exe.stackdump on Cygwin
80 | *.stackdump
81 |
82 | # Node modules and bower components
83 | node_modules
84 | bower_components
85 |
86 | # Prefer yarn.lock over package-lock.json
87 | package-lock.json
88 |
89 | # Windows shortcut files
90 | *.lnk
91 |
92 | # Windows / Mac OS junk files
93 | Desktop.ini
94 | $RECYCLE.BIN/
95 | *[Tt]humbs.db
96 | *.DS_Store
97 |
98 | # R history files
99 | .RHistory
100 |
101 | # C extensions
102 | *.so
103 |
104 | # Packages
105 | *.egg
106 | *.eggs
107 | *.egg-info
108 | dist
109 | build
110 | eggs
111 | parts
112 | bin
113 | var
114 | sdist
115 | develop-eggs
116 | .installed.cfg
117 | lib
118 | lib64
119 |
120 | # Installer logs
121 | pip-log.txt
122 |
123 | # Unit test / coverage reports
124 | .coverage
125 | .tox
126 | nosetests.xml
127 | htmlcov
128 | cover
129 |
130 | # Translations
131 | *.mo
132 |
133 | # Mr Developer
134 | .mr.developer.cfg
135 | .project
136 | .pydevproject
137 |
138 | # Pycharm
139 | .idea
140 |
141 | # Complexity
142 | output/*.html
143 | output/*/index.html
144 |
145 | # Sphinx
146 | docs/_build
147 |
148 | # For Linux FUSE file system
149 | .fuse_hidden*
150 |
151 | # IDE
152 | .vim
153 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | validate:
2 | tags: [py3]
3 | script:
4 | - python setup.py develop
5 | - python -m spacy download en
6 | - gramex license accept
7 | - nosetests -sv --with-coverage --cover-package=nlg
8 |
9 |
10 | deploy:
11 | stage: deploy
12 | script: deploy
13 | only: [dev]
14 | tags: [py3]
15 | variables:
16 | SERVER: ubuntu@uat.gramener.com # Deploy to uat.gramener.com/app-name/
17 | URL: nlg-tmplgen
18 | SETUP: sh setup.sh
19 | VERSION: py3v1
20 | PORT: 8040
21 |
--------------------------------------------------------------------------------
/.htmllintrc:
--------------------------------------------------------------------------------
1 | // NLG .htmllintrc v1.2
2 | {
3 | "plugins": [],
4 |
5 | "attr-bans": [
6 | "align",
7 | "background",
8 | "border",
9 | // "frameborder", // frameborder is used in YouTube embeds
10 | "longdesc",
11 | "marginwidth",
12 | "marginheight",
13 | "scrolling"
14 | ],
15 | "attr-name-style": false,
16 | "attr-no-dup": false, // attr name may be computed, and get replaced by {}
17 | "attr-no-unsafe-char": false, // title contains single quotes '
18 | "attr-quote-style": "double", // attributes contain double quotes
19 | "attr-req-value": false,
20 | "class-no-dup": true, // no duplicate classes in a tag
21 | "doctype-first": false, // snippet templates need not begin with doctype
22 | "doctype-html5": true,
23 | "fig-req-figcaption": false,
24 | "focusable-tabindex-style": false,
25 | "head-req-title": false, // title may be inside a Block.run()
26 | "href-style": false,
27 | "html-req-lang": false,
28 | "id-class-ignore-regex": "\\{ *\\}", // ignore tornado template id / class
29 | "id-class-no-ad": false,
30 | "id-class-style": false, // no styles enforced for now
31 | "id-no-dup": false, // template replacement IDs { } cause duplication
32 | "img-req-alt": "allownull", // for dynamic image content
33 | "img-req-src": false,
34 | "indent-style": "spaces",
35 | "indent-width": 2,
36 | "label-req-for": false, // cannot use if multiple forms with same key
37 | "line-end-style": false, // raises too many errors
38 | "raw-ignore-regex": "<%.*?%>\\s*|{[%#{].*?[%#}]}\\s*", // ignore templates
39 | "spec-char-escape": false, // using > or < is not that big a deal
40 | "table-req-caption": false,
41 | "tag-bans": [
42 | // "b", // Bootstrap caret example uses
43 | // "i", // Font-awesome icons use
44 | "s", // avoid strike tag, deprecated
45 | // "style", // Single-page templates need style tag
46 | "u",
47 | "strike",
48 | "font",
49 | "center"
50 | ],
51 | "tag-name-lowercase": true,
52 | "tag-name-match": true,
53 | "tag-self-close": false,
54 | "title-max-len": false, // we sometimes have tables inside the title=""
55 | "title-no-dup": true
56 | }
57 |
--------------------------------------------------------------------------------
/.stylelintrc.js:
--------------------------------------------------------------------------------
1 | "use strict"
2 |
3 | module.exports = {
4 | rules: {
5 | "at-rule-no-unknown": true,
6 | "block-no-empty": true,
7 | "color-no-invalid-hex": true,
8 | "comment-no-empty": true,
9 | "declaration-block-no-duplicate-properties": [
10 | true,
11 | {
12 | ignore: ["consecutive-duplicates-with-different-values"]
13 | }
14 | ],
15 | "declaration-block-no-shorthand-property-overrides": true,
16 | "font-family-no-duplicate-names": true,
17 | "font-family-no-missing-generic-family-keyword": true,
18 | "function-calc-no-unspaced-operator": true,
19 | "function-linear-gradient-no-nonstandard-direction": true,
20 | "keyframe-declaration-no-important": true,
21 | "media-feature-name-no-unknown": true,
22 | "no-descending-specificity": true,
23 | "no-duplicate-at-import-rules": true,
24 | "no-duplicate-selectors": true,
25 | "no-empty-source": true,
26 | "no-extra-semicolons": true,
27 | "no-invalid-double-slash-comments": true,
28 | "property-no-unknown": true,
29 | "selector-pseudo-class-no-unknown": true,
30 | "selector-pseudo-element-no-unknown": true,
31 | "selector-type-no-unknown": true,
32 | "string-no-newline": true,
33 | "unit-no-unknown": true
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Config file for automatic testing at travis-ci.org
2 |
3 | # Run in Python 3 only. Drop Python 2 testing
4 | language: python
5 | python: '3.7'
6 |
7 | dist: xenial
8 | sudo: yes
9 |
10 | # Cache modules for faster builds
11 | cache:
12 | timeout: 1000
13 | pip: true
14 | npm: true
15 | yarn: true
16 | # Don't cache miniconda directory. It's slower. Fresh install takes ~200s.
17 | # But caching takes ~150s (extraction) + ~190s (re-packing) = ~340s (slower).
18 | # directories:
19 | # - $HOME/miniconda
20 |
21 | install:
22 | # Install miniconda
23 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh
24 | - bash $HOME/miniconda.sh -b -u -p $HOME/miniconda
25 | - export PATH="$HOME/miniconda/bin:$PATH"
26 | - hash -r
27 | - conda config --set always_yes yes --set changeps1 no
28 | # Install pip modules
29 | - pip install flake8 pep8-naming flake8-gramex flake8-blind-except flake8-print flake8-debugger nose coverage
30 | - npm install -g yarn
31 | - yarn global add eclint eslint eslint-plugin-html eslint-plugin-template htmllint-cli
32 | - yarn install
33 | # Set up variables
34 | - export BRANCH=$TRAVIS_BRANCH
35 |
36 | script:
37 | - eclint check '**/*.html' '**/*.js' '**/*.css' '**/*.yaml' '**/*.md'
38 | - htmllint
39 | - flake8
40 | - bandit nlg --recursive --format csv || true
41 | - pip install -e .
42 | - gramex setup nlg/app
43 | - nosetests -sv --with-coverage --cover-package=nlg
44 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Gramex-NLG is licensed under the [MIT License][1]
2 |
3 | Copyright (c) 2019, Gramener
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
22 |
23 | Gramex includes [third party libraries][2] with permissive licenses.
24 |
25 | [1]: https://opensource.org/licenses/MIT
26 | [2]: https://learn.gramener.com/guide/license/thirdparty.md
27 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft nlg/app
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | |Build Status|
2 |
3 | nlg
4 | ===
5 |
6 | Natural Language Generation component for
7 | `Gramex `__. The NLG module is
8 | designed to work as a Python library, as well as a `Gramex
9 | application `__.
10 |
11 | The library:
12 |
13 | 1. Automatically creates tornado templates from English text in the
14 | context of a dataset.
15 | 2. Allows for modification and generalization of these templates.
16 | 3. Renders these templates as a unified narrative.
17 |
18 | Installation
19 | ------------
20 |
21 | The NLG library can be installed from PyPI as follows:
22 |
23 | .. code:: bash
24 |
25 | $ pip install nlg
26 | $ python -m spacy download en_core_web_sm
27 | $ gramex setup ui
28 |
29 | or from source as follows:
30 |
31 | .. code:: bash
32 |
33 | $ git clone https://github.com/gramener/gramex-nlg.git
34 | $ cd gramex-nlg
35 | $ pip install -e .
36 | $ gramex setup ./app
37 |
38 | Usage
39 | -----
40 |
41 | Using the Python library
42 | ~~~~~~~~~~~~~~~~~~~~~~~~
43 |
44 | To get started, see the `example notebook here `_.
45 |
46 | .. code:: python
47 |
48 | >>> import pandas as pd
49 | >>> from gramex import data
50 |
51 | >>> # load some data
52 | >>> df = pd.read_csv('iris.csv')
53 |
54 | >>> # specify a FormHandler operation - find the average sepal_width per species
55 | >>> fh_args = {'_by': ['species'], '_c': ['sepal_width|avg'], '_sort': ['sepal_width|avg']}
56 |
57 | >>> # Draw a sample
58 | >>> xdf = df.sample(frac=0.1, random_state=10)
59 |
60 | >>> # perform the FormHandler operation on the data
61 | >>> print(data.filter(xdf, fh_args.copy()))
62 | species sepal_width|avg
63 | 2 virginica 2.70
64 | 1 versicolor 2.92
65 | 0 setosa 3.15
66 |
67 | >>> # Write something about the output
68 | >>> from nlg.utils import load_spacy_model
69 | >>> text = nlp("The virginica species has the least average sepal_width.")
70 |
71 | >>> # Generate a template
72 | >>> from nlg.search import templatize
73 | >>> tmpl = templatize(text, fh_args, xdf)
74 | >>> print(tmpl)
75 | {% set fh_args = {"_by": ["species"], "_c": ["sepal_width|avg"], "_sort": ["sepal_width|avg"]} %}
76 | {% set df = U.gfilter(orgdf, fh_args.copy()) %}
77 | {% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}
78 | The {{ df["species"].iloc[0] }} species has the least average {{ fh_args["_sort"][0].lower() }}.
79 |
80 | >>> # Render the same template with new data.
81 | >>> print(render(df, tmpl).decode('utf8'))
82 | The versicolor species has the least average sepal_width|avg.
83 |
84 | Using the NLG IDE
85 | ~~~~~~~~~~~~~~~~~
86 |
87 | The NLG module ships with an IDE. The IDE is a `Gramex
88 | application `__.
89 |
90 | To use it, install the NLG module as indicated above, and add the
91 | following to your ``gramex.yaml``:
92 |
93 | .. code:: yaml
94 |
95 | variables:
96 | NLG_ROOT:
97 | function: nlg.utils._locate_app_config()
98 |
99 | import:
100 | nlg:
101 | path: $NLG_ROOT
102 | YAMLURL: $YAMLURL/nlg
103 |
104 | This configuration mounts the app at the ``/nlg/`` resource. Start gramex to access it.
105 |
106 | The Gramex NLG IDE
107 | ------------------
108 |
109 | The NLG component depends on two sources of information:
110 |
111 | 1. A source dataset, which can be uploaded on to the IDE. A dataset is
112 | uniquely identified with its filename. Once uploaded, the file
113 | persists and is available for selection from the app. Any *file* that
114 | makes a valid URL for
115 | `FormHandler `__ can be
116 | used with the NLG app.
117 | 2. A *narrative*, which is a collection of templates and rules around
118 | them. The narrative consists of the configuration which governs the
119 | rendered text. An existing narrative can be uploaded through the "Add
120 | Data" button, or can be created through the IDE. Once created, the
121 | narrative can be named and becomes available for selection from the
122 | "Add Data" modal.
123 |
124 | The NLG IDE
125 | -----------
126 |
127 | The primary purpose of the IDE is to create or edit narratives based on
128 | a dataset. Once a dataset has been selected, it is exposed in the IDE as
129 | a `FormHandler
130 | table `__.
131 |
132 | .. figure:: doc/images/nlg-ide-input.png
133 | :alt:
134 |
135 | Users can now type English text into the IDE and add it to the
136 | narrative. This automatically templatizes the text, and adds the
137 | template to the narrative. For example, typing "Humphrey Bogart is at
138 | the top of the list." does this:
139 |
140 | .. figure:: doc/images/nlg-ide-toplist.gif
141 | :alt:
142 |
143 | This means that the input statement has been templatized and added to
144 | the narrative. The part of the input text that was successfully
145 | templatized is highlighted in green. Clicking on the spanner button next
146 | to a template opens the `Template Settings <#template-settings>`__
147 | modal.
148 |
149 | Template Settings
150 | -----------------
151 |
152 | .. figure:: doc/images/nlg-template-settings.png
153 | :alt:
154 |
155 | This dialog provides configuration options for all template attributes:
156 |
157 | 1. **Template Name** - Each template can optionally be named.
158 | 2. **Condition** - Any Python expression which evaluates to a boolean
159 | may be set as a condition, which controls whether the template is
160 | rendered.
161 | 3. The actual Tornado template itself can be edited. Any valid Tornado
162 | template is acceptable.
163 | 4. **Token Settings** - Every token from the input text that finds a
164 | match in the dataset or in FormHandler arguments (i.e. every token
165 | that is highlighted in the preview) is converted into a `template
166 | expression `__.
167 | Such tokens have their own attributes, as follows:
168 |
169 | - **Token search results** - if a token is found in more than one
170 | place (say, a dataframe cell as well as a FormHandler argument),
171 | this setting allows the user to select the right result.
172 | - **Grammar options** - the NLG engine may automatically apply
173 | certain string formatting or lexical operations to the template
174 | expression to make it match the input text. Any number of these
175 | operations can be enabled / disabled through this setting.
176 | - **Make variable** - a token may be set as a local variable within
177 | the template.
178 | - **Ignore** - the template expression corresponding to the token
179 | may be ignored, and set back to the literal input text.
180 |
181 | 5. **Run Template** - Run the current template against the dataframe and
182 | preview its output.
183 | 6. **Save Template** - Save the template. Note that this is required if
184 | the template has been manually edited in the textarea.
185 |
186 | Naming and Saving a Narrative
187 | -----------------------------
188 |
189 | Once a narrative has been fully configured, it can be named and saved.
190 | Doing so causes it to appear the narrative dropdown menu on the app.
191 |
192 | Sharing a Narrative
193 | -------------------
194 |
195 | After a narrative has been named and saved, it be shared in two modes:
196 |
197 | 1. **IDE mode** - This option lets users copy a URL that redirects to
198 | the IDE, with the current dataset and the current narrative set in
199 | the session.
200 | 2. **Embed mode** - Copy an HTML snippet to embed into a page which
201 | contains a Formhandler table. The template will render live as the
202 | table changes.
203 |
204 |
205 | Glossary: Grammar of Data-Driven Narratives
206 | ===========================================
207 |
208 | This section describes the building blocks of Gramex's approach to natural language generation.
209 | These concepts serve as primitives to the logic and automation capabilities of the NLG engine.
210 |
211 | 1. **Narrative** - A *narrative* is a piece of text written by a user or generated by a machine which contains facts about a dataset.
212 | A narrative in its entirity is assumed to be a function of three items:
213 |
214 | a. A dataset
215 | b. Operations on that dataset
216 | c. Some "source text" provided by the user.
217 |
218 | For example, the following is a narrative about the `Fisher Iris dataset `_.
219 |
220 | The iris dataset contains measurements from a hundred and fifty samples of three unique species of the iris flower - setosa, versicolor and virginica. The species are equally distributed within the dataset, so that each species has fifty samples. For each sample, four measurements are taken - sepal width, petal width, sepal width and sepal length. The average petal length of the setosa is significantly less than that of versicolor or virginica. The average petal width of virginica is much higher than that of versicolor. However, there is no pair of features that can uniquely identify a species. The presence of such properties makes the iris dataset ideal for explaining machine learning concepts.
221 |
222 | 2. **Nugget** - A *nugget* is ideally a single sentence which conveys a fact about the data. Each sentence in the example narrative except the last two is a nugget. Note that each nugget derives its facts from the source data directly, or from the result of some operation on the data. For example, the following nugget
223 |
224 | The average petal length of the setosa is significantly less than that of versicolor or virginica.
225 |
226 | derives from a groupby-and-average operation on one column of the dataset. Some nuggets, like the one enumerating the number of samples in the dataset, derive from the raw dataset, *not* from the result of any operations on it. A narrative is essentially an ordered collection of nuggets.
227 |
228 | 3. **Variables** - A *variable* is a piece of text which can change with the data or the operations performed on it. Here is a reproduction of the example narrative, with all variables shown in bold.
229 |
230 | The iris dataset contains measurements from **a hundred and fifty** samples of **three** unique species of the iris flower - **setosa, versicolor and virginica**. The species are equally distributed within the dataset, so that each species has **fifty** samples. For each sample, **four** measurements are taken - **sepal width, petal width, sepal width and sepal length**. The **average petal length** of the setosa is significantly **less** than that of versicolor or virginica. The **average petal width** of virginica is much **higher** than that of versicolor. However, there is no pair of features that can uniquely identify a species. The presence of such properties makes the iris dataset ideal for explaining machine learning concepts.
231 |
232 | Note that each variable has two defining components:
233 |
234 | * a *source text*, as initially provided by the user
235 | * one or more *formulae*, which compute the value of the variable for a specific instance of the data. Note that the source text of a variable may be found in multiple places within a dataset, and as such, a variable may have multiple formulae - one of which will have to be preferred by the user.
236 |
237 | For example, for the first variable in example narrative, "hundred and fifty" is the source text, and the formula is any machine code that counts the number of rows in the dataset and translates it into a human-readable form. A variable may additionally have other attributes, like:
238 |
239 | * a set of linguistic *inflections* which determine the form of the rendered variable text - these are distinct from the formula itself, in that the formula creates the base form of the text and inflections modify the base form.
240 | * a *name* used to identify the variable within the template of the nugget
241 |
242 |
243 | Thus, narratives are composed from nuggets, and nuggets from variables. This grammar allows the NLG engine to approach the problem of data-driven, machine-generated narratives in a more *compositional* manner than a *generative* one.
244 |
245 | .. |Build Status| image:: https://travis-ci.org/gramener/gramex-nlg.svg?branch=dev
246 | :target: https://travis-ci.org/gramener/gramex-nlg
247 |
248 |
--------------------------------------------------------------------------------
/doc/images/nlg-ide-input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/doc/images/nlg-ide-input.png
--------------------------------------------------------------------------------
/doc/images/nlg-ide-toplist.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/doc/images/nlg-ide-toplist.gif
--------------------------------------------------------------------------------
/doc/images/nlg-template-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/doc/images/nlg-template-settings.png
--------------------------------------------------------------------------------
/examples/intro-narrative-api.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Introduction to NLG's Narrative API\n",
8 | "===================================\n",
9 | "\n",
10 | "This notebook is an introduction to Gramex NLG's Narrative API. Here we will learn how to create data-driven narratives with the NLG module, by going over the building blocks of the API.\n",
11 | "\n",
12 | "Getting Started\n",
13 | "---------------\n",
14 | "\n",
15 | "If the NLG module is not installed, install it as follows:\n",
16 | "\n",
17 | "```bash\n",
18 | "$ pip install nlg\n",
19 | "```"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "Test the installation by running the following cell:"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 1,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "from nlg.search import templatize\n",
36 | "import pandas as pd"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "Next, let's load some data. For this tutorials, we will be using [this](https://raw.githubusercontent.com/gramener/gramex-nlg/master/nlg/tests/data/actors.csv) dataset. Please download the file and load it as a pandas dataframe."
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/html": [
54 | "
\n",
55 | "\n",
68 | "
\n",
69 | " \n",
70 | "
\n",
71 | "
\n",
72 | "
category
\n",
73 | "
name
\n",
74 | "
rating
\n",
75 | "
votes
\n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | "
\n",
80 | "
0
\n",
81 | "
Actors
\n",
82 | "
Humphrey Bogart
\n",
83 | "
0.570197
\n",
84 | "
109
\n",
85 | "
\n",
86 | "
\n",
87 | "
1
\n",
88 | "
Actors
\n",
89 | "
Cary Grant
\n",
90 | "
0.438602
\n",
91 | "
142
\n",
92 | "
\n",
93 | "
\n",
94 | "
2
\n",
95 | "
Actors
\n",
96 | "
James Stewart
\n",
97 | "
0.988374
\n",
98 | "
120
\n",
99 | "
\n",
100 | "
\n",
101 | "
3
\n",
102 | "
Actors
\n",
103 | "
Marlon Brando
\n",
104 | "
0.102045
\n",
105 | "
108
\n",
106 | "
\n",
107 | "
\n",
108 | "
4
\n",
109 | "
Actors
\n",
110 | "
Fred Astaire
\n",
111 | "
0.208877
\n",
112 | "
84
\n",
113 | "
\n",
114 | "
\n",
115 | "
5
\n",
116 | "
Actresses
\n",
117 | "
Katharine Hepburn
\n",
118 | "
0.039188
\n",
119 | "
63
\n",
120 | "
\n",
121 | "
\n",
122 | "
6
\n",
123 | "
Actresses
\n",
124 | "
Bette Davis
\n",
125 | "
0.282807
\n",
126 | "
14
\n",
127 | "
\n",
128 | "
\n",
129 | "
7
\n",
130 | "
Actresses
\n",
131 | "
Audrey Hepburn
\n",
132 | "
0.120197
\n",
133 | "
94
\n",
134 | "
\n",
135 | "
\n",
136 | "
8
\n",
137 | "
Actresses
\n",
138 | "
Ingrid Bergman
\n",
139 | "
0.296140
\n",
140 | "
52
\n",
141 | "
\n",
142 | "
\n",
143 | "
9
\n",
144 | "
Actors
\n",
145 | "
Spencer Tracy
\n",
146 | "
0.466311
\n",
147 | "
192
\n",
148 | "
\n",
149 | "
\n",
150 | "
10
\n",
151 | "
Actors
\n",
152 | "
Charlie Chaplin
\n",
153 | "
0.244426
\n",
154 | "
76
\n",
155 | "
\n",
156 | " \n",
157 | "
\n",
158 | "
"
159 | ],
160 | "text/plain": [
161 | " category name rating votes\n",
162 | "0 Actors Humphrey Bogart 0.570197 109\n",
163 | "1 Actors Cary Grant 0.438602 142\n",
164 | "2 Actors James Stewart 0.988374 120\n",
165 | "3 Actors Marlon Brando 0.102045 108\n",
166 | "4 Actors Fred Astaire 0.208877 84\n",
167 | "5 Actresses Katharine Hepburn 0.039188 63\n",
168 | "6 Actresses Bette Davis 0.282807 14\n",
169 | "7 Actresses Audrey Hepburn 0.120197 94\n",
170 | "8 Actresses Ingrid Bergman 0.296140 52\n",
171 | "9 Actors Spencer Tracy 0.466311 192\n",
172 | "10 Actors Charlie Chaplin 0.244426 76"
173 | ]
174 | },
175 | "execution_count": 2,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "# Replace the path with wherever you have downloaded the dataset.\n",
182 | "df = pd.read_csv('../nlg/tests/data/actors.csv')\n",
183 | "df"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "Let us now sort the dataframe by the `rating` column. NLG is designed to work with Gramex's [FormHandler](https://learn.gramener.com/guide/formhandler). Therefore, we will use FormHandler's own DSL to make any transformation on the dataset."
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 3,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "from gramex.data import filter as gfilter # do not clobber the `filter` function from the Python stdlib\n",
200 | "sort_args = {'_sort': ['-rating']}"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "Note that the `_sort` key in the dictionary above tells Gramex to sort a dataframe by the given columns. The value of they key is a _list_, indicating that dataframes can be sorted by multiple columns. Also, the hyphen before the column name indicates that the sorting is _descending_."
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 4,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "xdf = gfilter(df, sort_args.copy())"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 5,
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "data": {
226 | "text/html": [
227 | "
\n",
228 | "\n",
241 | "
\n",
242 | " \n",
243 | "
\n",
244 | "
\n",
245 | "
category
\n",
246 | "
name
\n",
247 | "
rating
\n",
248 | "
votes
\n",
249 | "
\n",
250 | " \n",
251 | " \n",
252 | "
\n",
253 | "
2
\n",
254 | "
Actors
\n",
255 | "
James Stewart
\n",
256 | "
0.988374
\n",
257 | "
120
\n",
258 | "
\n",
259 | "
\n",
260 | "
0
\n",
261 | "
Actors
\n",
262 | "
Humphrey Bogart
\n",
263 | "
0.570197
\n",
264 | "
109
\n",
265 | "
\n",
266 | "
\n",
267 | "
9
\n",
268 | "
Actors
\n",
269 | "
Spencer Tracy
\n",
270 | "
0.466311
\n",
271 | "
192
\n",
272 | "
\n",
273 | "
\n",
274 | "
1
\n",
275 | "
Actors
\n",
276 | "
Cary Grant
\n",
277 | "
0.438602
\n",
278 | "
142
\n",
279 | "
\n",
280 | "
\n",
281 | "
8
\n",
282 | "
Actresses
\n",
283 | "
Ingrid Bergman
\n",
284 | "
0.296140
\n",
285 | "
52
\n",
286 | "
\n",
287 | " \n",
288 | "
\n",
289 | "
"
290 | ],
291 | "text/plain": [
292 | " category name rating votes\n",
293 | "2 Actors James Stewart 0.988374 120\n",
294 | "0 Actors Humphrey Bogart 0.570197 109\n",
295 | "9 Actors Spencer Tracy 0.466311 192\n",
296 | "1 Actors Cary Grant 0.438602 142\n",
297 | "8 Actresses Ingrid Bergman 0.296140 52"
298 | ]
299 | },
300 | "execution_count": 5,
301 | "metadata": {},
302 | "output_type": "execute_result"
303 | }
304 | ],
305 | "source": [
306 | "xdf.head()"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "Now, let's write something about this dataset. It is apparent that James Stewart has the highest rating."
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 6,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "from nlg.utils import load_spacy_model\n",
323 | "nlp = load_spacy_model()\n",
324 | "\n",
325 | "text = nlp(\"James Stewart is the actor with the highest rating.\")"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "The entry-point into the NLG module is the [`nlg.search.templatize`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/search.py#L478) function. This function uses:\n",
333 | "* a dataframe\n",
334 | "* operations on the dataframe (as FormHandler arguments)\n",
335 | "* some text about the dataset\n",
336 | "\n",
337 | "to create a [`Nugget`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L102) object. To learn more about the `Nugget` object and it's methods, see the [README](https://github.com/gramener/gramex-nlg/tree/dev#glossary-grammar-of-data-driven-narratives)."
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 7,
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "name": "stderr",
347 | "output_type": "stream",
348 | "text": [
349 | "/home/jaidevd/src/nlg/nlg/search.py:62: UserWarning: Ignoring lemmatization.\n",
350 | " warnings.warn('Ignoring lemmatization.')\n",
351 | "/home/jaidevd/src/nlg/nlg/search.py:92: UserWarning: Ignoring lemmatization.\n",
352 | " warnings.warn('Ignoring lemmatization.')\n",
353 | "/home/jaidevd/src/nlg/nlg/search.py:80: FutureWarning: Series.nonzero() is deprecated and will be removed in a future version.Use Series.to_numpy().nonzero() instead\n",
354 | " indices = {array[i]: i for i in mask.nonzero()[0]}\n",
355 | "/home/jaidevd/src/nlg/nlg/search.py:109: UserWarning: Cannot lemmatize multi-word cells.\n",
356 | " warnings.warn('Cannot lemmatize multi-word cells.')\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "nugget = templatize(text, sort_args, df)"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": 8,
367 | "metadata": {},
368 | "outputs": [
369 | {
370 | "data": {
371 | "text/plain": [
372 | "{% set fh_args = {\"_sort\": [\"-rating\"]} %}\n",
373 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n",
374 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n",
375 | "{# Do not edit above this line. #}\n",
376 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[-2]).lower() }} with the highest rating."
377 | ]
378 | },
379 | "execution_count": 8,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "nugget"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "As we see, a nugget has an underlying [Tornado template](https://www.tornadoweb.org/en/stable/template.html) which has been auto-generated by the `templatize` function. Let's see how well this template re-renders on the dataset."
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 9,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "b' James Stewart is the actor with the highest rating.'\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "print(nugget.render(df))"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "The text above is identical to the input text, but this is generated from a template. Essentially, we can pass any dataframe to the [`.render`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L190) method of the nugget object, and the text will be rendered in the context of that data. To test this, let's create a copy of the dataframe and give all the artists a random rating."
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": 10,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "import numpy as np\n",
426 | "np.random.seed(12345)\n",
427 | "\n",
428 | "fake_ratings = df.copy()\n",
429 | "fake_ratings['rating'] = np.random.rand(df.shape[0])"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {},
435 | "source": [
436 | "Let's see who the top rated artist is in this new, fake dataset."
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 11,
442 | "metadata": {},
443 | "outputs": [
444 | {
445 | "data": {
446 | "text/html": [
447 | "
\n",
448 | "\n",
461 | "
\n",
462 | " \n",
463 | "
\n",
464 | "
\n",
465 | "
category
\n",
466 | "
name
\n",
467 | "
rating
\n",
468 | "
votes
\n",
469 | "
\n",
470 | " \n",
471 | " \n",
472 | "
\n",
473 | "
6
\n",
474 | "
Actresses
\n",
475 | "
Bette Davis
\n",
476 | "
0.964515
\n",
477 | "
14
\n",
478 | "
\n",
479 | "
\n",
480 | "
0
\n",
481 | "
Actors
\n",
482 | "
Humphrey Bogart
\n",
483 | "
0.929616
\n",
484 | "
109
\n",
485 | "
\n",
486 | "
\n",
487 | "
8
\n",
488 | "
Actresses
\n",
489 | "
Ingrid Bergman
\n",
490 | "
0.748907
\n",
491 | "
52
\n",
492 | "
\n",
493 | "
\n",
494 | "
10
\n",
495 | "
Actors
\n",
496 | "
Charlie Chaplin
\n",
497 | "
0.747715
\n",
498 | "
76
\n",
499 | "
\n",
500 | "
\n",
501 | "
9
\n",
502 | "
Actors
\n",
503 | "
Spencer Tracy
\n",
504 | "
0.653570
\n",
505 | "
192
\n",
506 | "
\n",
507 | " \n",
508 | "
\n",
509 | "
"
510 | ],
511 | "text/plain": [
512 | " category name rating votes\n",
513 | "6 Actresses Bette Davis 0.964515 14\n",
514 | "0 Actors Humphrey Bogart 0.929616 109\n",
515 | "8 Actresses Ingrid Bergman 0.748907 52\n",
516 | "10 Actors Charlie Chaplin 0.747715 76\n",
517 | "9 Actors Spencer Tracy 0.653570 192"
518 | ]
519 | },
520 | "execution_count": 11,
521 | "metadata": {},
522 | "output_type": "execute_result"
523 | }
524 | ],
525 | "source": [
526 | "fake_ratings.sort_values('rating', ascending=False).head()"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "Now, let's see if our original nugget is able to adapt to this new dataset."
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 12,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "data": {
543 | "text/plain": [
544 | "b' Bette Davis is the actor with the highest rating.'"
545 | ]
546 | },
547 | "execution_count": 12,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "nugget.render(fake_ratings)"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {},
559 | "source": [
560 | "Clearly, that is false. Bette Davis is the _actress_ with the highest rating. To see what went wrong, let's take a look at the template again."
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 13,
566 | "metadata": {},
567 | "outputs": [
568 | {
569 | "name": "stdout",
570 | "output_type": "stream",
571 | "text": [
572 | "{% set fh_args = {\"_sort\": [\"-rating\"]} %}\n",
573 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n",
574 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n",
575 | "{# Do not edit above this line. #}\n",
576 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[-2]).lower() }} with the highest rating.\n"
577 | ]
578 | }
579 | ],
580 | "source": [
581 | "print(nugget.template)"
582 | ]
583 | },
584 | {
585 | "cell_type": "markdown",
586 | "metadata": {},
587 | "source": [
588 | "As we can see, the words 'actor' or 'actress' don't appear in the template. This means that the template-generator has correctly figured out that these words are dependent on the transformed dataset. However, it has not managed to determine the exact formula for this.\n",
589 | "\n",
590 | "Any token in the input text which is data-dependent, is called a [`Variable`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L27). To see which words in a nugget are variables, take a look at the `.variables` attribute of the nugget."
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 14,
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "data": {
600 | "text/plain": [
601 | "{James Stewart: {{ df[\"name\"].iloc[0] }},\n",
602 | " actor: {{ G.singular(df[\"category\"].iloc[-2]).lower() }}}"
603 | ]
604 | },
605 | "execution_count": 14,
606 | "metadata": {},
607 | "output_type": "execute_result"
608 | }
609 | ],
610 | "source": [
611 | "nugget.variables"
612 | ]
613 | },
614 | {
615 | "cell_type": "markdown",
616 | "metadata": {},
617 | "source": [
618 | "We see here that there are two tokens from the original text - `\"James Stewart\"` and `\"actor\"` that have been identified as variables. Only, the Python _expression_ for determining one of them is wrong. Whether the highest rated artist is an actor or an actress needs to be found from the `\"category\"` column of the first row.\n",
619 | "\n",
620 | "To fix this, we can use the [`.set_expr`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L58) method of the respective variable. The `.set_expr` method accepts any valid Python expression as a string."
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 15,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "var = nugget.get_var('actor')"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 16,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "var.set_expr('df[\"category\"].iloc[0]')"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 17,
644 | "metadata": {},
645 | "outputs": [
646 | {
647 | "data": {
648 | "text/plain": [
649 | "{{ G.singular(df[\"category\"].iloc[0]).lower() }}"
650 | ]
651 | },
652 | "execution_count": 17,
653 | "metadata": {},
654 | "output_type": "execute_result"
655 | }
656 | ],
657 | "source": [
658 | "var"
659 | ]
660 | },
661 | {
662 | "cell_type": "markdown",
663 | "metadata": {},
664 | "source": [
665 | "Now that we have fixed the variable. Let's re-render the nugget on the fake dataset."
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": 18,
671 | "metadata": {},
672 | "outputs": [
673 | {
674 | "data": {
675 | "text/plain": [
676 | "b' Bette Davis is the actress with the highest rating.'"
677 | ]
678 | },
679 | "execution_count": 18,
680 | "metadata": {},
681 | "output_type": "execute_result"
682 | }
683 | ],
684 | "source": [
685 | "nugget.render(fake_ratings)"
686 | ]
687 | },
688 | {
689 | "cell_type": "markdown",
690 | "metadata": {},
691 | "source": [
692 | "----"
693 | ]
694 | },
695 | {
696 | "cell_type": "markdown",
697 | "metadata": {},
698 | "source": [
699 | "There is scope for yet more automation. Note that the last word in the text, \"rating\", matches the name of the column by which the dataframe has been sorted. Therefore, even that can be turned into a variable. Essentially, we want the template to render the name of whichever column is used to sort the data, in place of rating.\n",
700 | "\n",
701 | "New variables can be added to a nugget using the [`.add_var`](https://github.com/gramener/gramex-nlg/blob/dev/nlg/narrative.py#L236) method of the nugget object, as follows:"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": 19,
707 | "metadata": {},
708 | "outputs": [],
709 | "source": [
710 | "var_token = text[-2] # The spacy token corresponding to \"rating\""
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 20,
716 | "metadata": {},
717 | "outputs": [],
718 | "source": [
719 | "var_expr = 'fh_args[\"_sort\"][0]' # The Python expression to detect the sorted column"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": 21,
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "data": {
729 | "text/plain": [
730 | "{% set fh_args = {\"_sort\": [\"-rating\"]} %}\n",
731 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n",
732 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n",
733 | "{# Do not edit above this line. #}\n",
734 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[0]).lower() }} with the highest {{ fh_args[\"_sort\"][0] }}."
735 | ]
736 | },
737 | "execution_count": 21,
738 | "metadata": {},
739 | "output_type": "execute_result"
740 | }
741 | ],
742 | "source": [
743 | "nugget.add_var(var_token, expr=var_expr)\n",
744 | "nugget"
745 | ]
746 | },
747 | {
748 | "cell_type": "markdown",
749 | "metadata": {},
750 | "source": [
751 | "----\n",
752 | "Let us now test a scenario where we sort the dataframe by votes."
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 22,
758 | "metadata": {},
759 | "outputs": [
760 | {
761 | "data": {
762 | "text/plain": [
763 | "{% set fh_args = {\"_sort\": [\"-votes\"]} %}\n",
764 | "{% set df = U.gfilter(orgdf, fh_args.copy()) %}\n",
765 | "{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}\n",
766 | "{# Do not edit above this line. #}\n",
767 | "{{ df[\"name\"].iloc[0] }} is the {{ G.singular(df[\"category\"].iloc[0]).lower() }} with the highest {{ fh_args[\"_sort\"][0] }}."
768 | ]
769 | },
770 | "execution_count": 22,
771 | "metadata": {},
772 | "output_type": "execute_result"
773 | }
774 | ],
775 | "source": [
776 | "nugget.fh_args = {'_sort': ['-votes']}\n",
777 | "nugget"
778 | ]
779 | },
780 | {
781 | "cell_type": "code",
782 | "execution_count": 23,
783 | "metadata": {},
784 | "outputs": [
785 | {
786 | "data": {
787 | "text/plain": [
788 | "b' Spencer Tracy is the actor with the highest votes.'"
789 | ]
790 | },
791 | "execution_count": 23,
792 | "metadata": {},
793 | "output_type": "execute_result"
794 | }
795 | ],
796 | "source": [
797 | "nugget.render(df)"
798 | ]
799 | },
800 | {
801 | "cell_type": "markdown",
802 | "metadata": {},
803 | "source": [
804 | "---"
805 | ]
806 | },
807 | {
808 | "cell_type": "markdown",
809 | "metadata": {},
810 | "source": [
811 | "Now we know how to create templates from raw text, and how to assign tokens within the text as data-dependent variables. In forthcoming examples, we will explore:\n",
812 | "\n",
813 | "1. how to design more complex variable expression - especially those that cannot be defined a short and simple Python strings\n",
814 | "2. how to create longer narratives by putting together different nuggets."
815 | ]
816 | }
817 | ],
818 | "metadata": {
819 | "kernelspec": {
820 | "display_name": "Python 3",
821 | "language": "python",
822 | "name": "python3"
823 | },
824 | "language_info": {
825 | "codemirror_mode": {
826 | "name": "ipython",
827 | "version": 3
828 | },
829 | "file_extension": ".py",
830 | "mimetype": "text/x-python",
831 | "name": "python",
832 | "nbconvert_exporter": "python",
833 | "pygments_lexer": "ipython3",
834 | "version": "3.6.8"
835 | }
836 | },
837 | "nbformat": 4,
838 | "nbformat_minor": 4
839 | }
840 |
--------------------------------------------------------------------------------
/nlg/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | __version__ = '0.1.3'
4 |
5 | try:
6 | __NLG_SETUP__
7 | except NameError:
8 | __NLG_SETUP__ = False
9 |
10 |
11 | if __NLG_SETUP__:
12 | sys.stderr.write('Partial import of nlg during the build process.\n')
13 | else:
14 | from .search import templatize # NOQA: F401
15 | from .grammar import get_gramopts
16 | grammar_options = get_gramopts()
17 | __all__ = ['templatize', 'grammar_options']
18 |
--------------------------------------------------------------------------------
/nlg/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramener/gramex-nlg/aa508f66b6ce38eeb3ea3deac96f458865b896ff/nlg/app/__init__.py
--------------------------------------------------------------------------------
/nlg/app/body.html:
--------------------------------------------------------------------------------
1 | {% set admin_kwargs = handler.kwargs.get('admin_kwargs', '') or {} %}
2 |
3 | {% from nlg.webapp import read_current_config, get_user_dir, is_user_authenticated %}
4 | {% set dsid = read_current_config(handler).get('dsid') %}
5 |
6 |
7 |
8 |
9 |
10 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | {% from nlg.webapp import get_dataset_files, get_narrative_config_files %}
20 | {% set NLG_DATASETS = get_dataset_files(handler) %}
21 | {% set NLG_NARRATIVES = get_narrative_config_files(handler) %}
22 | {% import os.path as op %}
23 |
24 |
25 |
26 | {#
27 |
28 | Note: this template is rarely called, because:
29 |
30 | - basehandler.py redirects to login_url if 401: UNAUTHORIZED
31 | - authhandler.py explicitly renders specific templates if 401: UNAUTHORIZED
32 |
33 | The rare cases where this is used are:
34 |
35 | - If an application explicitly raises a 401
36 | - If basehandler.py raises a 401
37 | - for an OTP request when user is not logged in
38 | - if the request is not GET/HEAD, it is not redirected to login_url
39 |
40 | #}
41 |
--------------------------------------------------------------------------------
/nlg/app/error/403.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Forbidden
8 |
15 |
16 |
17 |
You need access
18 | {% try %}{% set xsrf = handler.check_xsrf_cookie() or True %}{% except %}{% set xsrf = False %}{% end %}
19 | {% if handler.request.method not in {'GET', 'HEAD', 'OPTIONS'} and not xsrf %}
20 |
Your app sent a {{ handler.request.method }} request without an XSRF cookie.
21 | {% elif handler.current_user %}
22 |
You are logged in, but as a user that cannot access this page.