├── .gitignore
├── LICENSE
├── README.md
├── align.py
├── components.py
├── examples
├── wikinews
│ ├── README.md
│ ├── load.sh
│ ├── wikinews.cache
│ │ ├── meta.yaml
│ │ ├── mgr
│ │ │ ├── blocks
│ │ │ │ └── 140645734304016
│ │ │ │ │ ├── data.feather
│ │ │ │ │ └── meta.yaml
│ │ │ ├── columns
│ │ │ │ ├── BertscoreAligner:spacy:document:spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:document:spacy:summary:bart-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:document:spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:document:spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:document:spacy:summary:reference
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:summary:reference:spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:summary:reference:spacy:summary:bart-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:summary:reference:spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── BertscoreAligner:spacy:summary:reference:spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:document:spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:document:spacy:summary:bart-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:document:spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:document:spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:document:spacy:summary:reference
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:summary:reference:spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:summary:reference:spacy:summary:bart-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:summary:reference:spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── NGramAligner:spacy:summary:reference:spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:document:spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:document:spacy:summary:bart-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:document:spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:document:spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:document:spacy:summary:reference
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:summary:reference:spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:summary:reference:spacy:summary:bart-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:summary:reference:spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── StaticEmbeddingAligner:spacy:summary:reference:spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.dill
│ │ │ │ │ ├── meta.yaml
│ │ │ │ │ └── state.dill
│ │ │ │ ├── document
│ │ │ │ │ └── state.dill
│ │ │ │ ├── preprocessed_document
│ │ │ │ │ └── state.dill
│ │ │ │ ├── preprocessed_summary:bart-cnndm
│ │ │ │ │ └── state.dill
│ │ │ │ ├── preprocessed_summary:bart-xsum
│ │ │ │ │ └── state.dill
│ │ │ │ ├── preprocessed_summary:pegasus-cnndm
│ │ │ │ │ └── state.dill
│ │ │ │ ├── preprocessed_summary:pegasus-xsum
│ │ │ │ │ └── state.dill
│ │ │ │ ├── preprocessed_summary:reference
│ │ │ │ │ └── state.dill
│ │ │ │ ├── spacy:document
│ │ │ │ │ ├── data.spacy
│ │ │ │ │ └── meta.yaml
│ │ │ │ ├── spacy:summary:bart-cnndm
│ │ │ │ │ ├── data.spacy
│ │ │ │ │ └── meta.yaml
│ │ │ │ ├── spacy:summary:bart-xsum
│ │ │ │ │ ├── data.spacy
│ │ │ │ │ └── meta.yaml
│ │ │ │ ├── spacy:summary:pegasus-cnndm
│ │ │ │ │ ├── data.spacy
│ │ │ │ │ └── meta.yaml
│ │ │ │ ├── spacy:summary:pegasus-xsum
│ │ │ │ │ ├── data.spacy
│ │ │ │ │ └── meta.yaml
│ │ │ │ ├── spacy:summary:reference
│ │ │ │ │ ├── data.spacy
│ │ │ │ │ └── meta.yaml
│ │ │ │ ├── summary:bart-cnndm
│ │ │ │ │ └── state.dill
│ │ │ │ ├── summary:bart-xsum
│ │ │ │ │ └── state.dill
│ │ │ │ ├── summary:pegasus-cnndm
│ │ │ │ │ └── state.dill
│ │ │ │ ├── summary:pegasus-xsum
│ │ │ │ │ └── state.dill
│ │ │ │ └── summary:reference
│ │ │ │ │ └── state.dill
│ │ │ └── meta.yaml
│ │ └── state.dill
│ └── wikinews.jsonl
└── xsum
│ └── load.sh
├── generation.py
├── join.py
├── preprocessing.py
├── quickstart.sh
├── requirements.txt
├── resources
├── jquery.color-2.1.2.min.js
├── summvis.css
└── summvis.js
├── summvis.py
├── utils.py
└── website
├── annotations.png
├── demo.gif
├── main-vis.jpg
├── title.png
└── triangle.png
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_STORE
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2021 SummVis
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SummVis
2 |
3 | SummVis is an open-source visualization tool that supports fine-grained analysis of summarization models, data, and evaluation
4 | metrics. Through its lexical and semantic visualizations, SummVis enables in-depth exploration across important dimensions such as factual consistency and abstractiveness.
5 |
6 | Authors: [Jesse Vig](https://twitter.com/jesse_vig)1,
7 | [Wojciech Kryściński](https://twitter.com/iam_wkr)1,
8 | [Karan Goel](https://twitter.com/krandiash)2,
9 | [Nazneen Fatema Rajani](https://twitter.com/nazneenrajani)1
10 | 1[Salesforce Research](https://einstein.ai/) 2[Stanford Hazy Research](https://hazyresearch.stanford.edu/)
11 |
12 | 📖 [Paper](https://arxiv.org/abs/2104.07605)
13 | 🎥 [Demo](https://vimeo.com/540429745)
14 |
15 |
16 |
17 |
18 |
19 | _We welcome issues for questions, suggestions, requests or bug reports._
20 |
21 | ## Table of Contents
22 | - [User guide](#user-guide)
23 | - [Installation](#installation)
24 | - [Quickstart](#quickstart)
25 | - [Load data into SummVis](#loading-data-into-summvis)
26 | - [Deploying SummVis remotely](#deploying-summvis-remotely)
27 | - [Citation](#citation)
28 | - [Acknowledgements](#acknowledgements)
29 |
30 | ## User guide
31 |
32 | ### Overview
33 | SummVis is a tool for analyzing abstractive summarization systems. It provides fine-grained insights on summarization
34 | models, data, and evaluation metrics by visualizing the relationships between source documents, reference summaries,
35 | and generated summaries, as illustrated in the figure below.
36 |
37 | 
38 |
39 | ### Interface
40 |
41 | The SummVis interface is shown below. The example displayed is the first record from the
42 | [CNN / Daily Mail](https://huggingface.co/datasets/cnn_dailymail) validation set.
43 |
44 | 
45 |
46 |
47 | #### Components
48 |
49 | **(a)** Configuration panel
50 | **(b)** Source document (or reference summary, depending on configuration)
51 | **(c)** Generated summaries (and/or reference summary, depending on configuration)
52 | **(d)** Scroll bar with global view of annotations
53 |
54 | #### Annotations
55 |
56 |
57 | **N-gram overlap:** Word sequences that overlap between the document on the left and
58 | the selected summary on the right. Underlines are color-coded by index of summary sentence.
59 | **Semantic overlap**: Words in the summary that are semantically close to one or more words in document on the left.
60 | **Novel words**: Words in the summary that do not appear in the document on the left.
61 | **Novel entities**: Entity words in the summary that do not appear in the document on the left.
62 |
63 | ### Limitations
64 | Currently only English text is supported. Extremely long documents may render slowly in the tool.
65 |
66 | ## Installation
67 | ```shell
68 | git clone https://github.com/robustness-gym/summvis.git
69 | cd summvis
70 | # Following line necessary to get pip > 21.3
71 | pip install --upgrade pip
72 | pip install -r requirements.txt
73 | ```
74 |
75 | ## Quickstart
76 |
77 | View an example from [WikiNews](examples/wikinews/README.md):
78 |
79 | ```shell
80 | streamlit run summvis.py -- --path examples/wikinews/wikinews.cache
81 | ```
82 |
83 |
84 | ## Loading data into SummVis
85 |
86 | ### If you have generated summaries:
87 |
88 | The following steps describe how to load source documents and associated precomputed summaries into the SummVis tool.
89 |
90 | **1. Download spaCy model**
91 | ```
92 | python -m spacy download en_core_web_lg
93 | ```
94 | This may take several minutes.
95 |
96 | **2. Create .jsonl file with the source document, reference summary and/or generated summaries in the following format:**
97 |
98 | ```
99 | {"document": "This is the first source document", "summary:reference": "This is the reference summary", "summary:testmodel1": "This is the summary for testmodel1", "summary:testmodel2": "This is the summary for testmodel2"}
100 | {"document": "This is the second source document", "summary:reference": "This is the reference summary", "summary:testmodel1": "This is the summary for testmodel1", "summary:testmodel2": "This is the summary for testmodel2"}
101 | ```
102 |
103 | The key for the reference summary must equal `summary:reference` and the key for any other summary must be of the form
104 | `summary:`, e.g. `summary:BART`. The document and at least one summary (reference, other, or both) are required.
105 |
106 | We also provide [scripts to generate summaries](#if-you-do-not-have-generated-summaries) if you haven't done so already.
107 |
108 | **3. Preprocess .jsonl file**
109 |
110 | Run `preprocessing.py` to precompute all data required in the interface (running `spaCy`, lexical and semantic
111 | aligners) and save a cache file, which can be read directly into the tool. Note that this script may take some time to run
112 | (~5-15 seconds per example on a MacBook Pro for
113 | documents of typical length found in CNN/DailyMail or XSum), so you may want to start with a small subset of your dataset
114 | using the `--n_samples` argument (below). This will also be expedited by running on a GPU.
115 |
116 | ```shell
117 | python preprocessing.py \
118 | --workflow \
119 | --dataset_jsonl path/to/my_dataset.jsonl \
120 | --processed_dataset_path path/to/my_cache_file
121 | ```
122 |
123 | Additional options:
124 | `--n_samples `: Process the first `number_of_samples` samples only (recommended).
125 | `--no_clean`: Do not perform additional text cleaning that may remove newlines, etc.
126 |
127 | **4. Launch Streamlit app**
128 |
129 | ```shell
130 | streamlit run summvis.py -- --path path/to/my_cache_file_or_parent_directory
131 | ```
132 |
133 | Note that the additional `--` is not a mistake, and is required to pass command-line arguments in Streamlit.
134 |
135 | ### If you do NOT have generated summaries:
136 |
137 | Before running the steps above, you may run the additional steps below to generate summaries. You may also refer to the [sample
138 | end-to-end loading scripts](examples/) for [WikiNews](examples/wikinews/load.sh) (loaded from .jsonl file) and [XSum](examples/xsum/load.sh)
139 | (loaded from HuggingFace Datasets).
140 |
141 | **1. Create file with the source documents and optional reference summaries in the following format:**
142 |
143 | ```
144 | {"document": "This is the first source document", "summary:reference": "This is the reference summary"}
145 | {"document": "This is the second source document", "summary:reference": "This is the reference summary"}
146 | ```
147 |
148 | You may create a .jsonl format directly from a Huggingface dataset by running `preprocessing.py` with the `--standardize` flag:
149 |
150 | ```shell
151 | python preprocessing.py \
152 | --standardize \
153 | --dataset hf_dataset_name \
154 | --version hf_dataset_version (optional) \
155 | --split hf_dataset_split \
156 | --save_jsonl_path path/to/save_jsonl_file
157 | ```
158 |
159 | **2. Generate predictions**
160 |
161 | To use one of the **6 standard models** (`bart-xsum`, `bart-cnndm`, `pegasus-xsum`, `pegasus-cnndm`, `pegasus-newsroom`,
162 | `pegasus-multinews`):
163 | ```shell
164 | python generation.py --model model_abbrev --data_path path/to/jsonl_file
165 | ```
166 | where `model` is one of the above 6 model codes.
167 |
168 | To use an **any Huggingface model**:
169 | ```shell
170 | python generation.py --model_name_or_path model_name_or_path --data_path path/to/jsonl_file
171 | ```
172 | where `model_name_or_path` is the name of a Huggingface model or a local path.
173 |
174 | Either of the above two commands will generate a prediction file named `..predictions`
175 |
176 | **3. Join one or more prediction files (from previous step) with original dataset**
177 |
178 | ```shell
179 | python join.py \
180 | --data_path path/to/jsonl_file \
181 | --generation_paths \
182 | path/to/prediction_file_1 \
183 | path/to/prediction_file_2 \
184 | --output_path path/to/save_jsonl_file
185 | ```
186 |
187 | Once you complete these steps, you may proceed with the [final steps](#if-you-have-already-generated-summaries) to load your file into SummVis.
188 |
189 | ## Deploying SummVis remotely
190 |
191 | See these tutorials on deploying a Streamlit app to various cloud services (from [Streamlit docs](https://docs.streamlit.io/en/stable/streamlit_faq.html)):
192 |
193 | * [How to Deploy Streamlit to a Free Amazon EC2 instance](https://towardsdatascience.com/how-to-deploy-a-streamlit-app-using-an-amazon-free-ec2-instance-416a41f69dc3), by Rahul Agarwal
194 | * [Host Streamlit on Heroku](https://towardsdatascience.com/quickly-build-and-deploy-an-application-with-streamlit-988ca08c7e83), by Maarten Grootendorst
195 | * [Host Streamlit on Azure](https://towardsdatascience.com/deploying-a-streamlit-web-app-with-azure-app-service-1f09a2159743), by Richard Peterson
196 | * [Host Streamlit on 21YunBox](https://www.21yunbox.com/docs/#/deploy-streamlit), by Toby Lei
197 |
198 | ## Citation
199 |
200 | When referencing this repository, please cite [this paper](https://arxiv.org/abs/2104.07605):
201 |
202 | ```
203 | @misc{vig2021summvis,
204 | title={SummVis: Interactive Visual Analysis of Models, Data, and Evaluation for Text Summarization},
205 | author={Jesse Vig and Wojciech Kry{\'s}ci{\'n}ski and Karan Goel and Nazneen Fatema Rajani},
206 | year={2021},
207 | eprint={2104.07605},
208 | archivePrefix={arXiv},
209 | primaryClass={cs.CL},
210 | url={https://arxiv.org/abs/2104.07605}
211 | }
212 | ```
213 |
214 | ## Acknowledgements
215 |
216 | We thank [Michael Correll](http://correll.io) for his valuable feedback.
217 |
218 |
219 |
--------------------------------------------------------------------------------
/align.py:
--------------------------------------------------------------------------------
1 | import heapq
2 | import itertools
3 | from abc import ABC, abstractmethod
4 | from collections import defaultdict
5 | from operator import itemgetter
6 | from typing import List, Dict, Tuple
7 | from typing import Sequence
8 | from abc import ABC
9 |
10 | import numpy as np
11 | import torch
12 | from bert_score import BERTScorer
13 | from nltk import PorterStemmer
14 | from spacy.tokens import Doc, Span
15 | from toolz import itertoolz
16 | from transformers import AutoTokenizer
17 | from transformers.tokenization_utils_base import PaddingStrategy
18 |
19 |
20 | class EmbeddingModel(ABC):
21 | @abstractmethod
22 | def embed(
23 | self,
24 | sents: List[Span]
25 | ):
26 | pass
27 |
28 |
29 | class ContextualEmbedding(EmbeddingModel):
30 |
31 | def __init__(self, model, tokenizer_name, max_length, batch_size=32):
32 | self.model = model
33 | self.tokenizer = SpacyHuggingfaceTokenizer(tokenizer_name, max_length)
34 | self._device = model.device
35 | self.batch_size = batch_size
36 |
37 | def embed(
38 | self,
39 | sents: List[Span]
40 | ):
41 | spacy_embs_list = []
42 | for start_idx in range(0, len(sents), self.batch_size):
43 | batch = sents[start_idx: start_idx + self.batch_size]
44 | encoded_input, special_tokens_masks, token_alignments = self.tokenizer.batch_encode(batch)
45 | encoded_input = {k: v.to(self._device) for k, v in encoded_input.items()}
46 | with torch.no_grad():
47 | model_output = self.model(**encoded_input)
48 | embeddings = model_output[0].cpu()
49 | for embs, mask, token_alignment \
50 | in zip(embeddings, special_tokens_masks, token_alignments):
51 | mask = torch.tensor(mask)
52 | embs = embs[mask == 0] # Filter embeddings at special token positions
53 | spacy_embs = []
54 | for hf_idxs in token_alignment:
55 | if hf_idxs is None:
56 | pooled_embs = torch.zeros_like(embs[0])
57 | else:
58 | pooled_embs = embs[hf_idxs].mean(dim=0) # Pool embeddings that map to the same spacy token
59 | spacy_embs.append(pooled_embs.numpy())
60 | spacy_embs = np.stack(spacy_embs)
61 | spacy_embs = spacy_embs / np.linalg.norm(spacy_embs, axis=-1, keepdims=True) # Normalize
62 | spacy_embs_list.append(spacy_embs)
63 | for embs, sent in zip(spacy_embs_list, sents):
64 | assert len(embs) == len(sent)
65 | return spacy_embs_list
66 |
67 |
68 | class StaticEmbedding(EmbeddingModel):
69 |
70 | def embed(
71 | self,
72 | sents: List[Span]
73 | ):
74 | return [
75 | np.stack([t.vector / (t.vector_norm or 1) for t in sent])
76 | for sent in sents
77 | ]
78 |
79 |
80 | class Aligner(ABC):
81 | @abstractmethod
82 | def align(
83 | self,
84 | source: Doc,
85 | targets: Sequence[Doc]
86 | ) -> List[Dict]:
87 | """Compute alignment from summary tokens to doc tokens
88 | Args:
89 | source: Source spaCy document
90 | targets: Target spaCy documents
91 | Returns: List of alignments, one for each target document"""
92 | pass
93 |
94 |
95 | class EmbeddingAligner(Aligner):
96 |
97 | def __init__(
98 | self,
99 | embedding: EmbeddingModel,
100 | threshold: float,
101 | top_k: int,
102 | baseline_val=0
103 | ):
104 | self.threshold = threshold
105 | self.top_k = top_k
106 | self.embedding = embedding
107 | self.baseline_val = baseline_val
108 |
109 | def align(
110 | self,
111 | source: Doc,
112 | targets: Sequence[Doc]
113 | ) -> List[Dict]:
114 | """Compute alignment from summary tokens to doc tokens with greatest semantic similarity
115 | Args:
116 | source: Source spaCy document
117 | targets: Target spaCy documents
118 | Returns: List of alignments, one for each target document
119 | """
120 | if len(source) == 0:
121 | return [{} for _ in targets]
122 | all_sents = list(source.sents) + list(itertools.chain.from_iterable(target.sents for target in targets))
123 | chunk_sizes = [_iter_len(source.sents)] + \
124 | [_iter_len(target.sents) for target in targets]
125 | all_sents_token_embeddings = self.embedding.embed(all_sents)
126 | chunked_sents_token_embeddings = _split(all_sents_token_embeddings, chunk_sizes)
127 | source_sent_token_embeddings = chunked_sents_token_embeddings[0]
128 | source_token_embeddings = np.concatenate(source_sent_token_embeddings)
129 | for token_idx, token in enumerate(source):
130 | if token.is_stop or token.is_punct:
131 | source_token_embeddings[token_idx] = 0
132 | alignments = []
133 | for i, target in enumerate(targets):
134 | target_sent_token_embeddings = chunked_sents_token_embeddings[i + 1]
135 | target_token_embeddings = np.concatenate(target_sent_token_embeddings)
136 | for token_idx, token in enumerate(target):
137 | if token.is_stop or token.is_punct:
138 | target_token_embeddings[token_idx] = 0
139 | alignment = defaultdict(list)
140 | for score, target_idx, source_idx in self._emb_sim_sparse(
141 | target_token_embeddings,
142 | source_token_embeddings,
143 | ):
144 | alignment[target_idx].append((source_idx, score))
145 | # TODO used argpartition to get nlargest
146 | for j in list(alignment):
147 | alignment[j] = heapq.nlargest(self.top_k, alignment[j], itemgetter(1))
148 | alignments.append(alignment)
149 | return alignments
150 |
151 | def _emb_sim_sparse(self, embs_1, embs_2):
152 | sim = embs_1 @ embs_2.T
153 | sim = (sim - self.baseline_val) / (1 - self.baseline_val)
154 | keep = sim > self.threshold
155 | keep_idxs_1, keep_idxs_2 = np.where(keep)
156 | keep_scores = sim[keep]
157 | return list(zip(keep_scores, keep_idxs_1, keep_idxs_2))
158 |
159 |
160 | class BertscoreAligner(EmbeddingAligner):
161 | def __init__(
162 | self,
163 | threshold,
164 | top_k
165 | ):
166 | scorer = BERTScorer(lang="en", rescale_with_baseline=True)
167 | model = scorer._model
168 | embedding = ContextualEmbedding(model, "roberta-large", 510)
169 | baseline_val = scorer.baseline_vals[2].item()
170 |
171 | super(BertscoreAligner, self).__init__(
172 | embedding, threshold, top_k, baseline_val
173 | )
174 |
175 |
176 | class StaticEmbeddingAligner(EmbeddingAligner):
177 | def __init__(
178 | self,
179 | threshold,
180 | top_k
181 | ):
182 | embedding = StaticEmbedding()
183 | super(StaticEmbeddingAligner, self).__init__(
184 | embedding, threshold, top_k
185 | )
186 |
187 |
188 | class NGramAligner(Aligner):
189 |
190 | def __init__(self):
191 | self.stemmer = PorterStemmer()
192 |
193 | def align(
194 | self,
195 | source: Doc,
196 | targets: List[Doc],
197 | ) -> List[Dict]:
198 |
199 | alignments = []
200 | source_ngram_spans = self._get_ngram_spans(source)
201 | for target in targets:
202 | target_ngram_spans = self._get_ngram_spans(target)
203 | alignments.append(
204 | self._align_ngrams(target_ngram_spans, source_ngram_spans)
205 | )
206 | return alignments
207 |
208 | def _get_ngram_spans(
209 | self,
210 | doc: Doc,
211 | ):
212 | ngrams = []
213 | for sent in doc.sents:
214 | for n in range(1, len(list(sent))):
215 | tokens = [t for t in sent if not (t.is_stop or t.is_punct)]
216 | ngrams.extend(_ngrams(tokens, n))
217 |
218 | def ngram_key(ngram):
219 | return tuple(self.stemmer.stem(token.text).lower() for token in ngram)
220 |
221 | key_to_ngrams = itertoolz.groupby(ngram_key, ngrams)
222 | key_to_spans = {}
223 | for k, grouped_ngrams in key_to_ngrams.items():
224 | key_to_spans[k] = [
225 | (ngram[0].i, ngram[-1].i + 1)
226 | for ngram in grouped_ngrams
227 | ]
228 | return key_to_spans
229 |
230 | def _align_ngrams(
231 | self,
232 | ngram_spans_1: Dict[Tuple[str], List[Tuple[int, int]]],
233 | ngram_spans_2: Dict[Tuple[str], List[Tuple[int, int]]]
234 | ) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
235 | """Align ngram spans between two documents
236 | Args:
237 | ngram_spans_1: Map from (normalized_token1, normalized_token2, ...) n-gram tuple to a list of token spans
238 | of format (start_pos, end_pos)
239 | ngram_spans_2: Same format as above, but for second text
240 | Returns: map from each (start, end) span in text 1 to list of aligned (start, end) spans in text 2
241 | """
242 | if not ngram_spans_1 or not ngram_spans_2:
243 | return {}
244 | max_span_end_1 = max(span[1] for span in itertools.chain.from_iterable(ngram_spans_1.values()))
245 | token_is_available_1 = [True] * max_span_end_1 #
246 | matched_keys = list(set(ngram_spans_1.keys()) & set(ngram_spans_2.keys())) # Matched normalized ngrams betwee
247 | matched_keys.sort(key=len, reverse=True) # Process n-grams from longest to shortest
248 |
249 | alignment = defaultdict(list) # Map from each matched span in text 1 to list of aligned spans in text 2
250 | for key in matched_keys:
251 | spans_1 = ngram_spans_1[key]
252 | spans_2 = ngram_spans_2[key]
253 | available_spans_1 = [span for span in spans_1 if all(token_is_available_1[slice(*span)])]
254 | matched_spans_1 = []
255 | if available_spans_1 and spans_2:
256 | # if ngram can be matched to available spans in both sequences
257 | for span in available_spans_1:
258 | # It's possible that these newly matched spans may be overlapping with one another, so
259 | # check that token positions still available (only one span allowed ber token in text 1):
260 | if all(token_is_available_1[slice(*span)]):
261 | matched_spans_1.append(span)
262 | token_is_available_1[slice(*span)] = [False] * (span[1] - span[0])
263 | for span1 in matched_spans_1:
264 | alignment[span1] = spans_2
265 |
266 | return alignment
267 |
268 |
269 | class SpacyHuggingfaceTokenizer:
270 | def __init__(
271 | self,
272 | model_name,
273 | max_length
274 | ):
275 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
276 | self.max_length = max_length
277 |
278 | def batch_encode(
279 | self,
280 | sents: List[Span]
281 | ):
282 | token_alignments = []
283 | token_ids_list = []
284 |
285 | # Tokenize each sentence and special tokens.
286 | for sent in sents:
287 | hf_tokens, token_alignment = self.tokenize(sent)
288 | token_alignments.append(token_alignment)
289 | token_ids = self.tokenizer.convert_tokens_to_ids(hf_tokens)
290 | encoding = self.tokenizer.prepare_for_model(
291 | token_ids,
292 | add_special_tokens=True,
293 | padding=False,
294 | )
295 | token_ids_list.append(encoding['input_ids'])
296 |
297 | # Add padding
298 | max_length = max(map(len, token_ids_list))
299 | attention_mask = []
300 | input_ids = []
301 | special_tokens_masks = []
302 | for token_ids in token_ids_list:
303 | encoding = self.tokenizer.prepare_for_model(
304 | token_ids,
305 | padding=PaddingStrategy.MAX_LENGTH,
306 | max_length=max_length,
307 | add_special_tokens=False
308 | )
309 | input_ids.append(encoding['input_ids'])
310 | attention_mask.append(encoding['attention_mask'])
311 | special_tokens_masks.append(
312 | self.tokenizer.get_special_tokens_mask(
313 | encoding['input_ids'],
314 | already_has_special_tokens=True
315 | )
316 | )
317 |
318 | encoded = {
319 | 'input_ids': torch.tensor(input_ids),
320 | 'attention_mask': torch.tensor(attention_mask)
321 | }
322 | return encoded, special_tokens_masks, token_alignments
323 |
324 | def tokenize(
325 | self,
326 | sent
327 | ):
328 | """Convert spacy sentence to huggingface tokens and compute the alignment"""
329 | hf_tokens = []
330 | token_alignment = []
331 | for i, token in enumerate(sent):
332 | # "Tokenize" each word individually, so as to track the alignment between spaCy/HF tokens
333 | # Prefix all tokens with a space except the first one in the sentence
334 | if i == 0:
335 | token_text = token.text
336 | else:
337 | token_text = ' ' + token.text
338 | start_hf_idx = len(hf_tokens)
339 | word_tokens = self.tokenizer.tokenize(token_text)
340 | end_hf_idx = len(hf_tokens) + len(word_tokens)
341 | if end_hf_idx < self.max_length:
342 | hf_tokens.extend(word_tokens)
343 | hf_idxs = list(range(start_hf_idx, end_hf_idx))
344 | else:
345 | hf_idxs = None
346 | token_alignment.append(hf_idxs)
347 | return hf_tokens, token_alignment
348 |
349 |
350 | def _split(data, sizes):
351 | it = iter(data)
352 | return [[next(it) for _ in range(size)] for size in sizes]
353 |
354 |
355 | def _iter_len(it):
356 | return sum(1 for _ in it)
357 |
358 | # TODO set up batching
359 | # To get top K axis and value per row: https://stackoverflow.com/questions/42832711/using-np-argpartition-to-index-values-in-a-multidimensional-array
360 |
361 |
362 | def _ngrams(tokens, n):
363 | for i in range(len(tokens) - n + 1):
364 | yield tokens[i:i + n]
365 |
--------------------------------------------------------------------------------
/components.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from itertools import count
3 | from operator import itemgetter
4 | from pathlib import Path
5 | from typing import Dict, Optional
6 | from typing import List, Tuple, Union
7 |
8 | import htbuilder
9 | import streamlit as st
10 | from htbuilder import span, div, script, style, link, styles, HtmlElement, br
11 | from htbuilder.units import px
12 | from spacy.tokens import Doc
13 |
14 | palette = [
15 | "#66c2a5",
16 | "#fc8d62",
17 | "#8da0cb",
18 | "#e78ac3",
19 | "#a6d854",
20 | "#ffd92f",
21 | "#e5c494",
22 | "#b3b3b3",
23 | ]
24 | inactive_color = "#BBB"
25 |
26 |
27 | def local_stylesheet(path):
28 | with open(path) as f:
29 | css = f.read()
30 | return style()(
31 | css
32 | )
33 |
34 |
35 | def remote_stylesheet(url):
36 | return link(
37 | href=url
38 | )
39 |
40 |
41 | def local_script(path):
42 | with open(path) as f:
43 | code = f.read()
44 | return script()(
45 | code
46 | )
47 |
48 |
49 | def remote_script(url):
50 | return script(
51 | src=url
52 | )
53 |
54 |
55 | def get_color(sent_idx):
56 | return palette[sent_idx % len(palette)]
57 |
58 |
59 | def hex_to_rgb(hex):
60 | hex = hex.replace("#", '')
61 | return tuple(int(hex[i:i + 2], 16) for i in (0, 2, 4))
62 |
63 |
64 | def color_with_opacity(hex_color, opacity):
65 | rgb = hex_to_rgb(hex_color)
66 | return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{opacity:.2f})"
67 |
68 |
69 | class Component:
70 |
71 | def show(self, width=None, height=None, scrolling=True, **kwargs):
72 | out = div(style=styles(
73 | **kwargs
74 | ))(self.html())
75 | html = str(out)
76 | st.components.v1.html(html, width=width, height=height, scrolling=scrolling)
77 |
78 | def html(self):
79 | raise NotImplemented
80 |
81 |
82 | class MainView(Component):
83 |
84 | def __init__(
85 | self,
86 | document: Doc,
87 | summaries: List[Doc],
88 | semantic_alignments: Optional[List[Dict]],
89 | lexical_alignments: Optional[List[Dict]],
90 | layout: str,
91 | scroll: bool,
92 | gray_out_stopwords: bool
93 | ):
94 | self.document = document
95 | self.summaries = summaries
96 | self.semantic_alignments = semantic_alignments
97 | self.lexical_alignments = lexical_alignments
98 | self.layout = layout
99 | self.scroll = scroll
100 | self.gray_out_stopwords = gray_out_stopwords
101 |
102 | def html(self):
103 |
104 | # Add document elements
105 | if self.document._.name == 'Document':
106 | document_name = 'Source Document'
107 | else:
108 | document_name = self.document._.name + ' summary'
109 | doc_header = div(
110 | id_="document-header"
111 | )(
112 | document_name
113 | )
114 | doc_elements = []
115 |
116 | # Add document content, which comprises multiple elements, one for each summary. Only the elment corresponding to
117 | # selected summary will be visible.
118 |
119 | mu = MultiUnderline()
120 |
121 | for summary_idx, summary in enumerate(self.summaries):
122 | token_idx_to_sent_idx = {}
123 | for sent_idx, sent in enumerate(summary.sents):
124 | for token in sent:
125 | token_idx_to_sent_idx[token.i] = sent_idx
126 | is_selected_summary = (summary_idx == 0) # By default, first summary is selected
127 |
128 | if self.semantic_alignments is not None:
129 | doc_token_idx_to_matches = defaultdict(list)
130 | semantic_alignment = self.semantic_alignments[summary_idx]
131 | for summary_token_idx, matches in semantic_alignment.items():
132 | for doc_token_idx, sim in matches:
133 | doc_token_idx_to_matches[doc_token_idx].append((summary_token_idx, sim))
134 | else:
135 | doc_token_idx_to_matches = {}
136 |
137 | token_elements = []
138 | for doc_token_idx, doc_token in enumerate(self.document):
139 | if doc_token.is_stop or doc_token.is_punct:
140 | classes = ["stopword"]
141 | if self.gray_out_stopwords:
142 | classes.append("grayed-out")
143 | el = span(
144 | _class=" ".join(classes)
145 | )(
146 | doc_token.text
147 | )
148 |
149 | else:
150 | matches = doc_token_idx_to_matches.get(doc_token_idx)
151 | if matches:
152 | summary_token_idx, sim = max(matches, key=itemgetter(1))
153 | sent_idx = token_idx_to_sent_idx[summary_token_idx]
154 | color_primary = get_color(sent_idx)
155 | highlight_color_primary = color_with_opacity(color_primary, sim)
156 | props = {
157 | 'data-highlight-id': str(doc_token_idx),
158 | 'data-primary-color': highlight_color_primary
159 | }
160 | match_classes = []
161 | for summary_token_idx, sim in matches:
162 | sent_idx = token_idx_to_sent_idx[summary_token_idx]
163 | match_classes.append(f"summary-highlight-{summary_idx}-{summary_token_idx}")
164 | color = color_with_opacity(get_color(sent_idx), sim)
165 | props[f"data-color-{summary_idx}-{summary_token_idx}"] = color
166 | props["data-match-classes"] = " ".join(match_classes)
167 | el = self._highlight(
168 | doc_token.text,
169 | highlight_color_primary,
170 | color_primary,
171 | match_classes + ["annotation-hidden"],
172 | **props
173 | )
174 | else:
175 | el = doc_token.text
176 | token_elements.append(el)
177 |
178 | spans = []
179 | if self.lexical_alignments is not None:
180 | lexical_alignment = self.lexical_alignments[summary_idx]
181 | for summary_span, doc_spans in lexical_alignment.items():
182 | summary_span_start, summary_span_end = summary_span
183 | span_id = f"{summary_idx}-{summary_span_start}-{summary_span_end}"
184 | sent_idx = token_idx_to_sent_idx[summary_span_start]
185 | for doc_span_start, doc_span_end in doc_spans:
186 | spans.append((
187 | doc_span_start,
188 | doc_span_end,
189 | sent_idx,
190 | get_color(sent_idx),
191 | span_id
192 | ))
193 | token_elements = mu.markup(token_elements, spans)
194 |
195 | classes = ["main-doc", "bordered"]
196 | if self.scroll:
197 | classes.append("scroll")
198 |
199 | main_doc = div(
200 | _class=" ".join(classes)
201 | )(
202 | token_elements
203 | ),
204 |
205 | classes = ["doc"]
206 | if is_selected_summary:
207 | classes.append("display")
208 | else:
209 | classes.append("nodisplay")
210 | doc_elements.append(
211 | div(
212 | **{
213 | "class": " ".join(classes),
214 | "data-index": summary_idx
215 | }
216 | )(
217 | main_doc,
218 | div(_class="proxy-doc"),
219 | div(_class="proxy-scroll")
220 | )
221 | )
222 |
223 | summary_title = "Summary"
224 | summary_header = div(
225 | id_="summary-header"
226 | )(
227 | summary_title,
228 | div(id="summary-header-gap"),
229 | )
230 |
231 | summary_items = []
232 | for summary_idx, summary in enumerate(self.summaries):
233 | token_idx_to_sent_idx = {}
234 | for sent_idx, sent in enumerate(summary.sents):
235 | for token in sent:
236 | token_idx_to_sent_idx[token.i] = sent_idx
237 |
238 | spans = []
239 | matches_ngram = [False] * len(list(summary))
240 | if self.lexical_alignments is not None:
241 | lexical_alignment = self.lexical_alignments[summary_idx]
242 | for summary_span in lexical_alignment.keys():
243 | start, end = summary_span
244 | matches_ngram[slice(start, end)] = [True] * (end - start)
245 | span_id = f"{summary_idx}-{start}-{end}"
246 | sent_idx = token_idx_to_sent_idx[start]
247 | spans.append((
248 | start,
249 | end,
250 | sent_idx,
251 | get_color(sent_idx),
252 | span_id
253 | ))
254 |
255 | if self.semantic_alignments is not None:
256 | semantic_alignment = self.semantic_alignments[summary_idx]
257 | else:
258 | semantic_alignment = {}
259 | token_elements = []
260 | for token_idx, token in enumerate(summary):
261 | if token.is_stop or token.is_punct:
262 | classes = ["stopword"]
263 | if self.gray_out_stopwords:
264 | classes.append("grayed-out")
265 | el = span(
266 | _class=" ".join(classes)
267 | )(
268 | token.text
269 | )
270 | else:
271 | classes = []
272 | if token.ent_iob_ in ('I', 'B'):
273 | classes.append("entity")
274 | if matches_ngram[token_idx]:
275 | classes.append("matches-ngram")
276 | matches = semantic_alignment.get(token_idx)
277 | if matches:
278 | top_match = max(matches, key=itemgetter(1))
279 | top_sim = max(top_match[1], 0)
280 | top_doc_token_idx = top_match[0]
281 | props = {
282 | "data-highlight-id": f"{summary_idx}-{token_idx}",
283 | "data-top-doc-highlight-id": str(top_doc_token_idx),
284 | "data-top-doc-sim": f"{top_sim:.2f}",
285 | }
286 | classes.extend([
287 | "annotation-hidden",
288 | f"summary-highlight-{summary_idx}-{token_idx}"
289 | ])
290 | sent_idx = token_idx_to_sent_idx[token_idx]
291 | el = self._highlight(
292 | token.text,
293 | color_with_opacity(get_color(sent_idx), top_sim),
294 | color_with_opacity(get_color(sent_idx), 1),
295 | classes,
296 | **props
297 | )
298 | else:
299 | if classes:
300 | el = span(_class=" ".join(classes))(token.text)
301 | else:
302 | el = token.text
303 | token_elements.append(el)
304 |
305 | token_elements = mu.markup(token_elements, spans)
306 |
307 | classes = ["summary-item"]
308 | if summary_idx == 0: # Default is for first summary to be selected
309 | classes.append("selected")
310 |
311 | summary_items.append(
312 | div(
313 | **{"class": ' '.join(classes), "data-index": summary_idx}
314 | )(
315 | div(_class="name")(summary._.name),
316 | div(_class="content")(token_elements)
317 | )
318 | )
319 | classes = ["summary-list", "bordered"]
320 | if self.scroll:
321 | classes.append("scroll")
322 | if self.lexical_alignments is not None:
323 | classes.append("has-lexical-alignment")
324 | if self.semantic_alignments is not None:
325 | classes.append("has-semantic-alignment")
326 | summary_list = div(
327 | _class=" ".join(classes)
328 | )(
329 | summary_items
330 | )
331 |
332 | annotation_key = \
333 | """
334 |