├── .gitignore ├── assets └── images │ ├── promo.png │ ├── sample_tree.png │ ├── tree_modified_structure.png │ └── promo_tree.svg ├── pytest.ini ├── constituent_treelib ├── __init__.py ├── errors.py ├── export.py └── core.py ├── requirements.txt ├── CITATION.cff ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── python-package.yml ├── ctl_app.py ├── pyproject.toml ├── tests └── test_ctl_core.py ├── README.md └── Constituent_TreeLib_Quickstart.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .venv* 2 | .resources 3 | __pycache__ 4 | .pytest_cache 5 | .vscode -------------------------------------------------------------------------------- /assets/images/promo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Halvani/Constituent-Treelib/HEAD/assets/images/promo.png -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | pythonpath = . 3 | filterwarnings = 4 | ignore:.*torch_struct.distributions.TreeCRF.* -------------------------------------------------------------------------------- /assets/images/sample_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Halvani/Constituent-Treelib/HEAD/assets/images/sample_tree.png -------------------------------------------------------------------------------- /assets/images/tree_modified_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Halvani/Constituent-Treelib/HEAD/assets/images/tree_modified_structure.png -------------------------------------------------------------------------------- /constituent_treelib/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import warnings 3 | from constituent_treelib.core import * 4 | warnings.filterwarnings("ignore", message=".*torch_struct.distributions.TreeCRF.*") 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | benepar 2 | contractions 3 | langid 4 | huspacy 5 | nltk 6 | pdfkit 7 | pytest 8 | spacy 9 | streamlit 10 | torch 11 | Wand 12 | tokenizers 13 | transformers 14 | protobuf==3.20.3 15 | svgling 16 | ipython -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this library, please cite it as below." 3 | authors: 4 | - family-names: "Halvani" 5 | given-names: "Oren" 6 | orcid: "https://orcid.org/0000-0002-1460-9373" 7 | title: "Constituent Treelib - A Lightweight Python Library for Constructing, Processing, and Visualizing Constituent Trees." 8 | version: 0.0.7 9 | doi: 10.5281/zenodo.10951644 10 | date-released: 2024-04-10 11 | url: "https://github.com/Halvani/constituent-treelib" -------------------------------------------------------------------------------- /constituent_treelib/errors.py: -------------------------------------------------------------------------------- 1 | class ParenthesesError(Exception): 2 | """ Raised when there is a mismatch between opening and closing parenthesis.""" 3 | pass 4 | 5 | 6 | class NoneOrEmptyBracketedTreeError(Exception): 7 | """ Raised when there a bracketed tree is either none or empty.""" 8 | pass 9 | 10 | 11 | class NLPPipelineError(Exception): 12 | """ Raised when there is an issue with the nlp pipeline (e.g., the benepar component is missing.)""" 13 | pass 14 | 15 | 16 | class LanguageError(Exception): 17 | """ Raised in case of language issues (e.g., a language mismatch between spaCy and benepar).""" 18 | pass 19 | 20 | 21 | class SentenceError(Exception): 22 | """ Raised when an invalid sentence is given.""" 23 | pass -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Oren Halvani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.GIT_TOKEN }} 40 | -------------------------------------------------------------------------------- /ctl_app.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import spacy 3 | import base64 4 | import streamlit as st 5 | from pathlib import Path 6 | from constituent_treelib import ConstituentTree, BracketedTree 7 | 8 | 9 | @st.experimental_singleton 10 | def get_nlp(): 11 | language = ConstituentTree.Language.English 12 | spacy_model_size = ConstituentTree.SpacyModelSize.Medium 13 | nlp = ConstituentTree.create_pipeline(language, spacy_model_size, download_models = False) 14 | return nlp 15 | 16 | 17 | nlp = get_nlp() 18 | 19 | st.title("*** Constituent Tree Playground ***") 20 | 21 | text = st.text_input("Enter your sentence...", "There is a constituent behind the tree!") 22 | 23 | tree = ConstituentTree(text, nlp) 24 | tree.export_tree("temp.svg") 25 | tree_svg = Path("temp.svg").read_text() 26 | #tree_svg = tree_svg.replace("SVPVBExtractNPNNSconstituentsPPINwithNPNNease.! -------------------------------------------------------------------------------- /constituent_treelib/export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import json 5 | import pdfkit 6 | from pathlib import Path 7 | from nltk import Tree 8 | from nltk.draw.tree import TreeView 9 | from nltk.treeprettyprinter import TreePrettyPrinter 10 | from typing import Dict 11 | 12 | 13 | def __to_dict(tree: Tree) -> Dict[str, str]: 14 | """Transforms the given tree into a nested dictionary. 15 | 16 | Args: 17 | tree: An nltk.Tree that should be transformed into a nested dictionary. 18 | 19 | Returns: 20 | A nested dictionary representation of the given tree. 21 | """ 22 | return {tree.label(): [__to_dict(t) if isinstance(t, Tree) else t for t in tree]} 23 | 24 | 25 | def export_figure(nltk_tree: Tree, destination_filepath: str, wkhtmltopdf_bin_filepath: str, verbose: bool, 26 | dpi: int, tree_style_nltk: bool) -> None: 27 | """ Exports the constructed constituent tree in various file formats (currently supported: 28 | [.pdf, .svg, .ps, .png, .jpg, .gif, .bmp, .psd, .eps, .tiff, .txt, .tex, .json, .yaml]. 29 | 30 | Args: 31 | nltk_tree: To parsed nltk.Tree that should be exported. 32 | 33 | destination_filepath: The file path to which the tree should be exported. In case of an image format, 34 | the resulting visualization will be cropped with respect to unnecessary margins. 35 | 36 | wkhtmltopdf_bin_filepath: To filepath to the rendering tool "wkhtmltopdf". Only required if the 37 | visualization of the constituent tree should be exported to a PDF file. If not already done, the tool 38 | wkhtmltopdf must first be downloaded and installed from https://wkhtmltopdf.org before the visualization 39 | can be exported. 40 | 41 | dpi: Specifies the desired resolution. A DPI value of 300 is considered a good standard for 42 | printable files. 43 | 44 | tree_style_nltk: If set to True, the classic NLTK style will be used to visualize the nltk.Tree. 45 | 46 | verbose: If set to True, a short message about whether the output file creation was 47 | successful is displayed. 48 | """ 49 | 50 | # Convert the nltk.Tree into an SVG representation. 51 | svg_obj = nltk_tree._repr_svg_() 52 | 53 | if tree_style_nltk: 54 | svg_obj = TreePrettyPrinter(nltk_tree).svg(nodecolor='black', leafcolor='black', funccolor='black') 55 | 56 | extension = Path(destination_filepath).suffix.lower() 57 | try: 58 | if extension == '.ps': 59 | TreeView(nltk_tree)._cframe.print_to_file(destination_filepath) 60 | 61 | elif extension == '.txt': 62 | tree_as_text = TreePrettyPrinter(nltk_tree).text() 63 | Path(destination_filepath).write_text(data=tree_as_text, encoding='utf-8') 64 | 65 | elif extension == '.tex': 66 | # Build a minimal working tex file. 67 | tex_head = '\\documentclass{article}\n\\usepackage[utf8]{inputenc}\n\\usepackage{qtree}\n' 68 | tex_body = '\\begin{document}\n' + nltk_tree.pformat_latex_qtree() + '\n\\end{document}' 69 | tex_file = f'{tex_head}{tex_body}' 70 | 71 | Path(destination_filepath).write_text(data=tex_file, encoding='utf-8') 72 | 73 | elif extension == '.json': 74 | tree_dic = __to_dict(nltk_tree) 75 | json_str = json.dumps(tree_dic, indent=1) 76 | Path(destination_filepath).write_text(data=json_str, encoding='utf-8') 77 | 78 | elif extension == '.svg': 79 | Path(destination_filepath).write_text(svg_obj, encoding='utf-8') 80 | 81 | elif extension in ['.jpg', '.png', '.gif', '.bmp', '.eps', '.psd', '.tiff', '.yaml']: 82 | # In case that ImageMagick is not installed, the Python binding "wand" raises an exception and returns the 83 | # appropriate URL for the version of ImageMagick that matches the current operating system. 84 | # After downloading and installing ImageMagick, "wand" can be used to export the visualization. 85 | # Take a look at the docs https://docs.wand-py.org for further information. 86 | from wand.api import library 87 | from wand.image import Image 88 | 89 | with Image(blob=svg_obj.encode('utf-8'), resolution=dpi) as image: 90 | image.save(filename=destination_filepath) 91 | 92 | elif extension == '.pdf': 93 | msg_invalid_wkhtmltopdf_path = 'A valid path to the wkhtmltopdf binary must be provided in order to export ' \ 94 | 'the parsed nltk.Tree into a pdf file.' 95 | 96 | # The path to the wkhtmltopdf binary was not specified (None). 97 | if wkhtmltopdf_bin_filepath is None: 98 | # In case of a Windows OS, an attempt is made to locate the path of the wkhtmltopdf binary by looking up 99 | # the default installation directory ("Program Files/wkhtmltopdf") 100 | if sys.platform == 'win32': 101 | wkhtmltopdf_bin_filepath = Path(os.environ.get('ProgramFiles'), 102 | 'wkhtmltopdf', 103 | 'bin', 104 | 'wkhtmltopdf.exe') 105 | 106 | # Check if the default path to the wkhtmltopdf binary actually exists. 107 | if not Path(wkhtmltopdf_bin_filepath).exists(): 108 | raise AssertionError( 109 | 'The wkhtmltopdf binary (e.g., wkhtmltopdf.exe on a Windows OS) could not be found ' 110 | 'under the default installation directory ("Program Files/wkhtmltopdf"). ' 111 | 'If not installed yet, you should download it first from https://wkhtmltopdf.org. ' 112 | 'Note, you only need to install the program itself and provide this method the path where ' 113 | 'it can be found. Setting environment variables is not required.') 114 | 115 | # If the current OS is not Windows, the path to the wkhtmltopdf binary must be specified manually. 116 | else: 117 | raise AssertionError(msg_invalid_wkhtmltopdf_path) 118 | 119 | # In case that a path to the wkhtmltopdf binary has been provided, check if it is valid. 120 | else: 121 | if len(wkhtmltopdf_bin_filepath.strip()) == 0 or not Path(wkhtmltopdf_bin_filepath).exists(): 122 | raise AssertionError(msg_invalid_wkhtmltopdf_path) 123 | 124 | # Determine visible size of the SVG object. 125 | height = int(re.search(r"height=\"([0-9]+)", svg_obj).groups(1)[0]) 126 | width = int(re.search(r"width=\"([0-9]+)", svg_obj).groups(1)[0]) 127 | height = str((height + 10) * dpi / 96) + "px" 128 | width = str(width * dpi / 96) + "px" 129 | 130 | # Default options for the rendering process. All options of wkhtmltopdf can be looked up under' 131 | # https://wkhtmltopdf.org/usage/wkhtmltopdf.txt 132 | options = { 133 | 'dpi': str(dpi), 134 | 'page-size': 'Letter', 135 | 'margin-top': '0mm', 136 | 'margin-bottom': '0mm', 137 | 'margin-left': '0mm', 138 | 'margin-right': '0mm', 139 | 'page-width': width, 140 | 'page-height': height, 141 | 'encoding': "UTF-8", 142 | 'disable-smart-shrinking': None 143 | } 144 | 145 | config = pdfkit.configuration(wkhtmltopdf=str(wkhtmltopdf_bin_filepath)) 146 | pdfkit.from_string(svg_obj, output_path=destination_filepath, configuration=config, options=options) 147 | else: 148 | raise ValueError( 149 | "Currently, only the following file formats are supported " 150 | "[.pdf, .svg, .ps, .png, .jpg, .gif, .bmp, .psd, .eps, .tiff, .txt, .tex, .json, .yaml]") 151 | 152 | except Exception as e: 153 | raise Exception(f'The specified {extension[1:].upper()}-file could not be saved. Error: {e}') 154 | 155 | if verbose: 156 | print(f'{extension[1:].upper()}-file successfully saved to: {destination_filepath}') 157 | -------------------------------------------------------------------------------- /tests/test_ctl_core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import spacy 5 | import pytest 6 | import inspect 7 | import hashlib 8 | import unittest 9 | 10 | # Import CTL from the parent directory 11 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 12 | parentdir = os.path.dirname(currentdir) 13 | sys.path.insert(0, parentdir) 14 | from constituent_treelib import ConstituentTree, BracketedTree, Language, Structure 15 | from constituent_treelib.errors import * 16 | 17 | 18 | class TestBracketedTree(unittest.TestCase): 19 | def test_error_none_bracketed_tree(self): 20 | with pytest.raises(NoneOrEmptyBracketedTreeError): 21 | BracketedTree(bracketed_tree_string=None) 22 | 23 | def test_error_empty_tree(self): 24 | with pytest.raises(NoneOrEmptyBracketedTreeError): 25 | BracketedTree(bracketed_tree_string="") 26 | 27 | def test_error_no_matching_closing_opening_parentheses(self): 28 | with pytest.raises(ParenthesesError): 29 | bracketed_tree_string = "(S (NP (PRP I)) (VP (VBP love) (NP (NNS cookies))) (. !)" 30 | BracketedTree(bracketed_tree_string) 31 | 32 | def test_error_too_many_closing_parentheses(self): 33 | with pytest.raises(ParenthesesError): 34 | bracketed_tree_string = "(S (NP (PRP I)) (VP (VBP love) (NP (NNS cakes) (CC and) (NNS cookies))) (. !)))" 35 | BracketedTree(bracketed_tree_string) 36 | 37 | def test_remove_postag_nodes(self): 38 | bracketed_tree_string = "(S (NP (PRP I)) (VP (VBP love) (NP (NNS cookies))) (. !))" 39 | bracketed_tree_string_without_postags = BracketedTree.remove_postag_nodes(bracketed_tree_string) 40 | assert bracketed_tree_string_without_postags == "(S (NP I) (VP love (NP cookies)) !)" 41 | 42 | def test_remove_token_leaves(self): 43 | bracketed_tree_string = "(S (NP (PRP I)) (VP (VBP love) (NP (NNS cookies))) (. !))" 44 | bracketed_tree = BracketedTree(bracketed_tree_string) 45 | bracketed_tree_string_without_token_leaves = BracketedTree.remove_token_leaves(bracketed_tree.nltk_tree) 46 | assert bracketed_tree_string_without_token_leaves == "(S (NP (PRP)) (VP (VBP) (NP (NNS))) (.))" 47 | 48 | 49 | class TestConstituentTree(unittest.TestCase): 50 | nlp = None 51 | defect_nlp = None 52 | unnecessary_components = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"] 53 | 54 | def setUp(self): 55 | spacy_model_en_small = "en_core_web_sm" 56 | spacy_model_en_medium = "en_core_web_md" 57 | benepar_model_fr = "benepar_fr2" 58 | benepar_model_en_small = "benepar_en3" 59 | 60 | ConstituentTree.download(spacy_model_en_small, benepar_model_fr) 61 | ConstituentTree.download(spacy_model_en_medium, benepar_model_en_small) 62 | 63 | self.defect_nlp = spacy.load("en_core_web_sm", disable=self.unnecessary_components) 64 | self.defect_nlp.add_pipe("sentencizer") 65 | self.nlp = ConstituentTree.create_pipeline(download_models=False) 66 | 67 | def test_error_nlp_pipeline_none(self): 68 | with pytest.raises(NLPPipelineError): 69 | sentence = "I will not instantiate a ConstituentTree object without an nlp pipeline ever again." 70 | ConstituentTree(sentence) 71 | 72 | def test_error_nlp_pipeline_invalid_type(self): 73 | with pytest.raises(NLPPipelineError): 74 | sentence = "I will not instantiate a ConstituentTree object with an invalid nlp pipeline ever again." 75 | ConstituentTree(sentence, nlp="") 76 | 77 | def test_error_nlp_pipeline_without_benepar_component(self): 78 | with pytest.raises(NLPPipelineError): 79 | sentence = "I will not instantiate a ConstituentTree object with an invalid nlp pipeline ever again." 80 | ConstituentTree(sentence, nlp=self.defect_nlp) 81 | 82 | def test_error_nlp_pipeline_models_sentence_language_mismatch(self): 83 | with pytest.raises(LanguageError): 84 | sentence = "Huch, das war jetzt nicht gewollt." 85 | ConstituentTree(sentence, nlp=self.nlp) 86 | 87 | def test_spacy_pos_tagging(self): 88 | doc = self.nlp("Today was a good day!") 89 | assert ['NN', 'VBD', 'DT', 'JJ', 'NN', '.'] == [t.tag_ for t in doc] 90 | 91 | def test_error_sentence_none(self): 92 | with pytest.raises(SentenceError): 93 | sentence = None 94 | ConstituentTree(sentence, nlp=self.nlp) 95 | 96 | def test_tree_parsing(self): 97 | sentence = "I love cakes and cookies!" 98 | tree = ConstituentTree(sentence, self.nlp) 99 | bracketed_sentence = "(S (NP (PRP I)) (VP (VBP love) (NP (NNS cakes) (CC and) (NNS cookies))) (. !))" 100 | assert tree.to_bracketed_tree_string() == bracketed_sentence 101 | 102 | def test_create_tree_from_bracketed_string(self): 103 | bracketed_tree_string = "(S (NP (PRP You)) (VP (MD must) (VP (VB construct) (NP (JJ additional) " \ 104 | "(NNS pylons)))) (. !))" 105 | bracketed_tree = BracketedTree(bracketed_tree_string) 106 | tree_from_bracketed = ConstituentTree(sentence=bracketed_tree, nlp=self.nlp) 107 | sentence = "You must construct additional pylons !" 108 | assert tree_from_bracketed.leaves(tree_from_bracketed.nltk_tree) == sentence 109 | 110 | def test_error_multiple_sentences(self): 111 | with pytest.raises(SentenceError): 112 | sentence = "I love cakes and cookies! I don't like eggplants." 113 | ConstituentTree(sentence, nlp=self.nlp) 114 | 115 | def test_extracted_phrases(self): 116 | sentence = "Albert Einstein was a German-born theoretical physicist." 117 | tree = ConstituentTree(sentence, self.nlp) 118 | expected_output = { 119 | "NP": ["Albert Einstein", "a German - born theoretical physicist"], 120 | "S": ["Albert Einstein was a German - born theoretical physicist ."], 121 | "ADJP": ["German - born"], 122 | "VP": ["was a German - born theoretical physicist"]} 123 | assert tree.extract_all_phrases() == expected_output 124 | 125 | def test_extracted_phrases_tree_without_token_leaves(self): 126 | sentence = "Give it all you've got!" 127 | tree_without_token_leaves = ConstituentTree(sentence, self.nlp, structure=Structure.WithoutTokenLeaves) 128 | postag_phrases = {'S': ['VB PRP DT PRP VBP VBN .', 'PRP VBP VBN'], 129 | 'NP': ['DT PRP VBP VBN'], 130 | 'VP': ['VB PRP DT PRP VBP VBN', 'VBP VBN']} 131 | assert tree_without_token_leaves.extract_all_phrases() == postag_phrases 132 | 133 | def contraction_expansion(self): 134 | sentence = "I haven't the foggiest idea what you're talking about!" 135 | tree = ConstituentTree(sentence, self.nlp, expand_contractions=True) 136 | 137 | nc_text_only = tree.leaves(tree.nltk_tree, ConstituentTree.NodeContent.Text) 138 | nc_postag_only = tree.leaves(tree.nltk_tree, ConstituentTree.NodeContent.Pos) 139 | nc_combined = tree.leaves(tree.nltk_tree, ConstituentTree.NodeContent.Combined) 140 | 141 | true_text_only = "I have not the foggiest idea what you are talking about !" 142 | true_postag_only = "PRP VBP RB DT JJS NN WP PRP VBP VBG IN ." 143 | true_combined = "I_PRP have_VBP not_RB the_DT foggiest_JJS idea_NN what_WP " \ 144 | "you_PRP are_VBP talking_VBG about_IN !_." 145 | assert (nc_text_only == true_text_only and nc_postag_only == true_postag_only and nc_combined == true_combined) 146 | 147 | def test_tree_structure_without_token_leaves(self): 148 | sentence = "Let's test the improvements, shall we?" 149 | tree_without_token_leaves = ConstituentTree(sentence, self.nlp, structure=Structure.WithoutTokenLeaves) 150 | bracketed_string = "(SQ (S (VP VB (S (NP PRP) (VP VB (NP DT NNS))))) , MD (NP PRP) .)" 151 | assert tree_without_token_leaves.to_bracketed_tree_string() == bracketed_string 152 | 153 | def test_tree_structure_without_postag_nodes(self): 154 | sentence = "Let's test the improvements, shall we?" 155 | tree_without_postag_nodes = ConstituentTree(sentence, self.nlp, structure=Structure.WithoutPostagNodes) 156 | bracketed_string = "(SQ (S (VP Let (S (NP 's) (VP test (NP the improvements))))) , shall (NP we) ?)" 157 | assert tree_without_postag_nodes.to_bracketed_tree_string() == bracketed_string 158 | 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | Constituent-Treelib logo 4 |

5 |

6 | 7 | 8 | 9 | 10 | 11 | 12 |

13 |
14 | 15 | # Constituent Treelib (CTL) 16 | A lightweight Python library for constructing, processing, and visualizing constituent trees. 17 | 18 | ## Description 19 | CTL is a lightweight Python library that offers you a convenient way to parse sentences into constituent trees, modify them according to their structure, as well as visualize and export them into various [file formats](#Export_visualization). In addition, you can extract phrases according to their phrasal categories (which can be used e.g., as features for various NLP tasks), validate already parsed sentences in bracket notation or convert them back into sentences. 20 | 21 | CTL is built on top of **benepar** (*Berkeley Neural Parser*) as well as the two well-known NLP frameworks **spaCy** and **NLTK**. Here, spaCy is used for tokenization and sentence segmentation, while benepar performs the actual parsing of the sentences. NLTK, on the other hand, provides the fundamental data structure for storing and processing the parsed sentences. 22 | 23 | To gain a clearer picture of what a constituent tree looks like, we consider the following example. Let *S* denote the sentence...
24 | ``` 25 | "Isaac Asimov was an American writer and professor of biochemistry at Boston University." 26 | ``` 27 | 28 | This sentence can be parsed into a bracketed tree string representation (shown below in a *Penn tree-bank* style) 29 | ``` 30 | (S 31 | (NP (NNP Isaac) (NNP Asimov)) 32 | (VP 33 | (VBD was) 34 | (NP 35 | (NP (DT an) (JJ American) (NN writer) (CC and) (NN professor)) 36 | (PP (IN of) (NP (NN biochemistry))) 37 | (PP (IN at) (NP (NNP Boston) (NNP University))))) 38 | (. .)) 39 | ``` 40 | 41 | which represents the actual constituent tree. However, since this notation is not really easy to read, we can turn it into a nicer visualization using - guess what - CTL! Once we have parsed and visualized the tree, we can export it to a desired format, here for example as a PNG file: 42 | 43 |
44 |
45 | (S
 46 |   (NP (NNP Isaac) (NNP Asimov))
 47 |   (VP
 48 |     (VBD was)
 49 |     (NP
 50 |       (NP (DT an) (JJ American) (NN writer) (CC and) (NN professor))
 51 |       (PP (IN of) (NP (NN biochemistry)))
 52 |       (PP (IN at) (NP (NNP Boston) (NNP University)))))
 53 |   (. .)) 54 |
55 |
56 |
57 | 58 | In case you grew up in the Usenet era, you might prefer the classic ASCII-ART look... 59 | 60 | ``` 61 | S 62 | __________________________________|____________________________________________________________ 63 | | VP | 64 | | _________|____________________ | 65 | | | NP | 66 | | | ____________|________________________________ | 67 | | | | PP PP | 68 | | | | ___|_______ ____|_____ | 69 | NP | NP | NP | NP | 70 | ____|____ | _____________|____________ | | | _____|______ | 71 | NNP NNP VBD DT JJ NN CC NN IN NN IN NNP NNP . 72 | | | | | | | | | | | | | | | 73 | Isaac Asimov was an American writer and professor of biochemistry at Boston University . 74 | ``` 75 | Regardless of which format is considered, the underlying representation[^1] shows three aspects of the structure of *S*: 76 | - Linear order of the words and their part-of-speech: ``NNP = Isaac``, ``NNP = Asimov``, ``VBD = was``, ... 77 | - Groupings of the words and their part-of-speech into phrases: ``NP = Isaac Asimov``, ``VP = an American writer and professor``, ``PP = of biochemistry`` and ``PP = at Boston University`` 78 | - Hierarchical structure of the phrases: ``S``, ``VP``, ``NP`` and ``PP`` 79 | 80 | 81 | ## Applications 82 | Constituent trees offer a wide range of applications including: 83 | - Analysis and comparison of sentence structures between different languages for (computational) linguists 84 | - Extracting phrasal features for certain NLP tasks (e.g., Machine Translation, Information Extraction, Paraphrasing, Stylometry, Deception Detection or Natural Language Watermarking) 85 | - Using the resulting representations as an input to train GNNs for specific tasks (e.g., Chemical–Drug Relation Extraction or Semantic Role Labeling) 86 | 87 | 88 | ## Features 89 | - Easy construction of constituent trees from raw or already processed sentences 90 | - Converting parsed constituent trees back into sentences 91 | - Convenient export of tree visualizations into various [file formats](#Export_visualization) 92 | - Extraction of phrases according to their phrasal categories 93 | - Manipulation of the tree structure (without inner postag nodes or without token leaves) 94 | - Multilingual (currently CTL supports [eight languages](#Available_models_and_languages)) 95 | - Automatic NLP pipeline creation (loads and installs the benepar + spaCy models on demand) 96 | - No API dependency (after downloading the models CTL can be used completely offline) 97 | - Extensively documented source code 98 | 99 | 100 | ## No Code Demo 101 | In case you just want to play around with CTL, there is a minimally functional Streamlit app that will be gradually extended. To run the demo, please first install Streamlit via: `pip install streamlit`. Afterwards, you can call the app from the command line as follows: `streamlit run ctl_app.py` 102 | 103 | 104 | ## Installation 105 | The easiest way to install CTL is to use pip, where you can choose between (1) the PyPI[^2] repository and (2) this repository. 106 | 107 | - (1) ```pip install constituent-treelib``` 108 | 109 | - (2) ```pip install git+https://github.com/Halvani/constituent_treelib.git ``` 110 | 111 | The latter will pull and install the latest commit from this repository as well as the required Python dependencies. 112 | 113 | ### Non-Python dependencies: 114 | CTL also relies on two open-source tools to export constituent trees into various file formats: 115 | 116 | - To export the constituent tree into a PDF, the command line tool **wkhtmltopdf** is required. Once downloaded and installed, the path to the wkhtmltopdf binary must be passed to the export function. 117 | 118 | - To export the constituent tree into the file formats JPG, PNG, GIF, BMP, EPS, PSD, TIFF and YAML, the software suite **ImageMagick** is required. 119 | 120 | 121 | ## Quickstart 122 | Below you can find several examples of the core functionality of CTL. More examples can be found in the jupyter notebook demo. 123 | 124 | 125 | ### Creating an NLP pipeline 126 | To instantiate a ``ConstituentTree`` object, CTL requires a spaCy-based NLP pipeline that incorporates a benepar component. Although you can set up this pipeline yourself, it is recommended (and more convenient) to let CTL do it for you automatically via the ``create_pipeline()`` method. Given the desired [language](#Available_models_and_languages), this method creates the NLP pipeline and also downloads[^3] the corresponding spaCy and benepar models, if requested. The following code shows an example of this: 127 | ```python 128 | from constituent_treelib import ConstituentTree, BracketedTree, Language, Structure 129 | 130 | # Define the language for the sentence as well as for the spaCy and benepar models 131 | language = Language.English 132 | 133 | # Define which specific SpaCy model should be used (default is Medium) 134 | spacy_model_size = ConstituentTree.SpacyModelSize.Medium 135 | 136 | # Create the pipeline (note, the required models will be downloaded and installed automatically) 137 | nlp = ConstituentTree.create_pipeline(language, spacy_model_size) 138 | 139 | >>> ✔ Download and installation successful 140 | >>> You can now load the package via spacy.load('en_core_web_md') 141 | 142 | >>> [nltk_data] Downloading package benepar_en3 to 143 | >>> [nltk_data] [..] \nltk_data... 144 | >>> [nltk_data] Unzipping models\benepar_en3.zip. 145 | ``` 146 | 147 | ### Define a sentence 148 | Next, we instantiate a ``ConstituentTree`` object and pass it the created NLP pipeline along with a sentence to parse, e.g. the memorable quote *"You must construct additional pylons!"*[^4]. Rather than a raw sentence, ``ConstituentTree`` also accepts an already parsed sentence wrapped as a BracketedTree object, or alternatively in the form of an NLTK tree. The following example illustrates all three options: 149 | ```python 150 | # Raw sentence 151 | sentence = 'You must construct additional pylons!' 152 | 153 | # Parsed sentence wrapped as a BracketedTree object 154 | bracketed_tree_string = '(S (NP (PRP You)) (VP (MD must) (VP (VB construct) (NP (JJ additional) (NNS pylons)))) (. !))' 155 | sentence = BracketedTree(bracketed_tree_string) 156 | 157 | # Parsed sentence in the form of an NLTK tree 158 | from nltk import Tree 159 | 160 | sentence = Tree('S', [Tree('NP', [Tree('PRP', ['You'])]), Tree('VP', [Tree('MD', ['must']), Tree('VP', [Tree('VB', ['construct']), Tree('NP', [Tree('JJ', ['additional']), Tree('NNS', ['pylons'])])])]), Tree('.', ['!'])]) 161 | 162 | tree = ConstituentTree(sentence, nlp) 163 | ``` 164 | 165 | ### Modified tree structure 166 | CTL allows you to modify the structure of the tree by either: 167 | 168 | - Eliminating **inner postag nodes** (tree contains now phrasal categories as inner nodes and tokens as leaves) 169 | 170 | - Eliminating **token leaves** (tree contains now phrasal categories as inner nodes and postags as leaves) 171 | 172 | ```python 173 | without_token_leaves = ConstituentTree(sentence, nlp, Structure.WithoutTokenLeaves) 174 | 175 | without_inner_postag_nodes = ConstituentTree(sentence, nlp, Structure.WithoutPostagNodes) 176 | ``` 177 | The result... 178 | 179 | ![(S (NP You) (VP must (VP construct (NP additional pylons))) !)](assets/images/tree_modified_structure.png) 180 | 181 | Modified tree structures offer several benefits. One of them, for example, is saving space when using the visualizations in papers. Eliminating the inner postag nodes (shown on the right) reduces the tree height from level 5 to 4. Another useful application arises from the elimination of token leaves, which will be discussed in more detail in the following section. 182 | 183 | ### Extract phrases 184 | Once we have created ``tree``, we can now extract phrases according to given phrasal categories e.g., verb phrases: 185 | ```python 186 | phrases = tree.extract_all_phrases() 187 | print(phrases) 188 | 189 | >>> {'S': ['You must construct additional pylons !'], 190 | >>> 'VP': ['must construct additional pylons', 'construct additional pylons'], 191 | >>> 'NP': ['additional pylons']} 192 | 193 | # Only verb phrases.. 194 | print(phrases['VP']) 195 | 196 | >>> ['must construct additional pylons', 'construct additional pylons'] 197 | ``` 198 | 199 | As can be seen here, the second verb phrase is contained in the former. To avoid this, we can instruct the method to disregard nested phrases: 200 | ```python 201 | non_nested_phrases = tree.extract_all_phrases(avoid_nested_phrases=True) 202 | print(non_nested_phrases['VP']) 203 | 204 | >>> ['must construct additional pylons'] 205 | ``` 206 | If you want to extract phrases, but are more interested in their postag representation than the actual words/tokens, you can apply the same function to the modified tree... 207 | ```python 208 | pos_phrases = without_token_leaves.extract_all_phrases() 209 | print(pos_phrases) 210 | 211 | >>> {'S': ['PRP MD VB JJ NNS .'], 212 | >>> 'NP': ['JJ NNS'], 213 | >>> 'VP': ['MD VB JJ NNS', 'VB JJ NNS']} 214 | ``` 215 | This is especially helpful when investigating the writing style of authors. 216 | 217 | 218 | 219 | ### Export the tree 220 | CTL offers you to export a constituent tree into various **file formats**, which are listed below. Most of these formats result in a visualization of the tree, while the remaining file formats are used for data exchange. 221 | 222 |
223 | Show supported file formats... 224 | 225 | | Extension | Description | Output | 226 | | --- | --- | --- | 227 | | **PDF** | *Portable Document Format* | Vector graphic| 228 | | **SVG** | *Scalable Vector Graphics* | Vector graphic| 229 | | **EPS** | *Encapsulated PostScript* | Vector graphic| 230 | | **JPG** | *Joint Photographic Experts Group* | Raster image| 231 | | **PNG** | *Portable Network Graphics* | Raster image| 232 | | **GIF** | *Graphics Interchange Format* | Raster image| 233 | | **BMP** | *Bitmap* | Raster image| 234 | | **PSD** | *Photoshop Document* | Raster image| 235 | | **TIFF** | *Tagged Image File Format* | Raster image| 236 | | **JSON** | *JavaScript Object Notation* | Data exchange format | 237 | | **YAML** | *Yet Another Markup Language* | Data exchange format | 238 | | **TXT** | *Plain-Text* | Pretty-print text visualization| 239 | | **TEX** | *LaTeX-Document* | LaTeX-typesetting | 240 | 241 |
242 | 243 | The following example shows an export of the tree into a PDF file: 244 | 245 | ```python 246 | tree.export_tree(destination_filepath='my_tree.pdf', verbose=True) 247 | 248 | >>> PDF - file successfully saved to: my_tree.pdf 249 | ``` 250 | 251 | In the case of raster/vector images, CTL automatically removes unnecessary margins with respect to the resulting visualizations. This is particularly useful if the visualizations are to be used in papers. 252 | 253 | 254 | 255 | ## Available models and languages 256 | CTL currently supports eight languages: English, German, French, Polish, Hungarian, Swedish, Chinese and Korean. The performance of the respective models can be looked up in the benepar repository. 257 | 258 | ## CTL in the Research Landscape 259 | CTL has been used in several research works published at leading conferences, including EMNLP 2025, ICLR 2024 and ACL 2024: 260 | 261 | - Meinan Liu, Yunfang Dong, Xixian Liao, and Bonnie Webber. 2025. **[Multi-token Mask-filling and Implicit Discourse Relations](https://aclanthology.org/2025.findings-emnlp.670/)**. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 12546–12560, Suzhou, China. Association for Computational Linguistics. 262 | 263 | - Mulligan, Karl, and Kyle Rawlins. **[Analyzing naturally-sourced Questions Under Discussion](https://journals.linguisticsociety.org/proceedings/index.php/ELM/article/view/5828)**. Experiments in Linguistic Meaning, vol. 3, 24 Jan 2025. 264 | 265 | - Judita Preiss **[Hybrid Approach to Literature-Based Discovery: Combining Traditional Methods with LLMs](https://www.mdpi.com/2076-3417/15/16/8785/pdf?version=1754653754)**. Appl. Sci. 2025, 15, 8785. 266 | 267 | - Yuang Li, Jiaxin Guo, Min Zhang, Ma Miaomiao, Zhiqiang Rao, Weidong Zhang, Xianghui He, Daimeng Wei, and Hao Yang. 2024. **[Pause-Aware Automatic Dubbing using LLM and Voice Cloning](https://aclanthology.org/2024.iwslt-1.2/)**. In Proceedings of the 21st International Conference on Spoken Language Translation (IWSLT 2024), pages 12–16, Bangkok, Thailand (in-person and online). Association for Computational Linguistics. 268 | 269 | - Tanvir Mahmud, D. Marculescu, **[Weakly-supervised Audio Separation via Bi-modal Semantic Similarity](https://arxiv.org/abs/2404.01740#)**, in: The Twelfth International Conference on Learning Representations, 2024. 270 | 271 | 272 | ## License 273 | The code and the jupyter notebook demo of CTL are released under the MIT License. See LICENSE for further details. 274 | 275 | 276 | ## Citation 277 | If you find this repository helpful, please invest a few minutes and cite it in your paper/project: 278 | ```bibtex 279 | @software{Halvani_Constituent_Treelib:2024, 280 | author = {Halvani, Oren}, 281 | title = {{Constituent Treelib - A Lightweight Python Library for Constructing, Processing, and Visualizing Constituent Trees.}}, 282 | doi = {10.5281/zenodo.10951644}, 283 | month = apr, 284 | url = {https://github.com/Halvani/constituent-treelib}, 285 | version = {0.0.7}, 286 | year = {2024} 287 | } 288 | ``` 289 | Please also give credit to the authors of benepar and cite their work. In science, the principle is: **give and take**.. 290 | 291 | 292 | [^1]: Note, if you are not familiar with the bracket labels of constituent trees, have a look at the following Gist 293 | or alternatively this website. 294 | 295 | [^2]: It's recommended to install CTL from PyPI (*Python Package Index*). However, if you want to benefit from the latest update of CTL, you should use this repository instead, since I will only update PyPi at irregular intervals. 296 | 297 | [^3]: After the models have been downloaded, they are cached so that there are no redundant downloads when the method is called again. However, loading and initializing the spaCy and benepar models can take a while, so it makes sense to invoke the ``create_pipeline()`` method only once if you want to process multiple sentences. 298 | 299 | [^4]: https://knowyourmeme.com/memes/you-must-construct-additional-pylons -------------------------------------------------------------------------------- /constituent_treelib/core.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import nltk 4 | import spacy 5 | import benepar 6 | import huspacy 7 | import contractions 8 | from nltk import Tree 9 | from pathlib import Path 10 | import importlib.resources 11 | from enum import Enum, auto 12 | from langid.langid import LanguageIdentifier, model 13 | from typing import List, Dict, Set, Union, Generator 14 | 15 | # Package imports 16 | from .errors import * 17 | 18 | 19 | class Structure(Enum): 20 | Complete = auto() 21 | WithoutTokenLeaves = auto() 22 | WithoutPostagNodes = auto() 23 | 24 | 25 | class Language(Enum): 26 | English = auto() 27 | German = auto() 28 | French = auto() 29 | Polish = auto() 30 | Hungarian = auto() 31 | Swedish = auto() 32 | Chinese = auto() 33 | Korean = auto() 34 | Unsupported = auto() 35 | 36 | 37 | class BracketedTree: 38 | nltk_tree = None 39 | locations_of_parentheses = None 40 | bracketed_tree_string = None 41 | 42 | def __init__(self, bracketed_tree_string: str, structure: Structure = Structure.Complete) -> None: 43 | """Validates the given bracketed tree string and, in case of success, constructs the internal nltk_tree that 44 | represents the data structure of the constituent tree. If the validation has not succeeded, an appropriate 45 | error is raised. 46 | 47 | Args: 48 | bracketed_tree_string: A constituent tree represented in a bracketed tree notation. 49 | 50 | structure: The desired structure of the tree. By default, inner postag nodes and tokens, which 51 | represent the tree leaves, are present (Structure.Complete). Alternatively, the postag nodes can 52 | be removed (Structure.WithoutPostagNodes) or the tokens i.e. the tree leaves (Structure.WithoutTokenLeaves). 53 | """ 54 | 55 | msg_base_tree_error = "Could not create a BracketedTree instance." 56 | msg_none_or_empty_tree = f"{msg_base_tree_error} The given bracketed tree string is either None or empty." 57 | msg_parentheses_mismatch = f"{msg_base_tree_error} The given bracketed string does not match in terms of " \ 58 | f"opening and closing parentheses." 59 | msg_invalid_nltk_tree = f"{msg_base_tree_error} The given bracketed tree string could not be loaded as an " \ 60 | f"nltk.Tree." 61 | 62 | # Ensure that the given bracketed tree string is neither None nor empty. 63 | if bracketed_tree_string is not None and len(bracketed_tree_string) > 0: 64 | bracketed_tree_string = bracketed_tree_string.strip() 65 | self.bracketed_tree_string = bracketed_tree_string 66 | else: 67 | raise NoneOrEmptyBracketedTreeError(msg_none_or_empty_tree) 68 | 69 | # Ensure that for each opening parenthesis there is a corresponding closing parenthesis 70 | # and store their locations. 71 | try: 72 | self.locations_of_parentheses = BracketedTree.parentheses_locations(self.bracketed_tree_string) 73 | except Exception as e: 74 | raise ParenthesesError(f"{msg_parentheses_mismatch} Error: {e}") 75 | 76 | # Check whether the given bracketed tree string has a valid structure. 77 | if self.valid_structure(): 78 | try: 79 | # Ensure the given bracketed tree string can be loaded as a valid nltk.Tree. 80 | self.nltk_tree = Tree.fromstring(self.bracketed_tree_string) 81 | except Exception as e: 82 | raise ParenthesesError(f"{msg_invalid_nltk_tree} Error: {e}") 83 | 84 | # Further nodes at the end are present. An attempt is made to correct the tree structure. 85 | else: 86 | root_node_closing_parenthesis_index = self.locations_of_parentheses[0] + 1 87 | punctuation_fragment = self.bracketed_tree_string[root_node_closing_parenthesis_index:].strip() 88 | 89 | # Cut off last closing parenthesis and insert it under the root node. 90 | reorganized_sentence = self.bracketed_tree_string[:root_node_closing_parenthesis_index - 1] 91 | self.bracketed_tree_string = f"{reorganized_sentence} {punctuation_fragment})" 92 | self.nltk_tree = Tree.fromstring(self.bracketed_tree_string) 93 | 94 | # Modify the tree structure by removing inner postag nodes. 95 | if structure == Structure.WithoutPostagNodes: 96 | self.bracketed_tree_string = BracketedTree.remove_postag_nodes(self.bracketed_tree_string) 97 | self.nltk_tree = Tree.fromstring(self.bracketed_tree_string) 98 | 99 | # Modify the tree structure by removing the token leaves. 100 | elif structure == Structure.WithoutTokenLeaves: 101 | self.bracketed_tree_string = BracketedTree.remove_token_leaves(self.nltk_tree) 102 | # Since there are no more token leaves present, the current postag nodes (tree objects) must be transformed 103 | # into terminals (strings) otherwise, the semantics of some methods within the ConstituentTree 104 | # class will change. 105 | self.nltk_tree = BracketedTree.tree_leaves_to_terminal_leaves(Tree.fromstring(self.bracketed_tree_string)) 106 | 107 | def valid_structure(self) -> bool: 108 | """Validates the structure of the bracketed tree string with regard to nodes that occur after the root node. 109 | When parsing German sentences, for example, it can happen on the part of benepar that the bracketed tree string 110 | is fragmented into several non-nested constituents, e.g. (S (NP ...)) ($. ?). If this case occurs, the internal 111 | nltk_tree cannot be constructed due to a read error: "ValueError: Tree.read(): expected 'end-of-string'...". 112 | The goal of this function is therefore to check if such a fragmentation exists, so that BracketedTree can 113 | automatically take care of it by reorganizing the structure of the bracketed tree string in a later step. 114 | 115 | Returns: 116 | Decision on whether the structure of the bracketed tree string is valid. 117 | This is the case if no further nodes occur after the closing parenthesis of the root node. 118 | """ 119 | root_node_ending_par_loc = self.locations_of_parentheses[0] 120 | return len( 121 | [par_loc for par_loc in self.locations_of_parentheses.keys() if par_loc > root_node_ending_par_loc]) == 0 122 | 123 | @staticmethod 124 | def tokenize(tree: Tree) -> Generator[str, None, None]: 125 | """ Tokenizes a given tree into phrasal categories, postags as well as opening and closing parentheses 126 | 127 | Args: 128 | tree: An nltk.Tree that should be tokenized. 129 | 130 | Returns: 131 | An iterator that produces a sequence of phrasal categories, postags, opening and closing parentheses. 132 | """ 133 | if isinstance(tree, Tree): 134 | yield "(" 135 | yield tree.label() 136 | for node in tree: 137 | yield from BracketedTree.tokenize(node) 138 | yield ")" 139 | 140 | @staticmethod 141 | def remove_postag_nodes(bracketed_tree_string: str) -> str: 142 | """ Changes the structure of the constituent tree by removing all postag nodes from the given 143 | bracketed tree string. 144 | 145 | Args: 146 | bracketed_tree_string: A constituent tree represented in a bracketed tree notation. 147 | 148 | Returns: 149 | Reorganized bracketed tree string in which all postag nodes are removed. 150 | """ 151 | 152 | tree = Tree.fromstring(bracketed_tree_string) 153 | postag_nodes = tree.subtrees(lambda t: t.height() == 2 and len(t.leaves()) == 1) 154 | 155 | for p in postag_nodes: 156 | bracketed_tree_string = bracketed_tree_string.replace(p.pformat(), p.leaves()[0]) 157 | return bracketed_tree_string 158 | 159 | @staticmethod 160 | def remove_token_leaves(tree: Tree) -> str: 161 | """ Changes the structure of the constituent tree by removing all token leaves from the given 162 | bracketed tree string. 163 | 164 | Args: 165 | tree: An nltk.Tree from where all token leaves should be removed. 166 | 167 | Returns: 168 | Reorganized bracketed tree string in which all token leaves are removed. 169 | """ 170 | 171 | tokenized_tree = list(BracketedTree.tokenize(tree)) 172 | bracketed_tree_string = " ".join(tokenized_tree).replace("( ", "(").replace(" )", ")") 173 | return bracketed_tree_string 174 | 175 | @staticmethod 176 | def tree_leaves_to_terminal_leaves(tree: Tree): 177 | """ Transforms tree leaves into terminals (strings). 178 | 179 | Returns: 180 | A tree where all leaves (represented by tree objects) are converted to terminals (strings). 181 | """ 182 | return Tree(tree._label, map(BracketedTree.tree_leaves_to_terminal_leaves, tree)) if len(tree) else tree._label 183 | 184 | @staticmethod 185 | def parentheses_locations(bracketed_tree_string: str) -> Dict[int, int]: 186 | """Validates that for each opening parenthesis in the given bracketed tree string there is a corresponding 187 | closing parenthesis. If this is not the case appropriate exceptions are raised. 188 | 189 | Args: 190 | bracketed_tree_string: A constituent tree represented in bracketed tree notation. 191 | 192 | Returns: 193 | A dictionary that holds all locations of all opening and closing parentheses. 194 | """ 195 | 196 | stack = [] 197 | locations = dict() 198 | for index, p in enumerate(bracketed_tree_string): 199 | if p == "(": 200 | stack.append(index) 201 | elif p == ")": 202 | if stack: 203 | locations[stack.pop()] = index 204 | else: 205 | raise ParenthesesError(f"Too many closing parentheses located at stack index: {index}.") 206 | if stack: 207 | raise ParenthesesError(f"No matching closing parenthesis to opening parenthesis at " 208 | f"stack index: {stack.pop()}.") 209 | return locations 210 | 211 | 212 | class ConstituentTree: 213 | nlp = None 214 | lang_dict = {"en": Language.English, 215 | "de": Language.German, 216 | "fr": Language.French, 217 | "zh": Language.Chinese, 218 | "hu": Language.Hungarian, 219 | "ko": Language.Korean, 220 | "pl": Language.Polish, 221 | "sv": Language.Swedish} 222 | 223 | class NodeContent(Enum): 224 | Text = auto() 225 | Pos = auto() 226 | Combined = auto() 227 | 228 | class SpacyModelSize(Enum): 229 | Small = auto() 230 | Medium = auto() 231 | Large = auto() 232 | Transformer = auto() 233 | 234 | class BeneparEnglishModel(Enum): 235 | EN3 = auto() 236 | EN3Large = auto() 237 | EN3WSJ = auto() 238 | 239 | def __init__(self, sentence: Union[str, BracketedTree, Tree], nlp: spacy.Language = None, 240 | structure: Structure = Structure.Complete, expand_contractions: bool = False, 241 | create_pipeline: bool = False) -> None: 242 | """Performs all necessary steps and validations to create the ConstituentTree object. 243 | 244 | Args: 245 | sentence: The sentence that should be parsed into a constituent tree. 246 | 247 | nlp: The fundamental spaCy-based NLP pipeline, which incorporates a benepar component. If the NLP 248 | pipeline is not explicitly specified, you can call the create_pipeline() method in this class to create it 249 | automatically. Note, however, that loading the underlying models for this pipeline can take a while, 250 | so it makes sense to invoke the method only once. 251 | 252 | structure: The desired structure of the tree. By default, inner postag nodes and tokens, 253 | which represent the tree leaves, are present (Structure.Complete). Alternatively, the postag nodes can 254 | be removed (Structure.WithoutPostagNodes) or the token leaves (Structure.WithoutTokenLeaves). 255 | 256 | expand_contractions: If set to True, contractions within the sentence are expanded (e.g., I'm --> I am). 257 | Note that contraction expansion is only supported for English. 258 | 259 | create_pipeline: If set to True (and no NLP pipeline is given), the NLP pipeline is created automatically. 260 | This variant of pipeline creation is mainly recommended for demo purposes or if you only want to process a 261 | single sentence. If, on the other hand, you want to process more than a single sentence and thus 262 | instantiate multiple ConstituentTree objects, it is strongly recommended to create the pipeline outside 263 | the ConstituentTree constructor via the create_pipeline() method and pass it to the constructor using the 264 | nlp parameter. 265 | """ 266 | 267 | if sentence is None or isinstance(sentence, str) and len(sentence.strip()) == 0: 268 | raise SentenceError("The given sentence is either none or empty. Please provide a valid sentence in order " 269 | "to instantiate a ConstituentTree object.") 270 | 271 | # Load the language detector model. 272 | self.lang_det = LanguageIdentifier.from_modelstring(model, norm_probs=True) 273 | 274 | # Detect the language of the given sentence in order to load the correct spaCy and benepar models. 275 | detected_language = Language.Unsupported 276 | 277 | if isinstance(sentence, str): 278 | detected_language = self.detect_language(sentence) 279 | elif isinstance(sentence, BracketedTree): 280 | extracted_sentence = " ".join(sentence.nltk_tree.leaves()) 281 | detected_language = self.detect_language(extracted_sentence) 282 | elif isinstance(sentence, Tree): 283 | extracted_sentence = " ".join(sentence.leaves()) 284 | detected_language = self.detect_language(extracted_sentence) 285 | 286 | supported_languages = [e.name for e in Language if e.name != "Unsupported"] 287 | if detected_language == Language.Unsupported: 288 | raise LanguageError(f"The detected language of the given sentence is not supported. " 289 | f"Currently, ConstituentTree only supports: {', '.join(supported_languages[:-1])} " 290 | f"and {supported_languages[-1]}.") 291 | else: 292 | self.sentence_language = detected_language 293 | 294 | # To process the tree correctly it is required to know which structure has been chosen. 295 | self.structure = structure 296 | 297 | # No nlp pipeline has been provided. 298 | if nlp is None: 299 | # Create the pipeline on request. 300 | if create_pipeline: 301 | self.nlp = ConstituentTree.create_pipeline(language=self.sentence_language) 302 | 303 | # Unacceptable condition: Pipeline not given and its creation was not requested. 304 | else: 305 | raise NLPPipelineError("To instantiate a ConstituentTree object, a spaCy NLP pipeline must be provided " 306 | "beforehand, which contains a benepar component. Consider using the " 307 | "create_pipeline() method to create a valid nlp pipeline.") 308 | 309 | # An nlp pipeline has been provided. Ensure if it is valid. 310 | else: 311 | # The pipeline constitutes a spaCy pipeline. 312 | if isinstance(nlp, spacy.Language): 313 | # Detect the langauge of the spaCy model. 314 | spacy_lang = nlp.config["nlp"]["lang"] 315 | spacy_lang = self.lang_dict[spacy_lang] if spacy_lang in self.lang_dict else Language.Unsupported 316 | 317 | if "benepar" in nlp.component_names: 318 | # Detect the langauge of the benepar model. 319 | benepar_lang = nlp.config["components"]["benepar"]["model"] 320 | benepar_lang = re.findall("_[a-z]{2}", benepar_lang)[0][1:] 321 | benepar_lang = self.lang_dict[ 322 | benepar_lang] if benepar_lang in self.lang_dict else Language.Unsupported 323 | 324 | # Unacceptable condition: pipeline does not contain a benepar component. 325 | else: 326 | raise NLPPipelineError("The given nlp pipeline does not contain a benepar component which is " 327 | "required to parse the sentence. Consider using the create_pipeline() " 328 | "method to create a valid nlp pipeline.") 329 | 330 | # Unacceptable condition: spacy and benepar models do not match with regard to the underlying language. 331 | if spacy_lang != benepar_lang: 332 | raise LanguageError(f"There is a mismatch regarding the languages of the spaCy and benepar models " 333 | f"within the given nlp pipeline (spaCy --> [{spacy_lang.name}], whereas " 334 | f"benepar --> [{benepar_lang.name}]. Consider using the create_pipeline() " 335 | f"method to create a valid nlp pipeline.") 336 | 337 | # Unacceptable condition: language of the sentence does not match the one of the spacy & benepar models. 338 | elif spacy_lang == benepar_lang and spacy_lang != self.sentence_language: 339 | raise LanguageError(f"There is a mismatch regarding the language of the given " 340 | f"sentence [{self.sentence_language.name}] and the language of the spaCy and " 341 | f"benepar models [{spacy_lang.name}]. You must either provide a " 342 | f"sentence in {self.sentence_language.name} or use an nlp pipeline that " 343 | f"integrates {spacy_lang.name} spaCy and benepar models.") 344 | 345 | # All minimum requirements were successfully met. Set the internal nlp pipeline. 346 | else: 347 | self.nlp = nlp 348 | 349 | # Unacceptable condition: The pipeline does not constitute a valid spaCy pipeline. 350 | else: 351 | raise NLPPipelineError("The pipeline does not constitute a valid spaCy pipeline. Consider using " 352 | "the create_pipeline() method to create a valid nlp pipeline.") 353 | 354 | # The sentence represents an nltk.Tree. Hence, no further processing is required. 355 | if isinstance(sentence, Tree): 356 | self.nltk_tree = sentence 357 | 358 | # The sentence represents a BracketedTree object that holds a constituent tree in a bracketed notation. 359 | # In this case we use its internal nltk.Tree object. 360 | elif isinstance(sentence, BracketedTree): 361 | self.nltk_tree = sentence.nltk_tree 362 | 363 | # The sentence represents a string (we assume that it constitutes a natural sentence). 364 | elif isinstance(sentence, str) and len(sentence.strip()) > 0: 365 | # Remove multiple spaces at the end of the sentence, otherwise benepar will throw an exception. 366 | sentence = sentence.strip() 367 | 368 | # Expand contractions if requested. Note, currently only English is supported! 369 | if expand_contractions and self.sentence_language == Language.English: 370 | sentence = contractions.fix(sentence, leftovers=True, slang=True) 371 | 372 | # Parse the given sentence using benepar. 373 | bracketed_tree_string = self.parse_sentence(sentence) 374 | 375 | # Instantiate a BracketedTree object to internally check if the sentence can be parsed correctly. 376 | bracketed_tree = BracketedTree(bracketed_tree_string, structure=structure) 377 | 378 | # The bracketed tree string has been successfully validated. Use its internal nltk.Tree object. 379 | self.nltk_tree = bracketed_tree.nltk_tree 380 | 381 | # The given sentence is neither a string nor a valid nltk.Tree or a BracketedTree object. 382 | # Therefore, we cannot further proceed. 383 | else: 384 | raise SentenceError(f"To instantiate a ConstituentTree object, a non-empty sentence object " 385 | f"(a string, an nltk.Tree or a BracketedTree) must be provided. Type of the " 386 | f"given sentence: {type(sentence).__name__}.") 387 | 388 | 389 | def detect_language(self, text: str, append_proba: bool = False, round_precision: int = 3, top_k_matches: int = 1): 390 | """Detects the language of the given text using the pythob lib langid. 391 | 392 | Args: 393 | text: The text whose language is to be detected. 394 | 395 | append_proba: The probability regarding the detected language. 396 | 397 | round_precision: The accuracy of the probability to be rounded. 398 | 399 | top_k_matches: Number of k most likely detected languages. By default (k=1), the language with the 400 | highest detection probability is returned. 401 | 402 | Returns: 403 | The language of the given text (optionally, the top-k detected languages and the detection probability). 404 | """ 405 | 406 | predictions = self.lang_det.rank(text) 407 | 408 | if top_k_matches > len(predictions): 409 | raise ValueError(f"The given 'top_k_matches' exceeds the number of langid's known languages. " 410 | "Consider: top_k_matches < {len(predictions)}.") 411 | 412 | predictions = predictions[0:top_k_matches] 413 | result = [] 414 | 415 | for lang, proba in predictions: 416 | lang = self.lang_dict[lang] if lang in self.lang_dict else Language.Unsupported 417 | proba = round(proba, round_precision) 418 | result.append((lang, proba)) if append_proba else result.append(lang) 419 | return result[0] if top_k_matches == 1 else result 420 | 421 | def detect_spacy_langauge(self, nlp: spacy.Language = None): 422 | """ Translates the language identifier of the internal spaCy pipeline into a corresponding 423 | ConstituentTreelib.Langauge object. 424 | 425 | Returns: 426 | A ConstituentTreelib.Langauge object that represents the language of the spaCy pipeline. 427 | """ 428 | lang = nlp.config["nlp"]["lang"] 429 | return self.lang_dict[lang] if lang in self.lang_dict else Language.Unsupported 430 | 431 | def detect_benepar_langauge(self, nlp: spacy.Language = None): 432 | """ Translates the language identifier of the internal benepar component into a corresponding 433 | ConstituentTreelib.Langauge object. 434 | 435 | Returns: 436 | A ConstituentTreelib.Langauge object that represents the language of the spaCy pipeline. 437 | """ 438 | lang = nlp.config["components"]["benepar"]["model"] 439 | lang = re.findall("_[a-z]{2}", lang)[0][1:] 440 | return self.lang_dict[lang] if lang in self.lang_dict else Language.Unsupported 441 | 442 | def parse_sentence(self, sentence: str) -> str: 443 | """Parses the given sentence into constituents using the benepar component within the spaCy nlp pipeline. 444 | 445 | Args: 446 | sentence: The raw sentence that should be parsed into a bracketed tree string. 447 | 448 | Returns: 449 | The bracketed tree string representation of the initialized sentence. 450 | """ 451 | 452 | doc = self.nlp(sentence) 453 | if len(list(doc.sents)) == 1: 454 | return list(doc.sents)[0]._.parse_string 455 | else: 456 | raise SentenceError("The given 'sentence' contains more than one sentence. " 457 | "A ConstituentTree object can process only one sentence at a time.") 458 | 459 | def to_bracketed_tree_string(self, margin: int = 70, indent: int = 0, node_separator: str = "", 460 | parentheses: str = "()", quotes: bool = False, pretty_print: bool = False) -> str: 461 | """Constructs the bracketed tree string representation of the ConstituentTree object. 462 | 463 | Args: 464 | margin: The right margin at which to do line-wrapping. 465 | 466 | indent: The indentation level at which printing begins. 467 | This number is used to decide how far to indent subsequent lines. 468 | 469 | node_separator: A string that is used to separate the node from the children. 470 | E.g., the default value ``':'`` gives trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. 471 | 472 | parentheses: The type of parentheses to be used for the bracketed tree string. 473 | 474 | quotes: If set to True, all the leaves (i.e. terminal symbols) of the tree will be quoted. 475 | 476 | pretty_print: If set to True, the bracketed tree string will be formatted in a pretty-print style 477 | using indentation. 478 | 479 | Returns: 480 | A bracketed tree string representation of the constructed ConstituentTree object. 481 | """ 482 | 483 | pp_bracketed_tree_string = self.nltk_tree.pformat(margin=margin, 484 | indent=indent, 485 | nodesep=node_separator, 486 | parens=parentheses, 487 | quotes=quotes) 488 | 489 | return pp_bracketed_tree_string if pretty_print else re.sub(r"\s{2,}", " ", pp_bracketed_tree_string) 490 | 491 | def __str__(self, **kwargs) -> str: 492 | """Allows to print the ConstituentTree object in a pretty-print style. 493 | 494 | Returns: 495 | A pretty-print bracketed tree string representation of the constructed ConstituentTree object. 496 | """ 497 | return self.to_bracketed_tree_string(**kwargs, pretty_print=True) 498 | 499 | def _repr_svg_(self) -> str: 500 | return self.nltk_tree._repr_svg_() 501 | 502 | @staticmethod 503 | def download(spacy_model_id: str, benepar_model_id: str, quiet: bool = False) -> None: 504 | """Downloads the spaCy and benepar models according to their IDs. 505 | 506 | Args: 507 | spacy_model_id: The ID of the spaCy model. 508 | 509 | benepar_model_id: The ID of the benepar model. 510 | 511 | quiet: When set to True, no pip installation output is printed. 512 | """ 513 | 514 | # Currently, spaCy does not offer any models of its own for Hungarian. Therefore, the huspacy package is used 515 | # to download respective models with an alternative downloader. 516 | if spacy_model_id.startswith("hu"): 517 | huspacy.download(spacy_model_id) 518 | 519 | # In case of any other supported language, the default spaCy downloader is used. 520 | else: 521 | # Initiate the download only if it is really necessary (i.e. if the language pack is not installed). 522 | if not spacy.util.is_package(spacy_model_id): 523 | print(f"The spaCy model: '{spacy_model_id}' was not found. Download is initiated...") 524 | # Suppress pip installation messages on request. 525 | if quiet: 526 | spacy.cli.download(spacy_model_id, False, False, "--quiet") 527 | else: 528 | spacy.cli.download(spacy_model_id, False, False) 529 | 530 | # Create the nltk_data path if it does not exist. In this path the benepar models will be saved. 531 | nltk_data_dir = Path(sys.exec_prefix, "share", "nltk_data") 532 | if not Path.exists(nltk_data_dir): 533 | nltk_data_dir.mkdir(parents=True, exist_ok=True) 534 | 535 | # Append the path to the nltk.data environment if it does not exist. 536 | if not str(nltk_data_dir) in nltk.data.path: 537 | nltk.data.path.append(str(nltk_data_dir)) 538 | 539 | benepar.download(benepar_model_id, download_dir=nltk_data_dir, quiet=quiet) 540 | 541 | @staticmethod 542 | def create_pipeline(language: Language = Language.English, spacy_model_size: SpacyModelSize = SpacyModelSize.Small, 543 | benepar_english_model: BeneparEnglishModel = BeneparEnglishModel.EN3, 544 | download_models: bool = True, quiet: bool = False) -> spacy.Language: 545 | """Constructs the fundamental nlp pipeline for the given language that consists of a spaCy pipeline that 546 | incorporates the benepar component. 547 | 548 | Args: 549 | language: The language of the text to be parsed. Depending on the specified language, the respective 550 | models are assembled for the nlp pipeline. Unless otherwise specified, the default language is set 551 | to English. 552 | 553 | spacy_model_size: The desired model size. Depending on the language, a variable number of models are 554 | available, which can be looked up at https://spacy.io/models. 555 | 556 | benepar_english_model: The desired benepar model for English (this is the only language for which 557 | multiple models are provided by the benepar developers). A description of these models can be 558 | looked up at https://github.com/nikitakit/self-attentive-parser#available-models. 559 | 560 | download_models: When set to True, an attempt is made to automatically download the required spaCy and 561 | benepar models. Otherwise, it is assumed that the corresponding models are already installed and ready 562 | for use. In case you want to download the models manually, there are several possibilities. 563 | Regarding spaCy, the first possibility is to call the spaCy module via python: 564 | "!python -m spacy download X". Alternatively, you can download the model via spaCy's CLI tool 565 | "spacy.cli.download(X)", where in both cases X denotes the name of the model (e.g., "en_core_web_sm" 566 | for the small English model). All available spaCy models are listed in https://spacy.io/models. 567 | Regarding benepar, the desired model can be downloaded via benepar.download(X), 568 | where again X denotes the name of the model. All available benepar models 569 | are listed in https://github.com/nikitakit/self-attentive-parser#available-models. 570 | 571 | quiet: When set to True, no pip installation output is printed. 572 | 573 | Returns: 574 | A spaCy-based nlp pipeline which incorporates the benepar component. This pipeline is mandatory to 575 | instantiate a ConstituentTree object. 576 | """ 577 | 578 | def err_models_not_downloaded(model_id: str, framework: str, error_trace: str) -> str: 579 | return f"It seems that the '{model_id}' model has not been downloaded or installed yet.\n" \ 580 | f"Consider to call create_pipeline() with 'download_models = True' to solve this issue.\n\n" \ 581 | f"Original error message received from {framework}: {error_trace}" 582 | 583 | msg_error_spacy_model_na = f"Unfortunately, a {str(spacy_model_size.name).lower()} spaCy model is not yet " \ 584 | f"available for {language.name}. Therefore, consider switching to another " \ 585 | f"existing model." 586 | 587 | if language == Language.English: 588 | spacy_english_models = {ConstituentTree.SpacyModelSize.Small: "en_core_web_sm", 589 | ConstituentTree.SpacyModelSize.Medium: "en_core_web_md", 590 | ConstituentTree.SpacyModelSize.Large: "en_core_web_lg", 591 | ConstituentTree.SpacyModelSize.Transformer: "en_core_web_trf"} 592 | 593 | benepar_english_models = {ConstituentTree.BeneparEnglishModel.EN3: "benepar_en3", 594 | ConstituentTree.BeneparEnglishModel.EN3Large: "benepar_en3_large", 595 | ConstituentTree.BeneparEnglishModel.EN3WSJ: "benepar_en3_wsj"} 596 | 597 | spacy_model = spacy_english_models[spacy_model_size] 598 | benepar_model = benepar_english_models[benepar_english_model] 599 | 600 | elif language == Language.German: 601 | spacy_german_models = {ConstituentTree.SpacyModelSize.Small: "de_core_news_sm", 602 | ConstituentTree.SpacyModelSize.Medium: "de_core_news_md", 603 | ConstituentTree.SpacyModelSize.Large: "de_core_news_lg", 604 | ConstituentTree.SpacyModelSize.Transformer: "de_dep_news_trf"} 605 | 606 | spacy_model = spacy_german_models[spacy_model_size] 607 | benepar_model = "benepar_de2" 608 | 609 | elif language == Language.French: 610 | spacy_french_models = {ConstituentTree.SpacyModelSize.Small: "fr_core_news_sm", 611 | ConstituentTree.SpacyModelSize.Medium: "fr_core_news_md", 612 | ConstituentTree.SpacyModelSize.Large: "fr_core_news_lg", 613 | ConstituentTree.SpacyModelSize.Transformer: "fr_dep_news_trf"} 614 | 615 | spacy_model = spacy_french_models[spacy_model_size] 616 | benepar_model = "benepar_fr2" 617 | 618 | elif language == Language.Polish: 619 | # Note, spaCy does not offer a transformer-based model for Polish yet. 620 | if spacy_model_size == ConstituentTree.SpacyModelSize.Transformer: 621 | raise ValueError(msg_error_spacy_model_na) 622 | 623 | spacy_polish_models = {ConstituentTree.SpacyModelSize.Small: "pl_core_news_sm", 624 | ConstituentTree.SpacyModelSize.Medium: "pl_core_news_md", 625 | ConstituentTree.SpacyModelSize.Large: "pl_core_news_lg"} 626 | 627 | spacy_model = spacy_polish_models[spacy_model_size] 628 | benepar_model = "benepar_pl2" 629 | 630 | elif language == Language.Hungarian: 631 | # Note, huspacy does not offer a small model for Hungarian yet. 632 | if spacy_model_size == ConstituentTree.SpacyModelSize.Small: 633 | raise ValueError(msg_error_spacy_model_na) 634 | 635 | spacy_hungarian_models = {ConstituentTree.SpacyModelSize.Medium: "hu_core_news_md", 636 | ConstituentTree.SpacyModelSize.Large: "hu_core_news_lg", 637 | ConstituentTree.SpacyModelSize.Transformer: "hu_core_news_trf"} 638 | 639 | spacy_model = spacy_hungarian_models[spacy_model_size] 640 | benepar_model = "benepar_hu2" 641 | 642 | elif language == Language.Swedish: 643 | # Note, spaCy does not offer a transformer-based model for Swedish yet. 644 | if spacy_model_size == ConstituentTree.SpacyModelSize.Transformer: 645 | raise ValueError(msg_error_spacy_model_na) 646 | 647 | spacy_swedish_models = {ConstituentTree.SpacyModelSize.Small: "sv_core_news_sm", 648 | ConstituentTree.SpacyModelSize.Medium: "sv_core_news_md", 649 | ConstituentTree.SpacyModelSize.Large: "sv_core_news_lg"} 650 | 651 | spacy_model = spacy_swedish_models[spacy_model_size] 652 | benepar_model = "benepar_sv2" 653 | 654 | elif language == Language.Chinese: 655 | spacy_chinese_models = {ConstituentTree.SpacyModelSize.Small: "zh_core_web_sm", 656 | ConstituentTree.SpacyModelSize.Medium: "zh_core_web_md", 657 | ConstituentTree.SpacyModelSize.Large: "zh_core_web_lg", 658 | ConstituentTree.SpacyModelSize.Transformer: "zh_core_web_trf"} 659 | 660 | spacy_model = spacy_chinese_models[spacy_model_size] 661 | benepar_model = "benepar_zh2" 662 | 663 | elif language == Language.Korean: 664 | # Note, spaCy does not offer a transformer-based model for Korean yet. 665 | if spacy_model_size == ConstituentTree.SpacyModelSize.Transformer: 666 | raise ValueError(msg_error_spacy_model_na) 667 | 668 | spacy_korean_models = {ConstituentTree.SpacyModelSize.Small: "ko_core_news_sm", 669 | ConstituentTree.SpacyModelSize.Medium: "ko_core_news_md", 670 | ConstituentTree.SpacyModelSize.Large: "ko_core_news_lg"} 671 | 672 | spacy_model = spacy_korean_models[spacy_model_size] 673 | benepar_model = "benepar_ko2" 674 | 675 | else: 676 | raise LanguageError("Unsupported language.") 677 | 678 | # Download the models only if requested. 679 | if download_models: 680 | ConstituentTree.download(spacy_model, benepar_model, quiet) 681 | 682 | try: 683 | nlp = spacy.load(spacy_model, 684 | disable=["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]) 685 | nlp.add_pipe("sentencizer") 686 | nlp.add_pipe("benepar", config={"model": benepar_model}) 687 | return nlp 688 | 689 | except OSError as e: 690 | print(err_models_not_downloaded(model_id=spacy_model, framework="spaCy", error_trace=e)) 691 | 692 | except LookupError as e: 693 | print(err_models_not_downloaded(model_id=benepar_model, framework="benepar", error_trace=e)) 694 | 695 | def _extract_phrases(self, tree: Tree, phrasal_category: str, min_words_in_phrases: int = 2) -> List[list]: 696 | """Extracts phrases according to a given phrasal category from an nltk.Tree in a recursive manner. 697 | 698 | Args: 699 | tree: An nltk.Tree from where the phrases should be extracted. 700 | 701 | phrasal_category: The desired category for the phrases that should be extracted. 702 | A comprehensive list of phrasal categories for English, German, French and other languages can be 703 | looked up at https://dkpro.github.io/dkpro-core/releases/2.2.0/docs/tagset-reference.html 704 | 705 | min_words_in_phrases: Minimum number of words each extracted phrase should contain. 706 | 707 | Returns: 708 | A list of all phrases in the tree that belong to the given phrasal category. 709 | """ 710 | 711 | phrases = [] 712 | if tree.label() == phrasal_category: 713 | phrases.append(tree.copy(True)) 714 | 715 | for child in tree: 716 | if isinstance(child, Tree): 717 | temp = self._extract_phrases(child, phrasal_category, min_words_in_phrases) 718 | if len(temp) > 0: 719 | phrases.extend(temp) 720 | 721 | if min_words_in_phrases >= 2: 722 | return [p for p in phrases if len(p) >= min_words_in_phrases] 723 | else: 724 | return [p for p in phrases] 725 | 726 | def leaves(self, tree: Tree, content_type: NodeContent = NodeContent.Text) -> str: 727 | """Extracts all leaves (= terminal symbols) from the given (sub)tree according to the desired content type, 728 | which can be the text itself, its corresponding part-of-speech or a combination of both. 729 | 730 | Args: 731 | tree: A (sub)tree from where the leaves should be extracted. 732 | 733 | content_type: The desired content type: token (NodeContent.Text), postag (NodeContent.Pos) or 734 | a combination of both (NodeContent.Combined). 735 | 736 | Returns: 737 | A concatenated string that includes all leaves. Here *string* represents either a sentence in case that 738 | the tree is complete or a phrase in case that it represents a subtree. 739 | """ 740 | 741 | # Depending on the current structure of the tree, it must be ensured that the correct leaves are returned. 742 | # In cases where specific postag/token leaves are missing, appropriate exceptions are raised. 743 | if content_type == self.NodeContent.Text: 744 | if self.structure in [Structure.Complete, Structure.WithoutPostagNodes]: 745 | return " ".join([w for w in tree.leaves()]) 746 | elif self.structure == Structure.WithoutTokenLeaves: 747 | raise ValueError("The leaves of the current tree contain only postags. " 748 | "Hence, there are no tokens to return.") 749 | 750 | elif content_type == self.NodeContent.Pos: 751 | if self.structure == Structure.WithoutTokenLeaves: 752 | return " ".join([w for w in tree.leaves()]) 753 | elif self.structure == Structure.Complete: 754 | return " ".join([w[1] for w in tree.pos()]) 755 | elif self.structure == Structure.WithoutPostagNodes: 756 | raise ValueError("The leaves of the current tree contain only tokens. " 757 | "Hence, there are no postags to return.") 758 | 759 | elif content_type == self.NodeContent.Combined: 760 | if self.structure == Structure.Complete: 761 | return " ".join([f"{w[0]}_{w[1]}" for w in tree.pos()]) 762 | elif self.structure == Structure.WithoutTokenLeaves: 763 | raise ValueError("The leaves of the current tree contain only postags. " 764 | "Hence, there are no tokens to combine.") 765 | elif self.structure == Structure.WithoutPostagNodes: 766 | raise ValueError("The leaves of the current tree contain only tokens. " 767 | "Hence, there are no postags to combine.") 768 | 769 | def extract_all_phrasal_categories(self) -> Set[str]: 770 | """Extracts all available phrasal categories from the tree. 771 | 772 | Returns: 773 | A set of all phrasal categories occurring in the tree. 774 | """ 775 | 776 | # In case of Structure.WithoutPostagNodes we can simply extract the phrasal categories from the tokenized tree. 777 | if self.structure == Structure.Complete: 778 | return set([str(p.lhs()) for p in self.nltk_tree.productions() if p.is_nonlexical() and p.lhs()]) 779 | elif self.structure == Structure.WithoutPostagNodes: 780 | return set(BracketedTree.tokenize(self.nltk_tree)) - set(["(", ")"]) 781 | else: 782 | return {n.label() for n in self.nltk_tree.subtrees(lambda n: n.height() > 1)} 783 | 784 | def extract_all_phrases(self, min_words_in_phrases: int = 2, avoid_nested_phrases: bool = False, 785 | content: NodeContent = NodeContent.Text) -> Dict[str, List[str]]: 786 | """Extracts all phrases from the tree and the categories they belong to. 787 | 788 | Args: 789 | min_words_in_phrases: Minimum number of words each extracted phrase should contain. 790 | 791 | avoid_nested_phrases: If set to True, nested subtrees of the same phrasal category X will be ignored. 792 | In other words, only the longest phrase of the category X will be returned: For example, lets say the tree 793 | contains the following nested noun phrases 'NP': ['a limited number of Single Game Tickets', 794 | 'a limited number']. In case of avoid_nested_phrases=True only the longer noun phrase will be extracted. 795 | 796 | content: The respective contents of the leaves of the nltk.Tree to be returned, which can be the 797 | word itself (Content.Text), its part-of-speech (Content.Pos) or a combination of both (Content.Combined). 798 | 799 | Returns: 800 | A dictionary of all phrases extracted from the tree. Here, the keys represent the phrasal 801 | categories while the values contain all phrases that belong to these categories. 802 | """ 803 | 804 | # Depending on the current tree structure, set the available node content 805 | if self.structure == Structure.WithoutTokenLeaves: 806 | content = self.NodeContent.Pos 807 | elif self.structure == Structure.WithoutPostagNodes: 808 | content = self.NodeContent.Text 809 | 810 | available_phrasal_categories = self.extract_all_phrasal_categories() 811 | all_phrases_by_category = dict.fromkeys(available_phrasal_categories) 812 | 813 | for phrasal_category in available_phrasal_categories: 814 | list_of_phrases = self._extract_phrases(self.nltk_tree, phrasal_category, min_words_in_phrases) 815 | 816 | list_of_x = [] 817 | if len(list_of_phrases) > 0: 818 | for phrase in list_of_phrases: 819 | list_of_x.append(self.leaves(phrase, content)) 820 | 821 | if avoid_nested_phrases and len(list_of_x) > 1: 822 | result = [] 823 | phrases_by_length = sorted(list_of_x, key=len, reverse=True) 824 | longest = phrases_by_length[0] 825 | result.append(longest) 826 | 827 | for p in phrases_by_length[1:]: 828 | if p not in longest: 829 | result.append(p) 830 | list_of_x.clear() 831 | list_of_x = result 832 | 833 | if len(list_of_x) > 0: 834 | all_phrases_by_category[phrasal_category] = list_of_x 835 | 836 | # Ensure only existing phrases are returned. 837 | all_phrases_by_category = {p: list_of_x for p, list_of_x in all_phrases_by_category.items() if 838 | list_of_x is not None} 839 | return all_phrases_by_category 840 | 841 | def export_tree(self, destination_filepath: str, wkhtmltopdf_bin_filepath: str = None, 842 | tree_style_nltk: bool = False, dpi: int = 300, verbose: bool = False) -> None: 843 | """ Exports the constructed constituent tree in various file formats. Currently supported: 844 | [.pdf, .svg, .ps, .png, .jpg, .gif, .bmp, .psd, .eps, .tiff, .txt, .tex, .json, .yaml]. 845 | 846 | Args: 847 | destination_filepath: The destination path to which the tree should be exported. 848 | In case of an image format, the resulting visualization will be cropped with respect to unnecessary margins. 849 | 850 | wkhtmltopdf_bin_filepath: To filepath to the rendering tool "wkhtmltopdf". Only required if the 851 | visualization of the constituent tree should be exported to a PDF file. If not already done, the tool 852 | wkhtmltopdf must first be downloaded and installed from https://wkhtmltopdf.org before the visualization 853 | can be exported. 854 | 855 | dpi: Specifies the desired resolution. A DPI value of 300 is considered a good standard for 856 | printable files. 857 | 858 | tree_style_nltk: If set to True, the classic NLTK style will be used to visualize the nltk.Tree. 859 | 860 | verbose: If set to True, a short message about whether the output file creation was 861 | successful is displayed. 862 | """ 863 | from .export import export_figure 864 | export_figure(self.nltk_tree, 865 | destination_filepath=destination_filepath, 866 | wkhtmltopdf_bin_filepath=wkhtmltopdf_bin_filepath, 867 | verbose=verbose, 868 | dpi=dpi, 869 | tree_style_nltk=tree_style_nltk) 870 | -------------------------------------------------------------------------------- /Constituent_TreeLib_Quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#
Constituent Treelib (CTL) Demo
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Necessary imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import torch\n", 24 | "from nltk import Tree" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Handle GPU issue" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "In case you have a weak GPU (e.g., 2gb), it is highly recomended to use the CPU instead, in order to prevent an out-of-memory exception caused by torch that is used within benepar. To force the CPU usage, we can override the \"cuda.is_available\" function. However, if you have a more powerful GPU, leave the following lines commented out." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# import torch \n", 48 | "# torch.cuda.is_available = lambda : False" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Import the CTL library" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from constituent_treelib import ConstituentTree, BracketedTree, Language, Structure" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Create the NLP pipeline" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "To instantiate a ConstituentTree object, a spaCy-based NLP pipeline that incorporates a benepar component is mandatory. CTL offers two possibilities to create this pipeline...\n", 79 | "- ...dynamically by instantiating the ConstituentTree object using the parameter create_pipeline\n", 80 | "- ...statically by creating the pipeline outside the ConstituentTree constructor via the create_pipeline() method and pass it to the constructor using the nlp parameter\n", 81 | "\n", 82 | "Note that the first option is mainly recommended for demo purposes or if you only want to process a single sentence.\n", 83 | "If, on the other hand, you want to process more than a single sentence and thus instantiate multiple ConstituentTree objects, it is strongly recommended to create the pipeline X outside and invoke it when instantiating the ConstituentTree object via ConstituentTree(nlp=X). \n", 84 | "\n", 85 | "In the following both possibilities are shown." 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "#### Static NLP pipelined creation" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "The spaCy model: 'en_core_web_md' was not found. Download is initiated...\n", 105 | "✔ Download and installation successful\n", 106 | "You can now load the package via spacy.load('en_core_web_md')\n" 107 | ] 108 | }, 109 | { 110 | "name": "stderr", 111 | "output_type": "stream", 112 | "text": [ 113 | "[nltk_data] Downloading package benepar_en3 to\n", 114 | "[nltk_data] D:\\install\\Python39\\share\\nltk_data...\n", 115 | "[nltk_data] Unzipping models\\benepar_en3.zip.\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "# Define the language for the sentence as well as for the spaCy and benepar models\n", 121 | "language = Language.English\n", 122 | "\n", 123 | "# Define which specific SpaCy model should be used (default is Medium)\n", 124 | "spacy_model_size = ConstituentTree.SpacyModelSize.Medium\n", 125 | "\n", 126 | "# Create the pipeline (note, the required models will be downloaded and installed automatically)\n", 127 | "nlp = ConstituentTree.create_pipeline(language, spacy_model_size)\n", 128 | "\n", 129 | "# In case you want to hide pip installation outputs...\n", 130 | "# nlp = ConstituentTree.create_pipeline(language, spacy_model_size, quiet=True)\n", 131 | "\n", 132 | "# If you already downloaded and installed the models, you can save validation overhead by informing CTL\n", 133 | "# nlp = ConstituentTree.create_pipeline(language, spacy_model_size, download_models = False)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "#### Dynamic NLP pipelined creation" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 5, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stderr", 150 | "output_type": "stream", 151 | "text": [ 152 | "[nltk_data] Downloading package benepar_en3 to\n", 153 | "[nltk_data] D:\\install\\Python39\\share\\nltk_data...\n", 154 | "[nltk_data] Package benepar_en3 is already up-to-date!\n", 155 | "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" 156 | ] 157 | }, 158 | { 159 | "data": { 160 | "image/svg+xml": [ 161 | "SNPNNConstituencyNNparsingVPVBZfocusesPPINonSVPVBGbuildingNPDTaNNparseNNtreeSVPVBGusingNPDTaJJprobabilisticADJPNNcontextHYPH-JJfreeNNgrammar.." 162 | ], 163 | "text/plain": [ 164 | "" 165 | ] 166 | }, 167 | "execution_count": 5, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "sentence_dyn = \"Constituency parsing focuses on building a parse tree using a probabilistic context-free grammar.\"\n", 174 | "tree_dyn = ConstituentTree(sentence_dyn, create_pipeline=True) \n", 175 | "\n", 176 | "# Now the NLP pipeline is integrated within the ConstituentTree object and is ready to use...\n", 177 | "tree_dyn" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "#### Instead of creating the NLP pipeline automatically via create_pipeline(), you can also create it manually" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 6, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" 197 | ] 198 | }, 199 | { 200 | "data": { 201 | "image/svg+xml": [ 202 | "SNPPRPYouVPMDmustVPVBconstructNPJJadditionalNNSpylons.!" 203 | ], 204 | "text/plain": [ 205 | "" 206 | ] 207 | }, 208 | "execution_count": 6, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "import spacy\n", 215 | "import benepar\n", 216 | "\n", 217 | "# We use English models by default..\n", 218 | "spacy_model = \"en_core_web_md\"\n", 219 | "benepar_model = \"benepar_en3\" \n", 220 | "\n", 221 | "nlp_pipeline = spacy.load(spacy_model, disable=[\"tok2vec\", \"tagger\", \"parser\", \"ner\", \"attribute_ruler\", \"lemmatizer\"])\n", 222 | "nlp_pipeline.add_pipe(\"sentencizer\")\n", 223 | "nlp_pipeline.add_pipe(\"benepar\", config={\"model\": benepar_model})\n", 224 | "\n", 225 | "sentence = \"You must construct additional pylons!\"\n", 226 | "tree_using_own_nlp_pipeline = ConstituentTree(sentence, nlp_pipeline) \n", 227 | "tree_using_own_nlp_pipeline" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## Define some test sentences" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "#### English" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 7, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "sentences = [\n", 251 | " \"You must construct additional pylons!\",\n", 252 | " \"I'm on my way to New York while they're flying to Tel Aviv.\",\n", 253 | " \"Isaac Asimov was an American writer and professor of biochemistry at Boston University.\",\n", 254 | " \"Kubrick's films typically involve expressions of an inner struggle, examined from different perspectives.\",\n", 255 | " \"Full Metal Jacket is a war drama film directed and produced by Stanley Kubrick.\",\n", 256 | " \"Stanley Getz was an American jazz saxophonist.\",\n", 257 | " \"The bridge was unfinished when it collapsed.\",\n", 258 | " \"The 2022 season is underway and there are a limited number of Single Game Tickets on sale now!\",\n", 259 | " \"And with no Wild Card possibilities for either team, the game is essentially a winner-take-all endeavor.\"\n", 260 | "]" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "#### German" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 8, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "# sentences = [\n", 277 | "# \"In einer Gaspipeline in Litauen hat es eine Explosion gegeben.\",\n", 278 | "# \"Für Fragen zu Freiwilligendiensten, nutzen sie bitte unser Forum!\",\n", 279 | "# \"Der Künstler verlegt seit 30 Jahren Stolpersteine, die er zur Erinnerung an die Opfer des Nationalsozialismus Häusern platziert.\"\n", 280 | "# \"Damit erlangen schützenswürdige Kundendaten in den Geschäfts- und Serviceprozessen der Wertschöpfungskette im Bereich Automotive eine immer größere Bedeutung.\",\n", 281 | "# \"Die USA haben mit ihrem Investitionsprogramm für Klimaschutz reichlich Unmut der EU auf sich gezogen.\",\n", 282 | "# \"Ebenso empfehlenswert ist das Lesen einer Tageszeitung des Landes.\", \n", 283 | "# \"Wie viel wird pro Jahr ungefähr weltweit benötigt?\"\n", 284 | "# ]" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "#### French" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 9, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# sentences = [\n", 301 | "# \"Nous irons plus tard au théâtre.\",\n", 302 | "# \"Pablo Ruiz Picasso était un peintre, dessinateur, sculpteur et graphiste espagnol.\", \n", 303 | "# \"Découvrez une belle sélection d’évènements pour fêter la nouvelle année en partenariat avec Party.\"\n", 304 | "# ]" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "#### Swedish" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 10, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# sentences = [\n", 321 | "# \"Vilken vacker skog!\",\n", 322 | "# \"Det var mycket åska och blixtar i går!\",\n", 323 | "# \"För den närmaste veckan finns ingen uppenbar risk för fjärrtransport.\"\n", 324 | "# ]" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "#### Polish" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 11, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "# sentences = [\n", 341 | "# \"W dodatku szczerze wierzy, że w tej wojnie stawką jest istnienie Rosji.\",\n", 342 | "# \"Poproszę pięć kilo ziemniaków.\",\n", 343 | "# \"Przepraszam, ale nie rozumiem.\"\n", 344 | "# ]" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "#### Hungarian" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 12, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "# sentences = [\n", 361 | "# \"A pizza tényleg kiváló volt!\",\n", 362 | "# \"Vannak kisebb és kiszámíthatatlan kivételek a szabály alól.\",\n", 363 | "# \"Ezt azért tesszük, hogy javítsuk és finanszírozzuk szolgáltatásainkat.\" \n", 364 | "# ]" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "#### Chinese " 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 13, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# sentences = [\n", 381 | "# \"你好吗?\",\n", 382 | "# \"很高兴见到你。\",\n", 383 | "# \"不好意思, 我没听懂。\",\n", 384 | "# \"请再说一遍。\" \n", 385 | "# ]" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "#### Korean" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 14, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "# sentences = [\n", 402 | "# \"말을 냇가에 끌고 갈 수는 있어도 억지로 물을 먹일 수는 없다\",\n", 403 | "# \"반갑습니다\", \n", 404 | "# \"잘 지내세요?\",\n", 405 | "# \"그 집은 한국에서 지어졌어요\"\n", 406 | "# ]" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## Instantiate a ConstituentTree object" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "#### ... from a raw sentence" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 15, 426 | "metadata": { 427 | "scrolled": false 428 | }, 429 | "outputs": [ 430 | { 431 | "name": "stderr", 432 | "output_type": "stream", 433 | "text": [ 434 | "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" 435 | ] 436 | }, 437 | { 438 | "data": { 439 | "image/svg+xml": [ 440 | "SNPNNPIsaacNNPAsimovVPVBDwasNPNPDTanJJAmericanNNwriterCCandNNprofessorPPINofNPNNbiochemistryPPINatNPNNPBostonNNPUniversity.." 441 | ], 442 | "text/plain": [ 443 | "" 444 | ] 445 | }, 446 | "execution_count": 15, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "tree = ConstituentTree(sentence=sentences[2], nlp=nlp) #, structure=Structure.WithoutTokenLeaves) \n", 453 | "tree" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "#### ... from a bracketed tree string (wrapped as a BracketedTree object)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 16, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "image/svg+xml": [ 471 | "SNPPRPYouVPMDmustVPVBconstructNPJJadditionalNNSpylons.!" 472 | ], 473 | "text/plain": [ 474 | "" 475 | ] 476 | }, 477 | "execution_count": 16, 478 | "metadata": {}, 479 | "output_type": "execute_result" 480 | } 481 | ], 482 | "source": [ 483 | "bracketed_tree_string = \"(S (NP (PRP You)) (VP (MD must) (VP (VB construct) (NP (JJ additional) (NNS pylons)))) (. !))\"\n", 484 | "bracketed_tree = BracketedTree(bracketed_tree_string)\n", 485 | "tree_from_bracketed = ConstituentTree(sentence=bracketed_tree, nlp=nlp)\n", 486 | "tree_from_bracketed" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "#### ... from an nltk.Tree object" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 17, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "image/svg+xml": [ 504 | "SNPPRPYouVPMDmustVPVBconstructNPJJadditionalNNSpylons.!" 505 | ], 506 | "text/plain": [ 507 | "" 508 | ] 509 | }, 510 | "execution_count": 17, 511 | "metadata": {}, 512 | "output_type": "execute_result" 513 | } 514 | ], 515 | "source": [ 516 | "nltk_tree = Tree('S', [Tree('NP', [Tree('PRP', ['You'])]), Tree('VP', [Tree('MD', ['must']), Tree('VP', [Tree('VB', ['construct']), Tree('NP', [Tree('JJ', ['additional']), Tree('NNS', ['pylons'])])])]), Tree('.', ['!'])])\n", 517 | "tree_from_nltk = ConstituentTree(sentence=nltk_tree, nlp=nlp) \n", 518 | "tree_from_nltk" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "## Expand contractions (e.g., *he's* $\\rightarrow$ *he is*, *they're* $\\rightarrow$ *they are*, etc.)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 18, 531 | "metadata": { 532 | "scrolled": true 533 | }, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "image/svg+xml": [ 538 | "SNPPRPIVPVBPamPPINonNPNPPRP$myNNwayPPINtoNPNNPNewNNPYorkSBARINwhileSNPPRPtheyVPVBPareVPVBGflyingPPINtoNPNNPTelNNPAviv.." 539 | ], 540 | "text/plain": [ 541 | "" 542 | ] 543 | }, 544 | "execution_count": 18, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "tree_expanded_contractions = ConstituentTree(sentence=sentences[1], nlp=nlp, expand_contractions=True)\n", 551 | "tree_expanded_contractions" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "## Select desired tree structure" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "#### Without token leaves (tree contains now phrasal categories as inner nodes and postags as leaves)" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 19, 571 | "metadata": {}, 572 | "outputs": [ 573 | { 574 | "data": { 575 | "image/svg+xml": [ 576 | "SNPPRPVPMDVPVBNPJJNNS." 577 | ], 578 | "text/plain": [ 579 | "" 580 | ] 581 | }, 582 | "execution_count": 19, 583 | "metadata": {}, 584 | "output_type": "execute_result" 585 | } 586 | ], 587 | "source": [ 588 | "tree_without_token_leaves = ConstituentTree(sentences[0], nlp, Structure.WithoutTokenLeaves)\n", 589 | "tree_without_token_leaves" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "#### Without postag nodes (tree contains now phrasal categories as inner nodes and tokens as leaves)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 20, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "image/svg+xml": [ 607 | "SNPYouVPmustVPconstructNPadditionalpylons!" 608 | ], 609 | "text/plain": [ 610 | "" 611 | ] 612 | }, 613 | "execution_count": 20, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "tree_without_postag_nodes = ConstituentTree(sentences[0], nlp, Structure.WithoutPostagNodes)\n", 620 | "tree_without_postag_nodes" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "## Tree representations " 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "#### Plot SVG representation of the internal NLTK tree" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 21, 640 | "metadata": {}, 641 | "outputs": [ 642 | { 643 | "data": { 644 | "image/svg+xml": [ 645 | "SNPNNPIsaacNNPAsimovVPVBDwasNPNPDTanJJAmericanNNwriterCCandNNprofessorPPINofNPNNbiochemistryPPINatNPNNPBostonNNPUniversity.." 646 | ], 647 | "text/plain": [ 648 | "" 649 | ] 650 | }, 651 | "execution_count": 21, 652 | "metadata": {}, 653 | "output_type": "execute_result" 654 | } 655 | ], 656 | "source": [ 657 | "tree" 658 | ] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": {}, 663 | "source": [ 664 | "#### Pretty-print bracketed tree string representation" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 22, 670 | "metadata": { 671 | "scrolled": true 672 | }, 673 | "outputs": [ 674 | { 675 | "name": "stdout", 676 | "output_type": "stream", 677 | "text": [ 678 | "(S\n", 679 | " (NP (NNP Isaac) (NNP Asimov))\n", 680 | " (VP\n", 681 | " (VBD was)\n", 682 | " (NP\n", 683 | " (NP (DT an) (JJ American) (NN writer) (CC and) (NN professor))\n", 684 | " (PP (IN of) (NP (NN biochemistry)))\n", 685 | " (PP (IN at) (NP (NNP Boston) (NNP University)))))\n", 686 | " (. .))\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "print(tree) " 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "#### ASCII art" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 23, 704 | "metadata": {}, 705 | "outputs": [ 706 | { 707 | "name": "stdout", 708 | "output_type": "stream", 709 | "text": [ 710 | " S \n", 711 | " __________________________________|____________________________________________________________ \n", 712 | " | VP | \n", 713 | " | _________|____________________ | \n", 714 | " | | NP | \n", 715 | " | | ____________|________________________________ | \n", 716 | " | | | PP PP | \n", 717 | " | | | ___|_______ ____|_____ | \n", 718 | " NP | NP | NP | NP | \n", 719 | " ____|____ | _____________|____________ | | | _____|______ | \n", 720 | " NNP NNP VBD DT JJ NN CC NN IN NN IN NNP NNP . \n", 721 | " | | | | | | | | | | | | | | \n", 722 | "Isaac Asimov was an American writer and professor of biochemistry at Boston University . \n", 723 | "\n" 724 | ] 725 | } 726 | ], 727 | "source": [ 728 | "tree.nltk_tree.pretty_print()" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "#### LATEX code" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 24, 741 | "metadata": {}, 742 | "outputs": [ 743 | { 744 | "data": { 745 | "text/plain": [ 746 | "'\\\\Tree [.S\\n [.NP [.NNP Isaac ] [.NNP Asimov ] ]\\n [.VP\\n [.VBD was ]\\n [.NP\\n [.NP\\n [.DT an ]\\n [.JJ American ]\\n [.NN writer ]\\n [.CC and ]\\n [.NN professor ] ]\\n [.PP [.IN of ] [.NP [.NN biochemistry ] ] ]\\n [.PP [.IN at ] [.NP [.NNP Boston ] [.NNP University ] ] ] ] ]\\n [.. . ] ]'" 747 | ] 748 | }, 749 | "execution_count": 24, 750 | "metadata": {}, 751 | "output_type": "execute_result" 752 | } 753 | ], 754 | "source": [ 755 | "tree.nltk_tree.pformat_latex_qtree()" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "## Export visualization" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "CTL relies on two two open-source tools to export the generated constituent tree into the following file formats:\n", 770 | "\n", 771 | "1.) **PDF**: For this the command line tool *wkhtmltopdf* is required: https://wkhtmltopdf.org/downloads.html
\n", 772 | "Once downloaded and installed, the path to the wkhtmltopdf binary must be passed to the export function. In case of a Windows OS, an attempt is made to locate the path of the wkhtmltopdf binary by looking up the default installation directory (\"Program Files/wkhtmltopdf\"). \n", 773 | "\n", 774 | "2.) **JPG, PNG, GIF, BMP, EPS, PSD, TIFF and YAML:** For these the software suite *ImageMagick* is required: https://imagemagick.org/script/download.php
Here, an attempt is made to locate the path of the software suite by looking up the environment variables. Hence, an explicit path to the binary is not required." 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 25, 780 | "metadata": {}, 781 | "outputs": [ 782 | { 783 | "name": "stdout", 784 | "output_type": "stream", 785 | "text": [ 786 | "PDF-file successfully saved to: my_tree.pdf\n" 787 | ] 788 | } 789 | ], 790 | "source": [ 791 | "tree.export_tree(destination_filepath='my_tree.pdf', verbose=True)" 792 | ] 793 | }, 794 | { 795 | "cell_type": "markdown", 796 | "metadata": {}, 797 | "source": [ 798 | "## Extract content fron tree leaves" 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": {}, 804 | "source": [ 805 | "#### Text tokens" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 26, 811 | "metadata": {}, 812 | "outputs": [ 813 | { 814 | "data": { 815 | "text/plain": [ 816 | "'Isaac Asimov was an American writer and professor of biochemistry at Boston University .'" 817 | ] 818 | }, 819 | "execution_count": 26, 820 | "metadata": {}, 821 | "output_type": "execute_result" 822 | } 823 | ], 824 | "source": [ 825 | "tree.leaves(tree.nltk_tree, ConstituentTree.NodeContent.Text)" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "#### POS tags" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": 27, 838 | "metadata": {}, 839 | "outputs": [ 840 | { 841 | "data": { 842 | "text/plain": [ 843 | "'NNP NNP VBD DT JJ NN CC NN IN NN IN NNP NNP .'" 844 | ] 845 | }, 846 | "execution_count": 27, 847 | "metadata": {}, 848 | "output_type": "execute_result" 849 | } 850 | ], 851 | "source": [ 852 | "tree.leaves(tree.nltk_tree, ConstituentTree.NodeContent.Pos) " 853 | ] 854 | }, 855 | { 856 | "cell_type": "markdown", 857 | "metadata": {}, 858 | "source": [ 859 | "#### Combination of both" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 28, 865 | "metadata": { 866 | "scrolled": true 867 | }, 868 | "outputs": [ 869 | { 870 | "data": { 871 | "text/plain": [ 872 | "'Isaac_NNP Asimov_NNP was_VBD an_DT American_JJ writer_NN and_CC professor_NN of_IN biochemistry_NN at_IN Boston_NNP University_NNP ._.'" 873 | ] 874 | }, 875 | "execution_count": 28, 876 | "metadata": {}, 877 | "output_type": "execute_result" 878 | } 879 | ], 880 | "source": [ 881 | "tree.leaves(tree.nltk_tree, ConstituentTree.NodeContent.Combined)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "metadata": {}, 887 | "source": [ 888 | "## Extract phrases" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": {}, 894 | "source": [ 895 | "#### Phrasal categories " 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": 29, 901 | "metadata": {}, 902 | "outputs": [ 903 | { 904 | "data": { 905 | "text/plain": [ 906 | "{'NP', 'PP', 'S', 'VP'}" 907 | ] 908 | }, 909 | "execution_count": 29, 910 | "metadata": {}, 911 | "output_type": "execute_result" 912 | } 913 | ], 914 | "source": [ 915 | "tree.extract_all_phrasal_categories()" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": {}, 921 | "source": [ 922 | "#### All phrases (including nested)" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 30, 928 | "metadata": {}, 929 | "outputs": [ 930 | { 931 | "name": "stdout", 932 | "output_type": "stream", 933 | "text": [ 934 | "NP ['Isaac Asimov', 'an American writer and professor of biochemistry at Boston University', 'an American writer and professor', 'Boston University']\n", 935 | "VP ['was an American writer and professor of biochemistry at Boston University']\n", 936 | "S ['Isaac Asimov was an American writer and professor of biochemistry at Boston University .']\n", 937 | "PP ['of biochemistry', 'at Boston University']\n" 938 | ] 939 | } 940 | ], 941 | "source": [ 942 | "all_phrases = tree.extract_all_phrases(avoid_nested_phrases=False, min_words_in_phrases=1)\n", 943 | "for phrasal_category, phrases in all_phrases.items():\n", 944 | " print(phrasal_category, phrases)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "markdown", 949 | "metadata": {}, 950 | "source": [ 951 | "#### All phrases (avoiding nested)" 952 | ] 953 | }, 954 | { 955 | "cell_type": "code", 956 | "execution_count": 31, 957 | "metadata": {}, 958 | "outputs": [ 959 | { 960 | "name": "stdout", 961 | "output_type": "stream", 962 | "text": [ 963 | "NP ['an American writer and professor of biochemistry at Boston University', 'Isaac Asimov']\n", 964 | "VP ['was an American writer and professor of biochemistry at Boston University']\n", 965 | "S ['Isaac Asimov was an American writer and professor of biochemistry at Boston University .']\n", 966 | "PP ['at Boston University', 'of biochemistry']\n" 967 | ] 968 | } 969 | ], 970 | "source": [ 971 | "all_phrases = tree.extract_all_phrases(avoid_nested_phrases=True, min_words_in_phrases=1)\n", 972 | "for phrasal_category, phrases in all_phrases.items():\n", 973 | " print(phrasal_category, phrases)" 974 | ] 975 | }, 976 | { 977 | "cell_type": "markdown", 978 | "metadata": {}, 979 | "source": [ 980 | "#### Only noun phrases" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 32, 986 | "metadata": {}, 987 | "outputs": [ 988 | { 989 | "name": "stdout", 990 | "output_type": "stream", 991 | "text": [ 992 | "['an American writer and professor of biochemistry at Boston University', 'Isaac Asimov']\n" 993 | ] 994 | } 995 | ], 996 | "source": [ 997 | "phrases = tree.extract_all_phrases(avoid_nested_phrases=True)\n", 998 | "noun_phrases = phrases['NP']\n", 999 | "\n", 1000 | "print(noun_phrases)" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "markdown", 1005 | "metadata": {}, 1006 | "source": [ 1007 | "#### All phrases of the tree without token leaves" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": 33, 1013 | "metadata": {}, 1014 | "outputs": [ 1015 | { 1016 | "name": "stdout", 1017 | "output_type": "stream", 1018 | "text": [ 1019 | "NP ['JJ NNS']\n", 1020 | "VP ['MD VB JJ NNS', 'VB JJ NNS']\n", 1021 | "S ['PRP MD VB JJ NNS .']\n" 1022 | ] 1023 | } 1024 | ], 1025 | "source": [ 1026 | "all_phrases = tree_without_token_leaves.extract_all_phrases(avoid_nested_phrases=False, min_words_in_phrases=1)\n", 1027 | "for phrasal_category, phrases in all_phrases.items():\n", 1028 | " print(phrasal_category, phrases)" 1029 | ] 1030 | } 1031 | ], 1032 | "metadata": { 1033 | "kernelspec": { 1034 | "display_name": "Python 3 (ipykernel)", 1035 | "language": "python", 1036 | "name": "python3" 1037 | }, 1038 | "language_info": { 1039 | "codemirror_mode": { 1040 | "name": "ipython", 1041 | "version": 3 1042 | }, 1043 | "file_extension": ".py", 1044 | "mimetype": "text/x-python", 1045 | "name": "python", 1046 | "nbconvert_exporter": "python", 1047 | "pygments_lexer": "ipython3", 1048 | "version": "3.9.13" 1049 | }, 1050 | "varInspector": { 1051 | "cols": { 1052 | "lenName": 16, 1053 | "lenType": 16, 1054 | "lenVar": 40 1055 | }, 1056 | "kernels_config": { 1057 | "python": { 1058 | "delete_cmd_postfix": "", 1059 | "delete_cmd_prefix": "del ", 1060 | "library": "var_list.py", 1061 | "varRefreshCmd": "print(var_dic_list())" 1062 | }, 1063 | "r": { 1064 | "delete_cmd_postfix": ") ", 1065 | "delete_cmd_prefix": "rm(", 1066 | "library": "var_list.r", 1067 | "varRefreshCmd": "cat(var_dic_list()) " 1068 | } 1069 | }, 1070 | "types_to_exclude": [ 1071 | "module", 1072 | "function", 1073 | "builtin_function_or_method", 1074 | "instance", 1075 | "_Feature" 1076 | ], 1077 | "window_display": false 1078 | } 1079 | }, 1080 | "nbformat": 4, 1081 | "nbformat_minor": 2 1082 | } 1083 | --------------------------------------------------------------------------------