├── pics ├── llm-addons.jpg ├── mm-project.png ├── first_atlas.jpg ├── causal-bridges.png ├── llm-superstitious.png ├── film_script_template.jpg ├── my_first_big_atlas.jpeg ├── universal-pos-tags.jpg ├── causal-bridges-captioned.png ├── spacy-named-entity-tags.png └── nltk_tags.html ├── m_scripts_dag_atlas ├── up.pkl ├── wall-e.pkl └── toy-story.pkl ├── white_paper ├── harris-e-mc2.jpg ├── mappa_mundi.pdf ├── shark-attacks.png ├── mappa_mundi_V2.pdf ├── nacc-nrej-plane.png ├── crossing-bridges.png ├── references.bib └── bayesuvius.sty ├── short_stories_dag_atlas ├── wiltons-holiday.pkl ├── bill-the-bloodhound.pkl └── extricating-young-gussie.pkl ├── spell_checking_test.txt ├── resources.txt ├── miscellaneous ├── nlp-environmental-variables.txt ├── starting-stanford-coreNLP-server.txt ├── predictions.txt.conj ├── parser-choices.py ├── WALL-E-quote-summarized-by-chatgpt.txt └── testing-stan-parser.py ├── requirements.txt ├── openie6_translation_test1.txt ├── README.md ├── MIT-License.txt ├── simp_deprecated ├── simp_spacy-claucy.py ├── simp_openie.py ├── simp_openie6-old.py ├── simp_spacy1.py ├── simp_spacy4.py ├── simp_spacy2.py ├── simp_stanford2.py └── simp_stanford.py ├── similarity_bert.py ├── Node.py ├── globals.py ├── simplifying_test.txt ├── similarity.py ├── similarity_deprecated ├── similarity_spacy2.py └── similarity_spacy.py ├── utils.py ├── WordGuesser.py ├── BatchSimilarity.py ├── simp_spacy3.py ├── similarity_nltk.py ├── simplifying.py ├── downloading_imsdb.py ├── simp_openie6.py ├── jupyter_notebooks └── SUMMARY.ipynb ├── post_cleaning.py ├── stopwords.py ├── DagAtlas.py ├── cleaning.py ├── Dag.py └── spell_checking.py /pics/llm-addons.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/llm-addons.jpg -------------------------------------------------------------------------------- /pics/mm-project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/mm-project.png -------------------------------------------------------------------------------- /pics/first_atlas.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/first_atlas.jpg -------------------------------------------------------------------------------- /pics/causal-bridges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/causal-bridges.png -------------------------------------------------------------------------------- /m_scripts_dag_atlas/up.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/up.pkl -------------------------------------------------------------------------------- /pics/llm-superstitious.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/llm-superstitious.png -------------------------------------------------------------------------------- /pics/film_script_template.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/film_script_template.jpg -------------------------------------------------------------------------------- /pics/my_first_big_atlas.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/my_first_big_atlas.jpeg -------------------------------------------------------------------------------- /pics/universal-pos-tags.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/universal-pos-tags.jpg -------------------------------------------------------------------------------- /white_paper/harris-e-mc2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/harris-e-mc2.jpg -------------------------------------------------------------------------------- /white_paper/mappa_mundi.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/mappa_mundi.pdf -------------------------------------------------------------------------------- /white_paper/shark-attacks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/shark-attacks.png -------------------------------------------------------------------------------- /m_scripts_dag_atlas/wall-e.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/wall-e.pkl -------------------------------------------------------------------------------- /white_paper/mappa_mundi_V2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/mappa_mundi_V2.pdf -------------------------------------------------------------------------------- /white_paper/nacc-nrej-plane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/nacc-nrej-plane.png -------------------------------------------------------------------------------- /m_scripts_dag_atlas/toy-story.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/toy-story.pkl -------------------------------------------------------------------------------- /pics/causal-bridges-captioned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/causal-bridges-captioned.png -------------------------------------------------------------------------------- /pics/spacy-named-entity-tags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/spacy-named-entity-tags.png -------------------------------------------------------------------------------- /white_paper/crossing-bridges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/crossing-bridges.png -------------------------------------------------------------------------------- /short_stories_dag_atlas/wiltons-holiday.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/wiltons-holiday.pkl -------------------------------------------------------------------------------- /short_stories_dag_atlas/bill-the-bloodhound.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/bill-the-bloodhound.pkl -------------------------------------------------------------------------------- /short_stories_dag_atlas/extricating-young-gussie.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/extricating-young-gussie.pkl -------------------------------------------------------------------------------- /spell_checking_test.txt: -------------------------------------------------------------------------------- 1 | Poul , Poul , Paul 2 | caesar caesar ceesar caisar 3 | Hooww are youu Judy , Poul . Judy 4 | fitnes fitness 5 | how haves you bein ? been 6 | leter beautifull adress addres 7 | letter beautiful address 8 | tomatos 9 | -------------------------------------------------------------------------------- /resources.txt: -------------------------------------------------------------------------------- 1 | python -m spacy download en_core_web_lg 2 | python -m spacy download en_core_web_sm 3 | python -m spacy download en_core_web_trf 4 | 5 | python -m coreferee install en 6 | 7 | python -m nltk.downloader popular # this includes wordnet -------------------------------------------------------------------------------- /miscellaneous/nlp-environmental-variables.txt: -------------------------------------------------------------------------------- 1 | CLASSPATH 2 | C:\NLP\stanford-parser-full-2018-02-27;C:\NLP\stanford-postagger-full-2015-12-09;C:\NLP\stanford-ner-2015-12-09 3 | 4 | STANFORD_MODELS 5 | C:\NLP\stanford-ner-2015-12-09\classifiers;C:\NLP\stanford-postagger-full-2015-12-09\models 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz 2 | ipython 3 | Pillow 4 | contractions 5 | spacy 6 | Unidecode 7 | nltk 8 | # pycorenlp~=0.3.0 9 | # coreferee~=1.4.0 10 | # anytree~=2.8.0 11 | numpy 12 | requests 13 | beautifulsoup4 14 | python-slugify 15 | # claucy~=0.0.2.0 16 | pyspellchecker 17 | sentence-transformers -------------------------------------------------------------------------------- /miscellaneous/starting-stanford-coreNLP-server.txt: -------------------------------------------------------------------------------- 1 | Reference: 2 | https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024 3 | 4 | # check java installed properly 5 | java -version 6 | 7 | # starting server 8 | # cd to folder with stanford java code 9 | cd /StanfordParser/stanford-corenlp-4.5.4/ 10 | 11 | # no need to deactivate conda virtual environment 12 | 13 | # start server (IMPORTANT: make sure this is one line) 14 | java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000 15 | -------------------------------------------------------------------------------- /openie6_translation_test1.txt: -------------------------------------------------------------------------------- 1 | The man , who had never liked the words `` booby '' and `` boobyhatch , '' and who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment . 2 | The man , '' , thought for a moment . 3 | The man , who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment . 4 | The man , who had never liked the words `` booby , thought for a moment . 5 | The man , who had never liked the words `` boobyhatch , thought for a moment . 6 | 7 | I love Luciano Pavarotti and Jose Carreras . 8 | I love Luciano Pavarotti . 9 | I love Jose Carreras . 10 | 11 | -------------------------------------------------------------------------------- /miscellaneous/predictions.txt.conj: -------------------------------------------------------------------------------- 1 | The man , who had never liked the words `` booby '' and `` boobyhatch , '' and who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment . 2 | The man , '' , thought for a moment . 3 | The man , who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment . 4 | The man , who had never liked the words `` booby , thought for a moment . 5 | The man , who had never liked the words `` boobyhatch , thought for a moment . 6 | 7 | I love Luciano Pavarotti and Jose Carreras . 8 | I love Luciano Pavarotti . 9 | I love Jose Carreras . 10 | 11 | -------------------------------------------------------------------------------- /miscellaneous/parser-choices.py: -------------------------------------------------------------------------------- 1 | # 1. nltk 2 | # NLTK was unable to find stanford-parser\.jar! Set the CLASSPATH 3 | # environment variable. 4 | # https://stackoverflow.com/questions/13883277/how-to-use-stanford-parser-in-nltk-using-python 5 | 6 | from nltk.parse.stanford import StanfordParser 7 | parser = StanfordParser() 8 | 9 | from nltk.parse.stanford import GenericStanfordParser 10 | parser = GenericStanfordParser() 11 | 12 | # 2. nltk.parse.corenlp 13 | # AttributeError: 'CoreNLPParser' object has no attribute 'tagged_parse'https://stackoverflow.com/questions/39320782/corenlp-provide-pos-tags 14 | import nltk 15 | from nltk.parse.corenlp import CoreNLPParser 16 | parser = CoreNLPParser(url='http://localhost:9000') 17 | 18 | 19 | # 3. pycorenlp 20 | from pycorenlp import StanfordCoreNLP 21 | nlp = StanfordCoreNLP('http://localhost:9000') 22 | parser = nlp.parse() 23 | 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mappa_Mundi 2 | 3 | Welcome to Mappa_Mundi (MM)! 4 | 5 | MM is a method that combines seamlessly 6 | Large Language Models (LLM) 7 | and Causal Inference (CI). 8 | 9 | The MM software does causal DEFT 10 | (causal DAG Extraction From Text). 11 | We store each extracted DAG in a separate file, and we put 12 | all DAG files in a directory 13 | that we call 14 | a DAG Atlas. 15 | 16 | I discuss the software in 17 | detail in this white paper: 18 | * [Version 1](https://github.com/rrtucci/mappa_mundi/blob/master/white_paper/mappa_mundi.pdf) 19 | * [Version 2](https://github.com/rrtucci/mappa_mundi/blob/master/white_paper/mappa_mundi_V2.pdf) 20 | 21 | ![LLM add-ons](pics/llm-addons.jpg) 22 | 23 | ![LLM are supertitious](https://github.com/rrtucci/mappa_mundi/blob/master/pics/llm-superstitious.png) 24 | 25 | ![Mappa Mundi Causal Bridges](https://github.com/rrtucci/mappa_mundi/blob/master/pics/causal-bridges-captioned.png) 26 | 27 | ![Mappa Mundi Project](https://github.com/rrtucci/mappa_mundi/blob/master/pics/mm-project.png) 28 | 29 | ![My First Big Atlas](pics/my_first_big_atlas.jpeg) 30 | 31 | 32 | -------------------------------------------------------------------------------- /MIT-License.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Robert R. Tucci 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /simp_deprecated/simp_spacy-claucy.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | Refs: 7 | https://spacy.io/usage/spacy-101/ 8 | 9 | https://github.com/mmxgn/spacy-clausieec13/splitting-sentences-into-clauses 10 | """ 11 | from globals import * 12 | import spacy 13 | import claucy 14 | 15 | nlp = spacy.load('en_core_web_sm') 16 | claucy.add_to_pipe(nlp) 17 | 18 | 19 | def simplify_ztz(sentence, verbose=False): 20 | """ 21 | This method simplifies the sentence `sentence`. It returns a list of 22 | simple sentences extracted from the input sentence. 23 | 24 | Parameters 25 | ---------- 26 | sentence: str 27 | verbose: bool 28 | kwargs: dict[] 29 | 30 | Returns 31 | ------- 32 | list[str] 33 | 34 | """ 35 | 36 | doc = nlp(sentence.strip()) 37 | if doc._.clauses: 38 | propositions = doc._.clauses[0].to_propositions(as_text=True) 39 | else: 40 | propositions = [sentence] 41 | if verbose: 42 | print(sentence.strip()) 43 | print(propositions) 44 | return propositions 45 | -------------------------------------------------------------------------------- /similarity_bert.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains a function `ztz_similarity(ztz1, ztz2)` 4 | that returns the similarity of sentences `ztz1` and `ztz2`. 5 | ztz = sentence 6 | 7 | It uses 8 | 9 | Ref: 10 | 1. https://www.sbert.net/ 11 | 2. https://huggingface.co/tasks/sentence-similarity 12 | 3. https://towardsdatascience.com/bert-for-measuring-text-similarity 13 | -eec91c6bf9e1 14 | """ 15 | from sklearn.metrics.pairwise import cosine_similarity 16 | 17 | 18 | def ztz_similarity(ztz1, ztz2, **kwargs): 19 | """ 20 | This method returns the similarity between sentences `ztz1` and `ztz2`. 21 | The similarity is measured as odds of a probability, so it ranges from 0 22 | to infinity. 23 | 24 | Parameters 25 | ---------- 26 | ztz1: str 27 | ztz2: str 28 | kwargs: dict[] 29 | 30 | Returns 31 | ------- 32 | float 33 | 34 | """ 35 | model = kwargs['model'] 36 | embedding_1 = model.encode(ztz1) 37 | embedding_2 = model.encode(ztz2) 38 | 39 | prob = cosine_similarity([embedding_1], [embedding_2])[0, 0] 40 | if prob < 0: 41 | # print("neg. prob.=", prob) 42 | # print(ztz1) 43 | # print(ztz2) 44 | prob = 0 45 | odds = prob / (1 - prob) if prob < 1 else 1e5 46 | return round(odds, 3) 47 | -------------------------------------------------------------------------------- /miscellaneous/WALL-E-quote-summarized-by-chatgpt.txt: -------------------------------------------------------------------------------- 1 | Consider the following quote: 2 | "A range of mountains takes form in the haze. 3 | Moving closer. 4 | The mountains are piles of TRASH. 5 | The entire surface is nothing but waste. 6 | 7 | '..We're gonna find adventure in the evening air...' 8 | 9 | A silhouetted city in the distance. 10 | What looks like skyscrapers turns into trash. 11 | Thousands of neatly stacked CUBES OF TRASH, stories high. 12 | Rows and rows of stacked cubes, like city avenues. 13 | They go on for miles." 14 | 15 | Give an enumerated list summarizing this quote. 16 | Each list item is a simple declarative sentence 17 | consisting of a subject phrase and a verb phrase. 18 | The list items are in chronological order of occurrence 19 | in the quote. 20 | 21 | 1. A range of mountains appears in the haze. 22 | 2. The narrator moves closer. 23 | 3. The mountains are revealed to be piles of trash. 24 | 4. The entire surface is described as nothing but waste. 25 | 5. The narrator speaks of finding adventure in the evening air. 26 | 6. A silhouetted city appears in the distance. 27 | 7. The skyscrapers turn out to be made of trash. 28 | 8. Thousands of neatly stacked cubes of trash are seen, stories high. 29 | 9. Rows and rows of stacked cubes are seen, resembling city avenues. 30 | 10. The expanse of trash goes on for miles. 31 | -------------------------------------------------------------------------------- /simp_deprecated/simp_openie.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | References 7 | 8 | 1. https://stanfordnlp.github.io/CoreNLP/openie.html#api 9 | # Default value of openie.affinity_probability_cap was 1/3. 10 | 2. https://pypi.org/project/stanford-openie/ 11 | 12 | 2. https://stanfordnlp.github.io/CoreNLP/demo.html 13 | 14 | """ 15 | from openie import StanfordOpenIE 16 | 17 | properties = { 18 | 'openie.triple.all_nominals': True, 19 | 'openie.triple.strict': False, 20 | 'openie.splitter.nomodel': True, 21 | 'openie.affinity_probability_cap': 1/ 3 22 | } 23 | client = StanfordOpenIE(properties=properties) 24 | 25 | 26 | def simplify_ztz(sentence, verbose=False): 27 | """ 28 | This method simplifies the sentence `sentence`. 29 | 30 | Parameters 31 | ---------- 32 | sentence: str 33 | verbose: bool 34 | 35 | Returns 36 | ------- 37 | str 38 | 39 | """ 40 | ztz_list = [] 41 | for triple in client.annotate(sentence): 42 | ztz_list.append(triple['subject'] + " " + 43 | triple['relation'] + " " + 44 | triple['object']) 45 | if verbose: 46 | print(sentence.strip()) 47 | print(ztz_list) 48 | return ztz_list 49 | 50 | 51 | -------------------------------------------------------------------------------- /simp_deprecated/simp_openie6-old.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | It is called within a jupyter notebook at Google colab 7 | https://colab.research.google.com/drive/1S2EWOGkoCgjfOJzTRJ7PLeu4T8SBwhlF?usp=sharing 8 | 9 | Refs: 10 | 11 | 1. https://github.com/dair-iitd/CaRB 12 | 13 | 2. https://github.com/dair-iitd/imojie 14 | 15 | 3. https://github.com/dair-iitd/openie6 16 | 17 | """ 18 | import subprocess 19 | from globals import * 20 | 21 | def simplify_ztz(sentence, verbose=False): 22 | """ 23 | This method simplifies the sentence `sentence`. It returns a list of 24 | simple sentences extracted from the input sentence. 25 | 26 | Parameters 27 | ---------- 28 | sentence: str 29 | verbose: bool 30 | kwargs: dict[] 31 | 32 | Returns 33 | ------- 34 | list[str] 35 | 36 | """ 37 | 38 | with open("../openie6_sentences.txt", "w") as f: 39 | f.write(sentence) 40 | 41 | gpu_command = \ 42 | "cd ../openie6 && CUDA_DEVICE_ORDER=PCI_BUS_ID " \ 43 | "CUDA_VISIBLE_DEVICES=0 " \ 44 | "PYTHONPATH=imojie:imojie/allennlp:imojie" \ 45 | "/pytorch_transformers:$PYTHONPATH python run.py " \ 46 | "--save models/conj_model --mode predict " \ 47 | "--inp ../openie6_sentences.txt --batch_size 1 " \ 48 | "--model_str bert-large-cased --task conj " \ 49 | "--gpus 1 --out ../openie6_predictions.txt" 50 | 51 | cpu_command = gpu_command.replace("--gpus 1", "--gpus 0") 52 | 53 | if USE_GPU: 54 | subprocess.Popen(gpu_command, shell=True) 55 | else: 56 | subprocess.Popen(cpu_command, shell=True) 57 | 58 | ztz_list = [] 59 | with open("../openie6_predictions.txt.conj", "r") as f: 60 | for line in f: 61 | ztz_list.append(line) 62 | # ztz_list has full sentence in first row 63 | return ztz_list[1:] 64 | -------------------------------------------------------------------------------- /Node.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | """ 3 | 4 | This is a very simple class that holds the `time` and `place` of each 5 | node. 6 | 7 | Each simplified clause becomes a node of the DAG. 8 | 9 | For brevity, let us refer to time as `t` and place as `x`. Previously, 10 | we put each full sentence of the movie script into one row of a file. 11 | Then each sentence was replaced by zero, one, two, or more simplified 12 | clauses, separated by separator-tokens. If a simplified clause ( i.e., 13 | node) appears at the row $t$ of the file (counting starting with 0), 14 | then we say that the node occurs at time $t$. If a simplified clause 15 | appears after zero separator-tokens, we say $x=0$ for it. If it appears 16 | after one separator-token, we say $x=1$ for it, and so forth. Hence each 17 | node ( i.e., simplified clause) can be labeled by its $(t, x)$ coordinates. 18 | 19 | Attributes 20 | ---------- 21 | place: int 22 | time: int 23 | """ 24 | 25 | def __init__(self, time, place): 26 | """ 27 | Constructor 28 | 29 | Parameters 30 | ---------- 31 | time: int 32 | place: int 33 | """ 34 | self.time = time 35 | self.place = place 36 | assert time >= 0 and place >= 0 37 | 38 | def coords(self): 39 | """ 40 | This method returns the coordinates of self as a tuple. 41 | 42 | Returns 43 | ------- 44 | tuple(int, int) 45 | 46 | """ 47 | return (self.time, self.place) 48 | 49 | 50 | def node_str(node): 51 | """ 52 | This method returns a string for Node `node`. 53 | 54 | Parameters 55 | ---------- 56 | node: Node 57 | 58 | Returns 59 | ------- 60 | str 61 | 62 | """ 63 | return "(" + str(node.time) + "," + str(node.place) + ")" 64 | 65 | 66 | def arrow_str(arrow): 67 | """ 68 | This method returns a string for an arrow `arrow` 69 | 70 | Parameters 71 | ---------- 72 | arrow: tuple[Node, Node] 73 | 74 | Returns 75 | ------- 76 | str 77 | 78 | """ 79 | return node_str(arrow[0]) + "->" + node_str(arrow[1]) 80 | -------------------------------------------------------------------------------- /globals.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains all the global variables used by Mappa Mundi (MM). 4 | 5 | """ 6 | 7 | BASE_URL = "https://imsdb.com" 8 | 9 | M_SCRIPTS_DIR = "m_scripts" 10 | CLEAN_DIR = "m_scripts_clean" 11 | CLEAN_RD_DIR = "m_scripts_clean_rd" 12 | SPELL_DIR = "m_scripts_spell" 13 | SPELL_RD_DIR = "m_scripts_spell_rd" 14 | SIMP_DIR = "m_scripts_simp" 15 | SIMP_RD_DIR = "m_scripts_simp_rd" 16 | POST_CLEAN_DIR = "m_scripts_post_clean" 17 | POST_CLEAN_RD_DIR = "m_scripts_post_clean_rd" 18 | DAG_DIR = "m_scripts_dag_atlas" 19 | DAG_RD_DIR = "m_scripts_dag_atlas_rd" 20 | 21 | # ZTZ_SIMPLIFIER = "simp_stanford" 22 | # ZTZ_SIMPLIFIER = "simp_spacy_claucy" 23 | # ZTZ_SIMPLIFIER = "simp_spacy1" 24 | # ZTZ_SIMPLIFIER = "simp_spacy2" 25 | # ZTZ_SIMPLIFIER = "simp_spacy3" # originally recommended 26 | ZTZ_SIMPLIFIER = "simp_openie6" # recommended 27 | 28 | # SIMI_DEF = "similarity_spacy" 29 | # SIMI_DEF = "similarity_spacy2" 30 | # SIMI_DEF = "similarity_nltk" # originally recommended 31 | SIMI_DEF = "similarity_bert" # recommended 32 | 33 | # good threshold values gleaned from similarity.py examples 34 | # SIMI_THRESHOLD = 2.2 for NLTK 35 | # SIMI_THRESHOLD = 2.69 for SpaCy 36 | SIMI_THRESHOLD = 2 # for bert, recommended 37 | 38 | ZTZ_SEPARATOR = "[%@!]" 39 | 40 | SPELLING_CORRECTION_RISK = 1e-8 41 | 42 | # POS (part of speech) in stopwords.py 43 | # ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 44 | # 'NOUN', 'NUM', 'PART', 'PRON', 'PUNCT', 'SCONJ', 'VERB'] 45 | 46 | # To see full list of POS, see jpg in pics folder 47 | 48 | # ADP (adposition) are mostly prepositions 49 | # AUX contains verbs like 'is' 50 | # DET (determiner) contains 'whose' 51 | # NUM contains number words like 'three' 52 | # PART (particle) contains 'not' 53 | 54 | RETAINED_POS = ['ADJ', 'ADV', 'NOUN', 'VERB'] 55 | 56 | # See stopwords.py 57 | # RETAINED_STOPWORD_POS should be subset of RETAINED_POS 58 | # RETAINED_STOPWORD_POS = RETAINED_POS 59 | RETAINED_STOPWORD_POS = [] # recommended 60 | 61 | USE_GPU = True 62 | 63 | class color: 64 | PURPLE = '\033[95m' 65 | CYAN = '\033[96m' 66 | DARKCYAN = '\033[36m' 67 | BLUE = '\033[94m' 68 | GREEN = '\033[92m' 69 | YELLOW = '\033[93m' 70 | RED = '\033[91m' 71 | BOLD = '\033[1m' 72 | UNDERLINE = '\033[4m' 73 | END = '\033[0m' -------------------------------------------------------------------------------- /miscellaneous/testing-stan-parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reference: 3 | https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024 4 | 5 | """ 6 | import os 7 | import subprocess 8 | version = subprocess.check_output( 9 | ['java', '-version'], stderr=subprocess.STDOUT) 10 | print("java version=\t", version) 11 | print("CLASSPATH=\t", os.environ['CLASSPATH']) 12 | print("STANFORD_MODELS=\t", os.environ['STANFORD_MODELS']) 13 | print("JAVA_HOME=\t", os.environ['JAVA_HOME']) 14 | 15 | def main1(): 16 | from pycorenlp import StanfordCoreNLP 17 | nlp = StanfordCoreNLP('http://localhost:9000') 18 | 19 | text = "This movie was actually neither that funny, nor super witty. The movie was meh. I liked watching that movie. If I had a choice, I would not watch that movie areps." 20 | result = nlp.annotate(text, 21 | properties={ 22 | 'annotators': 'sentiment, ner, pos', 23 | 'outputFormat': 'json', 24 | 'timeout': 1000, 25 | }) 26 | print(result) 27 | 28 | def main2(): 29 | #ttps://www.nltk.org/api/nltk.parse.corenlp.html 30 | import nltk 31 | from nltk.parse.corenlp import CoreNLPParser 32 | 33 | # Start the CoreNLP server 34 | # nltk.download('punkt') 35 | # nltk.download('corenlp') 36 | parser = CoreNLPParser(url='http://localhost:9000') 37 | 38 | # Parse a sentence 39 | sentence = "The quick brown fox jumps over the lazy dog." 40 | parse_tree = list( 41 | parser.parse(sentence.split()) 42 | )[0] 43 | print(parse_tree) 44 | 45 | def main3(): 46 | import nltk 47 | from nltk.parse.corenlp import CoreNLPParser 48 | 49 | # Start the CoreNLP server 50 | parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos') 51 | 52 | # Parse a tagged sentence 53 | tagged_sentence = [('The', 'DT'), ('quick', 'JJ'), ('brown', 'JJ'), 54 | ('fox', 'NN'), 55 | ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), 56 | ('lazy', 'JJ'), 57 | ('dog', 'NN'), ('.', '.')] 58 | # parse_tree = list(parser.parse(tagged_sentence))[0] 59 | # print(parse_tree) 60 | parser.parse(tagged_sentence) 61 | 62 | if __name__ == "__main__": 63 | # main1() 64 | main2() 65 | # main3() # doesn't work 66 | 67 | -------------------------------------------------------------------------------- /simplifying_test.txt: -------------------------------------------------------------------------------- 1 | Robert, who lives nearby, was walking his dog. 2 | While eating food Ram is singing a song . 3 | After she ate the cake , Emma visited Tony in his room . 4 | If she is singing then I will sing . 5 | Melanie bought a Batman game for $ 6.95 , a strategy game for $ 7.90 , and a Superman game for $ 7.73 . 6 | Bag A contains 3 white and 2 blue marbles . 7 | Ram has two apples and five bananas . 8 | Ram and Shyam are two brothers . 9 | Ram is a boy and Sita is a girl . 10 | Ram is a boy who is six years old . 11 | Ram eats a banana and an apple but sings a song . 12 | He washed cars over the weekend and now has 86 dollars . 13 | While playing piano Ram is singing a song in a room and Shyam is playing violin . 14 | You are a boy, and Sita is a girl . 15 | Ram sold 6 balls at 10 a.m and 7 balls at 11 a.m . 16 | The restaurant sold 6 slices of pie during the day and 7 slices of pie during the night . 17 | Sam's dad gave Sam 39 nickels and 31 quarters . 18 | Park workers will plant 41 dogwood trees today and 20 dogwood trees tomorrow . 19 | Dan picked 9 limes and gave Sara 4 of the limes . 20 | This year Diane bought some new hives and increased Diane's honey harvest by 6085 pounds . 21 | Sara had 4 quarters and 8 dimes in Sara's bank . 22 | Mike found 6 seashells and 4 starfishes but 4 of the seashells were broken . 23 | Jessica grew 35 watermelons and 30 carrots but the rabbits ate 27 watermelons . 24 | Dan bought a clarinet for $ 130.30 , and a song book which was $ 11.24 . 25 | There are 2 maple trees and 5 popular trees currently in the park . 26 | Dan 's cat had kittens and 5 had spots . 27 | This year, 712261 male salmon and 259378 female salmon, returned to their rivers . 28 | Each day , the polar bear at Richmond 's zoo eats 0.2 bucket of trout and 0.4 bucket of salmon . 29 | While eating food and drinking water Ram is singing a song . 30 | He is eating food and she is playing and they are fighting . 31 | Ram is playing guitar while talking to Sita . 32 | He is playing and she is crying but they are singing . 33 | The embattled Major government survived a crucial vote on coal pits closure as its last-minute concessions curbed the extent of Tony revolt over an issue that generated unusual heat in the House of Commons and brought the miners to London streets. 34 | When Sam is eating food, Alice is singing a song. 35 | Talwinder Singh,who masterminded the Kanishka crash in 1998, was killed in a fierce two hour counter. 36 | Because I was late, I became angry. -------------------------------------------------------------------------------- /similarity.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file has functions to test the function `ztz_similarity(str1, str2)` 4 | which measures the similarity of two sentences `ztz1` and `ztz2`. 5 | `ztz_similarity()` has been implemented 4 different ways, in separate files 6 | 7 | 1. similarity_bert.py (Recommended) 8 | Uses BERT and sentence-transformers 9 | 10 | 2. similarity_nltk.py 11 | Uses NLTK + WordNet 12 | 13 | 3. similarity_spacy.py 14 | Uses SpaCy + WordVec 15 | 16 | 4. similarity_spacy2.py 17 | Attempt to use SpaCy + WordNet 18 | 19 | """ 20 | from globals import * 21 | import importlib as imp 22 | from sentence_transformers import SentenceTransformer 23 | 24 | simi_def = imp.import_module(SIMI_DEF) 25 | 26 | 27 | def print_simi_12(str1, str2, **kwargs): 28 | """ 29 | Prints similarity of `str1` and `str2`. 30 | 31 | Parameters 32 | ---------- 33 | str1: str 34 | str2: str 35 | 36 | Returns 37 | ------- 38 | None 39 | 40 | """ 41 | print() 42 | print("1.", str1) 43 | print("2.", str2) 44 | simi12 = simi_def.ztz_similarity(str1, str2, **kwargs) 45 | simi21 = simi_def.ztz_similarity(str2, str1, **kwargs) 46 | print("simi(1, 2)=", str(simi12)) 47 | print("simi(2, 1)=", str(simi21)) 48 | 49 | 50 | if __name__ == "__main__": 51 | def main1(): 52 | if SIMI_DEF == "similarity_bert": 53 | model = SentenceTransformer('all-MiniLM-L6-v2') 54 | else: 55 | model = None 56 | print("************ simi definition from:", SIMI_DEF) 57 | 58 | ztzs = [ 59 | "Dogs are awesome.", 60 | "Some gorgeous creatures are felines.", 61 | "Dolphins are swimming mammals.", 62 | "Cats are beautiful animals.", 63 | "Cats are beauti animals.", 64 | ] 65 | 66 | focus_ztz = "Cats are beautiful animals." 67 | for ztz in ztzs: 68 | print_simi_12(focus_ztz, ztz, model=model) 69 | 70 | 71 | def main2(): 72 | if SIMI_DEF == "similarity_bert": 73 | model = SentenceTransformer('all-MiniLM-L6-v2') 74 | else: 75 | model = None 76 | print("************ simi definition from:", SIMI_DEF) 77 | word1, word2 = "apple", "horse" 78 | print_simi_12(word1, word2, model=model) 79 | print_simi_12("Paul", "John", model=model) 80 | 81 | ztz1 = "The cat sat on the mat." 82 | ztz2 = "The dog lay on the rug." 83 | print_simi_12(ztz1, ztz2, model=model) 84 | 85 | 86 | main1() 87 | main2() 88 | -------------------------------------------------------------------------------- /pics/nltk_tags.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
    6 |
  1. CC: Coordinating conjunction
  2. 7 |
  3. CD: Cardinal number
  4. 8 |
  5. DT: Determiner
  6. 9 |
  7. EX: Existential there
  8. 10 |
  9. FW: Foreign word
  10. 11 |
  11. IN: Preposition or subordinating conjunction
  12. 12 |
  13. JJ: Adjective
  14. 13 |
  15. JJR: Adjective, comparative
  16. 14 |
  17. JJS: Adjective, superlative
  18. 15 |
  19. LS: List item marker
  20. 16 |
  21. MD: Modal
  22. 17 |
  23. NN: Noun, singular or mass
  24. 18 |
  25. NNS: Noun, plural
  26. 19 |
  27. NNP: Proper noun, singular Phrase
  28. 20 |
  29. NNPS: Proper noun, plural
  30. 21 |
  31. PDT: Pre determiner
  32. 22 |
  33. POS: Possessive ending
  34. 23 |
  35. PP: Preposition Phrase
  36. 24 |
  37. PRP: Possessive pronoun Phrase
  38. 25 |
  39. RB: Adverb
  40. 26 |
  41. RBR: Adverb, comparative
  42. 27 |
  43. RBS: Adverb, superlative
  44. 28 |
  45. RP: Particle
  46. 29 |
  47. S: Simple declarative clause
  48. 30 |
  49. SBAR: Clause introduced by a (possibly empty) subordinating conjunction
  50. 31 |
  51. SBARQ: Direct question introduced by a wh-word or a wh-phrase.
  52. 32 |
  53. SINV: Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal.
  54. 33 |
  55. SQ: Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ.
  56. 34 |
  57. SYM: Symbol
  58. 35 |
  59. VB: Verb, base form
  60. 36 |
  61. VBD: Verb, past tense
  62. 37 |
  63. VBG: Verb, gerund or present participle
  64. 38 |
  65. VBN: Verb, past participle
  66. 39 |
  67. VBP: Verb, non-3rd person singular present
  68. 40 |
  69. VBZ: Verb, 3rd person singular present
  70. 41 |
  71. VP: Verb Phrase
  72. 42 |
  73. WDT: Wh-determiner
  74. 43 |
  75. WP: Wh-pronoun
  76. 44 |
  77. WP$: Possessive wh-pronoun
  78. 45 |
  79. WRB: Wh-adverb
  80. 46 |
47 | 48 | -------------------------------------------------------------------------------- /similarity_deprecated/similarity_spacy2.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains a function `ztz_similarity(ztz1, ztz2)` 4 | that returns the similarity of sentences `ztz1` and `ztz2`. 5 | ztz = sentence 6 | 7 | It uses SpaCy + WordNet 8 | 9 | Ref: 10 | 11 | """ 12 | import spacy 13 | import nltk 14 | from nltk.corpus import wordnet as wn 15 | from globals import * 16 | from itertools import product 17 | from collections import defaultdict 18 | from time import time 19 | 20 | nlp = spacy.load("en_core_web_sm") 21 | 22 | 23 | def ztz_similarity(ztz1, ztz2, **kwargs): 24 | """ 25 | This method returns the similarity between sentences `ztz1` and `ztz2`. 26 | The similarity is measured as odds of a probability, so it ranges from 0 27 | to infinity. 28 | 29 | Parameters 30 | ---------- 31 | ztz1: str 32 | ztz2: str 33 | 34 | Returns 35 | ------- 36 | float 37 | 38 | """ 39 | do_time = False 40 | if do_time: 41 | print("similarity begins", time()) 42 | doc1 = nlp(ztz1) 43 | doc2 = nlp(ztz2) 44 | sp_tokens1 = [token1 for token1 in doc1 \ 45 | if token1.pos_ in RETAINED_POS] 46 | sp_tokens2 = [token2 for token2 in doc2 \ 47 | if token2.pos_ in RETAINED_POS] 48 | all_ss1 = [] 49 | for token1 in sp_tokens1: 50 | if wn.synsets(token1.text): 51 | ss1 = wn.synsets(token1.text)[0] 52 | all_ss1.append(ss1) 53 | 54 | all_ss2 = [] 55 | for token2 in sp_tokens2: 56 | if wn.synsets(token2.text): 57 | ss2 = wn.synsets(token2.text)[0] 58 | all_ss2.append(ss2) 59 | ss_pair_to_simi = defaultdict(lambda: 0) 60 | if do_time: 61 | print("beginning of path_similarity()", time()) 62 | for ss1, ss2 in product(all_ss1, all_ss2): 63 | simi = ss1.path_similarity(ss2) 64 | if simi is not None: 65 | ss_pair_to_simi[(ss1, ss2)] = simi 66 | 67 | score1 = 0.0 68 | count1 = 0 69 | for ss1 in all_ss1: 70 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss2 in all_ss2] 71 | if simi_list: 72 | best_score = max(simi_list) 73 | score1 += best_score 74 | count1 += 1 75 | if count1: 76 | score1 /= count1 77 | 78 | score2 = 0.0 79 | count2 = 0 80 | for ss2 in all_ss2: 81 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss1 in all_ss1] 82 | if simi_list: 83 | best_score = max(simi_list) 84 | score2 += best_score 85 | count2 += 1 86 | if count2: 87 | score2 /= count2 88 | prob = (score1 + score2) / 2 89 | if prob < 1: 90 | odds = prob / (1 - prob) 91 | else: 92 | odds = 1000 93 | if do_time: 94 | print("similarity ends", time()) 95 | return round(odds, 3) 96 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file holds some general purpose functions (utilities). 4 | 5 | """ 6 | import os 7 | from cfitbit_globals import * 8 | import shutil 9 | 10 | 11 | def zero_based_position_from_m_title(dir_, title): 12 | """ 13 | This method returns the position (zero based, starting from zero) of 14 | title `title` in directory `dir_`. 15 | 16 | Parameters 17 | ---------- 18 | dir_: str 19 | title: str 20 | 21 | Returns 22 | ------- 23 | int 24 | 25 | """ 26 | return list(my_listdir(dir_)).index(title + ".txt") 27 | 28 | 29 | def m_title_from_zero_based_position(dir_, pos): 30 | """ 31 | This method returns the title in directory `dir_` of the movie at 32 | position `pos` (zero based, starting from zero). 33 | 34 | Parameters 35 | ---------- 36 | dir_: str 37 | pos: int 38 | 39 | Returns 40 | ------- 41 | str 42 | 43 | """ 44 | return list(my_listdir(dir_))[pos][:-len(".txt")] 45 | 46 | 47 | def argmax_of_list(lista): 48 | """ 49 | This method returns the argmax of list `lista`. 50 | 51 | Parameters 52 | ---------- 53 | lista: list[X] 54 | 55 | Returns 56 | ------- 57 | type(X) 58 | 59 | 60 | """ 61 | return max(range(len(lista)), key=(lambda i: lista[i])) 62 | 63 | 64 | def print_welcome_message(): 65 | """ 66 | This method prints a welcome message. 67 | 68 | Returns 69 | ------- 70 | None 71 | 72 | """ 73 | print("Welcome Causal AI Navigator. We have been waiting for you for " 74 | "millennia. Where would you like us to go next?") 75 | 76 | 77 | def my_listdir(dir_): 78 | """ 79 | Whenever one opens a text file within directory `dir_` using jupyter lab 80 | ( JL), JL writes an annoying `.ipynb.checkpoints` folder inside `dir_`. 81 | This method deletes that checkpoints folder and then returns the usual 82 | `os.listdir( dir_)` 83 | 84 | Parameters 85 | ---------- 86 | dir_: str 87 | 88 | Returns 89 | ------- 90 | iterable 91 | 92 | """ 93 | # listdir includes hidden files like .ipynb_checkpoints 94 | checkpoints = dir_ + "/" + ".ipynb_checkpoints" 95 | shutil.rmtree(checkpoints, ignore_errors=True) 96 | # os.listdir list in arbitrary order! 97 | return sorted(os.listdir(dir_)) 98 | 99 | 100 | def get_prob_acc_and_nsam(num_acc, num_rej, round_digits=2): 101 | """ 102 | This method returns the probability of acceptance `prob_acc` and the 103 | number of samples `nsam` used to calculate that probability. 104 | 105 | Parameters 106 | ---------- 107 | num_acc: int 108 | number of times an arrow has been accepted 109 | num_rej: int 110 | number of times an arrow has been rejected. 111 | round_digits: int 112 | 113 | Returns 114 | ------- 115 | float, int 116 | 117 | """ 118 | nsam = num_acc + num_rej 119 | return round(num_acc / nsam, round_digits), nsam 120 | -------------------------------------------------------------------------------- /WordGuesser.py: -------------------------------------------------------------------------------- 1 | class WordGuesser: 2 | """ 3 | This class is used by `spell_checking.py` to store and update the word 4 | `best_guess` which is a guess for the word `word`. Also stored in this 5 | class: the probabilities for `best_guess` and `word`. 6 | 7 | 8 | Attributes 9 | ---------- 10 | best_guess: str 11 | a word which is the best guess so far for the word `word` 12 | global_checker: SpellChecker 13 | a class of pyspellchecker that can give global probabilities of words 14 | local_word_count: int 15 | number of different words in the single local document being considered 16 | prob_for_best_guess: float 17 | probability for `best_guess` (average of local and global probs) 18 | prob_for_word: float 19 | probability for `word` (average of local and global probs) 20 | word: str 21 | low probability word, likely a misspelled word. `best_guess` is a 22 | replacement for it. 23 | word_to_reps: dict[str, int] 24 | a dictionary mapping each word in the local document being considered, 25 | to its number of repetitions in that document. 26 | 27 | """ 28 | 29 | def __init__(self, word, global_checker, 30 | word_to_reps=None, local_word_count=None): 31 | """ 32 | Constructor 33 | 34 | Parameters 35 | ---------- 36 | word: str 37 | global_checker: SpellChecker 38 | word_to_reps: dict[str, int] 39 | local_word_count: int 40 | 41 | """ 42 | assert word[0].islower() 43 | self.word = word 44 | self.global_checker = global_checker 45 | self.word_to_reps = word_to_reps 46 | self.local_word_count = local_word_count 47 | if word_to_reps: 48 | assert local_word_count 49 | 50 | self.prob_for_word = \ 51 | global_checker.word_usage_frequency(word) 52 | if word_to_reps: 53 | local_prob = word_to_reps[word] / local_word_count 54 | self.prob_for_word = (self.prob_for_word + local_prob) / 2 55 | 56 | self.best_guess = word 57 | self.prob_for_best_guess = 0 58 | self.do_update(word) 59 | 60 | def do_update(self, guess): 61 | """ 62 | This method finds the probability of the word `guess` in the local 63 | dictionary, and if that probability is greater that 64 | `prob_best_guess`, it replaces `best_guess` by `guess`. It also 65 | updates `prob_for_best_guess`. 66 | 67 | Parameters 68 | ---------- 69 | guess: str 70 | 71 | Returns 72 | ------- 73 | None 74 | 75 | """ 76 | prob_for_guess = \ 77 | self.global_checker.word_usage_frequency(guess) 78 | if self.word_to_reps: 79 | local_prob = self.word_to_reps[guess] / self.local_word_count 80 | prob_for_guess = (prob_for_guess + local_prob) / 2 81 | if prob_for_guess > self.prob_for_best_guess: 82 | self.best_guess = guess 83 | self.prob_for_best_guess = prob_for_guess 84 | -------------------------------------------------------------------------------- /BatchSimilarity.py: -------------------------------------------------------------------------------- 1 | import importlib as imp 2 | from globals import * 3 | from Dag import * 4 | 5 | simi_def = imp.import_module(SIMI_DEF) 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | 9 | class BatchSimilarity: 10 | """ 11 | With sentence transformers, one can speed up the evaluation of sentence 12 | similarity by embedding large batches of sentences all at once, rather 13 | than one at a time. Given two DAGs, dag1 and dag2, this class uses a 14 | sentence transformer to evaluate the similarity between all sentences 15 | `all_ztz1` in dag1 and all the sentences `all_ztz2` in dag2. (ztz = 16 | sentence). `all_ztz1 + all_ztz2` are embedded as a batch, in a single 17 | shot. 18 | 19 | Attributes 20 | ---------- 21 | all_ztz1: list[str] 22 | all_ztz2: list[str] 23 | cos_mat: np.array[float] 24 | a matrix of cosines corresponding to all_ztz1 X all_ztz2 25 | model: SentenceTransformer 26 | node_to_simple_ztz1: dict[Node, str] 27 | node_to_simple_ztz2: dict[Node, str] 28 | 29 | """ 30 | 31 | def __init__(self, 32 | dag1, 33 | dag2, 34 | node_to_simple_ztz1, 35 | node_to_simple_ztz2, 36 | model=None): 37 | """ 38 | Constructor 39 | 40 | Parameters 41 | ---------- 42 | dag1: Dag 43 | dag2: Dag 44 | node_to_simple_ztz1: dict[Node, str] 45 | node_to_simple_ztz2: dict[Node, str] 46 | model: SentenceTransformer 47 | """ 48 | self.node_to_simple_ztz1 = node_to_simple_ztz1 49 | self.node_to_simple_ztz2 = node_to_simple_ztz2 50 | self.all_ztz1 = [node_to_simple_ztz1[nd] for nd in dag1.nodes] 51 | self.all_ztz2 = [node_to_simple_ztz2[nd] for nd in dag2.nodes] 52 | self.model = model 53 | if model: 54 | sent_embeddings = model.encode(self.all_ztz1 + self.all_ztz2) 55 | len1 = len(self.all_ztz1) 56 | self.cos_mat = cosine_similarity(sent_embeddings[:len1], 57 | sent_embeddings[len1:]) 58 | 59 | def simi(self, nd1, nd2): 60 | """ 61 | This method returns the similarity of the sentences corresponding to 62 | nodes `nd1` and `nd2`. 63 | 64 | Parameters 65 | ---------- 66 | nd1: Node 67 | nd2: Node 68 | 69 | Returns 70 | ------- 71 | float 72 | 73 | """ 74 | ztz1 = self.node_to_simple_ztz1[nd1] 75 | ztz2 = self.node_to_simple_ztz2[nd2] 76 | if not self.model: 77 | return simi_def.ztz_similarity(ztz1, ztz2) 78 | else: 79 | k1 = self.all_ztz1.index(ztz1) 80 | k2 = self.all_ztz2.index(ztz2) 81 | prob = self.cos_mat[k1, k2] 82 | if prob < 0: 83 | # print("neg. prob.=", prob) 84 | # print(ztz1) 85 | # print(ztz2) 86 | prob = 0 87 | odds = prob / (1 - prob) if prob < 1 else 1e5 88 | return round(odds, 3) 89 | -------------------------------------------------------------------------------- /white_paper/references.bib: -------------------------------------------------------------------------------- 1 | @book{book-of-why , 2 | title={The book of why: the new science of cause and effect}, 3 | author={Pearl, Judea and Mackenzie, Dana}, 4 | year={2018}, 5 | publisher={Basic books} 6 | } 7 | 8 | @misc{bayesuvius, 9 | title="Bayesuvius (book)", 10 | author="Robert R. Tucci", 11 | howpublished="\url{https://github.com/rrtucci/Bayesuvius/raw/master/main.pdf}" 12 | } 13 | 14 | @inproceedings{2022opberg, 15 | title={OpBerg: Discovering causal sentences using optimal alignments}, 16 | author={Wood, Justin and Matiasz, Nicholas and Silva, Alcino and Hsu, William and Abyzov, Alexej and Wang, Wei}, 17 | booktitle={International Conference on Big Data Analytics and Knowledge Discovery}, 18 | pages={17--30}, 19 | year={2022}, 20 | organization={Springer} 21 | } 22 | 23 | @misc{yann-religion, 24 | title="Twitter, Absurd statement about causal inference 25 | and religion", 26 | author="Yann LeCun", 27 | howpublished="\url{https://twitter.com/ylecun/status/1577128801620070400}" 28 | } 29 | 30 | @misc{yann-text, 31 | title="Twitter, Absurd statement 32 | about all 33 | the text ever written", 34 | author="Yann LeCun", 35 | howpublished="\url{https://twitter.com/ylecun/status/1562137291845521408}" 36 | } 37 | 38 | @misc{deft1, 39 | title="Causal DAG extraction from a library of books or videos/movies", 40 | author="Robert R. Tucci", 41 | howpublished="\url{https://arxiv.org/abs/2211.00486}" 42 | } 43 | 44 | 45 | @misc{tic-tac-toe, 46 | title="deft-tic-tac-toe at github", 47 | author="Robert R. Tucci", 48 | howpublished="\url{https://github.com/rrtucci/deft-tic-tac-toe}" 49 | } 50 | 51 | @misc{project-gutenberg, 52 | title="Project {G}utenberg website", 53 | howpublished="\url{https://www.gutenberg.org}" 54 | } 55 | 56 | @misc{imsdb, 57 | title="Internet {M}ovie {S}cript {D}atabase ({IMSDb})", 58 | howpublished="\url{https://imsdb.com/}" 59 | } 60 | 61 | @misc{github-mappa-mundi, 62 | title="Mappa {M}undi at github", 63 | author="Robert R. Tucci", 64 | howpublished="\url{https://github.com/rrtucci/mappa_mundi}" 65 | } 66 | 67 | @misc{audio-description, 68 | title="Audio description", 69 | author="Wikipedia", 70 | howpublished="\url{https://en.wikipedia.org/wiki/Audio_description}" 71 | } 72 | 73 | @misc{scumpy, 74 | title="{SCuMpy} at github", 75 | author="Robert R. Tucci", 76 | howpublished="\url{https://github.com/rrtucci/scumpy}" 77 | } 78 | 79 | @misc{sentence-ax, 80 | title="Sentence{A}x at github", 81 | author="Robert R. Tucci", 82 | howpublished="\url{https://github.com/rrtucci/SentenceAx}" 83 | } 84 | 85 | @misc{fitbit-dataset, 86 | title="FitBit Fitness Tracker Data", 87 | author="Kaggle.com", 88 | howpublished="\url{https://www.kaggle.com/datasets/arashnic/fitbit}" 89 | } 90 | 91 | @misc{causal-fitbit, 92 | title="Causal{F}itbit at github", 93 | author="Robert R. Tucci", 94 | howpublished="\url{https://github.com/rrtucci/CausalFitbit}" 95 | } 96 | 97 | @misc{sbert, 98 | title="{sBERT}", 99 | author="sbert.net", 100 | howpublished="\url{https://www.sbert.net/}" 101 | } 102 | 103 | @misc{openie6, 104 | title="{Openie6}", 105 | author="dair-iitd", 106 | howpublished="\url{https://github.com/dair-iitd/openie6}" 107 | } 108 | 109 | 110 | -------------------------------------------------------------------------------- /simp_deprecated/simp_spacy1.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | Refs: 7 | https://spacy.io/usage/spacy-101/ 8 | 9 | https://subscription.packtpub.com/book/data/9781838987312/2/ch02lvl1sec13/splitting-sentences-into-clauses 10 | """ 11 | from globals import * 12 | import spacy 13 | 14 | nlp = spacy.load('en_core_web_sm') 15 | 16 | 17 | # sentence = "He eats cheese, but he won't eat ice cream." 18 | 19 | def simplify_ztz(sentence, verbose=False): 20 | """ 21 | This method simplifies the sentence `sentence`. It returns a list of 22 | simple sentences extracted from the input sentence. 23 | 24 | Parameters 25 | ---------- 26 | sentence: str 27 | verbose: bool 28 | kwargs: dict[] 29 | 30 | Returns 31 | ------- 32 | list[str] 33 | 34 | """ 35 | 36 | 37 | doc = nlp(sentence) 38 | 39 | for token in doc: 40 | ancestors = [t.text for t in token.ancestors] 41 | children = [t.text for t in token.children] 42 | # if verbose: 43 | # print(token.text, "\t", token.i, "\t", 44 | # token.pos_, "\t", token.dep_, "\t", 45 | # ancestors, "\t", children) 46 | 47 | def find_root_of_sentence(doc): 48 | root_token = None 49 | for token in doc: 50 | if (token.dep_ == "ROOT"): 51 | root_token = token 52 | return root_token 53 | 54 | root_token = find_root_of_sentence(doc) 55 | 56 | def find_other_verbs(doc, root_token): 57 | other_verbs = [] 58 | for token in doc: 59 | ancestors = list(token.ancestors) 60 | if (token.pos_ == "VERB" and len(ancestors) == 1 \ 61 | and ancestors[0] == root_token): 62 | other_verbs.append(token) 63 | return other_verbs 64 | 65 | other_verbs = find_other_verbs(doc, root_token) 66 | 67 | def get_clause_token_span_for_verb(verb, doc, all_verbs): 68 | first_token_index = len(doc) 69 | last_token_index = 0 70 | this_verb_children = list(verb.children) 71 | for child in this_verb_children: 72 | if (child not in all_verbs): 73 | if (child.i < first_token_index): 74 | first_token_index = child.i 75 | if (child.i > last_token_index): 76 | last_token_index = child.i 77 | return (first_token_index, last_token_index) 78 | 79 | token_spans = [] 80 | all_verbs = [root_token] + other_verbs 81 | for other_verb in all_verbs: 82 | (first_token_index, last_token_index) = \ 83 | get_clause_token_span_for_verb(other_verb, 84 | doc, all_verbs) 85 | token_spans.append((first_token_index, 86 | last_token_index)) 87 | 88 | sentence_clauses = [] 89 | for token_span in token_spans: 90 | start = token_span[0] 91 | end = token_span[1] 92 | if (start < end): 93 | clause = doc[start:end] 94 | sentence_clauses.append(clause) 95 | sentence_clauses = sorted(sentence_clauses, 96 | key=lambda tup: tup[0]) 97 | 98 | clauses_text = [clause.text for clause in sentence_clauses] 99 | if verbose: 100 | print(sentence) 101 | print(clauses_text) 102 | 103 | return clauses_text 104 | -------------------------------------------------------------------------------- /simp_deprecated/simp_spacy4.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | References: 7 | 8 | 1. "Knowledge graphs from complex text" by Karthika Vijayan (Solution 9 | Consultant @Sahaj ) 10 | https://medium.com/inspiredbrilliance/knowledge-graphs-from-complex-text-eb009aeed48e 11 | 12 | """ 13 | import spacy 14 | nlp = spacy.load('en_core_web_lg') 15 | import coreferee 16 | import spacy_transformers 17 | 18 | def coref_resolve(text): 19 | nlp1 = spacy.load('en_core_web_trf') 20 | nlp1.add_pipe('coreferee') 21 | doc1 = nlp1(text) 22 | tok_list = list(token.text for token in doc1) 23 | c = 0 24 | for chain in doc1._.coref_chains: 25 | for mention in chain: 26 | res1 = [doc1._.coref_chains.resolve(doc1[i]) for i in mention] 27 | res = list(filter((None).__ne__, res1)) 28 | if len(res) != 0: 29 | if len(res[0]) == 1: 30 | tok_list[mention[0] + c] = str(res[0][0]) 31 | elif len(res[0]) > 1: 32 | tok_list[mention[0] + c] = str(res[0][0]) 33 | for j in range(1, len(res[0])): 34 | tok_list.insert(mention[0] + c + j, str(res[0][j])) 35 | c = c + 1 36 | textres = " ".join(tok_list) 37 | return textres 38 | 39 | 40 | def compound_to_simple(sentence): 41 | doc = nlp(sentence) 42 | 43 | root_token = None 44 | for token in doc: 45 | if (token.dep_ == "ROOT"): 46 | root_token = token 47 | 48 | other_verbs = [] 49 | for token in doc: 50 | ancestors = list(token.ancestors) 51 | if (token.pos_ == "VERB" and len( 52 | ancestors) < 3 and token != root_token): 53 | other_verbs.append(token) 54 | 55 | token_spans = [] 56 | all_verbs = [root_token] + other_verbs 57 | for other_verb in all_verbs: 58 | first_token_index = len(doc) 59 | last_token_index = 0 60 | this_verb_children = list(other_verb.children) 61 | for child in this_verb_children: 62 | if (child not in all_verbs): 63 | if (child.i < first_token_index): 64 | first_token_index = child.i 65 | if (child.i > last_token_index): 66 | last_token_index = child.i 67 | token_spans.append((first_token_index, last_token_index)) 68 | 69 | sentence_clauses = [] 70 | for token_span in token_spans: 71 | start = token_span[0] 72 | end = token_span[1] 73 | if (start < end): 74 | clause = doc[start:end] 75 | sentence_clauses.append(clause) 76 | sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0]) 77 | clauses_text = [clause.text for clause in sentence_clauses] 78 | return clauses_text 79 | 80 | def simplify_ztz(sentence, verbose=False): 81 | """ 82 | This method simplifies the sentence `sentence`. It returns a list of 83 | simple sentences extracted from the input sentence. 84 | 85 | Parameters 86 | ---------- 87 | sentence: str 88 | verbose: bool 89 | kwargs: dict[] 90 | 91 | Returns 92 | ------- 93 | list[str] 94 | 95 | """ 96 | 97 | textres = coref_resolve(sentence) 98 | ztz_list = compound_to_simple(textres) 99 | if verbose: 100 | print(sentence.strip()) 101 | print(ztz_list) 102 | return ztz_list -------------------------------------------------------------------------------- /simp_spacy3.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | Refs: 7 | https://spacy.io/usage/spacy-101/ 8 | 9 | For spacy, here are some values of token.dep_ 10 | 11 | cc: coordinating conjunction. 12 | i.e., FANBOYS = for, and, nor, but, or, yet, so 13 | 14 | mark: marker that introduces a subordinate clause 15 | 16 | ADP: adposition, e.g. in, to, during 17 | 18 | """ 19 | 20 | import spacy 21 | import re 22 | from globals import * 23 | 24 | nlp = spacy.load("en_core_web_sm") 25 | nlp.add_pipe("merge_entities") 26 | 27 | 28 | def simplify_ztz(sentence, verbose=False): 29 | """ 30 | This method simplifies the sentence `sentence`. It returns a list of 31 | simple sentences extracted from the input sentence. 32 | 33 | Parameters 34 | ---------- 35 | sentence: str 36 | verbose: bool 37 | 38 | Returns 39 | ------- 40 | list[str] 41 | 42 | """ 43 | doc = nlp(sentence) 44 | tokenized_clauses_list = [] 45 | tokenized_clause = [] 46 | for token in doc: 47 | cond = (token.dep_ == "mark") or \ 48 | (token.dep_ == "cc") or \ 49 | (token.text == ";") 50 | if not cond: 51 | tokenized_clause.append(token) 52 | else: 53 | tokenized_clauses_list.append(tokenized_clause) 54 | tokenized_clause = [] 55 | # last clause 56 | tokenized_clauses_list.append(tokenized_clause) 57 | 58 | ztz_list = [] 59 | for tokenized_clause in tokenized_clauses_list: 60 | 61 | # replace by empty list any tokenized clause 62 | # that doesn't have a noun/pronoun and a verb 63 | clause_has_noun_or_pronoun = False 64 | clause_has_verb = False 65 | token_str_list = [] 66 | for token in tokenized_clause: 67 | x = get_simplified_token_txt(token) 68 | if x: 69 | token_str_list.append(x) 70 | if token.pos_ in ["NOUN", "PRON", "PROPN"] and x: 71 | clause_has_noun_or_pronoun = True 72 | # print("NOUN or PRONOUN", token.text) 73 | if token.pos_ in ["VERB", "AUX"] and x: 74 | clause_has_verb = True 75 | # print("VERB", token.text) 76 | if not (clause_has_noun_or_pronoun and clause_has_verb): 77 | clause_str = [] 78 | else: 79 | clause_str = " ".join(token_str_list) 80 | 81 | if clause_str: 82 | ztz_list.append(clause_str) 83 | 84 | if verbose: 85 | print(sentence.strip()) 86 | print(ztz_list) 87 | return ztz_list 88 | 89 | 90 | def get_simplified_token_txt(token): 91 | """ 92 | This auxiliary method takes as input a SpaCy Token `token` and returns a 93 | simplified version of the token's text. 94 | 95 | Parameters 96 | ---------- 97 | token: Token 98 | 99 | Returns 100 | ------- 101 | str 102 | 103 | """ 104 | x = token.text 105 | # remove all punctuation marks 106 | x = re.sub(r'[^\w\s]', '', x) 107 | if token.ent_type_: 108 | # replace named entities by their labels 109 | # x = token.ent_type_ 110 | 111 | # remove named entities 112 | x = "" 113 | if token.is_stop and (token.pos_ not in RETAINED_STOPWORD_POS): 114 | x = "" 115 | if token.pos_ not in RETAINED_POS: 116 | x = "" 117 | # remove single character tokens 118 | if len(x.strip()) == 1: 119 | x = "" 120 | x = x.strip() 121 | return x 122 | -------------------------------------------------------------------------------- /similarity_deprecated/similarity_spacy.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains a function `ztz_similarity(ztz1, ztz2)` 4 | that returns the similarity of sentences `ztz1` and `ztz2`. 5 | ztz = sentence 6 | 7 | It uses SpaCy + WordVec 8 | 9 | Ref: 10 | 11 | 12 | """ 13 | 14 | from itertools import product 15 | import numpy as np 16 | import spacy 17 | from globals import * 18 | 19 | nlp = spacy.load('en_core_web_lg') 20 | 21 | 22 | def ztz_similarity(ztz1, ztz2, **kwargs): 23 | """ 24 | This method returns the similarity between sentences `ztz1` and `ztz2`. 25 | The similarity is measured as odds of a probability, so it ranges from 0 26 | to infinity. 27 | 28 | Parameters 29 | ---------- 30 | ztz1: str 31 | ztz2: str 32 | 33 | Returns 34 | ------- 35 | float 36 | 37 | """ 38 | 39 | def same_pos(token1, token2): 40 | # this gives same simi but elapsed time is less 41 | return token1.pos_ == token2.pos_ 42 | 43 | doc1 = nlp(ztz1) 44 | doc2 = nlp(ztz2) 45 | sp_tokens1 = [token1 for token1 in doc1 \ 46 | if token1.pos_ in RETAINED_POS] 47 | sp_tokens2 = [token2 for token2 in doc2 \ 48 | if token2.pos_ in RETAINED_POS] 49 | token_pair_to_simi = {} 50 | for token1, token2 in product(sp_tokens1, sp_tokens2): 51 | if same_pos(token1, token2): 52 | simi = nlp(token1.text.lower()). \ 53 | similarity(nlp(token2.text.lower())) 54 | # print("llkj", token1.text, token2.text, token1.pos_, simi) 55 | if simi is not None: 56 | token_pair_to_simi[(token1, token2)] = simi 57 | # print("ffgh", "****************") 58 | # ("mmnk", token_pair_to_simi) 59 | score1 = 0.0 60 | count1 = 0 61 | for token1 in sp_tokens1: 62 | simi_list = [token_pair_to_simi[(token1, token2)] 63 | for token2 in sp_tokens2 64 | if same_pos(token1, token2)] 65 | if simi_list: 66 | best_score = max(simi_list) 67 | score1 += best_score 68 | count1 += 1 69 | if count1: 70 | score1 /= count1 71 | 72 | score2 = 0.0 73 | count2 = 0 74 | for token2 in sp_tokens2: 75 | simi_list = [token_pair_to_simi[(token1, token2)] 76 | for token1 in sp_tokens1 77 | if same_pos(token1, token2)] 78 | if simi_list: 79 | best_score = max(simi_list) 80 | score2 += best_score 81 | count2 += 1 82 | if count2: 83 | score2 /= count2 84 | prob = (score1 + score2) / 2 85 | if prob < 1: 86 | odds = prob / (1 - prob) 87 | else: 88 | odds = 1000 89 | return round(odds, 3) 90 | 91 | 92 | """ 93 | ************ simi definition from: similarity_spacy 94 | 1. Cats are beautiful animals. 95 | 2. Dogs are awesome. 96 | simi(1, 2)= 2.578 97 | simi(2, 1)= 2.578 98 | 99 | 1. Cats are beautiful animals. 100 | 2. Some gorgeous creatures are felines. 101 | simi(1, 2)= 2.697 102 | simi(2, 1)= 2.697 103 | 104 | 1. Cats are beautiful animals. 105 | 2. Dolphins are swimming mammals. 106 | simi(1, 2)= 2.535 107 | simi(2, 1)= 2.535 108 | 109 | 1. Cats are beautiful animals. 110 | 2. Cats are beautiful animals. 111 | simi(1, 2)= 1000 112 | simi(2, 1)= 1000 113 | 114 | 1. Cats are beautiful animals. 115 | 2. Cats are beauti animals. 116 | simi(1, 2)= 7.986 117 | simi(2, 1)= 7.986 118 | 119 | ************ simi definition from: similarity_spacy 120 | 1. apple 121 | 2. horse 122 | simi(1, 2)= 0.247 123 | simi(2, 1)= 0.247 124 | 125 | 1. Paul 126 | 2. John 127 | simi(1, 2)= 0.0 128 | simi(2, 1)= 0.0 129 | 130 | 1. The cat sat on the mat. 131 | 2. The dog lay on the rug. 132 | simi(1, 2)= 1.678 133 | simi(2, 1)= 1.678 134 | elapsed time= 0.14391398429870605 135 | 136 | """ 137 | -------------------------------------------------------------------------------- /similarity_nltk.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains a function `ztz_similarity(ztz1, ztz2)` 4 | that returns the similarity of sentences `ztz1` and `ztz2`. 5 | ztz = sentence 6 | 7 | It uses NLTK + WordNet 8 | 9 | Ref: 10 | https://nlpforhackers.io/wordnet-sentence-similarity/ 11 | 12 | """ 13 | 14 | from nltk import word_tokenize, pos_tag 15 | from nltk.corpus import wordnet as wn 16 | from itertools import product 17 | from collections import defaultdict 18 | from time import time 19 | 20 | 21 | def penn_to_wn(tag): 22 | """ 23 | Convert a Penn Treebank tag to a simplified Wordnet tag 24 | 25 | Parameters 26 | ---------- 27 | tag: str 28 | 29 | Returns 30 | ------- 31 | str 32 | 33 | """ 34 | if tag.startswith('N'): 35 | return 'n' # noun 36 | 37 | if tag.startswith('V'): 38 | return 'v' # verb 39 | 40 | if tag.startswith('J'): 41 | return 'a' # adjective 42 | 43 | if tag.startswith('R'): 44 | return 'r' # adverb 45 | 46 | return None 47 | 48 | 49 | def synset_for_tgd_word(tgd_word): 50 | """ 51 | This private method returns the most likely synset for a tagged word 52 | `tgd_word`. A synset (synonym set) is a sort of equivalence class of 53 | words with very similar meanings. 54 | 55 | Parameters 56 | ---------- 57 | tgd_word: tuple(str, str) 58 | 59 | Returns 60 | ------- 61 | wn.synset or None 62 | 63 | """ 64 | word, tag = tgd_word 65 | wn_tag = penn_to_wn(tag) 66 | if wn_tag is None: 67 | return None 68 | 69 | try: 70 | return wn.synsets(word, wn_tag)[0] 71 | except: 72 | return None 73 | 74 | 75 | def ztz_similarity(ztz1, ztz2, **kwargs): 76 | """ 77 | This method returns the similarity between sentences `ztz1` and `ztz2`. 78 | The similarity is measured as odds of a probability, so it ranges from 0 79 | to infinity. 80 | 81 | Parameters 82 | ---------- 83 | ztz1: str 84 | ztz2: str 85 | 86 | Returns 87 | ------- 88 | float 89 | 90 | """ 91 | 92 | do_time = False 93 | if do_time: 94 | print("similarity start", time()) 95 | # Tokenize and tag 96 | tgd_ztz1 = pos_tag(word_tokenize(ztz1.lower())) 97 | tgd_ztz2 = pos_tag(word_tokenize(ztz2.lower())) 98 | 99 | # Get the synsets for the tagged words (tgd_word) 100 | all_ss1 = [] 101 | for tgd_word in tgd_ztz1: 102 | ss1 = synset_for_tgd_word(tgd_word) 103 | if ss1: 104 | all_ss1.append(ss1) 105 | all_ss2 = [] 106 | for tgd_word in tgd_ztz2: 107 | ss2 = synset_for_tgd_word(tgd_word) 108 | if ss2: 109 | all_ss2.append(ss2) 110 | 111 | ss_pair_to_simi = defaultdict(lambda: 0) 112 | if do_time: 113 | print("similarity begin path_similarity()", time()) 114 | for ss1, ss2 in product(all_ss1, all_ss2): 115 | simi = ss1.path_similarity(ss2) 116 | if simi is not None: 117 | ss_pair_to_simi[(ss1, ss2)] = simi 118 | 119 | score1 = 0.0 120 | count1 = 0 121 | for ss1 in all_ss1: 122 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss2 in all_ss2] 123 | if simi_list: 124 | best_score = max(simi_list) 125 | score1 += best_score 126 | count1 += 1 127 | if count1: 128 | score1 /= count1 129 | 130 | score2 = 0.0 131 | count2 = 0 132 | for ss2 in all_ss2: 133 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss1 in all_ss1] 134 | if simi_list: 135 | best_score = max(simi_list) 136 | score2 += best_score 137 | count2 += 1 138 | if count2: 139 | score2 /= count2 140 | prob = (score1 + score2) / 2 141 | if prob < 1: 142 | odds = prob / (1 - prob) 143 | else: 144 | odds = 1000 145 | if do_time: 146 | print("similarity ends", time()) 147 | return round(odds, 3) 148 | 149 | 150 | """ 151 | ************ simi definition from: similarity_nltk 152 | 1. Cats are beautiful animals. 153 | 2. Dogs are awesome. 154 | simi(1, 2)= 1.045 155 | simi(2, 1)= 1.045 156 | 157 | 1. Cats are beautiful animals. 158 | 2. Some gorgeous creatures are felines. 159 | simi(1, 2)= 2.429 160 | simi(2, 1)= 2.429 161 | 162 | 1. Cats are beautiful animals. 163 | 2. Dolphins are swimming mammals. 164 | simi(1, 2)= 0.733 165 | simi(2, 1)= 0.733 166 | 167 | 1. Cats are beautiful animals. 168 | 2. Cats are beautiful animals. 169 | simi(1, 2)= 1000 170 | simi(2, 1)= 1000 171 | 172 | ************ simi definition from: similarity_nltk 173 | 1. apple 174 | 2. horse 175 | simi(1, 2)= 0.056 176 | simi(2, 1)= 0.056 177 | 178 | 1. Paul 179 | 2. John 180 | simi(1, 2)= 0.083 181 | simi(2, 1)= 0.083 182 | 183 | 1. The cat sat on the mat. 184 | 2. The dog lay on the rug. 185 | simi(1, 2)= 0.353 186 | simi(2, 1)= 0.353 187 | elapsed time= 0.006499767303466797 188 | 189 | """ 190 | -------------------------------------------------------------------------------- /simplifying.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains functions for simplifying movie scripts (or short stories). 4 | 5 | input directory: m_scripts_spell or short_stories_spell 6 | output directory: m_scripts_simp or short_stories_simp 7 | 8 | Simplification is done by the function `simplify_ztz()`. This function was 9 | implemented in several ways before we decided to stick with the version in 10 | file `simp_spacy3`. 11 | 12 | simp_spacy1.py 13 | simp_spacy2.py 14 | simp_spacy3.py (recommended) 15 | simp_spacy-claucy.py 16 | simp_stanford.py 17 | 18 | The input files have only one sentence per line. For each file, we use SpaCy 19 | to break each sentence into clauses. Then we simplify the clauses by 20 | removing stop-words, punctuation marks, proper nouns (a.k.a. named entities) 21 | and other excess baggage. Then we replace each clause by its simplified 22 | version. Different simplified clauses from the same sentence are put in the 23 | same line, separated by a separator-token. Some sentences are diminished to 24 | nothing after the simplification. Those sentences are replaced by a single 25 | asterisk. 26 | 27 | """ 28 | from globals import * 29 | import os 30 | import re 31 | import importlib as imp 32 | 33 | zsimp = imp.import_module(ZTZ_SIMPLIFIER) 34 | from utils import * 35 | 36 | 37 | def simplify_one_m_script( 38 | in_dir, out_dir, 39 | file_name, 40 | verbose=False, 41 | use_gpu=False): 42 | """ 43 | in_dir and out_dir can be the same, but this will overwrite the files. 44 | 45 | This method reads a file called `file_name` in the `in_dir` directory 46 | and creates a simplified version in the `out_dir` directory. 47 | 48 | 49 | Parameters 50 | ---------- 51 | in_dir: str 52 | out_dir: str 53 | file_name: str 54 | verbose: bool 55 | use_gpu: bool 56 | 57 | Returns 58 | ------- 59 | None 60 | 61 | """ 62 | inpath = in_dir + "/" + file_name 63 | outpath = out_dir + "/" + file_name 64 | new_lines = [] 65 | with open(inpath, "r") as f: 66 | count = 1 67 | for line in f: 68 | if verbose: 69 | print(str(count) + ".") 70 | simple_ztz_list = zsimp.simplify_ztz(line, 71 | verbose=verbose, 72 | use_gpu=use_gpu) 73 | 74 | # remove empty clauses 75 | simple_ztz_list = [ztz for ztz in simple_ztz_list if ztz] 76 | 77 | if simple_ztz_list == []: 78 | simple_ztz_list = [ZTZ_SEPARATOR] 79 | 80 | # replace multiple white spaces by single white space 81 | simple_ztz_list = [re.sub(r'\s+', ' ', ztz) for ztz in 82 | simple_ztz_list] 83 | 84 | if len(simple_ztz_list) > 1: 85 | xx = " " + ZTZ_SEPARATOR + " " 86 | new_lines.append(xx.join(simple_ztz_list)) 87 | elif len(simple_ztz_list) == 1: 88 | new_lines.append(simple_ztz_list[0]) 89 | else: 90 | assert False 91 | 92 | count += 1 93 | with open(outpath, "w") as f: 94 | for line in new_lines: 95 | f.write(line + "\n") 96 | 97 | 98 | def simplify_batch_of_m_scripts( 99 | in_dir, out_dir, 100 | batch_file_names, 101 | verbose=False): 102 | """ 103 | This method calls the method `simplify_one_m_script` for all the file 104 | names in the list of file names `batch_file_names`. 105 | 106 | 107 | Parameters 108 | ---------- 109 | in_dir: str 110 | out_dir: str 111 | batch_file_names: list[str] 112 | verbose: bool 113 | 114 | Returns 115 | ------- 116 | None 117 | 118 | """ 119 | all_file_names = my_listdir(in_dir) 120 | assert set(batch_file_names).issubset(set(all_file_names)) 121 | for file_name in batch_file_names: 122 | i = all_file_names.index(file_name) 123 | print('%i.' % (i + 1), file_name) 124 | simplify_one_m_script(in_dir, out_dir, file_name, verbose) 125 | 126 | 127 | if __name__ == "__main__": 128 | 129 | def main1(): 130 | print("************ simplifier:", ZTZ_SIMPLIFIER) 131 | ztz = \ 132 | 'The man, who had never liked the words' \ 133 | ' "booby" and "boobyhatch,"' \ 134 | ' and who liked them even less on a shining morning when there' \ 135 | ' was a unicorn in the garden, thought for a moment.' 136 | zsimp.simplify_ztz(ztz, verbose=True) 137 | 138 | 139 | def main2(): 140 | print("************ simplifier:", ZTZ_SIMPLIFIER) 141 | path = "simplifying_test.txt" 142 | with open(path, "r") as f: 143 | count = 1 144 | for line in f: 145 | print(str(count) + ".") 146 | zsimp.simplify_ztz(line, verbose=True) 147 | count += 1 148 | 149 | 150 | def main3(): 151 | print("************ simplifier:", ZTZ_SIMPLIFIER) 152 | in_dir = "short_stories_spell" 153 | out_dir = "short_stories_simp" 154 | batch_file_names = my_listdir(in_dir)[0:3] 155 | simplify_batch_of_m_scripts( 156 | in_dir, out_dir, 157 | batch_file_names, 158 | verbose=False) 159 | 160 | 161 | def main4(): 162 | print("************ simplifier:", ZTZ_SIMPLIFIER) 163 | remove_dialogs = False 164 | in_dir = SPELL_DIR if not remove_dialogs else SPELL_RD_DIR 165 | out_dir = SIMP_DIR if not remove_dialogs else SIMP_RD_DIR 166 | batch_file_names = my_listdir(in_dir)[0:3] 167 | simplify_batch_of_m_scripts( 168 | in_dir, out_dir, 169 | batch_file_names) 170 | 171 | 172 | main1() 173 | main2() 174 | # main3() 175 | # main4() 176 | -------------------------------------------------------------------------------- /downloading_imsdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | The goal of this file is to scrape the 1,100+ movie scripts from the IMSDb 4 | website. 5 | 6 | References: 7 | https://github.com/j2kun/imsdb_download_all_scripts 8 | https://github.com/AdeboyeML/Film_Script_Analysis 9 | https://www.datacamp.com/tutorial/scraping-reddit-python-scrapy 10 | 11 | In Chrome and most web browsers, pressing Ctrl+U opens the current page's 12 | source code in a new window. 13 | 14 | 3 depths, d0, d1, d2 15 | 16 | d0_url 17 | https://imsdb.com/all-scripts.html 18 | 19 | d1_url (depends on movie) 20 | https://imsdb.com/Movie%20Scripts/10%20Things%20I%20Hate%20About%20You%20Script.html 21 | 22 | d2_url (depends on movie) 23 | https://imsdb.com/scripts/10-Things-I-Hate-About-You.html 24 | 25 | find_all() takes you from X->Y 26 | (d0_html, d0_soup)->d1_url 27 | (d1_html, d1_soup)->d2_url 28 | """ 29 | from bs4 import BeautifulSoup 30 | import requests 31 | from slugify import slugify # python-slugify 32 | from globals import * 33 | 34 | 35 | def get_d1_urls_and_titles(): 36 | """ 37 | This auxiliary method returns lists `d1_urls` and `titles`. 38 | 39 | Returns 40 | ------- 41 | list[str], list[str] 42 | 43 | """ 44 | d1_urls = [] 45 | titles = [] 46 | d0_url = BASE_URL + "/all-scripts.html" 47 | d0_html = requests.get(d0_url).text 48 | d0_soup = BeautifulSoup(d0_html, "html.parser") 49 | for p_tag in d0_soup.find_all('p'): 50 | d1_url = p_tag.a['href'] 51 | cond1 = "/Movie Scripts/" in d1_url 52 | cond2 = ".html" in d1_url 53 | if cond1 and cond2: 54 | title = d1_url.replace("/Movie Scripts/", ""). \ 55 | replace(" Script.html", ""). \ 56 | replace(".html", "") 57 | d1_urls.append(BASE_URL + d1_url) 58 | titles.append(title) 59 | return d1_urls, titles 60 | 61 | 62 | def get_one_m_script(d1_url, stub_only=False): 63 | """ 64 | This method scrapes one movie script with d1-level URL `d1_url`. 65 | 66 | Parameters 67 | ---------- 68 | d1_url: str 69 | stub_only: bool 70 | True iff don't want to scrape the movie script text at all. Instead 71 | of the movie script text, it leaves a message "coming soon to a 72 | theater near you" 73 | 74 | Returns 75 | ------- 76 | str, bool 77 | the movie script, and a boolean indicating if it's missing. 78 | 79 | """ 80 | missing = False 81 | tail = d1_url.split('/')[-1].replace(".html", "") 82 | if stub_only: 83 | m_script = "coming soon to a theater near you" 84 | else: 85 | # print("nabf", d1_url) 86 | d1_html = requests.get(d1_url).text 87 | d1_soup = BeautifulSoup(d1_html, "html.parser") 88 | p_tags = d1_soup.find_all('p', align="center") 89 | if not p_tags: 90 | print('**************** Missing: %s' % tail) 91 | missing = True 92 | return "coming soon to a theater near you", missing 93 | assert len(p_tags) == 1 94 | d2_url = p_tags[0].a['href'] 95 | d2_url = BASE_URL + d2_url 96 | # print("nnfx", d2_url) 97 | d2_html = requests.get(d2_url).text 98 | d2_soup = BeautifulSoup(d2_html, "html.parser") 99 | # tried this. Doesn't always work 100 | # pre_tags = d2_soup.find_all('pre') 101 | pre_tags = d2_soup.find_all('td', {'class': "scrtext"}) 102 | if not pre_tags: 103 | print('**************** Missing: %s' % tail) 104 | missing = True 105 | return "coming soon to a theater near you", missing 106 | m_script = pre_tags[0].get_text() 107 | # m_script = clean_m_script(m_script) 108 | return m_script, missing 109 | 110 | 111 | def get_batch_of_m_scripts(first=1, last=5000, stub_only=False): 112 | """ 113 | This method scrapes the movie scripts starting at position `first` and 114 | ending at position `last`. If `last` is larger than the number of movie 115 | scripts at IMSDb, then the method ends when it has scraped all movie 116 | scripts. 117 | 118 | Parameters 119 | ---------- 120 | first: int 121 | last: int 122 | stub_only: bool 123 | 124 | Returns 125 | ------- 126 | None 127 | 128 | """ 129 | d1_urls, titles = get_d1_urls_and_titles() 130 | num_titles = len(titles) 131 | missing_m_scripts = [] 132 | assert first <= last 133 | if last > num_titles: 134 | last = num_titles 135 | if first < 1: 136 | first = 1 137 | for i in range(first - 1, last): 138 | d1_url = d1_urls[i] 139 | dashed_title = slugify(titles[i]) 140 | print('%i. fetching %s' % (i + 1, dashed_title)) 141 | m_script, missing = get_one_m_script(d1_url, stub_only=stub_only) 142 | outpath = M_SCRIPTS_DIR + '/' + dashed_title + '.txt' 143 | if missing: 144 | missing_m_scripts.append(dashed_title + '.txt') 145 | else: 146 | written = False 147 | len_script = len(m_script) 148 | print("m_script num of characters=", len_script) 149 | if len_script > 500: 150 | with open(outpath, "w", newline="\n") as f: 151 | f.write(m_script) 152 | written = True 153 | if not written: 154 | # m-scripts with less than 500 char are just stubs 155 | print("------------------ Found just a stub: ", dashed_title) 156 | missing_m_scripts.append(dashed_title + '.txt') 157 | print("missing m_scripts:") 158 | print(missing_m_scripts) 159 | print("number of missing m_scripts=", len(missing_m_scripts)) 160 | 161 | 162 | if __name__ == "__main__": 163 | def main1(): 164 | urls, titles = get_d1_urls_and_titles() 165 | print(urls) 166 | print(titles) 167 | assert len(urls) == len(titles) 168 | print("number of films=", len(urls)) # 1211 169 | # 75 missing 170 | # 1211-75=1136 expected 238 MB 171 | 172 | 173 | def main2(): 174 | get_batch_of_m_scripts(first=1, last=100, stub_only=False) 175 | 176 | 177 | # main1() 178 | main2() 179 | -------------------------------------------------------------------------------- /simp_openie6.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | The functions in this file are used inside the following jupyter notebook at 4 | Google Colab 5 | 6 | https://colab.research.google.com/drive/1S2EWOGkoCgjfOJzTRJ7PLeu4T8SBwhlF?usp=sharing 7 | 8 | Refs: 9 | 10 | 1. https://github.com/dair-iitd/CaRB 11 | 12 | 2. https://github.com/dair-iitd/imojie 13 | 14 | 3. https://github.com/dair-iitd/openie6 15 | 16 | """ 17 | import subprocess 18 | import os 19 | from globals import * 20 | from utils import my_listdir 21 | 22 | 23 | def openie6_simplify_batch_of_m_scripts( 24 | in_dir, out_dir, 25 | batch_file_names, 26 | verbose=False): 27 | """ 28 | This method does the same thing as the method 29 | `simplifying.simplify_batch_of_m_scripts()` but for the case 30 | `ZTZ_SIMPLIFIER = "simp_openie6"` 31 | 32 | Parameters 33 | ---------- 34 | in_dir: str 35 | out_dir: str 36 | batch_file_names: list[str] 37 | verbose: bool 38 | 39 | Returns 40 | ------- 41 | None 42 | 43 | """ 44 | # assume directories `openie6` and `mappa_mundi` 45 | # live side by side inside a bigger folder X 46 | # and that the cwd is X 47 | 48 | m_script_starting_line_nums = \ 49 | make_all_sentences_file(in_dir=in_dir, 50 | batch_file_names=batch_file_names) 51 | gpu_command = \ 52 | r"cd openie6 && CUDA_DEVICE_ORDER=PCI_BUS_ID " \ 53 | r"CUDA_VISIBLE_DEVICES=0 " \ 54 | r"PYTHONPATH=imojie:imojie/allennlp:imojie" \ 55 | r"/pytorch_transformers:$PYTHONPATH python run.py " \ 56 | r"--save models/conj_model --mode predict " \ 57 | r"--inp ../all_sentences.txt --batch_size 1 " \ 58 | r"--model_str bert-large-cased --task conj " \ 59 | r"--gpus 1 --out ../all_predictions.txt" 60 | 61 | cpu_command = gpu_command.replace("--gpus 1", "--gpus 0") 62 | 63 | if USE_GPU: 64 | os.system(gpu_command) 65 | else: 66 | os.system(cpu_command) 67 | 68 | translate_predictions_file_from_openie6_to_mm( 69 | in_fname="all_predictions.txt.conj", 70 | out_fname="all_predictions_in_mm.txt") 71 | 72 | make_m_scripts_simp_dir(out_dir, 73 | batch_file_names, 74 | m_script_starting_line_nums) 75 | 76 | os.remove("all_sentences.txt") 77 | os.remove("all_predictions.txt.conj") 78 | os.remove("all_predictions_in_mm.txt") 79 | 80 | 81 | def make_all_sentences_file(in_dir, batch_file_names): 82 | """ 83 | This internal method creates the file `all_sentences.txt`. 84 | `all_sentences.txt` is a concatenation of all the files in 85 | `batch_file_names`. 86 | 87 | Parameters 88 | ---------- 89 | in_dir: str 90 | batch_file_names: list[str] 91 | 92 | Returns 93 | ------- 94 | m_script_starting_line_nums: list[int] 95 | list of the starting line numbers within the file 96 | `all_sentences.txt` for the file names in the list `batch_file_names`. 97 | 98 | """ 99 | m_script_starting_line_nums = [] 100 | cum_line_num = 0 101 | with open("all_sentences.txt", "w") as big_f: 102 | for fname in batch_file_names: 103 | in_path = in_dir + '/' + fname 104 | # print("bbng", in_path) 105 | with open(in_path, "r") as f: 106 | # print("hhji", cum_line_num) 107 | m_script_starting_line_nums.append(cum_line_num) 108 | f_len = 0 109 | for line in f: 110 | f_len += 1 111 | # print("llmk", line) 112 | big_f.write(line) 113 | cum_line_num += f_len 114 | # print("nnmj", f_len) 115 | return m_script_starting_line_nums 116 | 117 | 118 | def translate_predictions_file_from_openie6_to_mm(in_fname, out_fname): 119 | """ 120 | This internal method reads the file `all_predictions.txt.conj` and 121 | translates it into a new file called `all_predictions_in_mm.txt`. The 122 | input file is in the format of openie6 extractions output and the output 123 | file is in the mappa mundi (mm) simp format. 124 | 125 | openie6 extractions output format: one sentence or empty line ("row 126 | gap") per line. Groups separated by empty lines. Each group consists of 127 | the original sentence followed by the extraction sentences. 128 | 129 | mm simp format: one sentence per line. No row gaps. Each line has all 130 | the extractions from the original sentence, separated by ZTZ_SEPARATOR. 131 | 132 | Parameters 133 | ---------- 134 | in_fname: str 135 | out_fname: str 136 | 137 | Returns 138 | ------- 139 | None 140 | 141 | """ 142 | with open(in_fname, "r") as in_file: 143 | with open(out_fname, "w") as out_file: 144 | in_parts = [] 145 | prev_line_is_empty = True 146 | for line in in_file: 147 | if line.strip(): 148 | in_parts.append(line.strip()) 149 | prev_line_is_empty = False 150 | else: 151 | if not prev_line_is_empty: 152 | if len(in_parts) > 1: 153 | in_parts = in_parts[1:] 154 | if len(in_parts) > 0: 155 | xx = " " + ZTZ_SEPARATOR + " " 156 | out_file.write(xx.join(in_parts) + "\n") 157 | in_parts = [] 158 | 159 | 160 | def make_m_scripts_simp_dir(out_dir, 161 | batch_file_names, 162 | m_script_starting_line_nums): 163 | """ 164 | This internal method reads the file `all_predictions_in_mm.txt` and it 165 | uses that to create a new directory called `out_dir` populated by files 166 | with the names in list `batch_file_names`. 167 | 168 | Parameters 169 | ---------- 170 | out_dir: str 171 | batch_file_names: list[str] 172 | m_script_starting_line_nums: list[int] 173 | 174 | Returns 175 | ------- 176 | None 177 | 178 | """ 179 | if not os.path.exists(out_dir): 180 | os.makedirs(out_dir) 181 | with open("all_predictions_in_mm.txt", "r") as big_f: 182 | m_script_num = -1 183 | f = None 184 | for line_num, line in enumerate(big_f): 185 | if line_num in m_script_starting_line_nums: 186 | if f: 187 | f.close() 188 | m_script_num += 1 189 | fname = batch_file_names[m_script_num] 190 | out_path = out_dir + "/" + fname 191 | f = open(out_path, "w") 192 | f.write(line) 193 | if f: 194 | f.close() 195 | 196 | 197 | if __name__ == "__main__": 198 | def main(): 199 | in_dir = "short_stories_spell" 200 | batch_file_names = my_listdir(in_dir) 201 | make_all_sentences_file(in_dir=in_dir, 202 | batch_file_names=batch_file_names) 203 | translate_predictions_file_from_openie6_to_mm( 204 | "openie6_translation_test.txt", 205 | "openie6_test_answer.txt") 206 | 207 | 208 | main() 209 | -------------------------------------------------------------------------------- /jupyter_notebooks/SUMMARY.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SUMMARY notebook\n", 8 | "\n", 9 | "This notebook scans the directory in which it lives to find all jupyter notebooks (other than itself) in that directory. It then prints for every notebook it finds (1) a hyperlink to the notebook, and (2) the first cell (which is always markdown) of the notebook. This way you can read a nice, automatically generated summary of all the notebooks without having to open all of them. If you find a notebook that you want to explore further, you can simply click on its link to open it." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/markdown": [ 20 | "\n", 21 | "\n", 22 | "
\n", 23 | "\n", 24 | "Coordination_Analysis_with_IGL_CA_(OpenIE6_paper).ipynb [local link] [github link] 1/4\n", 25 | "\n", 26 | "# IGL-CA: inference pipeline\n", 27 | "Coordination analysis inference using the OpenIE6 model.\n", 28 | "\n", 29 | "* Anton's [OpenIE6 fork](https://github.com/alexeyev/openie6)\n", 30 | "* [OpenIE6 original repo](https://github.com/dair-iitd/openie6)\n", 31 | "* [OpenIE6 original paper](https://aclanthology.org/2020.emnlp-main.306/)\n", 32 | "\n", 33 | "Prepared by [Anton Alekseev](https://github.com/alexeyev) and [Anastasia Predelina](https://github.com/PredelinaAsya). Quite a bit of effort, tbh.\n", 34 | "\n", 35 | "**NOTA BENE**: GPU environment should be enabled before running the code! If not possible, another code cell for CPU-only environment is available at the very end of the notebook.\n", 36 | "\n", 37 | "
\n", 38 | "\n", 39 | "navigating_m_scripts.ipynb [local link] [github link] 2/4\n", 40 | "\n", 41 | "# Navigating Movie Scripts\n", 42 | "\n", 43 | "In this notebook, we explain\n", 44 | "how to use Mappa Mundi (MM) to do causal DEFT (DAG extraction from text)\n", 45 | "using as a test case, the following 3 movie scripts by Pixar/Disney.\n", 46 | "\n", 47 | "* [Toy Story](../m_scripts/toy-story.txt)\n", 48 | "* [Up](../m_scripts/up.txt)\n", 49 | "* [WALL-E](../m_scripts/wall-e.txt)\n", 50 | "\n", 51 | "
\n", 52 | "\n", 53 | "navigating_short_stories.ipynb [local link] [github link] 3/4\n", 54 | "\n", 55 | "# Navigating Short Stories\n", 56 | "\n", 57 | "In this notebook, we explain how to use Mappa Mundi (MM) to do causal DEFT (DAG extraction from text)\n", 58 | "using as a test case, the following 3 short stories by P.G. Wodehouse.\n", 59 | "\n", 60 | "* [Bill the Bloodhound](../short_stories/bill-the-bloodhound.txt)\n", 61 | "* [Extricating Young Gussie](../short_stories/extricating-young-gussie.txt)\n", 62 | "* [Wilton's Holiday](../short_stories/wiltons-holiday.txt)\n", 63 | "\n", 64 | "\n", 65 | "
\n", 66 | "\n", 67 | "simplifying_with_Openie6.ipynb [local link] [github link] 4/4\n", 68 | "\n", 69 | "# Simplifying with Openie6\n", 70 | "\n", 71 | "The Openie6 software takes as input a possibly complex or compound sentence X,\n", 72 | "and it returns a list of simple sentences that contain all the\n", 73 | "information in the original sentence X.\n", 74 | "\n", 75 | "Anton Alekseev (AA) and Anastasia Predelina (AP) wrote a jupyter notebook\n", 76 | "that installs and runs the code in the Openie6 repo https://github.com/alexeyev/openie6\n", 77 | "An exact copy of notebook by AA/AP is included in this folder. It is also publicly available at AA's google drive\n", 78 | "under the URL\n", 79 | "\n", 80 | " https://colab.research.google.com/drive/1samvO-SH6Xgjf9ItlhAF1EmBZo5grBQb?usp=sharing\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | "This notebook adds new code to the end of the AA/AP notebook. The purpose of the\n", 85 | "new code is\n", 86 | "to simplify short stories and movie scripts." 87 | ], 88 | "text/plain": [ 89 | "" 90 | ] 91 | }, 92 | "metadata": {}, 93 | "output_type": "display_data" 94 | } 95 | ], 96 | "source": [ 97 | "# Version: 2\n", 98 | "import os\n", 99 | "import json\n", 100 | "from IPython.display import display, Markdown\n", 101 | "\n", 102 | "# the name of this file\n", 103 | "this_fname = 'SUMMARY.ipynb'\n", 104 | "fname_to_md = {}\n", 105 | "for fname in sorted([x for x in os.listdir('./')]):\n", 106 | " if fname[-6:] == '.ipynb' and fname != this_fname:\n", 107 | " # print('------------', fname)\n", 108 | " with open(fname, 'r', encoding=\"utf-8\") as f:\n", 109 | " fdata = json.load(f)\n", 110 | " fname_to_md[fname] = ''.join(fdata['cells'][0]['source'])\n", 111 | "# print(fname_to_md)\n", 112 | "pre_sep = '\\n\\n
\\n\\n'\n", 113 | "full_md = ''\n", 114 | "k = 1\n", 115 | "num_nb = len(fname_to_md)\n", 116 | "project_name =\"mappa_mundi\"\n", 117 | "who =\"rrtucci\"\n", 118 | "where = \"jupyter_notebooks\"\n", 119 | "for fname, md in fname_to_md.items():\n", 120 | " sep = pre_sep\n", 121 | " local_link = f' [local link] '\n", 122 | " github_link = f' [github link] '\n", 124 | " sep += fname + local_link + github_link + str(k) + '/' + str(num_nb) + '\\n\\n'\n", 125 | " full_md += sep + md\n", 126 | " k += 1\n", 127 | "display(Markdown(full_md))" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3 (ipykernel)", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.10.9" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 4 152 | } 153 | -------------------------------------------------------------------------------- /post_cleaning.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains functions for post-cleaning movie scripts (or short 4 | stories). 5 | 6 | input directory: m_scripts_simp or short_stories_simp 7 | output directory: m_scripts_post_clean or short_stories_post_clean 8 | 9 | The input files have one or more sublines per line. For each file, we use 10 | post clean the sublines by removing stop-words, punctuation marks, proper 11 | nouns (a.k.a. named entities) and other excess baggage. Then we replace each 12 | subline by its post-clean version. Different clean sublines from the 13 | same sentence are put in the same line, separated by ZTZ_SEPARATOR . Some 14 | sentences are diminished to nothing after the post-cleaning. Those 15 | sentences are replaced by a single ZTZ_SEPARATOR. 16 | 17 | 18 | Refs: 19 | https://spacy.io/usage/spacy-101/ 20 | 21 | For spacy, here are some values of token.dep_ 22 | 23 | cc: coordinating conjunction. 24 | i.e., FANBOYS = for, and, nor, but, or, yet, so 25 | 26 | mark: marker that introduces a subordinate subline 27 | 28 | ADP: adposition, e.g. in, to, during 29 | 30 | """ 31 | from globals import * 32 | import importlib as imp 33 | 34 | zsimp = imp.import_module(ZTZ_SIMPLIFIER) 35 | from utils import * 36 | 37 | import spacy 38 | import re 39 | from globals import * 40 | 41 | nlp = spacy.load("en_core_web_sm") 42 | 43 | 44 | # nlp.add_pipe("merge_entities") 45 | 46 | 47 | def post_clean_line(line, verbose=False): 48 | """ 49 | This method cleans the line string `line`. It returns a list of simple 50 | sentences (sublines) extracted from the input sentence (line). 51 | 52 | Parameters 53 | ---------- 54 | line: str 55 | verbose: bool 56 | 57 | Returns 58 | ------- 59 | list[str] 60 | 61 | """ 62 | tokenized_sublines = \ 63 | [nlp(subline) for subline in line.split(ZTZ_SEPARATOR)] 64 | 65 | ztz_list = [] 66 | for tokenized_subline in tokenized_sublines: 67 | 68 | # replace by empty list any tokenized subline 69 | # that doesn't have a noun/pronoun and a verb 70 | subline_has_noun_or_pronoun = False 71 | subline_has_verb = False 72 | token_str_list = [] 73 | for token in tokenized_subline: 74 | x = get_post_cleaned_token_txt(token) 75 | if x: 76 | token_str_list.append(x) 77 | if token.pos_ in ["NOUN", "PRON", "PROPN"] and x: 78 | subline_has_noun_or_pronoun = True 79 | # print("NOUN or PRONOUN", token.text) 80 | if token.pos_ in ["VERB", "AUX"] and x: 81 | subline_has_verb = True 82 | # print("VERB", token.text) 83 | if not (subline_has_noun_or_pronoun and subline_has_verb): 84 | subline_str = [] 85 | else: 86 | subline_str = " ".join(token_str_list) 87 | 88 | if subline_str: 89 | ztz_list.append(subline_str) 90 | 91 | if verbose: 92 | print(line.strip()) 93 | print(ztz_list) 94 | return ztz_list 95 | 96 | 97 | def get_post_cleaned_token_txt(token): 98 | """ 99 | This auxiliary method takes as input a SpaCy Token `token` and returns a 100 | simplified version of the token's text. 101 | 102 | Parameters 103 | ---------- 104 | token: Token 105 | 106 | Returns 107 | ------- 108 | str 109 | 110 | """ 111 | x = token.text 112 | # remove all punctuation marks 113 | x = re.sub(r'[^\w\s]', '', x) 114 | 115 | # if token.ent_type_: 116 | # # replace named entities by their labels 117 | # # x = token.ent_type_ 118 | # 119 | # # remove named entities 120 | # x = "" 121 | # if token.is_stop and (token.pos_ not in RETAINED_STOPWORD_POS): 122 | # x = "" 123 | # if token.pos_ not in RETAINED_POS: 124 | # x = "" 125 | 126 | # remove single character tokens 127 | if len(x.strip()) == 1: 128 | x = "" 129 | x = x.strip() 130 | return x 131 | 132 | 133 | def post_clean_one_m_script( 134 | in_dir, out_dir, 135 | file_name, 136 | verbose=False): 137 | """ 138 | in_dir and out_dir can be the same, but this will overwrite the files. 139 | 140 | This method reads a file called `file_name` in the `in_dir` directory 141 | and creates a post-cleaned version in the `out_dir` directory. 142 | 143 | 144 | Parameters 145 | ---------- 146 | in_dir: str 147 | out_dir: str 148 | file_name: str 149 | verbose: bool 150 | 151 | Returns 152 | ------- 153 | None 154 | 155 | """ 156 | inpath = in_dir + "/" + file_name 157 | outpath = out_dir + "/" + file_name 158 | new_lines = [] 159 | with open(inpath, "r") as f: 160 | count = 1 161 | for line in f: 162 | if verbose: 163 | print(str(count) + ".") 164 | simple_ztz_list = post_clean_line(line, 165 | verbose=verbose) 166 | 167 | # remove empty simple ztz 168 | simple_ztz_list = [ztz for ztz in simple_ztz_list if ztz] 169 | 170 | if not simple_ztz_list: 171 | simple_ztz_list = [ZTZ_SEPARATOR] 172 | 173 | # replace multiple white spaces by single white space 174 | simple_ztz_list = [re.sub(r'\s+', ' ', ztz) for ztz in 175 | simple_ztz_list] 176 | 177 | if len(simple_ztz_list) > 1: 178 | xx = " " + ZTZ_SEPARATOR + " " 179 | new_lines.append(xx.join(simple_ztz_list)) 180 | elif len(simple_ztz_list) == 1: 181 | new_lines.append(simple_ztz_list[0]) 182 | else: 183 | assert False 184 | 185 | count += 1 186 | with open(outpath, "w") as f: 187 | for line in new_lines: 188 | f.write(line + "\n") 189 | 190 | 191 | def post_clean_batch_of_m_scripts( 192 | in_dir, out_dir, 193 | batch_file_names, 194 | verbose=False): 195 | """ 196 | This method calls the method `post_clean_one_m_script` for all the file 197 | names in the list of file names `batch_file_names`. 198 | 199 | 200 | Parameters 201 | ---------- 202 | in_dir: str 203 | out_dir: str 204 | batch_file_names: list[str] 205 | verbose: bool 206 | 207 | Returns 208 | ------- 209 | None 210 | 211 | """ 212 | all_file_names = my_listdir(in_dir) 213 | assert set(batch_file_names).issubset(set(all_file_names)) 214 | for file_name in batch_file_names: 215 | i = all_file_names.index(file_name) 216 | print('%i.' % (i + 1), file_name) 217 | post_clean_one_m_script(in_dir, out_dir, file_name, verbose) 218 | 219 | 220 | if __name__ == "__main__": 221 | def main1(): 222 | in_dir = "short_stories_simp" 223 | out_dir = "short_stories_post_clean" 224 | batch_file_names = my_listdir(in_dir)[0:3] 225 | post_clean_batch_of_m_scripts( 226 | in_dir, out_dir, 227 | batch_file_names, 228 | verbose=False) 229 | 230 | 231 | def main2(): 232 | remove_dialogs = False 233 | in_dir = SIMP_DIR if not remove_dialogs else SIMP_RD_DIR 234 | out_dir = POST_CLEAN_DIR if not remove_dialogs else POST_CLEAN_RD_DIR 235 | batch_file_names = my_listdir(in_dir)[0:3] 236 | post_clean_batch_of_m_scripts( 237 | in_dir, out_dir, 238 | batch_file_names) 239 | 240 | main1() 241 | main2() 242 | -------------------------------------------------------------------------------- /stopwords.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file has a function that lists all SpaCy stopwords, classified by POS ( 4 | part of speech). 5 | 6 | """ 7 | 8 | import spacy 9 | from pprint import pprint 10 | 11 | nlp = spacy.load('en_core_web_sm') 12 | 13 | 14 | def get_stopwords_dict(): 15 | """ 16 | This method returns a dictionary that maps the parts of speech (POS) to 17 | a list of the stopwords that have that POS. 18 | 19 | Returns 20 | ------- 21 | dict[str, list[str]] 22 | 23 | """ 24 | 25 | stopwords = nlp.Defaults.stopwords 26 | 27 | pos_to_stopwords = {} 28 | 29 | for word in stopwords: 30 | pos = nlp(word)[0].pos_ 31 | if pos in pos_to_stopwords: 32 | pos_to_stopwords[pos].append(word) 33 | else: 34 | pos_to_stopwords[pos] = [word] 35 | 36 | return pos_to_stopwords 37 | 38 | 39 | if __name__ == "__main__": 40 | def main(): 41 | d = get_stopwords_dict() 42 | print(sorted(d.keys())) 43 | pprint(d) 44 | 45 | 46 | main() 47 | 48 | """ 49 | ['ADJ', 50 | 'ADP', 51 | 'ADV', 52 | 'AUX', 53 | 'CCONJ', 54 | 'DET', 55 | 'INTJ', 56 | 'NOUN', 57 | 'NUM', 58 | 'PART', 59 | 'PRON', 60 | 'PUNCT', 61 | 'SCONJ', 62 | 'VERB'] 63 | {'ADJ': ['same', 64 | 'few', 65 | 'former', 66 | 'full', 67 | 'serious', 68 | 'own', 69 | 'empty', 70 | 'such', 71 | 'several', 72 | 'latter', 73 | 'various', 74 | 'other', 75 | 'least', 76 | 'many', 77 | 'whole', 78 | 'top', 79 | 'due', 80 | 'whereafter', 81 | 'last', 82 | 'third'], 83 | 'ADP': ['behind', 84 | 'with', 85 | 'below', 86 | 'of', 87 | 'through', 88 | 'amongst', 89 | 'at', 90 | 'onto', 91 | 'thru', 92 | 'among', 93 | 'throughout', 94 | 'under', 95 | 'into', 96 | 'before', 97 | 'beside', 98 | 'against', 99 | 'within', 100 | 'per', 101 | 'towards', 102 | 'after', 103 | 'without', 104 | 'beyond', 105 | 'from', 106 | 'via', 107 | 'in', 108 | 'on', 109 | 'as', 110 | 'than', 111 | 'during', 112 | 'for', 113 | 'toward', 114 | 'until', 115 | 'above', 116 | 'across', 117 | 'along', 118 | 'between', 119 | 'over', 120 | 'by'], 121 | 'ADV': ['together', 122 | 'so', 123 | 'therein', 124 | 'next', 125 | 'off', 126 | 'meanwhile', 127 | 'whereupon', 128 | 'sometimes', 129 | 'again', 130 | 'rather', 131 | 'enough', 132 | 'thereby', 133 | 'first', 134 | 'too', 135 | 'always', 136 | 'either', 137 | 'somehow', 138 | 'very', 139 | 'perhaps', 140 | 'back', 141 | 'down', 142 | 'elsewhere', 143 | 'latterly', 144 | 'moreover', 145 | 'formerly', 146 | 'about', 147 | 'sometime', 148 | 'really', 149 | 'once', 150 | 'else', 151 | 'anyhow', 152 | 'also', 153 | 'there', 154 | 'most', 155 | 'nowhere', 156 | 'then', 157 | 'up', 158 | 'out', 159 | 'further', 160 | 'however', 161 | 'yet', 162 | 'namely', 163 | 'afterwards', 164 | 'already', 165 | 'hereby', 166 | 'thereupon', 167 | 'still', 168 | 'hence', 169 | 'anyway', 170 | 'even', 171 | 'much', 172 | 'thus', 173 | 'never', 174 | 'almost', 175 | 'alone', 176 | 'somewhere', 177 | 'here', 178 | 'more', 179 | 'hereupon', 180 | 'indeed', 181 | 'now', 182 | 'beforehand', 183 | 'everywhere', 184 | 'just', 185 | 'anywhere', 186 | 'often', 187 | 'thereafter', 188 | 'therefore', 189 | 'nevertheless', 190 | 'ever', 191 | 'quite', 192 | 'mostly', 193 | 'around', 194 | 'only', 195 | 'otherwise', 196 | 'less'], 197 | 'AUX': ['might', 198 | 'will', 199 | 'being', 200 | '’d', 201 | 'can', 202 | '’ll', 203 | 'are', 204 | 'was', 205 | "'ll", 206 | 'ca', 207 | 'could', 208 | 'must', 209 | 'would', 210 | "'re", 211 | 'may', 212 | 'were', 213 | "'s", 214 | 'should', 215 | 'be', 216 | 'cannot', 217 | 'am', 218 | 'is', 219 | 'been'], 220 | 'CCONJ': ['but', 'or', 'and', 'nor', 'neither'], 221 | 'DET': ['whose'], 222 | 'INTJ': ['please', 'well', 'no'], 223 | 'NOUN': ['part', 224 | 'bottom', 225 | 'whither', 226 | 'side', 227 | 'name', 228 | 'thence', 229 | 'amount', 230 | 'whence', 231 | 'yourselves', 232 | 'noone', 233 | 'front', 234 | 'yours', 235 | 'others', 236 | 'none', 237 | 'hers', 238 | '‘s', 239 | 'ours', 240 | 'herein'], 241 | 'NUM': ['one', 242 | 'twenty', 243 | 'nine', 244 | 'sixty', 245 | 'ten', 246 | 'five', 247 | 'fifty', 248 | 'forty', 249 | 'n’t', 250 | 'six', 251 | 'three', 252 | 'hundred', 253 | 'eleven', 254 | 'twelve', 255 | 'fifteen', 256 | 'four', 257 | 'two', 258 | 'eight'], 259 | 'PART': ["n't", 'to', 'not'], 260 | 'PRON': ['nothing', 261 | 'the', 262 | 'my', 263 | 'this', 264 | 'something', 265 | 'they', 266 | 'whom', 267 | 'nobody', 268 | 'her', 269 | 'those', 270 | 'me', 271 | 'he', 272 | 'themselves', 273 | 'us', 274 | 'an', 275 | 'anything', 276 | 'his', 277 | 'i', 278 | 'you', 279 | 'which', 280 | 'him', 281 | 'all', 282 | 'we', 283 | 'them', 284 | 'any', 285 | 'who', 286 | 'everyone', 287 | 'these', 288 | 'someone', 289 | 'some', 290 | 'himself', 291 | 'whoever', 292 | 'what', 293 | 'each', 294 | 'yourself', 295 | 'mine', 296 | 'everything', 297 | 'our', 298 | 'itself', 299 | 'anyone', 300 | 'herself', 301 | 'your', 302 | 'its', 303 | 'every', 304 | 'it', 305 | 'their', 306 | 'both', 307 | 'ourselves', 308 | 'that', 309 | 'another', 310 | 'whatever', 311 | 'she', 312 | 'myself', 313 | 'a'], 314 | 'PUNCT': ['‘ll', '’ve', '‘ve', '‘d', '’m', '‘m', '‘re'], 315 | 'SCONJ': ['where', 316 | 'wherever', 317 | 'unless', 318 | 'wherein', 319 | 'if', 320 | 'how', 321 | 'though', 322 | 'why', 323 | 'except', 324 | 'whether', 325 | 'while', 326 | 'upon', 327 | 'whereas', 328 | 'besides', 329 | 'when', 330 | 'because', 331 | 'whereby', 332 | 'whenever', 333 | 'since', 334 | 'although'], 335 | 'VERB': ['became', 336 | 'made', 337 | 'hereafter', 338 | 'used', 339 | 'using', 340 | 'did', 341 | 'becomes', 342 | 'seem', 343 | 'do', 344 | 'seems', 345 | "'d", 346 | 'done', 347 | 'give', 348 | 'keep', 349 | 'say', 350 | 'has', 351 | 'get', 352 | 'become', 353 | 'have', 354 | 'doing', 355 | 'seemed', 356 | 'make', 357 | 'n‘t', 358 | 'put', 359 | 'take', 360 | 'becoming', 361 | 'show', 362 | "'m", 363 | "'ve", 364 | '’s', 365 | 'see', 366 | 'regarding', 367 | 'move', 368 | 'had', 369 | 'seeming', 370 | 'call', 371 | 're', 372 | 'go', 373 | '’re', 374 | 'does']} 375 | """ 376 | -------------------------------------------------------------------------------- /DagAtlas.py: -------------------------------------------------------------------------------- 1 | from Dag import * 2 | from BatchSimilarity import * 3 | from utils import * 4 | import sys 5 | from itertools import product 6 | from globals import * 7 | 8 | import pickle as pik 9 | from time import time 10 | from sentence_transformers import SentenceTransformer 11 | 12 | 13 | class DagAtlas: 14 | """ 15 | This class reads movie script txt files from the `out_dir` directory ( 16 | simplified movie scripts) and creates a pickled file for each movie 17 | script. Each pickled file contains a Dag object for one movie. `dag_dir` 18 | (called the DAG atlas) is the directory containing the pickled files. 19 | This class is also called `DagAtlas`. 20 | 21 | Attributes 22 | ---------- 23 | dag_dir: str 24 | directory where this class writes pickled files. One pickled file ( 25 | i.e., DAG) per movie. 26 | model: SentenceTransformer 27 | Model returned by SentenceTransformer constructor 28 | simp_dir: str 29 | directory where this class reads txt files. 30 | start_time: float 31 | time in minutes when self is created. 32 | title_to_permission_to_write_new_pickle: dict[str, bool] 33 | A dictionary that maps each movie title to a boolean that grants 34 | permission to overwrite an existing pickled file. 35 | 36 | """ 37 | 38 | def __init__(self, simp_dir, dag_dir, 39 | recycled_pickles=None): 40 | """ 41 | Constructor 42 | 43 | Parameters 44 | ---------- 45 | simp_dir: str 46 | directory with a simplified txt file for each movie script 47 | dag_dir: str 48 | directory with a pickled file containing a Dag object for each 49 | movie script 50 | recycled_pickles: list[str] 51 | titles for which overwriting of pickled files is forbidden, at the 52 | beginning, when self is first constructed. 53 | 54 | """ 55 | self.start_time = time() 56 | time_now = (time() - self.start_time) / 60 57 | print(f"Initiating DagAtlas object: {time_now:.2f} minutes\n") 58 | 59 | self.simp_dir = simp_dir 60 | self.dag_dir = dag_dir 61 | all_simp_titles = [file_name[:-len(".txt")] for \ 62 | file_name in my_listdir(self.simp_dir)] 63 | all_dag_titles = [file_name[:-len(".pkl")] for \ 64 | file_name in my_listdir(self.dag_dir)] 65 | assert set(all_dag_titles).issubset(set(all_simp_titles)) 66 | 67 | self.title_to_permission_to_write_new_pickle = {} 68 | for title in all_simp_titles: 69 | self.title_to_permission_to_write_new_pickle[title] = True 70 | if recycled_pickles is None: 71 | recycled_pickles = [] 72 | for title in recycled_pickles: 73 | assert title in all_dag_titles 74 | self.title_to_permission_to_write_new_pickle[title] = False 75 | 76 | if SIMI_DEF == "similarity_bert": 77 | self.model = SentenceTransformer('all-MiniLM-L6-v2') 78 | else: 79 | self.model = None 80 | 81 | def update_arrows_for_two_m_scripts(self, title1, title2): 82 | """ 83 | This method updates the arrows for 2 movie titles. 84 | 85 | Parameters 86 | ---------- 87 | title1: str 88 | title2: str 89 | 90 | Returns 91 | ------- 92 | None 93 | 94 | """ 95 | time_now = (time() - self.start_time) / 60 96 | print(f"Starting comparison of 2 titles: {time_now:.2f} minutes") 97 | 98 | if self.title_to_permission_to_write_new_pickle[title1]: 99 | dag1 = Dag(title1, simp_dir=self.simp_dir) 100 | else: 101 | path1 = self.dag_dir + "/" + title1 + ".pkl" 102 | try: 103 | with open(path1, "rb") as f: 104 | dag1 = pik.load(f) 105 | except OSError: 106 | print("This file is probably missing:", path1) 107 | sys.exit() 108 | 109 | if self.title_to_permission_to_write_new_pickle[title2]: 110 | dag2 = Dag(title2, simp_dir=self.simp_dir) 111 | else: 112 | path2 = self.dag_dir + "/" + title2 + ".pkl" 113 | try: 114 | with open(path2, "rb") as f: 115 | dag2 = pik.load(f) 116 | except OSError: 117 | print("This file is probably missing:", path2) 118 | sys.exit() 119 | node_to_simple_ztz1 = \ 120 | dag1.build_node_to_simple_ztz_dict(self.simp_dir) 121 | node_to_simple_ztz2 = \ 122 | dag2.build_node_to_simple_ztz_dict(self.simp_dir) 123 | 124 | print("title1 and its num of nodes:", title1, len(dag1.nodes)) 125 | print("title2 and its num of nodes:", title2, len(dag2.nodes)) 126 | print("product of numbers of nodes=", 127 | len(dag1.nodes) * len(dag2.nodes)) 128 | 129 | time_now = (time() - self.start_time) / 60 130 | print(f"Starting bridges: {time_now:.2f} minutes") 131 | 132 | nd1_nd2_bridges = [] 133 | bridge_count = 0 134 | batch_simi = BatchSimilarity(dag1, dag2, 135 | node_to_simple_ztz1, 136 | node_to_simple_ztz2, 137 | model=self.model) 138 | for nd1, nd2 in product(dag1.nodes, dag2.nodes): 139 | if batch_simi.simi(nd1, nd2) > SIMI_THRESHOLD: 140 | nd1_nd2_bridges.append((nd1, nd2)) 141 | bridge_count += 1 142 | print(bridge_count, "bridges") 143 | range0 = range(len(nd1_nd2_bridges)) 144 | for i, j in product(range0, range0): 145 | if i < j: 146 | bridge_a = nd1_nd2_bridges[i] 147 | bridge_b = nd1_nd2_bridges[j] 148 | arrows = [None, None] 149 | time_gaps = [0, 0] 150 | for movie in range(2): 151 | time_gaps[movie] = \ 152 | bridge_a[movie].time - bridge_b[movie].time 153 | if time_gaps[movie] > 0: 154 | arrows[movie] = (bridge_b[movie], bridge_a[movie]) 155 | else: 156 | arrows[movie] = (bridge_a[movie], bridge_b[movie]) 157 | bridges_do_not_cross = (time_gaps[0] * time_gaps[1] > 0) 158 | if bridges_do_not_cross: 159 | accepted = True 160 | else: 161 | accepted = False 162 | dag1.update_arrow(arrows[0], accepted) 163 | dag2.update_arrow(arrows[1], accepted) 164 | 165 | time_now = (time() - self.start_time) / 60 166 | print(f"Before saving 2 dags: {time_now:.2f} minutes") 167 | dag1.save_self(self.dag_dir) 168 | self.title_to_permission_to_write_new_pickle[title1] = False 169 | dag2.save_self(self.dag_dir) 170 | self.title_to_permission_to_write_new_pickle[title2] = False 171 | 172 | time_now = (time() - self.start_time) / 60 173 | print(f"Exiting 2 titles comparison: {time_now:.2f} minutes\n") 174 | 175 | def update_arrows_in_batch_of_m_scripts(self, batch_titles=None): 176 | """ 177 | This method calls the method `update_arrows_for_two_m_scripts` for 178 | every pair '{ title1, title2}' of movie scripts in the list 179 | `batch_titles`. 180 | 181 | Parameters 182 | ---------- 183 | batch_titles: list[str] or None 184 | 185 | Returns 186 | ------- 187 | None 188 | 189 | """ 190 | all_simp_titles = [file_name[:-len(".txt")] for \ 191 | file_name in my_listdir(self.simp_dir)] 192 | 193 | if batch_titles is None: 194 | batch_titles = all_simp_titles 195 | assert set(batch_titles).issubset(set(all_simp_titles)) 196 | assert len(batch_titles) >= 2 197 | num = len(batch_titles) 198 | 199 | for i, j in product(range(num), range(num)): 200 | if i < j: 201 | self.update_arrows_for_two_m_scripts(batch_titles[i], 202 | batch_titles[j]) 203 | 204 | def update_arrows_for_one_m_script_and_others(self, 205 | title, 206 | other_titles): 207 | """ 208 | This method calls the method `update_arrows_for_two_m_scripts` for 209 | every pair '{ title, other_title}' of movie scripts, 210 | where `other_title` is in the list `other_titles`. 211 | 212 | Parameters 213 | ---------- 214 | title: str 215 | other_titles: list[str] 216 | 217 | Returns 218 | ------- 219 | None 220 | 221 | """ 222 | all_simp_titles = [file_name[:-len(".txt")] for \ 223 | file_name in my_listdir(self.simp_dir)] 224 | assert set(other_titles).issubset(set(all_simp_titles)) 225 | assert title not in other_titles 226 | 227 | for j in range(len(other_titles)): 228 | self.update_arrows_for_two_m_scripts(title, 229 | other_titles[j]) 230 | 231 | 232 | if __name__ == "__main__": 233 | def main1(): 234 | simp_dir = "short_stories_post_clean" 235 | dag_dir = "short_stories_dag_atlas" 236 | atlas = DagAtlas(simp_dir, dag_dir) 237 | all_titles = [file_name[:-len(".txt")] \ 238 | for file_name in my_listdir(simp_dir)] 239 | atlas.update_arrows_in_batch_of_m_scripts( 240 | batch_titles=all_titles[0:3]) 241 | 242 | 243 | def main2(): 244 | remove_dialog = False 245 | atlas = DagAtlas( 246 | simp_dir=POST_CLEAN_DIR if not remove_dialog else 247 | POST_CLEAN_RD_DIR, 248 | dag_dir=DAG_DIR) 249 | all_titles = [file_name[:-len(".txt")] \ 250 | for file_name in my_listdir(SIMP_DIR)] 251 | atlas.update_arrows_in_batch_of_m_scripts( 252 | batch_titles=all_titles[0:3]) 253 | 254 | 255 | main1() 256 | # main2() 257 | -------------------------------------------------------------------------------- /simp_deprecated/simp_spacy2.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | Refs: 7 | 8 | https://spacy.io/usage/spacy-101/ 9 | 10 | https://github.com/ac491/Sentence-simplifier/blob/master/simplifcation.ipynb 11 | """ 12 | import spacy 13 | import nltk 14 | 15 | nltk.download('averaged_perceptron_tagger') 16 | 17 | nlp = spacy.load("en_core_web_sm") 18 | 19 | # set of relative pronouns 20 | RELPRON = ['whom', 'whose', 'which', 'who'] 21 | 22 | 23 | def transform(parsed): 24 | d = {} 25 | # print(parsed) 26 | # print() 27 | for x in parsed: 28 | rel = x.dep_ 29 | parent = x.head.i + 1 30 | dependent = x.i + 1 31 | if parent == dependent and rel == 'ROOT': 32 | parent = 0 33 | if parent not in d.keys(): 34 | d[parent] = {} 35 | if rel not in d[parent].keys(): 36 | d[parent][rel] = [] 37 | 38 | d[parent][rel].append(dependent) 39 | 40 | return d 41 | 42 | 43 | def analyse_rc(sentence): 44 | # check for markers indicating rel_clause 45 | if any([s.lower() in RELPRON for s in sentence]): 46 | mark = [] 47 | for s in sentence: 48 | if s.lower() in RELPRON: 49 | mark.append(s.lower()) 50 | return True, mark 51 | else: 52 | return False, None 53 | 54 | 55 | def remove_all(aux, item): 56 | for a in aux.keys(): 57 | for d in aux[a].keys(): 58 | if item in aux[a][d]: 59 | aux[a][d].remove(item) 60 | 61 | 62 | def build(root, dep, aux, words, final, yes_root=True, previous=None): 63 | if previous is None: 64 | previous = [] 65 | 66 | if root in previous: 67 | return 68 | 69 | previous.append(root) 70 | 71 | if yes_root: 72 | final[root] = words[root - 1] 73 | previous.append(root) 74 | 75 | for k in dep.keys(): 76 | for i in dep[k]: 77 | if i in aux.keys(): 78 | deps = aux[i] 79 | build(i, deps, aux, words, final, previous=previous) 80 | 81 | final[i] = words[i - 1] 82 | 83 | 84 | def appositive_phrases(dep_dict, words, root, dep_root, ant): 85 | if 'nsubj' in dep_root: 86 | subj = dep_root['nsubj'][0] 87 | subj_word = words[subj - 1] 88 | 89 | # print(dep_dict) 90 | if subj not in dep_dict: 91 | return False, ant 92 | 93 | deps_subj = dep_dict[subj] 94 | v_tense = words[root - 1][1] 95 | n_num = words[subj - 1][1] 96 | 97 | if 'amod' in deps_subj: 98 | mod = deps_subj['amod'][0] 99 | if mod in dep_dict: 100 | deps_mod = dep_dict[mod] 101 | else: 102 | deps_mod = {} 103 | del dep_dict[subj]['amod'] 104 | deps_subj = dep_dict[subj] 105 | 106 | # Treat simple cases such as 'general rule' 107 | if 'JJ' in words[mod - 1][1] and 'punct' not in deps_subj: 108 | return False, ant 109 | 110 | elif 'appos' in deps_subj: 111 | mod = deps_subj['appos'][0] 112 | if mod in dep_dict: 113 | deps_mod = dep_dict[mod] 114 | else: 115 | deps_mod = {} 116 | del dep_dict[subj]['appos'] 117 | deps_subj = dep_dict[subj] 118 | else: 119 | return False, ant 120 | 121 | if 'punct' in deps_subj.keys(): 122 | del deps_subj['punct'] 123 | 124 | final_root = {} 125 | build(root, dep_root, dep_dict, [s[0].lower() for s in words], 126 | final_root) 127 | final_appos = {} 128 | build(mod, deps_mod, dep_dict, [s[0].lower() for s in words], 129 | final_appos) 130 | final_subj = {} 131 | build(subj, deps_subj, dep_dict, [s[0].lower() for s in words], 132 | final_subj) 133 | 134 | # print(final_root) 135 | s1 = [] 136 | for i in sorted(final_root): 137 | s1.append(final_root[i]) 138 | s1 = ' '.join(s1) 139 | # print(s1) 140 | 141 | # print(final_appos) 142 | s2 = [] 143 | for i in sorted(final_appos): 144 | s2.append(final_appos[i]) 145 | s2 = ' '.join(s2) 146 | # print(s2) 147 | 148 | # print(final_subj) 149 | s3 = [] 150 | for i in sorted(final_subj): 151 | s3.append(final_subj[i]) 152 | s3 = ' '.join(s3) 153 | # print(s3) 154 | 155 | if len(final_appos.keys()) < 2: 156 | return False, ant 157 | 158 | if n_num in ["NN", "NNP"]: 159 | if v_tense in ["VBP", "VBZ", "VB"]: 160 | s3 += " is " 161 | elif v_tense in ["VBD", "VBG", "VBN"]: 162 | s3 += " was " 163 | 164 | elif n_num in ["NNS", "NNPS"]: 165 | if v_tense in ["VBP", "VBZ", "VB"]: 166 | s3 += " are " 167 | elif v_tense in ("VBD", "VBG", "VBN"): 168 | s3 += " were " 169 | 170 | elif n_num in ["PRP"] and subj_word.lower() == "they": 171 | 172 | if v_tense in ["VBP", "VBZ", "VB"]: 173 | s3 += " are " 174 | elif v_tense in ["VBD", "VBG", "VBN"]: 175 | s3 += " were " 176 | 177 | elif n_num in ["PRP"]: 178 | if v_tense in ["VBP", "VBZ", "VB"]: 179 | s3 += " is " 180 | elif v_tense in ["VBD", "VBG", "VBN"]: 181 | s3 += " was " 182 | 183 | s2 = s3 + s2 184 | 185 | return True, [s1, s2] 186 | 187 | return False, ant 188 | 189 | 190 | def relative_clauses(dep_dict, words, root, dep_root, rel, ant): 191 | subj = dep_root[rel][0] 192 | if subj in dep_dict: 193 | 194 | dep_subj = dep_dict[subj] 195 | 196 | if 'relcl' in dep_subj or 'rcmod' in dep_subj: 197 | if 'relcl' in dep_subj: 198 | relc = dep_subj['relcl'][0] 199 | type_rc = 'relcl' 200 | else: 201 | relc = dep_subj['rcmod'][0] 202 | type_rc = 'rcmod' 203 | deps_relc = dep_dict[relc] 204 | 205 | if 'nsubj' in deps_relc: 206 | subj_rel = 'nsubj' 207 | elif 'nsubjpass' in deps_relc: 208 | subj_rel = 'nsubjpass' 209 | 210 | if 'ref' in dep_subj: 211 | to_remove = dep_subj['ref'][0] 212 | mark = words[dep_subj['ref'][0] - 1].lower() 213 | else: 214 | to_remove = deps_relc[subj_rel][0] 215 | mark = words[deps_relc[subj_rel][0] - 1].lower() 216 | 217 | # print(mark) 218 | 219 | if mark in RELPRON: 220 | deps_relc[subj_rel][0] = subj 221 | remove_all(dep_dict, to_remove) 222 | # needed for cases where the subject of 223 | # the relative clause is the object 224 | elif 'dobj' in deps_relc: 225 | obj = deps_relc['dobj'][0] 226 | 227 | if 'poss' in dep_dict[obj]: 228 | mod = dep_dict[obj]['poss'][0] 229 | aux_words = words[mod - 1] 230 | aux_words = words[subj - 1] + '\'s' 231 | words[mod - 1] = aux_words 232 | dep_dict[mod] = dep_dict[subj] 233 | else: 234 | return False, ant 235 | else: 236 | return False, ant # for broken cases - 237 | # " There are some 238 | # situations where it is particularly important 239 | # that you get financial information and 240 | # advice that is independent of us." 241 | 242 | del dep_dict[subj][type_rc] 243 | 244 | if 'punct' in dep_subj: 245 | del dep_dict[subj]['punct'] 246 | 247 | final_root = {} 248 | build(root, dep_root, dep_dict, words, final_root) 249 | final_relc = {} 250 | build(relc, deps_relc, dep_dict, words, final_relc) 251 | 252 | # print(final_root) 253 | # print(final_relc) 254 | 255 | s1 = [] 256 | for i in sorted(final_root): 257 | s1.append(final_root[i]) 258 | 259 | s2 = [] 260 | for i in sorted(final_relc): 261 | s2.append(final_relc[i]) 262 | 263 | return True, [' '.join(s1), ' '.join(s2)] 264 | return False, ant 265 | 266 | 267 | sentence0 = 'Robert, who lives nearby, was walking his dog' 268 | sentence1 = 'Marcus, my sister\'s hamster, likes to run in a wheel.' 269 | 270 | 271 | def simplify_ztz(ztz, verbose=False): 272 | """ 273 | This method simplifies the sentence `sentence`. It returns a list of 274 | simple sentences extracted from the input sentence. 275 | 276 | Parameters 277 | ---------- 278 | ztz: str 279 | verbose: bool 280 | kwargs: dict[] 281 | 282 | Returns 283 | ------- 284 | list[str] 285 | 286 | """ 287 | 288 | ztz.strip() 289 | sentences = [ztz] 290 | result = [] 291 | 292 | for s in sentences: 293 | 294 | output = nlp(s) 295 | 296 | dep_dict = transform(output) 297 | # print(dep_dict) 298 | 299 | # words = [(token.text.lower(), token.pos_) for token in output] 300 | 301 | tokens = [token.text.lower() for token in output] 302 | 303 | words = nltk.pos_tag(tokens) 304 | 305 | # print(words) 306 | 307 | if 0 in dep_dict: 308 | 309 | root = dep_dict[0]['ROOT'][0] 310 | 311 | if root in dep_dict: 312 | 313 | dep_root = dep_dict[root] 314 | 315 | # handle appositive_phrases 316 | flag_appos, res = appositive_phrases(dep_dict, words, root, 317 | dep_root, sentences) 318 | if flag_appos: 319 | result += res 320 | continue 321 | 322 | # check for relative clauses 323 | flag_rc, type_rc = analyse_rc(s.split()) 324 | 325 | if flag_rc: 326 | 327 | if 'nsubj' in dep_root: 328 | flag, res = relative_clauses(dep_dict, tokens, root, 329 | dep_root, 'nsubj', 330 | sentences) 331 | if flag: 332 | result += res 333 | continue 334 | elif 'dobj' in dep_root: 335 | flag, res = relative_clauses(dep_dict, tokens, root, 336 | dep_root, 'dobj', 337 | sentences) 338 | if flag: 339 | result += res 340 | continue 341 | result.append(s) 342 | 343 | if verbose: 344 | print(ztz.strip()) 345 | print(result) 346 | -------------------------------------------------------------------------------- /simp_deprecated/simp_stanford2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 19, 2015 3 | 4 | @author: TPetrou 5 | ''' 6 | 7 | from nltk.parse import stanford 8 | import os, sys 9 | import operator 10 | 11 | java_path = r"C:\Program Files\Java\jdk1.8.0_31\bin\java.exe" 12 | os.environ['JAVAHOME'] = java_path 13 | os.environ[ 14 | 'STANFORD_PARSER'] = r'/users/ted/stanford nlp/stanford-parser-full-2015-01-30' 15 | os.environ[ 16 | 'STANFORD_MODELS'] = r'/users/ted/stanford nlp/stanford-parser-full-2015-01-30' 17 | 18 | 19 | class RDF_Triple(): 20 | class RDF_SOP(): 21 | 22 | def __init__(self, name, pos=''): 23 | self.name = name 24 | self.word = '' 25 | self.parent = '' 26 | self.grandparent = '' 27 | self.depth = '' 28 | self.predicate_list = [] 29 | self.predicate_sibings = [] 30 | self.pos = pos 31 | self.attr = [] 32 | self.attr_trees = [] 33 | 34 | def __init__(self, sentence): 35 | self.sentence = sentence 36 | self.clear_data() 37 | 38 | def clear_data(self): 39 | self.parser = stanford.StanfordParser( 40 | model_path=r"/users/ted/stanford nlp/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") 41 | self.first_NP = '' 42 | self.first_VP = '' 43 | self.parse_tree = None 44 | self.subject = RDF_Triple.RDF_SOP('subject') 45 | self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB') 46 | self.Object = RDF_Triple.RDF_SOP('object') 47 | 48 | def find_NP(self, t): 49 | try: 50 | t.label() 51 | except AttributeError: 52 | pass 53 | else: 54 | # Now we know that t.node is defined 55 | if t.label() == 'NP': 56 | if self.first_NP == '': 57 | self.first_NP = t 58 | elif t.label() == 'VP': 59 | if self.first_VP == '': 60 | self.first_VP = t 61 | for child in t: 62 | self.find_NP(child) 63 | 64 | def find_subject(self, t, parent=None, grandparent=None): 65 | if self.subject.word != '': 66 | return 67 | try: 68 | t.label() 69 | except AttributeError: 70 | pass 71 | else: 72 | # Now we know that t.node is defined 73 | if t.label()[:2] == 'NN': 74 | if self.subject.word == '': 75 | self.subject.word = t.leaves()[0] 76 | self.subject.pos = t.label() 77 | self.subject.parent = parent 78 | self.subject.grandparent = grandparent 79 | else: 80 | for child in t: 81 | self.find_subject(child, parent=t, grandparent=parent) 82 | 83 | def find_predicate(self, t, parent=None, grandparent=None, depth=0): 84 | try: 85 | t.label() 86 | except AttributeError: 87 | pass 88 | else: 89 | if t.label()[:2] == 'VB': 90 | self.predicate.predicate_list.append( 91 | (t.leaves()[0], depth, parent, grandparent)) 92 | 93 | for child in t: 94 | self.find_predicate(child, parent=t, grandparent=parent, 95 | depth=depth + 1) 96 | 97 | def find_deepest_predicate(self): 98 | if not self.predicate.predicate_list: 99 | return '', '', '', '' 100 | return max(self.predicate.predicate_list, key=operator.itemgetter(1)) 101 | 102 | def extract_word_and_pos(self, t, depth=0, words=[]): 103 | try: 104 | t.label() 105 | except AttributeError: 106 | # print t 107 | # print 'error', t 108 | pass 109 | else: 110 | # Now we know that t.node is defined 111 | if t.height() == 2: 112 | # self.word_pos_holder.append((t.label(), t.leaves()[0])) 113 | words.append((t.leaves()[0], t.label())) 114 | for child in t: 115 | self.extract_word_and_pos(child, depth + 1, words) 116 | return words 117 | 118 | def print_tree(self, t, depth=0): 119 | try: 120 | t.label() 121 | except AttributeError: 122 | print(t) 123 | # print 'error', t 124 | pass 125 | else: 126 | # Now we know that t.node is defined 127 | print('(') # , t.label(), t.leaves()[0] 128 | for child in t: 129 | self.print_tree(child, depth + 1) 130 | print(') ') 131 | 132 | def find_object(self): 133 | for t in self.predicate.parent: 134 | if self.Object.word == '': 135 | self.find_object_NP_PP(t, t.label(), self.predicate.parent, 136 | self.predicate.grandparent) 137 | 138 | def find_object_NP_PP(self, t, phrase_type, parent=None, grandparent=None): 139 | ''' 140 | finds the object given its a NP or PP or ADJP 141 | ''' 142 | if self.Object.word != '': 143 | return 144 | try: 145 | t.label() 146 | except AttributeError: 147 | pass 148 | else: 149 | # Now we know that t.node is defined 150 | if t.label()[:2] == 'NN' and phrase_type in ['NP', 'PP']: 151 | if self.Object.word == '': 152 | self.Object.word = t.leaves()[0] 153 | self.Object.pos = t.label() 154 | self.Object.parent = parent 155 | self.Object.grandparent = grandparent 156 | elif t.label()[:2] == 'JJ' and phrase_type == 'ADJP': 157 | if self.Object.word == '': 158 | self.Object.word = t.leaves()[0] 159 | self.Object.pos = t.label() 160 | self.Object.parent = parent 161 | self.Object.grandparent = grandparent 162 | else: 163 | for child in t: 164 | self.find_object_NP_PP(child, phrase_type, parent=t, 165 | grandparent=parent) 166 | 167 | def get_attributes(self, pos, sibling_tree, grandparent): 168 | rdf_type_attr = [] 169 | if pos[:2] == 'JJ': 170 | for item in sibling_tree: 171 | if item.label()[:2] == 'RB': 172 | rdf_type_attr.append((item.leaves()[0], item.label())) 173 | else: 174 | if pos[:2] == 'NN': 175 | for item in sibling_tree: 176 | if item.label()[:2] in ['DT', 'PR', 'PO', 'JJ', 'CD']: 177 | rdf_type_attr.append((item.leaves()[0], item.label())) 178 | if item.label() in ['QP', 'NP']: 179 | # append a tree 180 | rdf_type_attr.append(item, item.label()) 181 | elif pos[:2] == 'VB': 182 | for item in sibling_tree: 183 | if item.label()[:2] == 'AD': 184 | rdf_type_attr.append((item, item.label())) 185 | 186 | if grandparent: 187 | if pos[:2] in ['NN', 'JJ']: 188 | for uncle in grandparent: 189 | if uncle.label() == 'PP': 190 | rdf_type_attr.append((uncle, uncle.label())) 191 | elif pos[:2] == 'VB': 192 | for uncle in grandparent: 193 | if uncle.label()[:2] == 'VB': 194 | rdf_type_attr.append((uncle, uncle.label())) 195 | 196 | return self.attr_to_words(rdf_type_attr) 197 | 198 | def attr_to_words(self, attr): 199 | new_attr_words = [] 200 | new_attr_trees = [] 201 | for tup in attr: 202 | # if type(tup[0]) != unicode: 203 | # if tup[0].height() == 2: 204 | # new_attr_words.append((tup[0].leaves()[0], tup[0].label())) 205 | # else: 206 | # # new_attr_words.extend(self.extract_word_and_pos(tup[0])) 207 | # new_attr_trees.append(tup[0].unicode_repr()) 208 | # else: 209 | new_attr_words.append(tup) 210 | return new_attr_words, new_attr_trees 211 | 212 | def jsonify_rdf(self): 213 | return {'sentence': self.sentence, 214 | 'parse_tree': self.parse_tree.unicode_repr(), 215 | 'predicate': {'word': self.predicate.word, 216 | 'POS': self.predicate.pos, 217 | 'Word Attributes': self.predicate.attr, 218 | 'Tree Attributes': self.predicate.attr_trees}, 219 | 'subject': {'word': self.subject.word, 'POS': self.subject.pos, 220 | 'Word Attributes': self.subject.attr, 221 | 'Tree Attributes': self.subject.attr_trees}, 222 | 'object': {'word': self.Object.word, 'POS': self.Object.pos, 223 | 'Word Attributes': self.Object.attr, 224 | 'Tree Attributes': self.Object.attr_trees}, 225 | 'rdf': [self.subject.word, self.predicate.word, 226 | self.Object.word] 227 | } 228 | 229 | def main(self): 230 | self.clear_data() 231 | self.parse_tree = self.parser.raw_parse(self.sentence)[0] 232 | self.find_NP(self.parse_tree) 233 | self.find_subject(self.first_NP) 234 | self.find_predicate(self.first_VP) 235 | if self.subject.word == '' and self.first_NP != '': 236 | self.subject.word = self.first_NP.leaves()[0] 237 | self.predicate.word, self.predicate.depth, self.predicate.parent, self.predicate.grandparent = self.find_deepest_predicate() 238 | self.find_object() 239 | self.subject.attr, self.subject.attr_trees = self.get_attributes( 240 | self.subject.pos, self.subject.parent, self.subject.grandparent) 241 | self.predicate.attr, self.predicate.attr_trees = self.get_attributes( 242 | self.predicate.pos, self.predicate.parent, 243 | self.predicate.grandparent) 244 | self.Object.attr, self.Object.attr_trees = self.get_attributes( 245 | self.Object.pos, self.Object.parent, self.Object.grandparent) 246 | self.answer = self.jsonify_rdf() 247 | 248 | 249 | if __name__ == '__main__': 250 | # try: 251 | # sentence = sys.argv[1] 252 | # sentence = 'A rare black squirrel has become a regular visitor to a suburban garden' 253 | # except IndexError: 254 | # print("Enter in your sentence") 255 | # sentence = 'A rare black squirrel has become a regular visitor to a suburban garden' 256 | # print("Heres an example") 257 | # print(sentence) 258 | 259 | # sentence = 'The boy dunked the basketball' 260 | sentence = 'They also made the substance able to last longer in the bloodstream, which led to more stable blood sugar levels and less frequent injections.' 261 | #sentence = 'A rare black squirrel has become a regular visitor to a 262 | # suburban garden' 263 | rdf = RDF_Triple(sentence) 264 | rdf.main() 265 | 266 | ans = rdf.answer 267 | print(ans) -------------------------------------------------------------------------------- /cleaning.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains functions for cleaning movie scripts (or short stories) 4 | 5 | input directory: m_scripts or short_stories 6 | output directory: m_scripts_clean or short_stories_clean 7 | 8 | The code in this file cleans one movie script file at a time. It takes each 9 | input movie script file from the folder `m_scripts` and outputs a new file 10 | to the folder `m_scripts_clean`. 11 | 12 | It removes contractions like "didn't", and replaces exclusively unicode 13 | symbols by their closest ANSII analogues (e.g., curly quotes are replaced by 14 | straight quotes). 15 | 16 | It uses the software SpaCy to break up the movie script into separate 17 | sentences, and returns a file with only one sentence per line. 18 | 19 | For the case of movie scripts (but not for short stories), it also tries to 20 | distinguish between dialog lines and narration lines. In many but not all 21 | movie scripts, the dialog lines are indented with respect to the narration 22 | lines. In the case of Pixar/Disney, they don't indent dialog. In cases where 23 | the movie script indents, the MM software gives the option of throwing away 24 | all the dialog lines and keeping only the narration ones. Folders ending in 25 | `_rd` are for remove dialog files. 26 | 27 | Occasionally in this file, we use regex (via the Python module `re`). 28 | Here is a nice reference on `re`. 29 | 30 | https://www.datacamp.com/tutorial/python-regular-expression-tutorial 31 | 32 | ChatGPT is also very good at answering regex questions. 33 | 34 | """ 35 | import re 36 | import os 37 | # sentence splitting with NLTK 38 | # from nltk.tokenize import sent_tokenize 39 | import collections as co 40 | from globals import * 41 | from unidecode import unidecode 42 | import contractions 43 | from utils import * 44 | 45 | # sentence splitting with spacy 46 | import spacy 47 | 48 | nlp = spacy.load('en_core_web_sm') 49 | 50 | 51 | def expand_contractions(line): 52 | """ 53 | This auxiliary method replaces all contractions in the string `line` by 54 | expansions thereof (e.g., replaces "didn't" by "did not".) 55 | 56 | Parameters 57 | ---------- 58 | line: str 59 | 60 | Returns 61 | ------- 62 | str 63 | 64 | """ 65 | str_list = [] 66 | for word in line.split(): 67 | str_list.append(contractions.fix(word)) 68 | return ' '.join(str_list) 69 | 70 | 71 | def clean_one_m_script(in_dir, 72 | out_dir, 73 | file_name, 74 | remove_dialog=False): 75 | """ 76 | in_dir and out_dir can be the same, but this will overwrite the files. 77 | 78 | This method reads a file called `file_name` in the `in_dir` directory 79 | and creates a clean version in the `out_dir` directory. 80 | 81 | Parameters 82 | ---------- 83 | in_dir: str 84 | out_dir: str 85 | file_name: str 86 | remove_dialog: bool 87 | True iff dialog part of the movie script is removed, leaving only 88 | the narration part. 89 | 90 | Returns 91 | ------- 92 | None 93 | 94 | """ 95 | 96 | print('fetching %s' % file_name) 97 | 98 | def count_leading_wh_sp(str0): 99 | # wh_sp = white space 100 | count = 0 101 | if str0: 102 | for char in str0: 103 | if char.isspace(): 104 | count += 1 105 | else: 106 | break 107 | return count 108 | 109 | inpath = in_dir + "/" + file_name 110 | outpath = out_dir + "/" + file_name 111 | 112 | with open(inpath, "r", encoding='utf-8') as f: 113 | lines = [line for line in f] 114 | 115 | # Replace exclusively unicode characters by ascii analogues (e.g., 116 | # replace curly quotes by straight ones) so don't have to use 117 | # encoding="utf-8" as a parameter in open() from here on. 118 | lines = [unidecode(line) for line in lines] 119 | 120 | # expand contractions 121 | lines = [expand_contractions(line) for line in lines] 122 | 123 | # strip trailing (i.e., right) white space and newline. 124 | # If this results in an empty line, remove it. 125 | new_lines = [] 126 | for line in lines: 127 | line = line.rstrip() 128 | if line: 129 | new_lines.append(line) 130 | lines = new_lines 131 | 132 | # remove everything after and including THE END 133 | new_lines = [] 134 | for line in lines: 135 | if line.strip() in ["THE END", "END"]: 136 | break 137 | else: 138 | new_lines.append(line) 139 | lines = new_lines 140 | 141 | # regex for parenthetical remarks 142 | pattern_paren = re.compile(r'\[(.*?)\]|\((.*?)\)|\{(.*?)\}') 143 | # regex for period followed by white spaces + number 144 | pattern_period = r"\.(?=\s*\d)" 145 | 146 | # Substitutions. If subs results in empty line, remove it. 147 | new_lines = [] 148 | for line in lines: 149 | # print("ssdf", line) 150 | # remove parenthetical remarks 151 | line = re.sub(pattern_paren, "", line) 152 | # remove the underscore, which is not 153 | # considered a punctuation mark. 154 | line = re.sub(r'[_]', '', line) 155 | # Replace tabs by 12 blank spaces 156 | line = re.sub(r"\t", " " * 12, line) 157 | # replace period by dash if period followed by number 158 | line = re.sub(pattern_period, "-", line) 159 | # print("\tssdf", line) 160 | if len(line) >= 1: 161 | new_lines.append(line) 162 | lines = new_lines 163 | 164 | # Add missing periods for transitions from dialog to narration or vice 165 | # versa 166 | indent = count_leading_wh_sp(lines[0]) 167 | for i in range(len(lines)): 168 | if i != len(lines) - 1: 169 | next_indent = count_leading_wh_sp(lines[i + 1]) 170 | if indent != next_indent and \ 171 | not lines[i][-1] in [".", "!", "?"]: 172 | lines[i] = lines[i] + "." 173 | else: 174 | next_indent = None 175 | if not lines[i][-1] in [".", "!", "?"]: 176 | lines[i] = lines[i] + "." 177 | indent = next_indent 178 | 179 | # Regex for string that contains at least 2 lower case letters 180 | # Found cases where line was just "is." 181 | pattern_lc = re.compile(r'^(.*[a-z]){2,}.*$') 182 | 183 | # Reject lines that don't contain at least 2 lower case letters string. 184 | # This gets rid of scene directions and character invocations. 185 | lines = [line for line in lines if re.search(pattern_lc, line)] 186 | 187 | white_spaces = [count_leading_wh_sp(line) for line in lines] 188 | # Counter returns dictionary mapping item to its number of repetitions 189 | wh_sp_counter = co.Counter(white_spaces) 190 | # print("llkh", wh_sp_counter) 191 | sum_reps = sum(wh_sp_counter.values()) 192 | indent_prob_dist = co.OrderedDict() 193 | indents = [] 194 | for indent in sorted(wh_sp_counter, 195 | key=wh_sp_counter.get, 196 | reverse=True): 197 | prob = round(wh_sp_counter[indent] / sum_reps, 3) 198 | indent_prob_dist[indent] = prob 199 | indents.append(indent) 200 | # print("ddfg", indents) 201 | # print("ddfg", indent_prob_dist) 202 | print("indent prob dist =", [(indent, indent_prob_dist[indent]) \ 203 | for indent in indents[0:4]]) 204 | 205 | # likely dialog indents 206 | # most probable indent = indents[0] 207 | dial_indents = [indent for indent in indents if \ 208 | abs(indent - indents[0]) <= 3 and \ 209 | indent_prob_dist[indent] >= .01] 210 | 211 | ndial_indents = [indent for indent in indents \ 212 | if indent not in dial_indents] 213 | # likely narration indents 214 | narr_indents = [indent for indent in ndial_indents if \ 215 | abs(indent - ndial_indents[0]) <= 3 and \ 216 | indent_prob_dist[indent] >= .01] 217 | 218 | print("dialog indents=", dial_indents) 219 | print("narration indents=", narr_indents) 220 | 221 | # keep only narration (less likely than dialog) indentations. Also 222 | # remove smallest indentation. 223 | new_lines = [] 224 | for line in lines: 225 | indent = count_leading_wh_sp(line) 226 | if indent in dial_indents + narr_indents: 227 | if not narr_indents or not dial_indents: 228 | # there is no difference in indentation between narr and dial 229 | new_lines.append(line) 230 | else: 231 | if remove_dialog: 232 | if indent in narr_indents: 233 | new_lines.append(line[min(narr_indents):]) 234 | else: 235 | new_lines.append(line[min(narr_indents):]) 236 | lines = new_lines 237 | 238 | # print("nnuu", lines[0:15]) 239 | # print("nnuu", lines[-15:]) 240 | 241 | # remove enumeration markers. 242 | # pattern = re.compile(r"^[^a-zA-Z]*") 243 | # lines = [re.sub(pattern, "", line) for line in lines] 244 | 245 | # join lines to create new script 246 | lines = [line.strip() for line in lines if line] 247 | script = ' '.join(lines) 248 | 249 | # split script into sentences with NLTK 250 | # lines = sent_tokenize(script) 251 | 252 | # split script into sentences with spacy 253 | lines = nlp(script).sents 254 | # for line in lines: 255 | # print("zzzxc", line) 256 | 257 | # remove single character sentences 258 | lines = [line.text for line in lines if len(line.text) > 1] 259 | 260 | with open(outpath, "w") as f: 261 | for line in lines: 262 | f.write(line + "\n") 263 | 264 | 265 | def clean_batch_of_m_scripts( 266 | in_dir, out_dir, 267 | batch_file_names, 268 | remove_dialog=False): 269 | """ 270 | This method calls the method `clean_one_m_script` for all the file names 271 | in the list of file names `batch_file_names`. 272 | 273 | Parameters 274 | ---------- 275 | in_dir: str 276 | out_dir: str 277 | batch_file_names: list[str] 278 | remove_dialog: bool 279 | 280 | Returns 281 | ------- 282 | None 283 | 284 | """ 285 | 286 | all_file_names = my_listdir(in_dir) 287 | assert set(batch_file_names).issubset(set(all_file_names)) 288 | for file_name in batch_file_names: 289 | i = all_file_names.index(file_name) 290 | print('%i.' % (i + 1)) 291 | clean_one_m_script(in_dir, 292 | out_dir, 293 | file_name, 294 | remove_dialog=remove_dialog) 295 | 296 | 297 | if __name__ == "__main__": 298 | from globals import * 299 | 300 | 301 | def main1(): 302 | in_dir = "short_stories" 303 | out_dir = "short_stories_clean" 304 | batch_file_names = my_listdir(in_dir)[0:3] 305 | clean_batch_of_m_scripts( 306 | in_dir, out_dir, 307 | batch_file_names, 308 | remove_dialog=False) 309 | 310 | 311 | def main2(): 312 | remove_dialog = True 313 | clean_one_m_script( 314 | in_dir=M_SCRIPTS_DIR, 315 | out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR, 316 | file_name="up.txt", 317 | remove_dialog=remove_dialog) 318 | 319 | 320 | def main3(): 321 | remove_dialog = False 322 | # batch_file_names=my_listdir(M_SCRIPTS_DIR) 323 | batch_file_names = ["toy-story.txt", "up.txt", "wall-e.txt"] 324 | clean_batch_of_m_scripts( 325 | in_dir=M_SCRIPTS_DIR, 326 | out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR, 327 | batch_file_names=batch_file_names, 328 | remove_dialog=remove_dialog) 329 | 330 | 331 | # main1() 332 | # main2() 333 | main3() 334 | -------------------------------------------------------------------------------- /Dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from Node import * 3 | import pickle as pik 4 | from globals import * 5 | from utils import * 6 | 7 | import graphviz as gv 8 | from IPython.display import display, Image 9 | from PIL.Image import open as open_image 10 | 11 | 12 | class Dag: 13 | """ 14 | This class creates a DAG (directed acyclic graph) for the movie entitled 15 | `m_title`. The DAG has nodes `nodes` and arrows `arrows`. Each arrow has 16 | a two weights `num_acc` and `num_rej`. Those weights are the number of 17 | times the arrow has been accepted and rejected. They are stored in the 18 | dictionary `arrow_to_acc_rej_nums`. 19 | 20 | Attributes 21 | ---------- 22 | arrow_to_acc_rej_nums: dict[tuple(Node), tuple(int)] 23 | arrows: list[tuple[Node, Node]] 24 | arrows of self. Arrows are defined as a pair of Node objects. 25 | The first element of the pair is the origin of the arrow and the 26 | second is the target of the arrow. 27 | m_title: str 28 | nodes: list[Node] 29 | 30 | """ 31 | 32 | def __init__(self, m_title, simp_dir): 33 | """ 34 | Constructor 35 | 36 | Parameters 37 | ---------- 38 | m_title: str 39 | title of movie to which this DAG refers to. 40 | simp_dir: str 41 | the directory in which simplified files are stored, and from 42 | which objects of this class are constructed. 43 | """ 44 | self.m_title = m_title 45 | path = simp_dir + "/" + m_title + ".txt" 46 | with open(path, "r") as f: 47 | lines = [line for line in f] 48 | self.nodes = [] 49 | for time, line in enumerate(lines): 50 | if line.strip() not in [ZTZ_SEPARATOR, ""]: 51 | ztz_list = line.split(ZTZ_SEPARATOR) 52 | for place in range(len(ztz_list)): 53 | self.nodes.append(Node(time, place)) 54 | self.arrows = [] 55 | self.arrow_to_acc_rej_nums = {} 56 | 57 | def save_self(self, dag_dir): 58 | """ 59 | This method stores self as a pickled file. 60 | 61 | Parameters 62 | ---------- 63 | dag_dir: str 64 | Directory in which pickled file is stored. 65 | 66 | Returns 67 | ------- 68 | None 69 | 70 | """ 71 | path = dag_dir + "/" + self.m_title + ".pkl" 72 | with open(path, "wb") as f: 73 | pik.dump(self, f, protocol=pik.HIGHEST_PROTOCOL) 74 | 75 | def update_arrow(self, arrow, accepted): 76 | """ 77 | This method changes the tuple (num_accepted, num_rejected) of 78 | `arrow`. If accepted=True, num_accepted is increased by one. If 79 | accepted=False, num_rejected is increased by one. 80 | 81 | Parameters 82 | ---------- 83 | arrow: tuple[Node, Node] 84 | accepted: bool 85 | 86 | Returns 87 | ------- 88 | None 89 | 90 | """ 91 | if arrow not in self.arrows: 92 | self.arrows.append(arrow) 93 | self.arrow_to_acc_rej_nums[arrow] = [0, 0] 94 | if accepted: 95 | self.arrow_to_acc_rej_nums[arrow][0] += 1 96 | else: 97 | self.arrow_to_acc_rej_nums[arrow][1] += 1 98 | 99 | def build_node_to_clean_ztz_dict(self, 100 | clean_dir, 101 | skip_1st_line=False): 102 | """ 103 | This method builds from scratch and returns a dictionary called 104 | `nd_to_clean_ztz` that maps each node to a clean sentence. ztz 105 | stands for sentence. 106 | 107 | Parameters 108 | ---------- 109 | clean_dir: str 110 | directory of movie scripts after cleaning. 111 | 112 | Returns 113 | ------- 114 | dict(Node, str) 115 | 116 | """ 117 | path = clean_dir + "/" + self.m_title + ".txt" 118 | is_csv = False 119 | if not os.path.isfile(path): 120 | path = path.replace(".txt", ".csv") 121 | is_csv = True 122 | assert os.path.isfile(path) 123 | 124 | time_to_clean_ztz = {} 125 | with open(path, "r") as f: 126 | time = -1 127 | for line in f: 128 | time += 1 129 | if is_csv: 130 | if time == 0: 131 | continue 132 | else: 133 | time_to_clean_ztz[time - 1] = line.strip() 134 | else: 135 | time_to_clean_ztz[time] = line.strip() 136 | 137 | nd_to_clean_ztz = {} 138 | for nd in self.nodes: 139 | nd_to_clean_ztz[nd] = time_to_clean_ztz[nd.time] 140 | 141 | return nd_to_clean_ztz 142 | 143 | def build_node_to_simple_ztz_dict(self, simp_dir): 144 | """ 145 | This method builds from scratch and returns a dictionary called 146 | `nd_to_simple_ztz` that maps each node to a simplified sentence. ztz 147 | stands for sentence. 148 | 149 | 150 | Parameters 151 | ---------- 152 | simp_dir: str 153 | directory of movie scripts after simplifying. 154 | 155 | Returns 156 | ------- 157 | dict(Node, str) 158 | 159 | """ 160 | path = simp_dir + "/" + self.m_title + ".txt" 161 | 162 | time_to_simp_ztz_list = {} 163 | with open(path, "r") as f: 164 | time = -1 165 | for line in f: 166 | time += 1 167 | if line.strip() != ZTZ_SEPARATOR: 168 | time_to_simp_ztz_list[time] = \ 169 | line.split(ZTZ_SEPARATOR) 170 | 171 | nd_to_simp_ztz = {} 172 | for nd in self.nodes: 173 | nd_to_simp_ztz[nd] = \ 174 | time_to_simp_ztz_list[nd.time][nd.place].strip() 175 | 176 | return nd_to_simp_ztz 177 | 178 | def build_high_prob_acc_arrows(self, 179 | prob_acc_thold, 180 | nsam_thold): 181 | """ 182 | This method builds from scratch and returns a list of all arrows 183 | whose `prob_acc` (i.e., probability of acceptance) is >= 184 | `prob_acc_thold` with `nsam` (i.e., number of samples used to 185 | calculate that probability) >= `nsam_thold`. thold = threshold 186 | 187 | Parameters 188 | ---------- 189 | prob_acc_thold: float 190 | nsam_thold: int 191 | 192 | Returns 193 | ------- 194 | list[tuple[Node, Node]] 195 | 196 | """ 197 | high_prob_arrows = [] 198 | for arrow in self.arrows: 199 | prob_acc, nsam = get_prob_acc_and_nsam( 200 | *self.arrow_to_acc_rej_nums[arrow]) 201 | if prob_acc >= prob_acc_thold and \ 202 | nsam >= nsam_thold: 203 | high_prob_arrows.append(arrow) 204 | return high_prob_arrows 205 | 206 | def print_map_legend(self, 207 | clean_dir, 208 | simp_dir, 209 | prob_acc_thold, 210 | nsam_thold): 211 | """ 212 | This method prints the DAG Rosetta stone (map legend). 213 | 214 | For each node labeled `( time, place)`, this method prints the 215 | simplified clause ( i.e., simplified sentence) in line `time` of the 216 | simplified file, after a number `place` of separator-tokens. It also 217 | prints the original sentence from which that simplified clause came 218 | from. The full sentence is preceded by the label `(full)` and the 219 | simplified sentence by the label `(part)`. 220 | 221 | It only prints the `(full)` and `(part)` for those nodes that appear 222 | in the DAG, after removing all arrows with probability of acceptance 223 | < `prob_acc_thold` or number of sample used to calculate that 224 | probability < `nsam_thold`. 225 | 226 | Parameters 227 | ---------- 228 | clean_dir: str 229 | directory of movie scripts after cleaning 230 | simp_dir: str 231 | directory of movie scripts after simplification 232 | prob_acc_thold: float 233 | nsam_thold: int 234 | 235 | Returns 236 | ------- 237 | None 238 | 239 | """ 240 | hprob_arrows = self.build_high_prob_acc_arrows( 241 | prob_acc_thold, nsam_thold) 242 | print("MAP LEGEND") 243 | print("title:", self.m_title) 244 | print("prob of acceptance threshold:", prob_acc_thold) 245 | print("number of samples threshold:", nsam_thold) 246 | print("number of arrows shown:", len(hprob_arrows)) 247 | print("number of arrows dropped:", 248 | len(self.arrows) - len(hprob_arrows)) 249 | 250 | hprob_nodes = [] 251 | for arrow in hprob_arrows: 252 | if arrow[0] not in hprob_nodes: 253 | hprob_nodes.append(arrow[0]) 254 | if arrow[1] not in hprob_nodes: 255 | hprob_nodes.append(arrow[1]) 256 | 257 | hprob_nodes = sorted(hprob_nodes, key=lambda x: x.time) 258 | if clean_dir: 259 | nd_to_clean_ztz = self.build_node_to_clean_ztz_dict(clean_dir) 260 | else: 261 | nd_to_clean_ztz = None 262 | nd_to_simple_ztz = self.build_node_to_simple_ztz_dict(simp_dir) 263 | 264 | for nd in hprob_nodes: 265 | print(color.GREEN + color.BOLD + node_str(nd) + ":" + color.END) 266 | ztz0 = "" 267 | if nd_to_clean_ztz: 268 | ztz0 = nd_to_clean_ztz[nd] 269 | print("(FULL)", ztz0) 270 | print("(PART)", nd_to_simple_ztz[nd]) 271 | 272 | @staticmethod 273 | def draw_dot(s, j_embed): 274 | """ 275 | This method draws a dot string. 276 | 277 | Using display(s) will draw the graph but will not embed it permanently 278 | in the notebook. To embed it permanently, must generate temporary image 279 | file and use Image().display(s) 280 | 281 | Parameters 282 | ---------- 283 | s: output of graphviz Source(dot_str) 284 | j_embed: bool 285 | True iff want to embed image in jupyter notebook. If you are 286 | using a python terminal instead of a jupyter notebook, 287 | only j_embed=False will draw image. 288 | 289 | Returns 290 | ------- 291 | None 292 | """ 293 | x = s.render("tempo", format='png', view=False) 294 | if j_embed: 295 | display(Image(x)) 296 | else: 297 | open_image("tempo.png").show() 298 | 299 | def draw(self, prob_acc_thold, nsam_thold, jupyter=False): 300 | """ 301 | This method draws the graph for self. Only arrows with 302 | `prob_acceptance` >= `prob_acc_thold` are drawn. 303 | 304 | Parameters 305 | ---------- 306 | prob_acc_thold: float 307 | nsam_thold: int 308 | jupyter: bool 309 | 310 | Returns 311 | ------- 312 | None 313 | 314 | """ 315 | hprob_arrows = self.build_high_prob_acc_arrows( 316 | prob_acc_thold, nsam_thold) 317 | 318 | dot = "digraph {\n" 319 | for arrow in hprob_arrows: 320 | prob_acc, nsam = get_prob_acc_and_nsam( 321 | *self.arrow_to_acc_rej_nums[arrow]) 322 | X = '"' + str(prob_acc) + " (" + str(nsam) + ")" + '"' 323 | dot += '"' + node_str(arrow[0]) + '"' + "->" + \ 324 | '"' + node_str(arrow[1]) + '"' + \ 325 | ' [label=' + X + "];\n" 326 | dot += 'labelloc="b";\n' 327 | dot += 'label="' + self.m_title + '";\n' 328 | dot += "}\n" 329 | # print("vvbn", dot) 330 | Dag.draw_dot(gv.Source(dot), j_embed=jupyter) 331 | 332 | 333 | if __name__ == "__main__": 334 | def main1(prob_acc_thold, nsam_thold, draw): 335 | dag_dir = "short_stories_dag_atlas" 336 | simp_dir = "short_stories_simp" 337 | clean_dir = "short_stories_clean" 338 | file_names = [file_name for 339 | file_name in my_listdir(dag_dir)[0:3]] 340 | dags = [] 341 | for fname in file_names: 342 | path = dag_dir + "/" + fname 343 | # print("ghty", path) 344 | with open(path, "rb") as f: 345 | dag = pik.load(f) 346 | dags.append(dag) 347 | for dag in dags: 348 | print("==================================") 349 | print(dag.m_title) 350 | hprob_arrows = dag.build_high_prob_acc_arrows( 351 | prob_acc_thold, nsam_thold) 352 | print({arrow_str(arrow): 353 | dag.arrow_to_acc_rej_nums[arrow] \ 354 | for arrow in hprob_arrows}) 355 | print() 356 | if draw: 357 | dag.draw(prob_acc_thold, nsam_thold) 358 | dag.print_map_legend(clean_dir, simp_dir, prob_acc_thold) 359 | 360 | 361 | main1(prob_acc_thold=.90, nsam_thold=2, draw=True) 362 | -------------------------------------------------------------------------------- /spell_checking.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains functions for spell-checking movie scripts (or short 4 | stories) 5 | 6 | input directory: m_scripts_clean or short_stories_clean 7 | output directory: m_scripts_spell or short_stories_spell 8 | 9 | Refs: 10 | https://pyspellchecker.readthedocs.io/en/latest/code.html 11 | 12 | Spell checkers that don't take context into consideration don't work too 13 | well. They replace very infrequent words (probably a misspelled word) by 14 | more frequent ones---a very error prone stategy. Spell checkers that do take 15 | context into consideration are better, but we didn't have access to them. 16 | 17 | This is a very conservative spell checker that doesn't know about context. 18 | We make its corrections conservative by constraining it to follow the 19 | following rules. 20 | 21 | 1. It doesn't change the spelling if a word starts with a capital letter. 22 | 23 | 2. if the word ends in "s" or "ed", it only considers replacements that 24 | also end in "s" or "ed" 25 | 26 | 3. it assumes that the first 2 letters of all words are always correct. 27 | 28 | 4. It retains the capitalization of the first letter of a word. 29 | 30 | 5. It retains punctuation 31 | 32 | 6. When looking for a double letter that should be a single letter or 33 | vice versa, it only considers guesses that have the same set of characters 34 | (e.g., "pool" and "pol") 35 | 36 | This spell checker also uses two word-to-reps dictionaries: a global one 37 | compiled from global usage, and a local one compiled from the local document 38 | (i.e., movie script or short story) being corrected. 39 | 40 | This spell checker also uses two agents (WordGuess objects) working 41 | separately to produce their best possible guess. 42 | 43 | """ 44 | from globals import * 45 | from spellchecker import SpellChecker # package is called pyspellchecker 46 | import os 47 | import re 48 | from WordGuesser import * 49 | from collections import defaultdict 50 | from utils import * 51 | 52 | 53 | def has_double_letter(word): 54 | """ 55 | This method returns True iff `word` has two consecutive letters that are 56 | the same. 57 | 58 | Parameters 59 | ---------- 60 | word: str 61 | 62 | Returns 63 | ------- 64 | bool 65 | 66 | """ 67 | pattern = r'(\w)\1' 68 | match = re.search(pattern, word) 69 | if match: 70 | return True 71 | else: 72 | return False 73 | 74 | 75 | def fancy_split(in_ztz): 76 | """ 77 | This method first adds a white space before and after punctuation marks 78 | in `in_ztz`, then it applies a `split()` on the new sentence and returns 79 | the list generated by the split(). 80 | 81 | Parameters 82 | ---------- 83 | in_ztz: str 84 | 85 | Returns 86 | ------- 87 | list[str] 88 | 89 | """ 90 | # Match any pattern that is not a word character 91 | # or a white space. 92 | # This is the same as a punctuation mark. 93 | punctuation_pattern = re.compile(r'([^\w\s])+') 94 | # add a whitespace before and after each punctuation mark 95 | in_ztz0 = punctuation_pattern.sub(r' \1 ', in_ztz) 96 | return in_ztz0.split() 97 | 98 | 99 | def get_word_to_reps(in_file_path): 100 | """ 101 | This method returns a dictionary `word_to_reps` and an int 102 | `local_word_count`. `word_to_reps` is a dictionary mapping each word to 103 | its number of repetitions, for the file located at `in_file_path`. 104 | The method also returns an int `local_word_count` which equals the number 105 | of distinct words in the file located at `in_file_path`. 106 | 107 | 108 | Parameters 109 | ---------- 110 | in_file_path: str 111 | 112 | Returns 113 | ------- 114 | dict[str, int], int 115 | 116 | """ 117 | # tempo dictionary words are lower case 118 | word_to_reps = defaultdict(lambda: 0) 119 | with open(in_file_path, "r") as f: 120 | local_word_count = 0 121 | for line in f: 122 | words = fancy_split(line) 123 | for word in words: 124 | word = word.lower() 125 | if word.isalpha() and len(word) >= 2: 126 | local_word_count += 1 127 | if word in word_to_reps: 128 | word_to_reps[word] += 1 129 | else: 130 | word_to_reps[word] = 1 131 | 132 | return word_to_reps, local_word_count 133 | 134 | 135 | def get_corrected_sentence(in_ztz, 136 | global_checker, 137 | error_type, 138 | word_to_reps=None, 139 | local_word_count=None): 140 | """ 141 | This method takes a sentence `in_zstz` as input and returns a corrected 142 | sentence. It uses two dictionaries to guess an answer: global_checker, 143 | word_to_reps. 144 | 145 | If `word_to_reps` is kept at None, no local dictionary is used. The 146 | function `get_word_to_reps()` returns both `word_to_reps` and 147 | `local_word_count`. 148 | 149 | `error_type` must be in the list ["tt", "random", "all"] 150 | 151 | Parameters 152 | ---------- 153 | in_ztz: str 154 | global_checker: SpellChecker 155 | error_type: str 156 | must be in ["tt", "random", "all"] 157 | word_to_reps: dict[str, int] 158 | local_word_count: int 159 | 160 | Returns 161 | ------- 162 | str, list[tuple(str, str)] 163 | 164 | """ 165 | if word_to_reps: 166 | assert local_word_count 167 | 168 | def implies(x, y): 169 | return (not x) or y 170 | 171 | words = fancy_split(in_ztz) 172 | # print("dfgh", words) 173 | best_guesses = [] 174 | changes = [] 175 | for word in words: 176 | capitalized = word[0].isupper() 177 | word = word.lower() 178 | best_guess = word 179 | prob_global_for_word = global_checker.word_usage_frequency(word) 180 | if word.isalpha() and len(word) >= 2 and \ 181 | prob_global_for_word < SPELLING_CORRECTION_RISK \ 182 | and not capitalized: 183 | word_guessers = {} 184 | simple_error_types = ["tt", "random"] 185 | if error_type in simple_error_types: 186 | word_guessers[error_type] = \ 187 | WordGuesser(word, global_checker, 188 | word_to_reps, local_word_count) 189 | if error_type == "all": 190 | for err in simple_error_types: 191 | word_guessers[err] = \ 192 | WordGuesser(word, global_checker, 193 | word_to_reps, local_word_count) 194 | assert word_guessers 195 | 196 | for guess in global_checker.edit_distance_1(word): 197 | cond1 = (guess[0:2] == word[0:2]) 198 | cond2a = implies(word[-1] == "s", guess[-1] == "s") 199 | cond2b = implies(word[-2:] == "ed", guess[-2:] == "ed") 200 | 201 | if cond1 and cond2a and cond2b: 202 | # this fixes tt, ss, dd, ll, errors 203 | if error_type in ["tt", "all"]: 204 | cond4 = (has_double_letter(guess) or has_double_letter( 205 | word)) and (len(guess) != len(word)) and set( 206 | guess) == set(word) 207 | if cond4: 208 | word_guessers['tt'].do_update(guess) 209 | if error_type in ["random", "all"]: 210 | word_guessers["random"].do_update(guess) 211 | guesser0 = None 212 | prob0 = -1 213 | for guesser in word_guessers.values(): 214 | # print("fgyt", guesser) 215 | if guesser.prob_for_best_guess > prob0: 216 | guesser0 = guesser 217 | prob0 = guesser.prob_for_best_guess 218 | best_guess = guesser0.best_guess 219 | if capitalized: 220 | word = word[0].upper() + word[1:] 221 | best_guess = best_guess[0].upper() + best_guess[1:] 222 | best_guesses.append(best_guess) 223 | if word != best_guess: 224 | changes.append((word, best_guess)) 225 | 226 | return " ".join(best_guesses), changes 227 | 228 | 229 | def correct_this_file(in_dir, 230 | out_dir, 231 | file_name, 232 | error_type, 233 | verbose=True, 234 | use_local_dict=False): 235 | """ 236 | This method reads a file called `file_name` in the `in_dir` directory 237 | and creates a spelling corrected version in the `out_dir` directory. 238 | 239 | in_dir and out_dir can be the same, but this will overwrite the files 240 | 241 | Parameters 242 | ---------- 243 | in_dir: str 244 | out_dir: str 245 | file_name: str 246 | error_type: str 247 | verbose: bool 248 | use_local_dict: bool 249 | 250 | Returns 251 | ------- 252 | None 253 | 254 | """ 255 | inpath = in_dir + "/" + file_name 256 | if out_dir: 257 | outpath = out_dir + "/" + file_name 258 | else: 259 | outpath = None 260 | 261 | global_checker = SpellChecker(distance=1) 262 | if use_local_dict: 263 | word_to_reps, local_word_count = get_word_to_reps(inpath) 264 | else: 265 | word_to_reps, local_word_count = None, None 266 | # print("nmjk", local_word_count, word_to_reps) 267 | 268 | # this didn't work. It merges TEMPO_DICT_FILE with global dict 269 | # instead of producing a dict solely from TEMP0_DICT_FILE 270 | # checker_local.word_frequency.load_dictionary("./" + TEMPO_DICT_FILE) 271 | 272 | if verbose: 273 | def print_probs(word1, word2): 274 | print() 275 | print("global probs:") 276 | print(word1, global_checker.word_usage_frequency(word1)) 277 | print(word2, global_checker.word_usage_frequency(word2)) 278 | print("local_probs:") 279 | if word_to_reps: 280 | print(word1, word_to_reps[word1]) 281 | print(word2, word_to_reps[word2]) 282 | else: 283 | print("N/A") 284 | print() 285 | 286 | print_probs("beautifull", "beautiful") 287 | print_probs("tomatos", "tomatoes") 288 | print_probs("mitty", "misty") 289 | 290 | corrected_lines = [] 291 | all_changes = [] 292 | with open(inpath, "r") as f: 293 | for line in f: 294 | corr_line, changes = get_corrected_sentence( 295 | line, global_checker, error_type, 296 | word_to_reps, local_word_count) 297 | corrected_lines.append(corr_line) 298 | all_changes += changes 299 | if verbose: 300 | print(line.strip()) 301 | print(corr_line) 302 | print() 303 | print("all changes:", all_changes) 304 | 305 | if outpath: 306 | with open(outpath, "w") as f: 307 | for corr_line in corrected_lines: 308 | f.write(corr_line + "\n") 309 | 310 | 311 | def correct_this_batch_of_files(in_dir, 312 | out_dir, 313 | batch_file_names, 314 | error_type, 315 | verbose=True, 316 | use_local_dict=False): 317 | """ 318 | This method calls the method `correct_this_file` for all the file names 319 | in the list of file names `batch_file_names`. 320 | 321 | Parameters 322 | ---------- 323 | in_dir: str 324 | out_dir: str 325 | batch_file_names: list[str] 326 | error_type: str 327 | verbose: bool 328 | use_local_dict: bool 329 | 330 | Returns 331 | ------- 332 | None 333 | 334 | """ 335 | all_file_names = my_listdir(in_dir) 336 | assert set(batch_file_names).issubset(set(all_file_names)) 337 | for file_name in batch_file_names: 338 | i = all_file_names.index(file_name) 339 | print(str(i + 1) + ".") 340 | print(file_name) 341 | correct_this_file(in_dir, out_dir, file_name, 342 | error_type, 343 | verbose, 344 | use_local_dict) 345 | 346 | 347 | if __name__ == "__main__": 348 | def main1(use_local_dict, error_type): 349 | print("**************************") 350 | print("use_local_dict=", use_local_dict) 351 | print("error_type=", error_type) 352 | print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK) 353 | print() 354 | 355 | in_dir = "." 356 | out_dir = "" # if empty out_dir, won't write to a file 357 | file_name = "spell_checking_test.txt" 358 | 359 | correct_this_file(in_dir, 360 | out_dir, 361 | file_name, 362 | error_type, 363 | verbose=True, 364 | use_local_dict=use_local_dict) 365 | 366 | 367 | def main2(use_local_dict, error_type): 368 | print("**************************") 369 | print("use_local_dict=", use_local_dict) 370 | print("error_type=", error_type) 371 | print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK) 372 | print() 373 | 374 | in_dir = "short_stories_clean" 375 | out_dir = "short_stories_spell" 376 | batch_file_names = my_listdir(in_dir) 377 | correct_this_batch_of_files(in_dir, 378 | out_dir, 379 | batch_file_names, 380 | error_type=error_type, 381 | verbose=False, 382 | use_local_dict=use_local_dict) 383 | 384 | 385 | def main3(use_local_dict, error_type): 386 | print("**************************") 387 | print("use_local_dict=", use_local_dict) 388 | print("error_type=", error_type) 389 | print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK) 390 | print() 391 | 392 | remove_dialogs = False 393 | in_dir = CLEAN_DIR if not remove_dialogs else CLEAN_RD_DIR 394 | out_dir = SPELL_DIR if not remove_dialogs else SPELL_RD_DIR 395 | batch_file_names = my_listdir(in_dir)[0:3] 396 | correct_this_batch_of_files(in_dir, 397 | out_dir, 398 | batch_file_names, 399 | error_type=error_type, 400 | verbose=False, 401 | use_local_dict=use_local_dict) 402 | 403 | 404 | # main1(use_local_dict=True, error_type="all") 405 | # main2(use_local_dict=True, error_type="all") 406 | main3(use_local_dict=True, error_type="all") 407 | -------------------------------------------------------------------------------- /white_paper/bayesuvius.sty: -------------------------------------------------------------------------------- 1 | \usepackage{graphicx} %standard package. Note graphics0$, then 554 | 555 | \beqa 556 | P(y. | \cald \rvx.=x.)&=&\sum_{m.} 557 | \underbrace{\left[\sum_{x'.} 558 | P(y.|x'., m.)P(x'.)\right]}_ 559 | {P(y.|\cald \rvm.=m.)} 560 | \underbrace{P(m.|x.)}_ 561 | {P(m.|\cald \rvx.=x.)} 562 | \\ 563 | &=& 564 | \xymatrix{ 565 | &\sum x'.\ar[dr] 566 | \\ 567 | x.´\ar[r] 568 | &\sum m.\ar[r]&y. 569 | } 570 | \eeqa 571 | where $\sum x'.$ and 572 | $\sum m.$ 573 | means nodes 574 | $\rvx'.$ and $\rvm.$ 575 | are summed over. 576 | } 577 | 578 | 579 | %Symmetry 580 | \newcommand{\symrule}[0]{ 581 | $\rva\perp_P\rvb\implies \rvb\perp_P\rva$} 582 | 583 | \newcommand{\symruleH}[0]{ 584 | $H(\rva:\rvb)=0\implies H(\rvb:\rva)=0$} 585 | 586 | %Decomposition 587 | \newcommand{\decrule}[0]{ 588 | $\rva\perp_P\rvb, \rvc\implies 589 | \rva\perp_P\rvb \text{ and } \rva\perp_P\rvc$} 590 | 591 | \newcommand{\decruleH}[0]{ 592 | $H(\rva:\rvb, \rvc)=0\implies 593 | H(\rva:\rvb)=0 \text{ and } H(\rva:\rvc)=0$} 594 | 595 | %Weak Union 596 | \newcommand{\wearule}[0]{ 597 | $\rva\perp_P \rvb, \rvc \implies 598 | \rva\perp_P\rvb|\rvc\text{ and }\rva\perp_P\rvc|\rvb$} 599 | 600 | \newcommand{\wearuleH}[0]{ 601 | $H(\rva:\rvb, \rvc)=0 \implies 602 | H(\rva:\rvb|\rvc)=0\text{ and }H(\rva:\rvc|\rvb)=0$} 603 | 604 | %Contraction 605 | \newcommand{\conrule}[0]{ 606 | $\rva\perp_P\rvb|\rvc\text{ and }\rva\perp_P \rvc 607 | \implies \rva\perp_P \rvb, \rvc$} 608 | 609 | \newcommand{\conruleH}[0]{ 610 | $H(\rva:\rvb|\rvc)=0\text{ and }H(\rva:\rvc)=0 611 | \implies H(\rva:\rvb, \rvc)=0$} 612 | 613 | %Intersection 614 | \newcommand{\intrule}[0]{ 615 | $\rva\perp_P\rvb|\rvc, \rvd\text{ and } 616 | \rva\perp_P \rvd|\rvc, \rvb\implies 617 | \rva\perp_P \rvb,\rvd|\rvc$} 618 | 619 | \newcommand{\intruleH}[0]{ 620 | $H(\rva:\rvb|\rvc, \rvd)=0\text{ and } 621 | H(\rva:\rvd|\rvc, \rvb)=0\implies 622 | H(\rva:\rvb,\rvd|\rvc)=0$} 623 | 624 | \newcommand{\dotbarmu}[0]{{\cdot|\mu}} 625 | \newcommand{\dotmu}[0]{{\cdot, \mu}} 626 | \newcommand{\kbarmu}[0]{{k|\mu}} 627 | \newcommand{\kmu}[0]{{k,\mu}} 628 | \newcommand{\plusbarmu}[0]{{+|\mu}} 629 | \newcommand{\plusmu}[0]{{+,\mu}} 630 | 631 | \newcommand{\bnlearn}[0]{{\tt bnlearn\;}} 632 | 633 | \newcommand{\sqsig}[0]{{[\sigma]}} 634 | 635 | \newcommand{\misscellone}[0]{ 636 | \begin{array}{c} 637 | \frac{1}{nsam} 638 | P(x_0=0, x_2=0\cond x_1=1, \theta) 639 | \\ 640 | \frac{1}{nsam} 641 | P(x_0=0, x_2=1\cond x_1=1, \theta) 642 | \\ 643 | \frac{1}{nsam} 644 | P(x_0=1, x_2=0\cond x_1=1, \theta) 645 | \\ 646 | \frac{1}{nsam} 647 | P(x_0=1, x_2=1\cond x_1=1, \theta) 648 | \end{array} 649 | } 650 | 651 | \newcommand{\misscelltwo}[0]{ 652 | \begin{array}{c} 653 | \frac{1}{nsam} 654 | P(x_1=0\cond x_0=0,x_2=1, \theta) 655 | \\ 656 | \frac{1}{nsam} 657 | P(x_1=1\cond x_0=0,x_2=1, \theta) 658 | \end{array} 659 | } 660 | 661 | 662 | \newcommand{\td}[0]{{\TIL{d}}} 663 | \newcommand{\rvtd}[0]{{\ul{\TIL{d}}}} 664 | \newcommand{\tx}[0]{{\TIL{x}}} 665 | \newcommand{\tmu}[0]{{\TIL{\mu}}} 666 | \newcommand{\rvtx}[0]{{\ul{\TIL{x}}}} 667 | 668 | \newcommand{\mlarr}[0]{\xrightarrow{\rm ML-fit}} 669 | \newcommand{\lrarr}[0]{\xrightarrow{\rm LR-fit}} 670 | 671 | \newcommand{\setprob}[3] 672 | {{\begin{array}{c}S=\{#1\} 673 | \\P(S)=#2\\ \haty(x^\s_S)=\$#3 K 674 | \end{array}}} 675 | 676 | \newcommand{\Gno}[0]{\xymatrix{\;\ar[r]|\parallel_G&}} 677 | \newcommand{\Gyes}[0]{\xymatrix{\;\ar[r]_G&}} 678 | 679 | \newcommand{\calypso}[0]{\ol{\caly}} 680 | 681 | \newcommand{\SeqBdoorDef}[0]{ 682 | Suppose that we have access to data 683 | that allows us to 684 | estimate a probability 685 | distribution 686 | $P(x^n, y, z^n)$. 687 | Hence, the variables 688 | $\rvx^n, \rvy, \rvz^n$ are 689 | ALL observed (i.e, not hidden). 690 | Then we say that the 691 | the multinode 692 | of ``covariates" $\rvz^n$ 693 | satisfies the 694 | {\bf sequential backdoor (SBD) adjustment criterion} 695 | relative to $(\rvx^n, \rvy)$ 696 | if for all $t\in\{0,1, \ldots, n-1\}$, 697 | 698 | \begin{enumerate} 699 | \item 700 | $\rvy\perp\rvx_t| 701 | \underbrace{(\rvx_0, \rvx_1, \ldots,\rvx_{t-1}, 702 | \rvz_0, \rvz_1, \ldots, \rvz_t)} 703 | _{\text{Past of $\rvx_t$}}$ 704 | in $\call_{\rvx_{t}} 705 | \cald_{\rvx_{t+1},\rvx_{t+2} 706 | ,\ldots, \rvx_{n-1}}G$. 707 | \item 708 | $\rvz_t \cap de(\rvx_t)=\emptyset$. 709 | \end{enumerate} 710 | } 711 | 712 | \newcommand{\SeqBdoorClaim}[0]{ 713 | If $\rvz^n$ satisfies the 714 | sequential backdoor criterion relative to 715 | $(\rvx^n, \rvy)$, then 716 | 717 | \beq 718 | P(y | \cald \rvx^n=x^n)= 719 | \calq(y|x^n) 720 | \;, 721 | \eeq 722 | where $\calq(y|x^n)$ 723 | is defined by 724 | Eq.(\ref{def-q-y-xn-seqbdoor}). 725 | } 726 | -------------------------------------------------------------------------------- /simp_deprecated/simp_stanford.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This file contains one of several implementations of the function 4 | `simplify_ztz(sentence, verbose=False)` that we considered. 5 | 6 | Refs: 7 | https://github.com/garain/Sentence-Simplification 8 | ztz = sentence 9 | 10 | """ 11 | import nltk 12 | from nltk.tree import ParentedTree 13 | from anytree import NodeMixin, Node, AnyNode, RenderTree 14 | import re 15 | import os 16 | import subprocess 17 | from globals import * 18 | 19 | version = subprocess.check_output( 20 | ['java', '-version'], stderr=subprocess.STDOUT) 21 | print("java version=\t", version) 22 | print("CLASSPATH=\t", os.environ['CLASSPATH']) 23 | print("STANFORD_MODELS=\t", os.environ['STANFORD_MODELS']) 24 | print("JAVA_HOME=\t", os.environ['JAVA_HOME']) 25 | 26 | from nltk.parse.stanford import StanfordParser 27 | 28 | parser = StanfordParser() 29 | 30 | 31 | def simplify_ztz(sentence0, verbose=False): 32 | """ 33 | This method simplifies the sentence `sentence`. It returns a list of 34 | simple sentences extracted from the input sentence. 35 | 36 | Parameters 37 | ---------- 38 | sentence0: str 39 | verbose: bool 40 | kwargs: dict[] 41 | 42 | Returns 43 | ------- 44 | list[str] 45 | 46 | """ 47 | 48 | simple_ztz_list = [] 49 | success = False 50 | 51 | # split = [] 52 | # simple_sent = [] 53 | # index = [] 54 | # index1 = 0 55 | n = 0 56 | but = 0 57 | 58 | # scount = 0 59 | # parts = [] 60 | # ht_3_last_obj = [] 61 | 62 | def SBAR_simplify(sent): 63 | 64 | def make_tree(tree, t, sent_list): 65 | # this fn. converts nltk tree to anytree 66 | if tree not in sent_list: 67 | ttt = AnyNode(id=str(tree.label()), parent=t) 68 | for tt in tree: 69 | make_tree(tt, ttt, sent_list) 70 | else: 71 | AnyNode(id=str(tree), parent=t) 72 | 73 | # SBAR CASE 74 | def find_sbar(t): 75 | if t.id == 'SBAR': 76 | global sbar 77 | sbar = t 78 | for tt in t.children: 79 | find_sbar(tt) 80 | 81 | def find_vp_in_sbar(t): 82 | if t.id == 'VP': 83 | global vp_sbar 84 | vp_sbar.append(t) 85 | for tt in t.children: 86 | find_vp_in_sbar(tt) 87 | 88 | def find_np_in_sbar(t): 89 | global f 90 | global ff 91 | if t.id == 'VP': 92 | ff = False 93 | if (t.id == 'NP') and f == True and ff == True: 94 | global np_sbar 95 | np_sbar = t 96 | f = False 97 | for tt in t.children: 98 | find_np_in_sbar(tt) 99 | 100 | def find_vp(t): 101 | if t.id == 'SBAR': 102 | return 103 | global f 104 | if t.id == 'VP' and f == True: 105 | global vp 106 | vp = t 107 | f = False 108 | for tt in t.children: 109 | find_vp(tt) 110 | 111 | def find_np(t): 112 | if t.id == 'SBAR': 113 | return 114 | global f 115 | if t.id == 'NP' and f == True: 116 | global np 117 | np = t 118 | f = False 119 | for tt in t.children: 120 | find_np(tt) 121 | 122 | def find_vbz(t): 123 | if t.id == 'SBAR': 124 | return 125 | global f 126 | if t.id == 'VBZ' and f == True: 127 | global vbz 128 | vbz = t.children[0].id 129 | f = False 130 | for tt in t.children: 131 | find_vbz(tt) 132 | 133 | def make_sent(t): 134 | global simple_sentences 135 | if t.id in sent_list: 136 | simple_sentences[-1].append(t.id) 137 | for tt in t.children: 138 | make_sent(tt) 139 | 140 | # sent=sent8 141 | 142 | parse_trees = parser.raw_parse(sent) 143 | global sent_list 144 | sent_list = [s for s in sent.split()] 145 | tree = next(parse_trees)[0] 146 | # tree.draw() 147 | t = AnyNode(id='ROOT') 148 | make_tree(tree, t, sent_list) 149 | global sbar 150 | sbar = t 151 | global vp_sbar 152 | global f 153 | global ff 154 | global np_sbar 155 | global vp 156 | global np 157 | global vbz 158 | vp_sbar = [] 159 | vp = t 160 | np = t 161 | vbz = 'bn2' 162 | np_sbar = t 163 | find_sbar(t) 164 | find_vp_in_sbar(sbar) 165 | f = True 166 | ff = True 167 | find_np_in_sbar(sbar) 168 | f = True 169 | find_vp(t) 170 | f = True 171 | find_np(t) 172 | f = True 173 | find_vbz(t) 174 | global simple_sentences 175 | simple_sentences = [] 176 | simple_sentences.append([]) 177 | make_sent(np) 178 | make_sent(vp) 179 | for i in range(len(vp_sbar)): 180 | simple_sentences.append([]) 181 | if np_sbar == t: 182 | make_sent(np) 183 | else: 184 | make_sent(np_sbar) 185 | if vbz != 'bn2': 186 | simple_sentences[-1].append(vbz) 187 | make_sent(vp_sbar[i]) 188 | # print (simple_sentences) 189 | simple = [] 190 | for sentence in simple_sentences: 191 | string = '' 192 | for word in sentence: 193 | string += word + ' ' 194 | string += '.' 195 | simple.append(string) 196 | 197 | def is_any_sbar(t): 198 | if t.id == 'SBAR': 199 | global f 200 | f = True 201 | return 202 | for tt in t.children: 203 | is_any_sbar(tt) 204 | 205 | f = False 206 | is_any_sbar(t) 207 | if f == False: 208 | simple = [sent] 209 | return simple 210 | 211 | # print(pos_tagged) 212 | # SBAR functions start here 213 | def make_tree_sbar(tree, t, sent_list): 214 | # this fn. converts nltk tree to anytree 215 | if tree not in sent_list: 216 | ttt = AnyNode(id=str(tree.label()), parent=t) 217 | for tt in tree: 218 | make_tree_sbar(tt, ttt, sent_list) 219 | else: 220 | AnyNode(id=str(tree), parent=t) 221 | 222 | def find_sbar(t): 223 | if t.id == 'SBAR': 224 | global sbar 225 | sbar = t 226 | for tt in t.children: 227 | find_sbar(tt) 228 | 229 | def find_vp_in_sbar(t): 230 | if t.id == 'VP': 231 | global vp_sbar 232 | vp_sbar = t 233 | for tt in t.children: 234 | find_vp_in_sbar(tt) 235 | 236 | def find_vp(t): 237 | if t.id == 'SBAR': 238 | return 239 | global f 240 | if t.id == 'VP' and f == True: 241 | global vp 242 | vp = t 243 | f = False 244 | for tt in t.children: 245 | find_vp(tt) 246 | 247 | def find_np(t): 248 | if t.id == 'SBAR': 249 | return 250 | global f 251 | if t.id == 'NP' and f == True: 252 | global np 253 | np = t 254 | f = False 255 | for tt in t.children: 256 | find_np(tt) 257 | 258 | def find_vbz(t): 259 | if t.id == 'SBAR': 260 | return 261 | global f 262 | if t.id == 'VBZ' and f == True: 263 | global vbz 264 | vbz = t.children[0].id 265 | f = False 266 | for tt in t.children: 267 | find_vbz(tt) 268 | 269 | def make_sent(t): 270 | global simple_sentences 271 | if t.id in sent_list: 272 | simple_sentences[-1].append(t.id) 273 | for tt in t.children: 274 | make_sent(tt) 275 | 276 | # SBAR functions end here 277 | # Multiple CC functions start here 278 | def pos_tag(tokenized_sent): 279 | return nltk.pos_tag(tokenized_sent) 280 | 281 | def has_conj(tagged_sent): 282 | cc_list = [('and', 'CC'), ('but', 'CC')] 283 | for cc_pair in cc_list: 284 | if cc_pair in tagged_sent: 285 | return True 286 | return False 287 | 288 | def split_needed(sent_list): 289 | for sent in sent_list: 290 | if has_conj(pos_tag(tokenize(sent))): 291 | return True 292 | return False 293 | 294 | def do_split(sent, cc_tuple): 295 | pos_tagged = pos_tag(tokenize(sent)) 296 | tree = next(parser.tagged_parse(pos_tagged)) 297 | tree1 = ParentedTree.convert(tree) 298 | # tree.draw() 299 | count = 0 300 | m = 0 301 | for t in tree1.subtrees(): 302 | if t.label() == 'PP': 303 | count = count + 1 304 | 305 | index = [] 306 | index1 = 0 307 | if count > 0 and (('to') not in tokenized_sent and ( 308 | 'washed') not in tokenized_sent) and ( 309 | tokenized_sent.count(",") < 2): 310 | for i in range(len(pos_tagged) - 3): 311 | if (pos_tagged[i][1] == 'VBD' or pos_tagged[i][1] == 'VBZ') and \ 312 | pos_tagged[i + 1][1] != 'VBG' and pos_tagged[i + 3][ 313 | 1] != 'CC' and pos_tagged[i + 1][1] != 'NNP' and \ 314 | pos_tagged[i - 1][1] != 'CC': 315 | pos_tagged.insert(i + 1, (',', ',')) 316 | 317 | for j in range(len(pos_tagged)): 318 | if pos_tagged[j][1] == 'CC': 319 | index.append(j) 320 | 321 | for t in tree1.subtrees(): 322 | if t.label() == 'SBAR': 323 | m = m + 1 324 | if len(index) > 0 and count > 0 and m == 0: 325 | c = 0 326 | for i in range(len(index)): 327 | pos_tagged.insert(index[i] + c, (',', ',')) 328 | c = c + 1 329 | if m > 0: 330 | for j in range(len(pos_tagged)): 331 | if pos_tagged[j][1] == 'CC': 332 | index1 = j 333 | 334 | if (index1 > 0 and m > 0) and count == 0: 335 | pos_tagged.insert(index1, (' ,', ',')) # ', 'is used 336 | pos_tagged.insert(index1 + 2, (', ', ',')) # ' ,' is used 337 | # print(pos_tagged) 338 | tree = next(parser.tagged_parse(pos_tagged)) 339 | p_tree = ParentedTree.convert(tree) 340 | 341 | leaf_values = p_tree.leaves() 342 | parts = [] 343 | ht_3_last_obj = [] 344 | 345 | if cc_tuple in pos_tagged: 346 | leaf_index = leaf_values.index(cc_tuple[0]) 347 | tree_location = p_tree.leaf_treeposition(leaf_index) 348 | parent = p_tree[tree_location[:-2]] 349 | # print(parent.height()) 350 | 351 | if parent.height() == 3: 352 | # find the noun being referred to 353 | for subtree in reversed(list(parent.subtrees())): 354 | if subtree.parent() == parent: 355 | if subtree.label() == 'NN' or subtree.label() == 'NNS': 356 | ht_3_last_obj = subtree.leaves() + ht_3_last_obj 357 | del p_tree[subtree.treeposition()] 358 | # print("ht 3 last obj -> ", ht_3_last_obj) 359 | part = [] 360 | for subtree in reversed(list(parent.subtrees())): 361 | if subtree.parent() == parent: 362 | # print(subtree) 363 | if subtree.label() != ',' and subtree.label() != 'CC': 364 | part = subtree.leaves() + part 365 | else: 366 | parts.append(part + ht_3_last_obj) 367 | part = [] 368 | del p_tree[subtree.treeposition()] 369 | parts.append(part + ht_3_last_obj) 370 | # print('parent', parent) 371 | # print('treeloc', tree_location) 372 | parent.append(ParentedTree('INSRT', ['*'])) 373 | 374 | else: 375 | for subtree in reversed(list(parent.subtrees())): 376 | if subtree.parent() == parent: 377 | # print(subtree) 378 | if subtree.label() != ',' and subtree.label() != 'CC': 379 | parts.append(subtree.leaves() + ht_3_last_obj) 380 | del p_tree[subtree.treeposition()] 381 | # print('parent', parent) 382 | # print('treeloc', tree_location) 383 | parent.append(ParentedTree('INSRT', ['*'])) 384 | 385 | # p_tree.draw() 386 | # print(parts) 387 | 388 | split = [] 389 | rem = p_tree.leaves() 390 | start_idx = rem.index('*') 391 | 392 | for part in reversed(parts): 393 | offset = start_idx 394 | r_clone = rem.copy() 395 | del r_clone[offset] 396 | for i, word in enumerate(part): 397 | r_clone.insert(offset + i, word) 398 | split.append(r_clone) 399 | 400 | # print("split", split) 401 | 402 | split = [" ".join(sent) for sent in split] 403 | 404 | return split 405 | 406 | def split_util(sent): 407 | cc_list = [('and', 'CC'), ('but', 'CC')] 408 | for cc_pair in cc_list: 409 | if cc_pair in pos_tag(tokenize(sent)): 410 | return do_split(sent, cc_pair) 411 | return sent 412 | 413 | def rem_dup(list): 414 | final = [] 415 | for item in list: 416 | if item not in final: 417 | final.append(item) 418 | return final 419 | 420 | def simplify(sent): 421 | initial = [sent] 422 | final = [] 423 | 424 | while (split_needed(initial)): 425 | final = [] 426 | while (initial): 427 | sent = initial.pop(0) 428 | if (split_needed([sent])): 429 | for split_sent in reversed(split_util(sent)): 430 | final.append(split_sent) 431 | else: 432 | final.append(sent) 433 | # print("final -> ", final) 434 | initial = final.copy() 435 | 436 | final = rem_dup(final) 437 | final = list(reversed(final)) 438 | # print(final) 439 | 440 | return final 441 | 442 | def tokenize(sent): 443 | tokenized_sent = nltk.word_tokenize(sent) 444 | if ('If') in tokenized_sent and ('then') in tokenized_sent: 445 | tokenized_sent.remove('If') 446 | tokenized_sent.insert(tokenized_sent.index('then'), 'and') 447 | tokenized_sent.remove('then') 448 | if ('because') in tokenized_sent: 449 | tokenized_sent.insert(tokenized_sent.index('because'), 450 | (',')) # ', 'is used 451 | tokenized_sent.insert(tokenized_sent.index('because') + 1, (',')) 452 | tokenized_sent.insert(tokenized_sent.index('because'), 'and') 453 | tokenized_sent.remove('because') 454 | if ('while') in tokenized_sent: 455 | tokenized_sent.insert(tokenized_sent.index('while'), 'and') 456 | tokenized_sent.remove('while') 457 | if ('which') in tokenized_sent: 458 | tokenized_sent.insert(tokenized_sent.index('which'), 'and') 459 | tokenized_sent.remove('which') 460 | if ('or') in tokenized_sent: 461 | tokenized_sent.insert(tokenized_sent.index('or'), 'and') 462 | tokenized_sent.remove('or') 463 | if ('who') in tokenized_sent: 464 | while (',') in tokenized_sent: 465 | tokenized_sent.insert(tokenized_sent.index(','), 'and') 466 | tokenized_sent.remove(',') 467 | tokenized_sent.insert(tokenized_sent.index('who'), 'and') 468 | tokenized_sent.remove('who') 469 | 470 | return tokenized_sent 471 | 472 | sentences = [sentence0.strip()] 473 | for sentence in sentences: 474 | if verbose: 475 | print("Complex Sentence: " + sentence) 476 | tokenized_sent = tokenize(sentence) 477 | # print(tokenized_sent) 478 | 479 | # parse_trees = parser1.tagged_parse(pos_tagged) 480 | 481 | pos_tagged = pos_tag(tokenized_sent) 482 | parse_trees = parser.tagged_parse(pos_tagged) 483 | tree = next(parse_trees) 484 | p_tree = ParentedTree.convert(tree) 485 | # p_tree.draw() 486 | 487 | leaf_values = p_tree.leaves() 488 | # print(leaf_values) 489 | for i in pos_tagged: 490 | if ('and') in i: 491 | n = n + 1 492 | 493 | if ('but') in i: 494 | but = but + 1 495 | tree1 = ParentedTree.convert(tree) 496 | # tree.draw() 497 | m = 0 498 | for t in tree1.subtrees(): 499 | if t.label() == 'SBAR': 500 | m = m + 1 501 | 502 | if (n + but) > 0: 503 | # tokenized_sent=nltk.word_tokenize(sent10) 504 | # pos_tagged=nltk.pos_tag(tokenized_sent) 505 | sent1 = sentence 506 | sent = " ".join(tokenize(sent1)) 507 | # print(sent) 508 | simplified = simplify(sent) 509 | for i in simplified: 510 | i = list(i) 511 | if ord(i[0]) >= 97 and ord(i[0]) <= 122: 512 | i[0] = chr(ord(i[0]) - 32) 513 | while i.count(",") > 0: 514 | # i.pop(i.index(",")) 515 | del (i[i.index(",")]) 516 | if (".") not in (i): 517 | if verbose: 518 | print("Simple sentence: " + "".join(i) + ".") 519 | simple_ztz_list.append("".join(i) + ".") 520 | success = True 521 | else: 522 | if verbose: 523 | print("Simple sentence: " + "".join(i)) 524 | simple_ztz_list.append("".join(i)) 525 | success = True 526 | n = 0 527 | but = 0 528 | # print("."), 529 | 530 | elif n == 0 and m > 0 and len(re.findall(r",", sentence)) == 0 and len( 531 | re.findall(r"While", sentence)) == 0: 532 | try: 533 | sent = sentence 534 | # print(sent) 535 | # print("Hello") 536 | tokenized_sent = tokenize(sent) 537 | pos_tagged = nltk.pos_tag(tokenized_sent) 538 | parse_trees = parser.tagged_parse(pos_tagged) 539 | sent_list = [s for s in sent.split()] 540 | tree = next(parse_trees)[0] 541 | # tree.draw() 542 | t = AnyNode(id='ROOT') 543 | make_tree_sbar(tree, t, sent_list) 544 | sbar = t 545 | vp_sbar = t 546 | vp = t 547 | np = t 548 | vbz = 'asvf' 549 | find_sbar(t) 550 | find_vp_in_sbar(sbar) 551 | f = True 552 | find_vp(t) 553 | f = True 554 | find_np(t) 555 | f = True 556 | find_vbz(t) 557 | simple_sentences = [] 558 | simple_sentences.append([]) 559 | make_sent(np) 560 | make_sent(vp) 561 | simple_sentences.append([]) 562 | make_sent(np) 563 | if vbz != 'asvf': 564 | simple_sentences[-1].append(vbz) 565 | make_sent(vp_sbar) 566 | for i in simple_sentences: 567 | i = list(i) 568 | 569 | # if ord(i[0])>=97 and ord(i[0])<=122: 570 | # i[0]=chr(ord(i[0])-32) 571 | 572 | while i.count(",") > 0: 573 | i.pop(i.index(",")) 574 | if (".") not in (i): 575 | if verbose: 576 | print("Simple sentence: " + " ".join(i) + ".") 577 | simple_ztz_list.append(" ".join(i) + ".") 578 | success = True 579 | else: 580 | if verbose: 581 | print("Simple sentence: " + " ".join(i)) 582 | simple_ztz_list.append(" ".join(i)) 583 | success = True 584 | # print("."), 585 | except: 586 | continue 587 | elif m > 0 and (len(re.findall(r",", sentence)) > 0 or len( 588 | re.findall(r"While", sentence)) > 0): 589 | try: 590 | # sent=re.sub(r",","",sentence) 591 | # print("Hello") 592 | tokenized_sent = tokenize(sentence) 593 | simple_sentences = SBAR_simplify(" ".join(tokenized_sent)) 594 | for i in simple_sentences: 595 | # i=list(i) 596 | 597 | # if ord(i[0])>=97 and ord(i[0])<=122: 598 | # i[0]=chr(ord(i[0])-32) 599 | 600 | # while i.count(",")>0: 601 | # i.pop(i.index(",")) 602 | if (".") not in (i): 603 | if verbose: 604 | print("Simple sentence: " + i) 605 | simple_ztz_list.append(i) 606 | success = True 607 | else: 608 | if verbose: 609 | print("Simple sentence: " + i) 610 | simple_ztz_list.append(i) 611 | success = True 612 | # print("."), 613 | except: 614 | continue 615 | if not success: 616 | simple_ztz_list.append(sentence0) 617 | return [ztz for ztz in simple_ztz_list if len(ztz) > 2] 618 | --------------------------------------------------------------------------------