├── pics
├── llm-addons.jpg
├── mm-project.png
├── first_atlas.jpg
├── causal-bridges.png
├── llm-superstitious.png
├── film_script_template.jpg
├── my_first_big_atlas.jpeg
├── universal-pos-tags.jpg
├── causal-bridges-captioned.png
├── spacy-named-entity-tags.png
└── nltk_tags.html
├── m_scripts_dag_atlas
├── up.pkl
├── wall-e.pkl
└── toy-story.pkl
├── white_paper
├── harris-e-mc2.jpg
├── mappa_mundi.pdf
├── shark-attacks.png
├── mappa_mundi_V2.pdf
├── nacc-nrej-plane.png
├── crossing-bridges.png
├── references.bib
└── bayesuvius.sty
├── short_stories_dag_atlas
├── wiltons-holiday.pkl
├── bill-the-bloodhound.pkl
└── extricating-young-gussie.pkl
├── spell_checking_test.txt
├── resources.txt
├── miscellaneous
├── nlp-environmental-variables.txt
├── starting-stanford-coreNLP-server.txt
├── predictions.txt.conj
├── parser-choices.py
├── WALL-E-quote-summarized-by-chatgpt.txt
└── testing-stan-parser.py
├── requirements.txt
├── openie6_translation_test1.txt
├── README.md
├── MIT-License.txt
├── simp_deprecated
├── simp_spacy-claucy.py
├── simp_openie.py
├── simp_openie6-old.py
├── simp_spacy1.py
├── simp_spacy4.py
├── simp_spacy2.py
├── simp_stanford2.py
└── simp_stanford.py
├── similarity_bert.py
├── Node.py
├── globals.py
├── simplifying_test.txt
├── similarity.py
├── similarity_deprecated
├── similarity_spacy2.py
└── similarity_spacy.py
├── utils.py
├── WordGuesser.py
├── BatchSimilarity.py
├── simp_spacy3.py
├── similarity_nltk.py
├── simplifying.py
├── downloading_imsdb.py
├── simp_openie6.py
├── jupyter_notebooks
└── SUMMARY.ipynb
├── post_cleaning.py
├── stopwords.py
├── DagAtlas.py
├── cleaning.py
├── Dag.py
└── spell_checking.py
/pics/llm-addons.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/llm-addons.jpg
--------------------------------------------------------------------------------
/pics/mm-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/mm-project.png
--------------------------------------------------------------------------------
/pics/first_atlas.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/first_atlas.jpg
--------------------------------------------------------------------------------
/pics/causal-bridges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/causal-bridges.png
--------------------------------------------------------------------------------
/m_scripts_dag_atlas/up.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/up.pkl
--------------------------------------------------------------------------------
/pics/llm-superstitious.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/llm-superstitious.png
--------------------------------------------------------------------------------
/pics/film_script_template.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/film_script_template.jpg
--------------------------------------------------------------------------------
/pics/my_first_big_atlas.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/my_first_big_atlas.jpeg
--------------------------------------------------------------------------------
/pics/universal-pos-tags.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/universal-pos-tags.jpg
--------------------------------------------------------------------------------
/white_paper/harris-e-mc2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/harris-e-mc2.jpg
--------------------------------------------------------------------------------
/white_paper/mappa_mundi.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/mappa_mundi.pdf
--------------------------------------------------------------------------------
/white_paper/shark-attacks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/shark-attacks.png
--------------------------------------------------------------------------------
/m_scripts_dag_atlas/wall-e.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/wall-e.pkl
--------------------------------------------------------------------------------
/white_paper/mappa_mundi_V2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/mappa_mundi_V2.pdf
--------------------------------------------------------------------------------
/white_paper/nacc-nrej-plane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/nacc-nrej-plane.png
--------------------------------------------------------------------------------
/m_scripts_dag_atlas/toy-story.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/toy-story.pkl
--------------------------------------------------------------------------------
/pics/causal-bridges-captioned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/causal-bridges-captioned.png
--------------------------------------------------------------------------------
/pics/spacy-named-entity-tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/spacy-named-entity-tags.png
--------------------------------------------------------------------------------
/white_paper/crossing-bridges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/crossing-bridges.png
--------------------------------------------------------------------------------
/short_stories_dag_atlas/wiltons-holiday.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/wiltons-holiday.pkl
--------------------------------------------------------------------------------
/short_stories_dag_atlas/bill-the-bloodhound.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/bill-the-bloodhound.pkl
--------------------------------------------------------------------------------
/short_stories_dag_atlas/extricating-young-gussie.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/extricating-young-gussie.pkl
--------------------------------------------------------------------------------
/spell_checking_test.txt:
--------------------------------------------------------------------------------
1 | Poul , Poul , Paul
2 | caesar caesar ceesar caisar
3 | Hooww are youu Judy , Poul . Judy
4 | fitnes fitness
5 | how haves you bein ? been
6 | leter beautifull adress addres
7 | letter beautiful address
8 | tomatos
9 |
--------------------------------------------------------------------------------
/resources.txt:
--------------------------------------------------------------------------------
1 | python -m spacy download en_core_web_lg
2 | python -m spacy download en_core_web_sm
3 | python -m spacy download en_core_web_trf
4 |
5 | python -m coreferee install en
6 |
7 | python -m nltk.downloader popular # this includes wordnet
--------------------------------------------------------------------------------
/miscellaneous/nlp-environmental-variables.txt:
--------------------------------------------------------------------------------
1 | CLASSPATH
2 | C:\NLP\stanford-parser-full-2018-02-27;C:\NLP\stanford-postagger-full-2015-12-09;C:\NLP\stanford-ner-2015-12-09
3 |
4 | STANFORD_MODELS
5 | C:\NLP\stanford-ner-2015-12-09\classifiers;C:\NLP\stanford-postagger-full-2015-12-09\models
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | graphviz
2 | ipython
3 | Pillow
4 | contractions
5 | spacy
6 | Unidecode
7 | nltk
8 | # pycorenlp~=0.3.0
9 | # coreferee~=1.4.0
10 | # anytree~=2.8.0
11 | numpy
12 | requests
13 | beautifulsoup4
14 | python-slugify
15 | # claucy~=0.0.2.0
16 | pyspellchecker
17 | sentence-transformers
--------------------------------------------------------------------------------
/miscellaneous/starting-stanford-coreNLP-server.txt:
--------------------------------------------------------------------------------
1 | Reference:
2 | https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024
3 |
4 | # check java installed properly
5 | java -version
6 |
7 | # starting server
8 | # cd to folder with stanford java code
9 | cd /StanfordParser/stanford-corenlp-4.5.4/
10 |
11 | # no need to deactivate conda virtual environment
12 |
13 | # start server (IMPORTANT: make sure this is one line)
14 | java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000
15 |
--------------------------------------------------------------------------------
/openie6_translation_test1.txt:
--------------------------------------------------------------------------------
1 | The man , who had never liked the words `` booby '' and `` boobyhatch , '' and who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
2 | The man , '' , thought for a moment .
3 | The man , who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
4 | The man , who had never liked the words `` booby , thought for a moment .
5 | The man , who had never liked the words `` boobyhatch , thought for a moment .
6 |
7 | I love Luciano Pavarotti and Jose Carreras .
8 | I love Luciano Pavarotti .
9 | I love Jose Carreras .
10 |
11 |
--------------------------------------------------------------------------------
/miscellaneous/predictions.txt.conj:
--------------------------------------------------------------------------------
1 | The man , who had never liked the words `` booby '' and `` boobyhatch , '' and who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
2 | The man , '' , thought for a moment .
3 | The man , who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
4 | The man , who had never liked the words `` booby , thought for a moment .
5 | The man , who had never liked the words `` boobyhatch , thought for a moment .
6 |
7 | I love Luciano Pavarotti and Jose Carreras .
8 | I love Luciano Pavarotti .
9 | I love Jose Carreras .
10 |
11 |
--------------------------------------------------------------------------------
/miscellaneous/parser-choices.py:
--------------------------------------------------------------------------------
1 | # 1. nltk
2 | # NLTK was unable to find stanford-parser\.jar! Set the CLASSPATH
3 | # environment variable.
4 | # https://stackoverflow.com/questions/13883277/how-to-use-stanford-parser-in-nltk-using-python
5 |
6 | from nltk.parse.stanford import StanfordParser
7 | parser = StanfordParser()
8 |
9 | from nltk.parse.stanford import GenericStanfordParser
10 | parser = GenericStanfordParser()
11 |
12 | # 2. nltk.parse.corenlp
13 | # AttributeError: 'CoreNLPParser' object has no attribute 'tagged_parse'https://stackoverflow.com/questions/39320782/corenlp-provide-pos-tags
14 | import nltk
15 | from nltk.parse.corenlp import CoreNLPParser
16 | parser = CoreNLPParser(url='http://localhost:9000')
17 |
18 |
19 | # 3. pycorenlp
20 | from pycorenlp import StanfordCoreNLP
21 | nlp = StanfordCoreNLP('http://localhost:9000')
22 | parser = nlp.parse()
23 |
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Mappa_Mundi
2 |
3 | Welcome to Mappa_Mundi (MM)!
4 |
5 | MM is a method that combines seamlessly
6 | Large Language Models (LLM)
7 | and Causal Inference (CI).
8 |
9 | The MM software does causal DEFT
10 | (causal DAG Extraction From Text).
11 | We store each extracted DAG in a separate file, and we put
12 | all DAG files in a directory
13 | that we call
14 | a DAG Atlas.
15 |
16 | I discuss the software in
17 | detail in this white paper:
18 | * [Version 1](https://github.com/rrtucci/mappa_mundi/blob/master/white_paper/mappa_mundi.pdf)
19 | * [Version 2](https://github.com/rrtucci/mappa_mundi/blob/master/white_paper/mappa_mundi_V2.pdf)
20 |
21 | 
22 |
23 | 
24 |
25 | 
26 |
27 | 
28 |
29 | 
30 |
31 |
32 |
--------------------------------------------------------------------------------
/MIT-License.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Robert R. Tucci
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy-claucy.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | Refs:
7 | https://spacy.io/usage/spacy-101/
8 |
9 | https://github.com/mmxgn/spacy-clausieec13/splitting-sentences-into-clauses
10 | """
11 | from globals import *
12 | import spacy
13 | import claucy
14 |
15 | nlp = spacy.load('en_core_web_sm')
16 | claucy.add_to_pipe(nlp)
17 |
18 |
19 | def simplify_ztz(sentence, verbose=False):
20 | """
21 | This method simplifies the sentence `sentence`. It returns a list of
22 | simple sentences extracted from the input sentence.
23 |
24 | Parameters
25 | ----------
26 | sentence: str
27 | verbose: bool
28 | kwargs: dict[]
29 |
30 | Returns
31 | -------
32 | list[str]
33 |
34 | """
35 |
36 | doc = nlp(sentence.strip())
37 | if doc._.clauses:
38 | propositions = doc._.clauses[0].to_propositions(as_text=True)
39 | else:
40 | propositions = [sentence]
41 | if verbose:
42 | print(sentence.strip())
43 | print(propositions)
44 | return propositions
45 |
--------------------------------------------------------------------------------
/similarity_bert.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
4 | that returns the similarity of sentences `ztz1` and `ztz2`.
5 | ztz = sentence
6 |
7 | It uses
8 |
9 | Ref:
10 | 1. https://www.sbert.net/
11 | 2. https://huggingface.co/tasks/sentence-similarity
12 | 3. https://towardsdatascience.com/bert-for-measuring-text-similarity
13 | -eec91c6bf9e1
14 | """
15 | from sklearn.metrics.pairwise import cosine_similarity
16 |
17 |
18 | def ztz_similarity(ztz1, ztz2, **kwargs):
19 | """
20 | This method returns the similarity between sentences `ztz1` and `ztz2`.
21 | The similarity is measured as odds of a probability, so it ranges from 0
22 | to infinity.
23 |
24 | Parameters
25 | ----------
26 | ztz1: str
27 | ztz2: str
28 | kwargs: dict[]
29 |
30 | Returns
31 | -------
32 | float
33 |
34 | """
35 | model = kwargs['model']
36 | embedding_1 = model.encode(ztz1)
37 | embedding_2 = model.encode(ztz2)
38 |
39 | prob = cosine_similarity([embedding_1], [embedding_2])[0, 0]
40 | if prob < 0:
41 | # print("neg. prob.=", prob)
42 | # print(ztz1)
43 | # print(ztz2)
44 | prob = 0
45 | odds = prob / (1 - prob) if prob < 1 else 1e5
46 | return round(odds, 3)
47 |
--------------------------------------------------------------------------------
/miscellaneous/WALL-E-quote-summarized-by-chatgpt.txt:
--------------------------------------------------------------------------------
1 | Consider the following quote:
2 | "A range of mountains takes form in the haze.
3 | Moving closer.
4 | The mountains are piles of TRASH.
5 | The entire surface is nothing but waste.
6 |
7 | '..We're gonna find adventure in the evening air...'
8 |
9 | A silhouetted city in the distance.
10 | What looks like skyscrapers turns into trash.
11 | Thousands of neatly stacked CUBES OF TRASH, stories high.
12 | Rows and rows of stacked cubes, like city avenues.
13 | They go on for miles."
14 |
15 | Give an enumerated list summarizing this quote.
16 | Each list item is a simple declarative sentence
17 | consisting of a subject phrase and a verb phrase.
18 | The list items are in chronological order of occurrence
19 | in the quote.
20 |
21 | 1. A range of mountains appears in the haze.
22 | 2. The narrator moves closer.
23 | 3. The mountains are revealed to be piles of trash.
24 | 4. The entire surface is described as nothing but waste.
25 | 5. The narrator speaks of finding adventure in the evening air.
26 | 6. A silhouetted city appears in the distance.
27 | 7. The skyscrapers turn out to be made of trash.
28 | 8. Thousands of neatly stacked cubes of trash are seen, stories high.
29 | 9. Rows and rows of stacked cubes are seen, resembling city avenues.
30 | 10. The expanse of trash goes on for miles.
31 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_openie.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | References
7 |
8 | 1. https://stanfordnlp.github.io/CoreNLP/openie.html#api
9 | # Default value of openie.affinity_probability_cap was 1/3.
10 | 2. https://pypi.org/project/stanford-openie/
11 |
12 | 2. https://stanfordnlp.github.io/CoreNLP/demo.html
13 |
14 | """
15 | from openie import StanfordOpenIE
16 |
17 | properties = {
18 | 'openie.triple.all_nominals': True,
19 | 'openie.triple.strict': False,
20 | 'openie.splitter.nomodel': True,
21 | 'openie.affinity_probability_cap': 1/ 3
22 | }
23 | client = StanfordOpenIE(properties=properties)
24 |
25 |
26 | def simplify_ztz(sentence, verbose=False):
27 | """
28 | This method simplifies the sentence `sentence`.
29 |
30 | Parameters
31 | ----------
32 | sentence: str
33 | verbose: bool
34 |
35 | Returns
36 | -------
37 | str
38 |
39 | """
40 | ztz_list = []
41 | for triple in client.annotate(sentence):
42 | ztz_list.append(triple['subject'] + " " +
43 | triple['relation'] + " " +
44 | triple['object'])
45 | if verbose:
46 | print(sentence.strip())
47 | print(ztz_list)
48 | return ztz_list
49 |
50 |
51 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_openie6-old.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | It is called within a jupyter notebook at Google colab
7 | https://colab.research.google.com/drive/1S2EWOGkoCgjfOJzTRJ7PLeu4T8SBwhlF?usp=sharing
8 |
9 | Refs:
10 |
11 | 1. https://github.com/dair-iitd/CaRB
12 |
13 | 2. https://github.com/dair-iitd/imojie
14 |
15 | 3. https://github.com/dair-iitd/openie6
16 |
17 | """
18 | import subprocess
19 | from globals import *
20 |
21 | def simplify_ztz(sentence, verbose=False):
22 | """
23 | This method simplifies the sentence `sentence`. It returns a list of
24 | simple sentences extracted from the input sentence.
25 |
26 | Parameters
27 | ----------
28 | sentence: str
29 | verbose: bool
30 | kwargs: dict[]
31 |
32 | Returns
33 | -------
34 | list[str]
35 |
36 | """
37 |
38 | with open("../openie6_sentences.txt", "w") as f:
39 | f.write(sentence)
40 |
41 | gpu_command = \
42 | "cd ../openie6 && CUDA_DEVICE_ORDER=PCI_BUS_ID " \
43 | "CUDA_VISIBLE_DEVICES=0 " \
44 | "PYTHONPATH=imojie:imojie/allennlp:imojie" \
45 | "/pytorch_transformers:$PYTHONPATH python run.py " \
46 | "--save models/conj_model --mode predict " \
47 | "--inp ../openie6_sentences.txt --batch_size 1 " \
48 | "--model_str bert-large-cased --task conj " \
49 | "--gpus 1 --out ../openie6_predictions.txt"
50 |
51 | cpu_command = gpu_command.replace("--gpus 1", "--gpus 0")
52 |
53 | if USE_GPU:
54 | subprocess.Popen(gpu_command, shell=True)
55 | else:
56 | subprocess.Popen(cpu_command, shell=True)
57 |
58 | ztz_list = []
59 | with open("../openie6_predictions.txt.conj", "r") as f:
60 | for line in f:
61 | ztz_list.append(line)
62 | # ztz_list has full sentence in first row
63 | return ztz_list[1:]
64 |
--------------------------------------------------------------------------------
/Node.py:
--------------------------------------------------------------------------------
1 | class Node:
2 | """
3 |
4 | This is a very simple class that holds the `time` and `place` of each
5 | node.
6 |
7 | Each simplified clause becomes a node of the DAG.
8 |
9 | For brevity, let us refer to time as `t` and place as `x`. Previously,
10 | we put each full sentence of the movie script into one row of a file.
11 | Then each sentence was replaced by zero, one, two, or more simplified
12 | clauses, separated by separator-tokens. If a simplified clause ( i.e.,
13 | node) appears at the row $t$ of the file (counting starting with 0),
14 | then we say that the node occurs at time $t$. If a simplified clause
15 | appears after zero separator-tokens, we say $x=0$ for it. If it appears
16 | after one separator-token, we say $x=1$ for it, and so forth. Hence each
17 | node ( i.e., simplified clause) can be labeled by its $(t, x)$ coordinates.
18 |
19 | Attributes
20 | ----------
21 | place: int
22 | time: int
23 | """
24 |
25 | def __init__(self, time, place):
26 | """
27 | Constructor
28 |
29 | Parameters
30 | ----------
31 | time: int
32 | place: int
33 | """
34 | self.time = time
35 | self.place = place
36 | assert time >= 0 and place >= 0
37 |
38 | def coords(self):
39 | """
40 | This method returns the coordinates of self as a tuple.
41 |
42 | Returns
43 | -------
44 | tuple(int, int)
45 |
46 | """
47 | return (self.time, self.place)
48 |
49 |
50 | def node_str(node):
51 | """
52 | This method returns a string for Node `node`.
53 |
54 | Parameters
55 | ----------
56 | node: Node
57 |
58 | Returns
59 | -------
60 | str
61 |
62 | """
63 | return "(" + str(node.time) + "," + str(node.place) + ")"
64 |
65 |
66 | def arrow_str(arrow):
67 | """
68 | This method returns a string for an arrow `arrow`
69 |
70 | Parameters
71 | ----------
72 | arrow: tuple[Node, Node]
73 |
74 | Returns
75 | -------
76 | str
77 |
78 | """
79 | return node_str(arrow[0]) + "->" + node_str(arrow[1])
80 |
--------------------------------------------------------------------------------
/globals.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains all the global variables used by Mappa Mundi (MM).
4 |
5 | """
6 |
7 | BASE_URL = "https://imsdb.com"
8 |
9 | M_SCRIPTS_DIR = "m_scripts"
10 | CLEAN_DIR = "m_scripts_clean"
11 | CLEAN_RD_DIR = "m_scripts_clean_rd"
12 | SPELL_DIR = "m_scripts_spell"
13 | SPELL_RD_DIR = "m_scripts_spell_rd"
14 | SIMP_DIR = "m_scripts_simp"
15 | SIMP_RD_DIR = "m_scripts_simp_rd"
16 | POST_CLEAN_DIR = "m_scripts_post_clean"
17 | POST_CLEAN_RD_DIR = "m_scripts_post_clean_rd"
18 | DAG_DIR = "m_scripts_dag_atlas"
19 | DAG_RD_DIR = "m_scripts_dag_atlas_rd"
20 |
21 | # ZTZ_SIMPLIFIER = "simp_stanford"
22 | # ZTZ_SIMPLIFIER = "simp_spacy_claucy"
23 | # ZTZ_SIMPLIFIER = "simp_spacy1"
24 | # ZTZ_SIMPLIFIER = "simp_spacy2"
25 | # ZTZ_SIMPLIFIER = "simp_spacy3" # originally recommended
26 | ZTZ_SIMPLIFIER = "simp_openie6" # recommended
27 |
28 | # SIMI_DEF = "similarity_spacy"
29 | # SIMI_DEF = "similarity_spacy2"
30 | # SIMI_DEF = "similarity_nltk" # originally recommended
31 | SIMI_DEF = "similarity_bert" # recommended
32 |
33 | # good threshold values gleaned from similarity.py examples
34 | # SIMI_THRESHOLD = 2.2 for NLTK
35 | # SIMI_THRESHOLD = 2.69 for SpaCy
36 | SIMI_THRESHOLD = 2 # for bert, recommended
37 |
38 | ZTZ_SEPARATOR = "[%@!]"
39 |
40 | SPELLING_CORRECTION_RISK = 1e-8
41 |
42 | # POS (part of speech) in stopwords.py
43 | # ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ',
44 | # 'NOUN', 'NUM', 'PART', 'PRON', 'PUNCT', 'SCONJ', 'VERB']
45 |
46 | # To see full list of POS, see jpg in pics folder
47 |
48 | # ADP (adposition) are mostly prepositions
49 | # AUX contains verbs like 'is'
50 | # DET (determiner) contains 'whose'
51 | # NUM contains number words like 'three'
52 | # PART (particle) contains 'not'
53 |
54 | RETAINED_POS = ['ADJ', 'ADV', 'NOUN', 'VERB']
55 |
56 | # See stopwords.py
57 | # RETAINED_STOPWORD_POS should be subset of RETAINED_POS
58 | # RETAINED_STOPWORD_POS = RETAINED_POS
59 | RETAINED_STOPWORD_POS = [] # recommended
60 |
61 | USE_GPU = True
62 |
63 | class color:
64 | PURPLE = '\033[95m'
65 | CYAN = '\033[96m'
66 | DARKCYAN = '\033[36m'
67 | BLUE = '\033[94m'
68 | GREEN = '\033[92m'
69 | YELLOW = '\033[93m'
70 | RED = '\033[91m'
71 | BOLD = '\033[1m'
72 | UNDERLINE = '\033[4m'
73 | END = '\033[0m'
--------------------------------------------------------------------------------
/miscellaneous/testing-stan-parser.py:
--------------------------------------------------------------------------------
1 | """
2 | Reference:
3 | https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024
4 |
5 | """
6 | import os
7 | import subprocess
8 | version = subprocess.check_output(
9 | ['java', '-version'], stderr=subprocess.STDOUT)
10 | print("java version=\t", version)
11 | print("CLASSPATH=\t", os.environ['CLASSPATH'])
12 | print("STANFORD_MODELS=\t", os.environ['STANFORD_MODELS'])
13 | print("JAVA_HOME=\t", os.environ['JAVA_HOME'])
14 |
15 | def main1():
16 | from pycorenlp import StanfordCoreNLP
17 | nlp = StanfordCoreNLP('http://localhost:9000')
18 |
19 | text = "This movie was actually neither that funny, nor super witty. The movie was meh. I liked watching that movie. If I had a choice, I would not watch that movie areps."
20 | result = nlp.annotate(text,
21 | properties={
22 | 'annotators': 'sentiment, ner, pos',
23 | 'outputFormat': 'json',
24 | 'timeout': 1000,
25 | })
26 | print(result)
27 |
28 | def main2():
29 | #ttps://www.nltk.org/api/nltk.parse.corenlp.html
30 | import nltk
31 | from nltk.parse.corenlp import CoreNLPParser
32 |
33 | # Start the CoreNLP server
34 | # nltk.download('punkt')
35 | # nltk.download('corenlp')
36 | parser = CoreNLPParser(url='http://localhost:9000')
37 |
38 | # Parse a sentence
39 | sentence = "The quick brown fox jumps over the lazy dog."
40 | parse_tree = list(
41 | parser.parse(sentence.split())
42 | )[0]
43 | print(parse_tree)
44 |
45 | def main3():
46 | import nltk
47 | from nltk.parse.corenlp import CoreNLPParser
48 |
49 | # Start the CoreNLP server
50 | parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
51 |
52 | # Parse a tagged sentence
53 | tagged_sentence = [('The', 'DT'), ('quick', 'JJ'), ('brown', 'JJ'),
54 | ('fox', 'NN'),
55 | ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'),
56 | ('lazy', 'JJ'),
57 | ('dog', 'NN'), ('.', '.')]
58 | # parse_tree = list(parser.parse(tagged_sentence))[0]
59 | # print(parse_tree)
60 | parser.parse(tagged_sentence)
61 |
62 | if __name__ == "__main__":
63 | # main1()
64 | main2()
65 | # main3() # doesn't work
66 |
67 |
--------------------------------------------------------------------------------
/simplifying_test.txt:
--------------------------------------------------------------------------------
1 | Robert, who lives nearby, was walking his dog.
2 | While eating food Ram is singing a song .
3 | After she ate the cake , Emma visited Tony in his room .
4 | If she is singing then I will sing .
5 | Melanie bought a Batman game for $ 6.95 , a strategy game for $ 7.90 , and a Superman game for $ 7.73 .
6 | Bag A contains 3 white and 2 blue marbles .
7 | Ram has two apples and five bananas .
8 | Ram and Shyam are two brothers .
9 | Ram is a boy and Sita is a girl .
10 | Ram is a boy who is six years old .
11 | Ram eats a banana and an apple but sings a song .
12 | He washed cars over the weekend and now has 86 dollars .
13 | While playing piano Ram is singing a song in a room and Shyam is playing violin .
14 | You are a boy, and Sita is a girl .
15 | Ram sold 6 balls at 10 a.m and 7 balls at 11 a.m .
16 | The restaurant sold 6 slices of pie during the day and 7 slices of pie during the night .
17 | Sam's dad gave Sam 39 nickels and 31 quarters .
18 | Park workers will plant 41 dogwood trees today and 20 dogwood trees tomorrow .
19 | Dan picked 9 limes and gave Sara 4 of the limes .
20 | This year Diane bought some new hives and increased Diane's honey harvest by 6085 pounds .
21 | Sara had 4 quarters and 8 dimes in Sara's bank .
22 | Mike found 6 seashells and 4 starfishes but 4 of the seashells were broken .
23 | Jessica grew 35 watermelons and 30 carrots but the rabbits ate 27 watermelons .
24 | Dan bought a clarinet for $ 130.30 , and a song book which was $ 11.24 .
25 | There are 2 maple trees and 5 popular trees currently in the park .
26 | Dan 's cat had kittens and 5 had spots .
27 | This year, 712261 male salmon and 259378 female salmon, returned to their rivers .
28 | Each day , the polar bear at Richmond 's zoo eats 0.2 bucket of trout and 0.4 bucket of salmon .
29 | While eating food and drinking water Ram is singing a song .
30 | He is eating food and she is playing and they are fighting .
31 | Ram is playing guitar while talking to Sita .
32 | He is playing and she is crying but they are singing .
33 | The embattled Major government survived a crucial vote on coal pits closure as its last-minute concessions curbed the extent of Tony revolt over an issue that generated unusual heat in the House of Commons and brought the miners to London streets.
34 | When Sam is eating food, Alice is singing a song.
35 | Talwinder Singh,who masterminded the Kanishka crash in 1998, was killed in a fierce two hour counter.
36 | Because I was late, I became angry.
--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file has functions to test the function `ztz_similarity(str1, str2)`
4 | which measures the similarity of two sentences `ztz1` and `ztz2`.
5 | `ztz_similarity()` has been implemented 4 different ways, in separate files
6 |
7 | 1. similarity_bert.py (Recommended)
8 | Uses BERT and sentence-transformers
9 |
10 | 2. similarity_nltk.py
11 | Uses NLTK + WordNet
12 |
13 | 3. similarity_spacy.py
14 | Uses SpaCy + WordVec
15 |
16 | 4. similarity_spacy2.py
17 | Attempt to use SpaCy + WordNet
18 |
19 | """
20 | from globals import *
21 | import importlib as imp
22 | from sentence_transformers import SentenceTransformer
23 |
24 | simi_def = imp.import_module(SIMI_DEF)
25 |
26 |
27 | def print_simi_12(str1, str2, **kwargs):
28 | """
29 | Prints similarity of `str1` and `str2`.
30 |
31 | Parameters
32 | ----------
33 | str1: str
34 | str2: str
35 |
36 | Returns
37 | -------
38 | None
39 |
40 | """
41 | print()
42 | print("1.", str1)
43 | print("2.", str2)
44 | simi12 = simi_def.ztz_similarity(str1, str2, **kwargs)
45 | simi21 = simi_def.ztz_similarity(str2, str1, **kwargs)
46 | print("simi(1, 2)=", str(simi12))
47 | print("simi(2, 1)=", str(simi21))
48 |
49 |
50 | if __name__ == "__main__":
51 | def main1():
52 | if SIMI_DEF == "similarity_bert":
53 | model = SentenceTransformer('all-MiniLM-L6-v2')
54 | else:
55 | model = None
56 | print("************ simi definition from:", SIMI_DEF)
57 |
58 | ztzs = [
59 | "Dogs are awesome.",
60 | "Some gorgeous creatures are felines.",
61 | "Dolphins are swimming mammals.",
62 | "Cats are beautiful animals.",
63 | "Cats are beauti animals.",
64 | ]
65 |
66 | focus_ztz = "Cats are beautiful animals."
67 | for ztz in ztzs:
68 | print_simi_12(focus_ztz, ztz, model=model)
69 |
70 |
71 | def main2():
72 | if SIMI_DEF == "similarity_bert":
73 | model = SentenceTransformer('all-MiniLM-L6-v2')
74 | else:
75 | model = None
76 | print("************ simi definition from:", SIMI_DEF)
77 | word1, word2 = "apple", "horse"
78 | print_simi_12(word1, word2, model=model)
79 | print_simi_12("Paul", "John", model=model)
80 |
81 | ztz1 = "The cat sat on the mat."
82 | ztz2 = "The dog lay on the rug."
83 | print_simi_12(ztz1, ztz2, model=model)
84 |
85 |
86 | main1()
87 | main2()
88 |
--------------------------------------------------------------------------------
/pics/nltk_tags.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | - CC: Coordinating conjunction
7 | - CD: Cardinal number
8 | - DT: Determiner
9 | - EX: Existential there
10 | - FW: Foreign word
11 | - IN: Preposition or subordinating conjunction
12 | - JJ: Adjective
13 | - JJR: Adjective, comparative
14 | - JJS: Adjective, superlative
15 | - LS: List item marker
16 | - MD: Modal
17 | - NN: Noun, singular or mass
18 | - NNS: Noun, plural
19 | - NNP: Proper noun, singular Phrase
20 | - NNPS: Proper noun, plural
21 | - PDT: Pre determiner
22 | - POS: Possessive ending
23 | - PP: Preposition Phrase
24 | - PRP: Possessive pronoun Phrase
25 | - RB: Adverb
26 | - RBR: Adverb, comparative
27 | - RBS: Adverb, superlative
28 | - RP: Particle
29 | - S: Simple declarative clause
30 | - SBAR: Clause introduced by a (possibly empty) subordinating conjunction
31 | - SBARQ: Direct question introduced by a wh-word or a wh-phrase.
32 | - SINV: Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal.
33 | - SQ: Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ.
34 | - SYM: Symbol
35 | - VB: Verb, base form
36 | - VBD: Verb, past tense
37 | - VBG: Verb, gerund or present participle
38 | - VBN: Verb, past participle
39 | - VBP: Verb, non-3rd person singular present
40 | - VBZ: Verb, 3rd person singular present
41 | - VP: Verb Phrase
42 | - WDT: Wh-determiner
43 | - WP: Wh-pronoun
44 | - WP$: Possessive wh-pronoun
45 | - WRB: Wh-adverb
46 |
47 |
48 |
--------------------------------------------------------------------------------
/similarity_deprecated/similarity_spacy2.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
4 | that returns the similarity of sentences `ztz1` and `ztz2`.
5 | ztz = sentence
6 |
7 | It uses SpaCy + WordNet
8 |
9 | Ref:
10 |
11 | """
12 | import spacy
13 | import nltk
14 | from nltk.corpus import wordnet as wn
15 | from globals import *
16 | from itertools import product
17 | from collections import defaultdict
18 | from time import time
19 |
20 | nlp = spacy.load("en_core_web_sm")
21 |
22 |
23 | def ztz_similarity(ztz1, ztz2, **kwargs):
24 | """
25 | This method returns the similarity between sentences `ztz1` and `ztz2`.
26 | The similarity is measured as odds of a probability, so it ranges from 0
27 | to infinity.
28 |
29 | Parameters
30 | ----------
31 | ztz1: str
32 | ztz2: str
33 |
34 | Returns
35 | -------
36 | float
37 |
38 | """
39 | do_time = False
40 | if do_time:
41 | print("similarity begins", time())
42 | doc1 = nlp(ztz1)
43 | doc2 = nlp(ztz2)
44 | sp_tokens1 = [token1 for token1 in doc1 \
45 | if token1.pos_ in RETAINED_POS]
46 | sp_tokens2 = [token2 for token2 in doc2 \
47 | if token2.pos_ in RETAINED_POS]
48 | all_ss1 = []
49 | for token1 in sp_tokens1:
50 | if wn.synsets(token1.text):
51 | ss1 = wn.synsets(token1.text)[0]
52 | all_ss1.append(ss1)
53 |
54 | all_ss2 = []
55 | for token2 in sp_tokens2:
56 | if wn.synsets(token2.text):
57 | ss2 = wn.synsets(token2.text)[0]
58 | all_ss2.append(ss2)
59 | ss_pair_to_simi = defaultdict(lambda: 0)
60 | if do_time:
61 | print("beginning of path_similarity()", time())
62 | for ss1, ss2 in product(all_ss1, all_ss2):
63 | simi = ss1.path_similarity(ss2)
64 | if simi is not None:
65 | ss_pair_to_simi[(ss1, ss2)] = simi
66 |
67 | score1 = 0.0
68 | count1 = 0
69 | for ss1 in all_ss1:
70 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss2 in all_ss2]
71 | if simi_list:
72 | best_score = max(simi_list)
73 | score1 += best_score
74 | count1 += 1
75 | if count1:
76 | score1 /= count1
77 |
78 | score2 = 0.0
79 | count2 = 0
80 | for ss2 in all_ss2:
81 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss1 in all_ss1]
82 | if simi_list:
83 | best_score = max(simi_list)
84 | score2 += best_score
85 | count2 += 1
86 | if count2:
87 | score2 /= count2
88 | prob = (score1 + score2) / 2
89 | if prob < 1:
90 | odds = prob / (1 - prob)
91 | else:
92 | odds = 1000
93 | if do_time:
94 | print("similarity ends", time())
95 | return round(odds, 3)
96 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file holds some general purpose functions (utilities).
4 |
5 | """
6 | import os
7 | from cfitbit_globals import *
8 | import shutil
9 |
10 |
11 | def zero_based_position_from_m_title(dir_, title):
12 | """
13 | This method returns the position (zero based, starting from zero) of
14 | title `title` in directory `dir_`.
15 |
16 | Parameters
17 | ----------
18 | dir_: str
19 | title: str
20 |
21 | Returns
22 | -------
23 | int
24 |
25 | """
26 | return list(my_listdir(dir_)).index(title + ".txt")
27 |
28 |
29 | def m_title_from_zero_based_position(dir_, pos):
30 | """
31 | This method returns the title in directory `dir_` of the movie at
32 | position `pos` (zero based, starting from zero).
33 |
34 | Parameters
35 | ----------
36 | dir_: str
37 | pos: int
38 |
39 | Returns
40 | -------
41 | str
42 |
43 | """
44 | return list(my_listdir(dir_))[pos][:-len(".txt")]
45 |
46 |
47 | def argmax_of_list(lista):
48 | """
49 | This method returns the argmax of list `lista`.
50 |
51 | Parameters
52 | ----------
53 | lista: list[X]
54 |
55 | Returns
56 | -------
57 | type(X)
58 |
59 |
60 | """
61 | return max(range(len(lista)), key=(lambda i: lista[i]))
62 |
63 |
64 | def print_welcome_message():
65 | """
66 | This method prints a welcome message.
67 |
68 | Returns
69 | -------
70 | None
71 |
72 | """
73 | print("Welcome Causal AI Navigator. We have been waiting for you for "
74 | "millennia. Where would you like us to go next?")
75 |
76 |
77 | def my_listdir(dir_):
78 | """
79 | Whenever one opens a text file within directory `dir_` using jupyter lab
80 | ( JL), JL writes an annoying `.ipynb.checkpoints` folder inside `dir_`.
81 | This method deletes that checkpoints folder and then returns the usual
82 | `os.listdir( dir_)`
83 |
84 | Parameters
85 | ----------
86 | dir_: str
87 |
88 | Returns
89 | -------
90 | iterable
91 |
92 | """
93 | # listdir includes hidden files like .ipynb_checkpoints
94 | checkpoints = dir_ + "/" + ".ipynb_checkpoints"
95 | shutil.rmtree(checkpoints, ignore_errors=True)
96 | # os.listdir list in arbitrary order!
97 | return sorted(os.listdir(dir_))
98 |
99 |
100 | def get_prob_acc_and_nsam(num_acc, num_rej, round_digits=2):
101 | """
102 | This method returns the probability of acceptance `prob_acc` and the
103 | number of samples `nsam` used to calculate that probability.
104 |
105 | Parameters
106 | ----------
107 | num_acc: int
108 | number of times an arrow has been accepted
109 | num_rej: int
110 | number of times an arrow has been rejected.
111 | round_digits: int
112 |
113 | Returns
114 | -------
115 | float, int
116 |
117 | """
118 | nsam = num_acc + num_rej
119 | return round(num_acc / nsam, round_digits), nsam
120 |
--------------------------------------------------------------------------------
/WordGuesser.py:
--------------------------------------------------------------------------------
1 | class WordGuesser:
2 | """
3 | This class is used by `spell_checking.py` to store and update the word
4 | `best_guess` which is a guess for the word `word`. Also stored in this
5 | class: the probabilities for `best_guess` and `word`.
6 |
7 |
8 | Attributes
9 | ----------
10 | best_guess: str
11 | a word which is the best guess so far for the word `word`
12 | global_checker: SpellChecker
13 | a class of pyspellchecker that can give global probabilities of words
14 | local_word_count: int
15 | number of different words in the single local document being considered
16 | prob_for_best_guess: float
17 | probability for `best_guess` (average of local and global probs)
18 | prob_for_word: float
19 | probability for `word` (average of local and global probs)
20 | word: str
21 | low probability word, likely a misspelled word. `best_guess` is a
22 | replacement for it.
23 | word_to_reps: dict[str, int]
24 | a dictionary mapping each word in the local document being considered,
25 | to its number of repetitions in that document.
26 |
27 | """
28 |
29 | def __init__(self, word, global_checker,
30 | word_to_reps=None, local_word_count=None):
31 | """
32 | Constructor
33 |
34 | Parameters
35 | ----------
36 | word: str
37 | global_checker: SpellChecker
38 | word_to_reps: dict[str, int]
39 | local_word_count: int
40 |
41 | """
42 | assert word[0].islower()
43 | self.word = word
44 | self.global_checker = global_checker
45 | self.word_to_reps = word_to_reps
46 | self.local_word_count = local_word_count
47 | if word_to_reps:
48 | assert local_word_count
49 |
50 | self.prob_for_word = \
51 | global_checker.word_usage_frequency(word)
52 | if word_to_reps:
53 | local_prob = word_to_reps[word] / local_word_count
54 | self.prob_for_word = (self.prob_for_word + local_prob) / 2
55 |
56 | self.best_guess = word
57 | self.prob_for_best_guess = 0
58 | self.do_update(word)
59 |
60 | def do_update(self, guess):
61 | """
62 | This method finds the probability of the word `guess` in the local
63 | dictionary, and if that probability is greater that
64 | `prob_best_guess`, it replaces `best_guess` by `guess`. It also
65 | updates `prob_for_best_guess`.
66 |
67 | Parameters
68 | ----------
69 | guess: str
70 |
71 | Returns
72 | -------
73 | None
74 |
75 | """
76 | prob_for_guess = \
77 | self.global_checker.word_usage_frequency(guess)
78 | if self.word_to_reps:
79 | local_prob = self.word_to_reps[guess] / self.local_word_count
80 | prob_for_guess = (prob_for_guess + local_prob) / 2
81 | if prob_for_guess > self.prob_for_best_guess:
82 | self.best_guess = guess
83 | self.prob_for_best_guess = prob_for_guess
84 |
--------------------------------------------------------------------------------
/BatchSimilarity.py:
--------------------------------------------------------------------------------
1 | import importlib as imp
2 | from globals import *
3 | from Dag import *
4 |
5 | simi_def = imp.import_module(SIMI_DEF)
6 | from sklearn.metrics.pairwise import cosine_similarity
7 |
8 |
9 | class BatchSimilarity:
10 | """
11 | With sentence transformers, one can speed up the evaluation of sentence
12 | similarity by embedding large batches of sentences all at once, rather
13 | than one at a time. Given two DAGs, dag1 and dag2, this class uses a
14 | sentence transformer to evaluate the similarity between all sentences
15 | `all_ztz1` in dag1 and all the sentences `all_ztz2` in dag2. (ztz =
16 | sentence). `all_ztz1 + all_ztz2` are embedded as a batch, in a single
17 | shot.
18 |
19 | Attributes
20 | ----------
21 | all_ztz1: list[str]
22 | all_ztz2: list[str]
23 | cos_mat: np.array[float]
24 | a matrix of cosines corresponding to all_ztz1 X all_ztz2
25 | model: SentenceTransformer
26 | node_to_simple_ztz1: dict[Node, str]
27 | node_to_simple_ztz2: dict[Node, str]
28 |
29 | """
30 |
31 | def __init__(self,
32 | dag1,
33 | dag2,
34 | node_to_simple_ztz1,
35 | node_to_simple_ztz2,
36 | model=None):
37 | """
38 | Constructor
39 |
40 | Parameters
41 | ----------
42 | dag1: Dag
43 | dag2: Dag
44 | node_to_simple_ztz1: dict[Node, str]
45 | node_to_simple_ztz2: dict[Node, str]
46 | model: SentenceTransformer
47 | """
48 | self.node_to_simple_ztz1 = node_to_simple_ztz1
49 | self.node_to_simple_ztz2 = node_to_simple_ztz2
50 | self.all_ztz1 = [node_to_simple_ztz1[nd] for nd in dag1.nodes]
51 | self.all_ztz2 = [node_to_simple_ztz2[nd] for nd in dag2.nodes]
52 | self.model = model
53 | if model:
54 | sent_embeddings = model.encode(self.all_ztz1 + self.all_ztz2)
55 | len1 = len(self.all_ztz1)
56 | self.cos_mat = cosine_similarity(sent_embeddings[:len1],
57 | sent_embeddings[len1:])
58 |
59 | def simi(self, nd1, nd2):
60 | """
61 | This method returns the similarity of the sentences corresponding to
62 | nodes `nd1` and `nd2`.
63 |
64 | Parameters
65 | ----------
66 | nd1: Node
67 | nd2: Node
68 |
69 | Returns
70 | -------
71 | float
72 |
73 | """
74 | ztz1 = self.node_to_simple_ztz1[nd1]
75 | ztz2 = self.node_to_simple_ztz2[nd2]
76 | if not self.model:
77 | return simi_def.ztz_similarity(ztz1, ztz2)
78 | else:
79 | k1 = self.all_ztz1.index(ztz1)
80 | k2 = self.all_ztz2.index(ztz2)
81 | prob = self.cos_mat[k1, k2]
82 | if prob < 0:
83 | # print("neg. prob.=", prob)
84 | # print(ztz1)
85 | # print(ztz2)
86 | prob = 0
87 | odds = prob / (1 - prob) if prob < 1 else 1e5
88 | return round(odds, 3)
89 |
--------------------------------------------------------------------------------
/white_paper/references.bib:
--------------------------------------------------------------------------------
1 | @book{book-of-why ,
2 | title={The book of why: the new science of cause and effect},
3 | author={Pearl, Judea and Mackenzie, Dana},
4 | year={2018},
5 | publisher={Basic books}
6 | }
7 |
8 | @misc{bayesuvius,
9 | title="Bayesuvius (book)",
10 | author="Robert R. Tucci",
11 | howpublished="\url{https://github.com/rrtucci/Bayesuvius/raw/master/main.pdf}"
12 | }
13 |
14 | @inproceedings{2022opberg,
15 | title={OpBerg: Discovering causal sentences using optimal alignments},
16 | author={Wood, Justin and Matiasz, Nicholas and Silva, Alcino and Hsu, William and Abyzov, Alexej and Wang, Wei},
17 | booktitle={International Conference on Big Data Analytics and Knowledge Discovery},
18 | pages={17--30},
19 | year={2022},
20 | organization={Springer}
21 | }
22 |
23 | @misc{yann-religion,
24 | title="Twitter, Absurd statement about causal inference
25 | and religion",
26 | author="Yann LeCun",
27 | howpublished="\url{https://twitter.com/ylecun/status/1577128801620070400}"
28 | }
29 |
30 | @misc{yann-text,
31 | title="Twitter, Absurd statement
32 | about all
33 | the text ever written",
34 | author="Yann LeCun",
35 | howpublished="\url{https://twitter.com/ylecun/status/1562137291845521408}"
36 | }
37 |
38 | @misc{deft1,
39 | title="Causal DAG extraction from a library of books or videos/movies",
40 | author="Robert R. Tucci",
41 | howpublished="\url{https://arxiv.org/abs/2211.00486}"
42 | }
43 |
44 |
45 | @misc{tic-tac-toe,
46 | title="deft-tic-tac-toe at github",
47 | author="Robert R. Tucci",
48 | howpublished="\url{https://github.com/rrtucci/deft-tic-tac-toe}"
49 | }
50 |
51 | @misc{project-gutenberg,
52 | title="Project {G}utenberg website",
53 | howpublished="\url{https://www.gutenberg.org}"
54 | }
55 |
56 | @misc{imsdb,
57 | title="Internet {M}ovie {S}cript {D}atabase ({IMSDb})",
58 | howpublished="\url{https://imsdb.com/}"
59 | }
60 |
61 | @misc{github-mappa-mundi,
62 | title="Mappa {M}undi at github",
63 | author="Robert R. Tucci",
64 | howpublished="\url{https://github.com/rrtucci/mappa_mundi}"
65 | }
66 |
67 | @misc{audio-description,
68 | title="Audio description",
69 | author="Wikipedia",
70 | howpublished="\url{https://en.wikipedia.org/wiki/Audio_description}"
71 | }
72 |
73 | @misc{scumpy,
74 | title="{SCuMpy} at github",
75 | author="Robert R. Tucci",
76 | howpublished="\url{https://github.com/rrtucci/scumpy}"
77 | }
78 |
79 | @misc{sentence-ax,
80 | title="Sentence{A}x at github",
81 | author="Robert R. Tucci",
82 | howpublished="\url{https://github.com/rrtucci/SentenceAx}"
83 | }
84 |
85 | @misc{fitbit-dataset,
86 | title="FitBit Fitness Tracker Data",
87 | author="Kaggle.com",
88 | howpublished="\url{https://www.kaggle.com/datasets/arashnic/fitbit}"
89 | }
90 |
91 | @misc{causal-fitbit,
92 | title="Causal{F}itbit at github",
93 | author="Robert R. Tucci",
94 | howpublished="\url{https://github.com/rrtucci/CausalFitbit}"
95 | }
96 |
97 | @misc{sbert,
98 | title="{sBERT}",
99 | author="sbert.net",
100 | howpublished="\url{https://www.sbert.net/}"
101 | }
102 |
103 | @misc{openie6,
104 | title="{Openie6}",
105 | author="dair-iitd",
106 | howpublished="\url{https://github.com/dair-iitd/openie6}"
107 | }
108 |
109 |
110 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy1.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | Refs:
7 | https://spacy.io/usage/spacy-101/
8 |
9 | https://subscription.packtpub.com/book/data/9781838987312/2/ch02lvl1sec13/splitting-sentences-into-clauses
10 | """
11 | from globals import *
12 | import spacy
13 |
14 | nlp = spacy.load('en_core_web_sm')
15 |
16 |
17 | # sentence = "He eats cheese, but he won't eat ice cream."
18 |
19 | def simplify_ztz(sentence, verbose=False):
20 | """
21 | This method simplifies the sentence `sentence`. It returns a list of
22 | simple sentences extracted from the input sentence.
23 |
24 | Parameters
25 | ----------
26 | sentence: str
27 | verbose: bool
28 | kwargs: dict[]
29 |
30 | Returns
31 | -------
32 | list[str]
33 |
34 | """
35 |
36 |
37 | doc = nlp(sentence)
38 |
39 | for token in doc:
40 | ancestors = [t.text for t in token.ancestors]
41 | children = [t.text for t in token.children]
42 | # if verbose:
43 | # print(token.text, "\t", token.i, "\t",
44 | # token.pos_, "\t", token.dep_, "\t",
45 | # ancestors, "\t", children)
46 |
47 | def find_root_of_sentence(doc):
48 | root_token = None
49 | for token in doc:
50 | if (token.dep_ == "ROOT"):
51 | root_token = token
52 | return root_token
53 |
54 | root_token = find_root_of_sentence(doc)
55 |
56 | def find_other_verbs(doc, root_token):
57 | other_verbs = []
58 | for token in doc:
59 | ancestors = list(token.ancestors)
60 | if (token.pos_ == "VERB" and len(ancestors) == 1 \
61 | and ancestors[0] == root_token):
62 | other_verbs.append(token)
63 | return other_verbs
64 |
65 | other_verbs = find_other_verbs(doc, root_token)
66 |
67 | def get_clause_token_span_for_verb(verb, doc, all_verbs):
68 | first_token_index = len(doc)
69 | last_token_index = 0
70 | this_verb_children = list(verb.children)
71 | for child in this_verb_children:
72 | if (child not in all_verbs):
73 | if (child.i < first_token_index):
74 | first_token_index = child.i
75 | if (child.i > last_token_index):
76 | last_token_index = child.i
77 | return (first_token_index, last_token_index)
78 |
79 | token_spans = []
80 | all_verbs = [root_token] + other_verbs
81 | for other_verb in all_verbs:
82 | (first_token_index, last_token_index) = \
83 | get_clause_token_span_for_verb(other_verb,
84 | doc, all_verbs)
85 | token_spans.append((first_token_index,
86 | last_token_index))
87 |
88 | sentence_clauses = []
89 | for token_span in token_spans:
90 | start = token_span[0]
91 | end = token_span[1]
92 | if (start < end):
93 | clause = doc[start:end]
94 | sentence_clauses.append(clause)
95 | sentence_clauses = sorted(sentence_clauses,
96 | key=lambda tup: tup[0])
97 |
98 | clauses_text = [clause.text for clause in sentence_clauses]
99 | if verbose:
100 | print(sentence)
101 | print(clauses_text)
102 |
103 | return clauses_text
104 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy4.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | References:
7 |
8 | 1. "Knowledge graphs from complex text" by Karthika Vijayan (Solution
9 | Consultant @Sahaj )
10 | https://medium.com/inspiredbrilliance/knowledge-graphs-from-complex-text-eb009aeed48e
11 |
12 | """
13 | import spacy
14 | nlp = spacy.load('en_core_web_lg')
15 | import coreferee
16 | import spacy_transformers
17 |
18 | def coref_resolve(text):
19 | nlp1 = spacy.load('en_core_web_trf')
20 | nlp1.add_pipe('coreferee')
21 | doc1 = nlp1(text)
22 | tok_list = list(token.text for token in doc1)
23 | c = 0
24 | for chain in doc1._.coref_chains:
25 | for mention in chain:
26 | res1 = [doc1._.coref_chains.resolve(doc1[i]) for i in mention]
27 | res = list(filter((None).__ne__, res1))
28 | if len(res) != 0:
29 | if len(res[0]) == 1:
30 | tok_list[mention[0] + c] = str(res[0][0])
31 | elif len(res[0]) > 1:
32 | tok_list[mention[0] + c] = str(res[0][0])
33 | for j in range(1, len(res[0])):
34 | tok_list.insert(mention[0] + c + j, str(res[0][j]))
35 | c = c + 1
36 | textres = " ".join(tok_list)
37 | return textres
38 |
39 |
40 | def compound_to_simple(sentence):
41 | doc = nlp(sentence)
42 |
43 | root_token = None
44 | for token in doc:
45 | if (token.dep_ == "ROOT"):
46 | root_token = token
47 |
48 | other_verbs = []
49 | for token in doc:
50 | ancestors = list(token.ancestors)
51 | if (token.pos_ == "VERB" and len(
52 | ancestors) < 3 and token != root_token):
53 | other_verbs.append(token)
54 |
55 | token_spans = []
56 | all_verbs = [root_token] + other_verbs
57 | for other_verb in all_verbs:
58 | first_token_index = len(doc)
59 | last_token_index = 0
60 | this_verb_children = list(other_verb.children)
61 | for child in this_verb_children:
62 | if (child not in all_verbs):
63 | if (child.i < first_token_index):
64 | first_token_index = child.i
65 | if (child.i > last_token_index):
66 | last_token_index = child.i
67 | token_spans.append((first_token_index, last_token_index))
68 |
69 | sentence_clauses = []
70 | for token_span in token_spans:
71 | start = token_span[0]
72 | end = token_span[1]
73 | if (start < end):
74 | clause = doc[start:end]
75 | sentence_clauses.append(clause)
76 | sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
77 | clauses_text = [clause.text for clause in sentence_clauses]
78 | return clauses_text
79 |
80 | def simplify_ztz(sentence, verbose=False):
81 | """
82 | This method simplifies the sentence `sentence`. It returns a list of
83 | simple sentences extracted from the input sentence.
84 |
85 | Parameters
86 | ----------
87 | sentence: str
88 | verbose: bool
89 | kwargs: dict[]
90 |
91 | Returns
92 | -------
93 | list[str]
94 |
95 | """
96 |
97 | textres = coref_resolve(sentence)
98 | ztz_list = compound_to_simple(textres)
99 | if verbose:
100 | print(sentence.strip())
101 | print(ztz_list)
102 | return ztz_list
--------------------------------------------------------------------------------
/simp_spacy3.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | Refs:
7 | https://spacy.io/usage/spacy-101/
8 |
9 | For spacy, here are some values of token.dep_
10 |
11 | cc: coordinating conjunction.
12 | i.e., FANBOYS = for, and, nor, but, or, yet, so
13 |
14 | mark: marker that introduces a subordinate clause
15 |
16 | ADP: adposition, e.g. in, to, during
17 |
18 | """
19 |
20 | import spacy
21 | import re
22 | from globals import *
23 |
24 | nlp = spacy.load("en_core_web_sm")
25 | nlp.add_pipe("merge_entities")
26 |
27 |
28 | def simplify_ztz(sentence, verbose=False):
29 | """
30 | This method simplifies the sentence `sentence`. It returns a list of
31 | simple sentences extracted from the input sentence.
32 |
33 | Parameters
34 | ----------
35 | sentence: str
36 | verbose: bool
37 |
38 | Returns
39 | -------
40 | list[str]
41 |
42 | """
43 | doc = nlp(sentence)
44 | tokenized_clauses_list = []
45 | tokenized_clause = []
46 | for token in doc:
47 | cond = (token.dep_ == "mark") or \
48 | (token.dep_ == "cc") or \
49 | (token.text == ";")
50 | if not cond:
51 | tokenized_clause.append(token)
52 | else:
53 | tokenized_clauses_list.append(tokenized_clause)
54 | tokenized_clause = []
55 | # last clause
56 | tokenized_clauses_list.append(tokenized_clause)
57 |
58 | ztz_list = []
59 | for tokenized_clause in tokenized_clauses_list:
60 |
61 | # replace by empty list any tokenized clause
62 | # that doesn't have a noun/pronoun and a verb
63 | clause_has_noun_or_pronoun = False
64 | clause_has_verb = False
65 | token_str_list = []
66 | for token in tokenized_clause:
67 | x = get_simplified_token_txt(token)
68 | if x:
69 | token_str_list.append(x)
70 | if token.pos_ in ["NOUN", "PRON", "PROPN"] and x:
71 | clause_has_noun_or_pronoun = True
72 | # print("NOUN or PRONOUN", token.text)
73 | if token.pos_ in ["VERB", "AUX"] and x:
74 | clause_has_verb = True
75 | # print("VERB", token.text)
76 | if not (clause_has_noun_or_pronoun and clause_has_verb):
77 | clause_str = []
78 | else:
79 | clause_str = " ".join(token_str_list)
80 |
81 | if clause_str:
82 | ztz_list.append(clause_str)
83 |
84 | if verbose:
85 | print(sentence.strip())
86 | print(ztz_list)
87 | return ztz_list
88 |
89 |
90 | def get_simplified_token_txt(token):
91 | """
92 | This auxiliary method takes as input a SpaCy Token `token` and returns a
93 | simplified version of the token's text.
94 |
95 | Parameters
96 | ----------
97 | token: Token
98 |
99 | Returns
100 | -------
101 | str
102 |
103 | """
104 | x = token.text
105 | # remove all punctuation marks
106 | x = re.sub(r'[^\w\s]', '', x)
107 | if token.ent_type_:
108 | # replace named entities by their labels
109 | # x = token.ent_type_
110 |
111 | # remove named entities
112 | x = ""
113 | if token.is_stop and (token.pos_ not in RETAINED_STOPWORD_POS):
114 | x = ""
115 | if token.pos_ not in RETAINED_POS:
116 | x = ""
117 | # remove single character tokens
118 | if len(x.strip()) == 1:
119 | x = ""
120 | x = x.strip()
121 | return x
122 |
--------------------------------------------------------------------------------
/similarity_deprecated/similarity_spacy.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
4 | that returns the similarity of sentences `ztz1` and `ztz2`.
5 | ztz = sentence
6 |
7 | It uses SpaCy + WordVec
8 |
9 | Ref:
10 |
11 |
12 | """
13 |
14 | from itertools import product
15 | import numpy as np
16 | import spacy
17 | from globals import *
18 |
19 | nlp = spacy.load('en_core_web_lg')
20 |
21 |
22 | def ztz_similarity(ztz1, ztz2, **kwargs):
23 | """
24 | This method returns the similarity between sentences `ztz1` and `ztz2`.
25 | The similarity is measured as odds of a probability, so it ranges from 0
26 | to infinity.
27 |
28 | Parameters
29 | ----------
30 | ztz1: str
31 | ztz2: str
32 |
33 | Returns
34 | -------
35 | float
36 |
37 | """
38 |
39 | def same_pos(token1, token2):
40 | # this gives same simi but elapsed time is less
41 | return token1.pos_ == token2.pos_
42 |
43 | doc1 = nlp(ztz1)
44 | doc2 = nlp(ztz2)
45 | sp_tokens1 = [token1 for token1 in doc1 \
46 | if token1.pos_ in RETAINED_POS]
47 | sp_tokens2 = [token2 for token2 in doc2 \
48 | if token2.pos_ in RETAINED_POS]
49 | token_pair_to_simi = {}
50 | for token1, token2 in product(sp_tokens1, sp_tokens2):
51 | if same_pos(token1, token2):
52 | simi = nlp(token1.text.lower()). \
53 | similarity(nlp(token2.text.lower()))
54 | # print("llkj", token1.text, token2.text, token1.pos_, simi)
55 | if simi is not None:
56 | token_pair_to_simi[(token1, token2)] = simi
57 | # print("ffgh", "****************")
58 | # ("mmnk", token_pair_to_simi)
59 | score1 = 0.0
60 | count1 = 0
61 | for token1 in sp_tokens1:
62 | simi_list = [token_pair_to_simi[(token1, token2)]
63 | for token2 in sp_tokens2
64 | if same_pos(token1, token2)]
65 | if simi_list:
66 | best_score = max(simi_list)
67 | score1 += best_score
68 | count1 += 1
69 | if count1:
70 | score1 /= count1
71 |
72 | score2 = 0.0
73 | count2 = 0
74 | for token2 in sp_tokens2:
75 | simi_list = [token_pair_to_simi[(token1, token2)]
76 | for token1 in sp_tokens1
77 | if same_pos(token1, token2)]
78 | if simi_list:
79 | best_score = max(simi_list)
80 | score2 += best_score
81 | count2 += 1
82 | if count2:
83 | score2 /= count2
84 | prob = (score1 + score2) / 2
85 | if prob < 1:
86 | odds = prob / (1 - prob)
87 | else:
88 | odds = 1000
89 | return round(odds, 3)
90 |
91 |
92 | """
93 | ************ simi definition from: similarity_spacy
94 | 1. Cats are beautiful animals.
95 | 2. Dogs are awesome.
96 | simi(1, 2)= 2.578
97 | simi(2, 1)= 2.578
98 |
99 | 1. Cats are beautiful animals.
100 | 2. Some gorgeous creatures are felines.
101 | simi(1, 2)= 2.697
102 | simi(2, 1)= 2.697
103 |
104 | 1. Cats are beautiful animals.
105 | 2. Dolphins are swimming mammals.
106 | simi(1, 2)= 2.535
107 | simi(2, 1)= 2.535
108 |
109 | 1. Cats are beautiful animals.
110 | 2. Cats are beautiful animals.
111 | simi(1, 2)= 1000
112 | simi(2, 1)= 1000
113 |
114 | 1. Cats are beautiful animals.
115 | 2. Cats are beauti animals.
116 | simi(1, 2)= 7.986
117 | simi(2, 1)= 7.986
118 |
119 | ************ simi definition from: similarity_spacy
120 | 1. apple
121 | 2. horse
122 | simi(1, 2)= 0.247
123 | simi(2, 1)= 0.247
124 |
125 | 1. Paul
126 | 2. John
127 | simi(1, 2)= 0.0
128 | simi(2, 1)= 0.0
129 |
130 | 1. The cat sat on the mat.
131 | 2. The dog lay on the rug.
132 | simi(1, 2)= 1.678
133 | simi(2, 1)= 1.678
134 | elapsed time= 0.14391398429870605
135 |
136 | """
137 |
--------------------------------------------------------------------------------
/similarity_nltk.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
4 | that returns the similarity of sentences `ztz1` and `ztz2`.
5 | ztz = sentence
6 |
7 | It uses NLTK + WordNet
8 |
9 | Ref:
10 | https://nlpforhackers.io/wordnet-sentence-similarity/
11 |
12 | """
13 |
14 | from nltk import word_tokenize, pos_tag
15 | from nltk.corpus import wordnet as wn
16 | from itertools import product
17 | from collections import defaultdict
18 | from time import time
19 |
20 |
21 | def penn_to_wn(tag):
22 | """
23 | Convert a Penn Treebank tag to a simplified Wordnet tag
24 |
25 | Parameters
26 | ----------
27 | tag: str
28 |
29 | Returns
30 | -------
31 | str
32 |
33 | """
34 | if tag.startswith('N'):
35 | return 'n' # noun
36 |
37 | if tag.startswith('V'):
38 | return 'v' # verb
39 |
40 | if tag.startswith('J'):
41 | return 'a' # adjective
42 |
43 | if tag.startswith('R'):
44 | return 'r' # adverb
45 |
46 | return None
47 |
48 |
49 | def synset_for_tgd_word(tgd_word):
50 | """
51 | This private method returns the most likely synset for a tagged word
52 | `tgd_word`. A synset (synonym set) is a sort of equivalence class of
53 | words with very similar meanings.
54 |
55 | Parameters
56 | ----------
57 | tgd_word: tuple(str, str)
58 |
59 | Returns
60 | -------
61 | wn.synset or None
62 |
63 | """
64 | word, tag = tgd_word
65 | wn_tag = penn_to_wn(tag)
66 | if wn_tag is None:
67 | return None
68 |
69 | try:
70 | return wn.synsets(word, wn_tag)[0]
71 | except:
72 | return None
73 |
74 |
75 | def ztz_similarity(ztz1, ztz2, **kwargs):
76 | """
77 | This method returns the similarity between sentences `ztz1` and `ztz2`.
78 | The similarity is measured as odds of a probability, so it ranges from 0
79 | to infinity.
80 |
81 | Parameters
82 | ----------
83 | ztz1: str
84 | ztz2: str
85 |
86 | Returns
87 | -------
88 | float
89 |
90 | """
91 |
92 | do_time = False
93 | if do_time:
94 | print("similarity start", time())
95 | # Tokenize and tag
96 | tgd_ztz1 = pos_tag(word_tokenize(ztz1.lower()))
97 | tgd_ztz2 = pos_tag(word_tokenize(ztz2.lower()))
98 |
99 | # Get the synsets for the tagged words (tgd_word)
100 | all_ss1 = []
101 | for tgd_word in tgd_ztz1:
102 | ss1 = synset_for_tgd_word(tgd_word)
103 | if ss1:
104 | all_ss1.append(ss1)
105 | all_ss2 = []
106 | for tgd_word in tgd_ztz2:
107 | ss2 = synset_for_tgd_word(tgd_word)
108 | if ss2:
109 | all_ss2.append(ss2)
110 |
111 | ss_pair_to_simi = defaultdict(lambda: 0)
112 | if do_time:
113 | print("similarity begin path_similarity()", time())
114 | for ss1, ss2 in product(all_ss1, all_ss2):
115 | simi = ss1.path_similarity(ss2)
116 | if simi is not None:
117 | ss_pair_to_simi[(ss1, ss2)] = simi
118 |
119 | score1 = 0.0
120 | count1 = 0
121 | for ss1 in all_ss1:
122 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss2 in all_ss2]
123 | if simi_list:
124 | best_score = max(simi_list)
125 | score1 += best_score
126 | count1 += 1
127 | if count1:
128 | score1 /= count1
129 |
130 | score2 = 0.0
131 | count2 = 0
132 | for ss2 in all_ss2:
133 | simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss1 in all_ss1]
134 | if simi_list:
135 | best_score = max(simi_list)
136 | score2 += best_score
137 | count2 += 1
138 | if count2:
139 | score2 /= count2
140 | prob = (score1 + score2) / 2
141 | if prob < 1:
142 | odds = prob / (1 - prob)
143 | else:
144 | odds = 1000
145 | if do_time:
146 | print("similarity ends", time())
147 | return round(odds, 3)
148 |
149 |
150 | """
151 | ************ simi definition from: similarity_nltk
152 | 1. Cats are beautiful animals.
153 | 2. Dogs are awesome.
154 | simi(1, 2)= 1.045
155 | simi(2, 1)= 1.045
156 |
157 | 1. Cats are beautiful animals.
158 | 2. Some gorgeous creatures are felines.
159 | simi(1, 2)= 2.429
160 | simi(2, 1)= 2.429
161 |
162 | 1. Cats are beautiful animals.
163 | 2. Dolphins are swimming mammals.
164 | simi(1, 2)= 0.733
165 | simi(2, 1)= 0.733
166 |
167 | 1. Cats are beautiful animals.
168 | 2. Cats are beautiful animals.
169 | simi(1, 2)= 1000
170 | simi(2, 1)= 1000
171 |
172 | ************ simi definition from: similarity_nltk
173 | 1. apple
174 | 2. horse
175 | simi(1, 2)= 0.056
176 | simi(2, 1)= 0.056
177 |
178 | 1. Paul
179 | 2. John
180 | simi(1, 2)= 0.083
181 | simi(2, 1)= 0.083
182 |
183 | 1. The cat sat on the mat.
184 | 2. The dog lay on the rug.
185 | simi(1, 2)= 0.353
186 | simi(2, 1)= 0.353
187 | elapsed time= 0.006499767303466797
188 |
189 | """
190 |
--------------------------------------------------------------------------------
/simplifying.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains functions for simplifying movie scripts (or short stories).
4 |
5 | input directory: m_scripts_spell or short_stories_spell
6 | output directory: m_scripts_simp or short_stories_simp
7 |
8 | Simplification is done by the function `simplify_ztz()`. This function was
9 | implemented in several ways before we decided to stick with the version in
10 | file `simp_spacy3`.
11 |
12 | simp_spacy1.py
13 | simp_spacy2.py
14 | simp_spacy3.py (recommended)
15 | simp_spacy-claucy.py
16 | simp_stanford.py
17 |
18 | The input files have only one sentence per line. For each file, we use SpaCy
19 | to break each sentence into clauses. Then we simplify the clauses by
20 | removing stop-words, punctuation marks, proper nouns (a.k.a. named entities)
21 | and other excess baggage. Then we replace each clause by its simplified
22 | version. Different simplified clauses from the same sentence are put in the
23 | same line, separated by a separator-token. Some sentences are diminished to
24 | nothing after the simplification. Those sentences are replaced by a single
25 | asterisk.
26 |
27 | """
28 | from globals import *
29 | import os
30 | import re
31 | import importlib as imp
32 |
33 | zsimp = imp.import_module(ZTZ_SIMPLIFIER)
34 | from utils import *
35 |
36 |
37 | def simplify_one_m_script(
38 | in_dir, out_dir,
39 | file_name,
40 | verbose=False,
41 | use_gpu=False):
42 | """
43 | in_dir and out_dir can be the same, but this will overwrite the files.
44 |
45 | This method reads a file called `file_name` in the `in_dir` directory
46 | and creates a simplified version in the `out_dir` directory.
47 |
48 |
49 | Parameters
50 | ----------
51 | in_dir: str
52 | out_dir: str
53 | file_name: str
54 | verbose: bool
55 | use_gpu: bool
56 |
57 | Returns
58 | -------
59 | None
60 |
61 | """
62 | inpath = in_dir + "/" + file_name
63 | outpath = out_dir + "/" + file_name
64 | new_lines = []
65 | with open(inpath, "r") as f:
66 | count = 1
67 | for line in f:
68 | if verbose:
69 | print(str(count) + ".")
70 | simple_ztz_list = zsimp.simplify_ztz(line,
71 | verbose=verbose,
72 | use_gpu=use_gpu)
73 |
74 | # remove empty clauses
75 | simple_ztz_list = [ztz for ztz in simple_ztz_list if ztz]
76 |
77 | if simple_ztz_list == []:
78 | simple_ztz_list = [ZTZ_SEPARATOR]
79 |
80 | # replace multiple white spaces by single white space
81 | simple_ztz_list = [re.sub(r'\s+', ' ', ztz) for ztz in
82 | simple_ztz_list]
83 |
84 | if len(simple_ztz_list) > 1:
85 | xx = " " + ZTZ_SEPARATOR + " "
86 | new_lines.append(xx.join(simple_ztz_list))
87 | elif len(simple_ztz_list) == 1:
88 | new_lines.append(simple_ztz_list[0])
89 | else:
90 | assert False
91 |
92 | count += 1
93 | with open(outpath, "w") as f:
94 | for line in new_lines:
95 | f.write(line + "\n")
96 |
97 |
98 | def simplify_batch_of_m_scripts(
99 | in_dir, out_dir,
100 | batch_file_names,
101 | verbose=False):
102 | """
103 | This method calls the method `simplify_one_m_script` for all the file
104 | names in the list of file names `batch_file_names`.
105 |
106 |
107 | Parameters
108 | ----------
109 | in_dir: str
110 | out_dir: str
111 | batch_file_names: list[str]
112 | verbose: bool
113 |
114 | Returns
115 | -------
116 | None
117 |
118 | """
119 | all_file_names = my_listdir(in_dir)
120 | assert set(batch_file_names).issubset(set(all_file_names))
121 | for file_name in batch_file_names:
122 | i = all_file_names.index(file_name)
123 | print('%i.' % (i + 1), file_name)
124 | simplify_one_m_script(in_dir, out_dir, file_name, verbose)
125 |
126 |
127 | if __name__ == "__main__":
128 |
129 | def main1():
130 | print("************ simplifier:", ZTZ_SIMPLIFIER)
131 | ztz = \
132 | 'The man, who had never liked the words' \
133 | ' "booby" and "boobyhatch,"' \
134 | ' and who liked them even less on a shining morning when there' \
135 | ' was a unicorn in the garden, thought for a moment.'
136 | zsimp.simplify_ztz(ztz, verbose=True)
137 |
138 |
139 | def main2():
140 | print("************ simplifier:", ZTZ_SIMPLIFIER)
141 | path = "simplifying_test.txt"
142 | with open(path, "r") as f:
143 | count = 1
144 | for line in f:
145 | print(str(count) + ".")
146 | zsimp.simplify_ztz(line, verbose=True)
147 | count += 1
148 |
149 |
150 | def main3():
151 | print("************ simplifier:", ZTZ_SIMPLIFIER)
152 | in_dir = "short_stories_spell"
153 | out_dir = "short_stories_simp"
154 | batch_file_names = my_listdir(in_dir)[0:3]
155 | simplify_batch_of_m_scripts(
156 | in_dir, out_dir,
157 | batch_file_names,
158 | verbose=False)
159 |
160 |
161 | def main4():
162 | print("************ simplifier:", ZTZ_SIMPLIFIER)
163 | remove_dialogs = False
164 | in_dir = SPELL_DIR if not remove_dialogs else SPELL_RD_DIR
165 | out_dir = SIMP_DIR if not remove_dialogs else SIMP_RD_DIR
166 | batch_file_names = my_listdir(in_dir)[0:3]
167 | simplify_batch_of_m_scripts(
168 | in_dir, out_dir,
169 | batch_file_names)
170 |
171 |
172 | main1()
173 | main2()
174 | # main3()
175 | # main4()
176 |
--------------------------------------------------------------------------------
/downloading_imsdb.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | The goal of this file is to scrape the 1,100+ movie scripts from the IMSDb
4 | website.
5 |
6 | References:
7 | https://github.com/j2kun/imsdb_download_all_scripts
8 | https://github.com/AdeboyeML/Film_Script_Analysis
9 | https://www.datacamp.com/tutorial/scraping-reddit-python-scrapy
10 |
11 | In Chrome and most web browsers, pressing Ctrl+U opens the current page's
12 | source code in a new window.
13 |
14 | 3 depths, d0, d1, d2
15 |
16 | d0_url
17 | https://imsdb.com/all-scripts.html
18 |
19 | d1_url (depends on movie)
20 | https://imsdb.com/Movie%20Scripts/10%20Things%20I%20Hate%20About%20You%20Script.html
21 |
22 | d2_url (depends on movie)
23 | https://imsdb.com/scripts/10-Things-I-Hate-About-You.html
24 |
25 | find_all() takes you from X->Y
26 | (d0_html, d0_soup)->d1_url
27 | (d1_html, d1_soup)->d2_url
28 | """
29 | from bs4 import BeautifulSoup
30 | import requests
31 | from slugify import slugify # python-slugify
32 | from globals import *
33 |
34 |
35 | def get_d1_urls_and_titles():
36 | """
37 | This auxiliary method returns lists `d1_urls` and `titles`.
38 |
39 | Returns
40 | -------
41 | list[str], list[str]
42 |
43 | """
44 | d1_urls = []
45 | titles = []
46 | d0_url = BASE_URL + "/all-scripts.html"
47 | d0_html = requests.get(d0_url).text
48 | d0_soup = BeautifulSoup(d0_html, "html.parser")
49 | for p_tag in d0_soup.find_all('p'):
50 | d1_url = p_tag.a['href']
51 | cond1 = "/Movie Scripts/" in d1_url
52 | cond2 = ".html" in d1_url
53 | if cond1 and cond2:
54 | title = d1_url.replace("/Movie Scripts/", ""). \
55 | replace(" Script.html", ""). \
56 | replace(".html", "")
57 | d1_urls.append(BASE_URL + d1_url)
58 | titles.append(title)
59 | return d1_urls, titles
60 |
61 |
62 | def get_one_m_script(d1_url, stub_only=False):
63 | """
64 | This method scrapes one movie script with d1-level URL `d1_url`.
65 |
66 | Parameters
67 | ----------
68 | d1_url: str
69 | stub_only: bool
70 | True iff don't want to scrape the movie script text at all. Instead
71 | of the movie script text, it leaves a message "coming soon to a
72 | theater near you"
73 |
74 | Returns
75 | -------
76 | str, bool
77 | the movie script, and a boolean indicating if it's missing.
78 |
79 | """
80 | missing = False
81 | tail = d1_url.split('/')[-1].replace(".html", "")
82 | if stub_only:
83 | m_script = "coming soon to a theater near you"
84 | else:
85 | # print("nabf", d1_url)
86 | d1_html = requests.get(d1_url).text
87 | d1_soup = BeautifulSoup(d1_html, "html.parser")
88 | p_tags = d1_soup.find_all('p', align="center")
89 | if not p_tags:
90 | print('**************** Missing: %s' % tail)
91 | missing = True
92 | return "coming soon to a theater near you", missing
93 | assert len(p_tags) == 1
94 | d2_url = p_tags[0].a['href']
95 | d2_url = BASE_URL + d2_url
96 | # print("nnfx", d2_url)
97 | d2_html = requests.get(d2_url).text
98 | d2_soup = BeautifulSoup(d2_html, "html.parser")
99 | # tried this. Doesn't always work
100 | # pre_tags = d2_soup.find_all('pre')
101 | pre_tags = d2_soup.find_all('td', {'class': "scrtext"})
102 | if not pre_tags:
103 | print('**************** Missing: %s' % tail)
104 | missing = True
105 | return "coming soon to a theater near you", missing
106 | m_script = pre_tags[0].get_text()
107 | # m_script = clean_m_script(m_script)
108 | return m_script, missing
109 |
110 |
111 | def get_batch_of_m_scripts(first=1, last=5000, stub_only=False):
112 | """
113 | This method scrapes the movie scripts starting at position `first` and
114 | ending at position `last`. If `last` is larger than the number of movie
115 | scripts at IMSDb, then the method ends when it has scraped all movie
116 | scripts.
117 |
118 | Parameters
119 | ----------
120 | first: int
121 | last: int
122 | stub_only: bool
123 |
124 | Returns
125 | -------
126 | None
127 |
128 | """
129 | d1_urls, titles = get_d1_urls_and_titles()
130 | num_titles = len(titles)
131 | missing_m_scripts = []
132 | assert first <= last
133 | if last > num_titles:
134 | last = num_titles
135 | if first < 1:
136 | first = 1
137 | for i in range(first - 1, last):
138 | d1_url = d1_urls[i]
139 | dashed_title = slugify(titles[i])
140 | print('%i. fetching %s' % (i + 1, dashed_title))
141 | m_script, missing = get_one_m_script(d1_url, stub_only=stub_only)
142 | outpath = M_SCRIPTS_DIR + '/' + dashed_title + '.txt'
143 | if missing:
144 | missing_m_scripts.append(dashed_title + '.txt')
145 | else:
146 | written = False
147 | len_script = len(m_script)
148 | print("m_script num of characters=", len_script)
149 | if len_script > 500:
150 | with open(outpath, "w", newline="\n") as f:
151 | f.write(m_script)
152 | written = True
153 | if not written:
154 | # m-scripts with less than 500 char are just stubs
155 | print("------------------ Found just a stub: ", dashed_title)
156 | missing_m_scripts.append(dashed_title + '.txt')
157 | print("missing m_scripts:")
158 | print(missing_m_scripts)
159 | print("number of missing m_scripts=", len(missing_m_scripts))
160 |
161 |
162 | if __name__ == "__main__":
163 | def main1():
164 | urls, titles = get_d1_urls_and_titles()
165 | print(urls)
166 | print(titles)
167 | assert len(urls) == len(titles)
168 | print("number of films=", len(urls)) # 1211
169 | # 75 missing
170 | # 1211-75=1136 expected 238 MB
171 |
172 |
173 | def main2():
174 | get_batch_of_m_scripts(first=1, last=100, stub_only=False)
175 |
176 |
177 | # main1()
178 | main2()
179 |
--------------------------------------------------------------------------------
/simp_openie6.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | The functions in this file are used inside the following jupyter notebook at
4 | Google Colab
5 |
6 | https://colab.research.google.com/drive/1S2EWOGkoCgjfOJzTRJ7PLeu4T8SBwhlF?usp=sharing
7 |
8 | Refs:
9 |
10 | 1. https://github.com/dair-iitd/CaRB
11 |
12 | 2. https://github.com/dair-iitd/imojie
13 |
14 | 3. https://github.com/dair-iitd/openie6
15 |
16 | """
17 | import subprocess
18 | import os
19 | from globals import *
20 | from utils import my_listdir
21 |
22 |
23 | def openie6_simplify_batch_of_m_scripts(
24 | in_dir, out_dir,
25 | batch_file_names,
26 | verbose=False):
27 | """
28 | This method does the same thing as the method
29 | `simplifying.simplify_batch_of_m_scripts()` but for the case
30 | `ZTZ_SIMPLIFIER = "simp_openie6"`
31 |
32 | Parameters
33 | ----------
34 | in_dir: str
35 | out_dir: str
36 | batch_file_names: list[str]
37 | verbose: bool
38 |
39 | Returns
40 | -------
41 | None
42 |
43 | """
44 | # assume directories `openie6` and `mappa_mundi`
45 | # live side by side inside a bigger folder X
46 | # and that the cwd is X
47 |
48 | m_script_starting_line_nums = \
49 | make_all_sentences_file(in_dir=in_dir,
50 | batch_file_names=batch_file_names)
51 | gpu_command = \
52 | r"cd openie6 && CUDA_DEVICE_ORDER=PCI_BUS_ID " \
53 | r"CUDA_VISIBLE_DEVICES=0 " \
54 | r"PYTHONPATH=imojie:imojie/allennlp:imojie" \
55 | r"/pytorch_transformers:$PYTHONPATH python run.py " \
56 | r"--save models/conj_model --mode predict " \
57 | r"--inp ../all_sentences.txt --batch_size 1 " \
58 | r"--model_str bert-large-cased --task conj " \
59 | r"--gpus 1 --out ../all_predictions.txt"
60 |
61 | cpu_command = gpu_command.replace("--gpus 1", "--gpus 0")
62 |
63 | if USE_GPU:
64 | os.system(gpu_command)
65 | else:
66 | os.system(cpu_command)
67 |
68 | translate_predictions_file_from_openie6_to_mm(
69 | in_fname="all_predictions.txt.conj",
70 | out_fname="all_predictions_in_mm.txt")
71 |
72 | make_m_scripts_simp_dir(out_dir,
73 | batch_file_names,
74 | m_script_starting_line_nums)
75 |
76 | os.remove("all_sentences.txt")
77 | os.remove("all_predictions.txt.conj")
78 | os.remove("all_predictions_in_mm.txt")
79 |
80 |
81 | def make_all_sentences_file(in_dir, batch_file_names):
82 | """
83 | This internal method creates the file `all_sentences.txt`.
84 | `all_sentences.txt` is a concatenation of all the files in
85 | `batch_file_names`.
86 |
87 | Parameters
88 | ----------
89 | in_dir: str
90 | batch_file_names: list[str]
91 |
92 | Returns
93 | -------
94 | m_script_starting_line_nums: list[int]
95 | list of the starting line numbers within the file
96 | `all_sentences.txt` for the file names in the list `batch_file_names`.
97 |
98 | """
99 | m_script_starting_line_nums = []
100 | cum_line_num = 0
101 | with open("all_sentences.txt", "w") as big_f:
102 | for fname in batch_file_names:
103 | in_path = in_dir + '/' + fname
104 | # print("bbng", in_path)
105 | with open(in_path, "r") as f:
106 | # print("hhji", cum_line_num)
107 | m_script_starting_line_nums.append(cum_line_num)
108 | f_len = 0
109 | for line in f:
110 | f_len += 1
111 | # print("llmk", line)
112 | big_f.write(line)
113 | cum_line_num += f_len
114 | # print("nnmj", f_len)
115 | return m_script_starting_line_nums
116 |
117 |
118 | def translate_predictions_file_from_openie6_to_mm(in_fname, out_fname):
119 | """
120 | This internal method reads the file `all_predictions.txt.conj` and
121 | translates it into a new file called `all_predictions_in_mm.txt`. The
122 | input file is in the format of openie6 extractions output and the output
123 | file is in the mappa mundi (mm) simp format.
124 |
125 | openie6 extractions output format: one sentence or empty line ("row
126 | gap") per line. Groups separated by empty lines. Each group consists of
127 | the original sentence followed by the extraction sentences.
128 |
129 | mm simp format: one sentence per line. No row gaps. Each line has all
130 | the extractions from the original sentence, separated by ZTZ_SEPARATOR.
131 |
132 | Parameters
133 | ----------
134 | in_fname: str
135 | out_fname: str
136 |
137 | Returns
138 | -------
139 | None
140 |
141 | """
142 | with open(in_fname, "r") as in_file:
143 | with open(out_fname, "w") as out_file:
144 | in_parts = []
145 | prev_line_is_empty = True
146 | for line in in_file:
147 | if line.strip():
148 | in_parts.append(line.strip())
149 | prev_line_is_empty = False
150 | else:
151 | if not prev_line_is_empty:
152 | if len(in_parts) > 1:
153 | in_parts = in_parts[1:]
154 | if len(in_parts) > 0:
155 | xx = " " + ZTZ_SEPARATOR + " "
156 | out_file.write(xx.join(in_parts) + "\n")
157 | in_parts = []
158 |
159 |
160 | def make_m_scripts_simp_dir(out_dir,
161 | batch_file_names,
162 | m_script_starting_line_nums):
163 | """
164 | This internal method reads the file `all_predictions_in_mm.txt` and it
165 | uses that to create a new directory called `out_dir` populated by files
166 | with the names in list `batch_file_names`.
167 |
168 | Parameters
169 | ----------
170 | out_dir: str
171 | batch_file_names: list[str]
172 | m_script_starting_line_nums: list[int]
173 |
174 | Returns
175 | -------
176 | None
177 |
178 | """
179 | if not os.path.exists(out_dir):
180 | os.makedirs(out_dir)
181 | with open("all_predictions_in_mm.txt", "r") as big_f:
182 | m_script_num = -1
183 | f = None
184 | for line_num, line in enumerate(big_f):
185 | if line_num in m_script_starting_line_nums:
186 | if f:
187 | f.close()
188 | m_script_num += 1
189 | fname = batch_file_names[m_script_num]
190 | out_path = out_dir + "/" + fname
191 | f = open(out_path, "w")
192 | f.write(line)
193 | if f:
194 | f.close()
195 |
196 |
197 | if __name__ == "__main__":
198 | def main():
199 | in_dir = "short_stories_spell"
200 | batch_file_names = my_listdir(in_dir)
201 | make_all_sentences_file(in_dir=in_dir,
202 | batch_file_names=batch_file_names)
203 | translate_predictions_file_from_openie6_to_mm(
204 | "openie6_translation_test.txt",
205 | "openie6_test_answer.txt")
206 |
207 |
208 | main()
209 |
--------------------------------------------------------------------------------
/jupyter_notebooks/SUMMARY.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# SUMMARY notebook\n",
8 | "\n",
9 | "This notebook scans the directory in which it lives to find all jupyter notebooks (other than itself) in that directory. It then prints for every notebook it finds (1) a hyperlink to the notebook, and (2) the first cell (which is always markdown) of the notebook. This way you can read a nice, automatically generated summary of all the notebooks without having to open all of them. If you find a notebook that you want to explore further, you can simply click on its link to open it."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/markdown": [
20 | "\n",
21 | "\n",
22 | "
\n",
23 | "\n",
24 | "Coordination_Analysis_with_IGL_CA_(OpenIE6_paper).ipynb [local link] [github link] 1/4\n",
25 | "\n",
26 | "# IGL-CA: inference pipeline\n",
27 | "Coordination analysis inference using the OpenIE6 model.\n",
28 | "\n",
29 | "* Anton's [OpenIE6 fork](https://github.com/alexeyev/openie6)\n",
30 | "* [OpenIE6 original repo](https://github.com/dair-iitd/openie6)\n",
31 | "* [OpenIE6 original paper](https://aclanthology.org/2020.emnlp-main.306/)\n",
32 | "\n",
33 | "Prepared by [Anton Alekseev](https://github.com/alexeyev) and [Anastasia Predelina](https://github.com/PredelinaAsya). Quite a bit of effort, tbh.\n",
34 | "\n",
35 | "**NOTA BENE**: GPU environment should be enabled before running the code! If not possible, another code cell for CPU-only environment is available at the very end of the notebook.\n",
36 | "\n",
37 | "
\n",
38 | "\n",
39 | "navigating_m_scripts.ipynb [local link] [github link] 2/4\n",
40 | "\n",
41 | "# Navigating Movie Scripts\n",
42 | "\n",
43 | "In this notebook, we explain\n",
44 | "how to use Mappa Mundi (MM) to do causal DEFT (DAG extraction from text)\n",
45 | "using as a test case, the following 3 movie scripts by Pixar/Disney.\n",
46 | "\n",
47 | "* [Toy Story](../m_scripts/toy-story.txt)\n",
48 | "* [Up](../m_scripts/up.txt)\n",
49 | "* [WALL-E](../m_scripts/wall-e.txt)\n",
50 | "\n",
51 | "
\n",
52 | "\n",
53 | "navigating_short_stories.ipynb [local link] [github link] 3/4\n",
54 | "\n",
55 | "# Navigating Short Stories\n",
56 | "\n",
57 | "In this notebook, we explain how to use Mappa Mundi (MM) to do causal DEFT (DAG extraction from text)\n",
58 | "using as a test case, the following 3 short stories by P.G. Wodehouse.\n",
59 | "\n",
60 | "* [Bill the Bloodhound](../short_stories/bill-the-bloodhound.txt)\n",
61 | "* [Extricating Young Gussie](../short_stories/extricating-young-gussie.txt)\n",
62 | "* [Wilton's Holiday](../short_stories/wiltons-holiday.txt)\n",
63 | "\n",
64 | "\n",
65 | "
\n",
66 | "\n",
67 | "simplifying_with_Openie6.ipynb [local link] [github link] 4/4\n",
68 | "\n",
69 | "# Simplifying with Openie6\n",
70 | "\n",
71 | "The Openie6 software takes as input a possibly complex or compound sentence X,\n",
72 | "and it returns a list of simple sentences that contain all the\n",
73 | "information in the original sentence X.\n",
74 | "\n",
75 | "Anton Alekseev (AA) and Anastasia Predelina (AP) wrote a jupyter notebook\n",
76 | "that installs and runs the code in the Openie6 repo https://github.com/alexeyev/openie6\n",
77 | "An exact copy of notebook by AA/AP is included in this folder. It is also publicly available at AA's google drive\n",
78 | "under the URL\n",
79 | "\n",
80 | " https://colab.research.google.com/drive/1samvO-SH6Xgjf9ItlhAF1EmBZo5grBQb?usp=sharing\n",
81 | "\n",
82 | "\n",
83 | "\n",
84 | "This notebook adds new code to the end of the AA/AP notebook. The purpose of the\n",
85 | "new code is\n",
86 | "to simplify short stories and movie scripts."
87 | ],
88 | "text/plain": [
89 | ""
90 | ]
91 | },
92 | "metadata": {},
93 | "output_type": "display_data"
94 | }
95 | ],
96 | "source": [
97 | "# Version: 2\n",
98 | "import os\n",
99 | "import json\n",
100 | "from IPython.display import display, Markdown\n",
101 | "\n",
102 | "# the name of this file\n",
103 | "this_fname = 'SUMMARY.ipynb'\n",
104 | "fname_to_md = {}\n",
105 | "for fname in sorted([x for x in os.listdir('./')]):\n",
106 | " if fname[-6:] == '.ipynb' and fname != this_fname:\n",
107 | " # print('------------', fname)\n",
108 | " with open(fname, 'r', encoding=\"utf-8\") as f:\n",
109 | " fdata = json.load(f)\n",
110 | " fname_to_md[fname] = ''.join(fdata['cells'][0]['source'])\n",
111 | "# print(fname_to_md)\n",
112 | "pre_sep = '\\n\\n
\\n\\n'\n",
113 | "full_md = ''\n",
114 | "k = 1\n",
115 | "num_nb = len(fname_to_md)\n",
116 | "project_name =\"mappa_mundi\"\n",
117 | "who =\"rrtucci\"\n",
118 | "where = \"jupyter_notebooks\"\n",
119 | "for fname, md in fname_to_md.items():\n",
120 | " sep = pre_sep\n",
121 | " local_link = f' [local link] '\n",
122 | " github_link = f' [github link] '\n",
124 | " sep += fname + local_link + github_link + str(k) + '/' + str(num_nb) + '\\n\\n'\n",
125 | " full_md += sep + md\n",
126 | " k += 1\n",
127 | "display(Markdown(full_md))"
128 | ]
129 | }
130 | ],
131 | "metadata": {
132 | "kernelspec": {
133 | "display_name": "Python 3 (ipykernel)",
134 | "language": "python",
135 | "name": "python3"
136 | },
137 | "language_info": {
138 | "codemirror_mode": {
139 | "name": "ipython",
140 | "version": 3
141 | },
142 | "file_extension": ".py",
143 | "mimetype": "text/x-python",
144 | "name": "python",
145 | "nbconvert_exporter": "python",
146 | "pygments_lexer": "ipython3",
147 | "version": "3.10.9"
148 | }
149 | },
150 | "nbformat": 4,
151 | "nbformat_minor": 4
152 | }
153 |
--------------------------------------------------------------------------------
/post_cleaning.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains functions for post-cleaning movie scripts (or short
4 | stories).
5 |
6 | input directory: m_scripts_simp or short_stories_simp
7 | output directory: m_scripts_post_clean or short_stories_post_clean
8 |
9 | The input files have one or more sublines per line. For each file, we use
10 | post clean the sublines by removing stop-words, punctuation marks, proper
11 | nouns (a.k.a. named entities) and other excess baggage. Then we replace each
12 | subline by its post-clean version. Different clean sublines from the
13 | same sentence are put in the same line, separated by ZTZ_SEPARATOR . Some
14 | sentences are diminished to nothing after the post-cleaning. Those
15 | sentences are replaced by a single ZTZ_SEPARATOR.
16 |
17 |
18 | Refs:
19 | https://spacy.io/usage/spacy-101/
20 |
21 | For spacy, here are some values of token.dep_
22 |
23 | cc: coordinating conjunction.
24 | i.e., FANBOYS = for, and, nor, but, or, yet, so
25 |
26 | mark: marker that introduces a subordinate subline
27 |
28 | ADP: adposition, e.g. in, to, during
29 |
30 | """
31 | from globals import *
32 | import importlib as imp
33 |
34 | zsimp = imp.import_module(ZTZ_SIMPLIFIER)
35 | from utils import *
36 |
37 | import spacy
38 | import re
39 | from globals import *
40 |
41 | nlp = spacy.load("en_core_web_sm")
42 |
43 |
44 | # nlp.add_pipe("merge_entities")
45 |
46 |
47 | def post_clean_line(line, verbose=False):
48 | """
49 | This method cleans the line string `line`. It returns a list of simple
50 | sentences (sublines) extracted from the input sentence (line).
51 |
52 | Parameters
53 | ----------
54 | line: str
55 | verbose: bool
56 |
57 | Returns
58 | -------
59 | list[str]
60 |
61 | """
62 | tokenized_sublines = \
63 | [nlp(subline) for subline in line.split(ZTZ_SEPARATOR)]
64 |
65 | ztz_list = []
66 | for tokenized_subline in tokenized_sublines:
67 |
68 | # replace by empty list any tokenized subline
69 | # that doesn't have a noun/pronoun and a verb
70 | subline_has_noun_or_pronoun = False
71 | subline_has_verb = False
72 | token_str_list = []
73 | for token in tokenized_subline:
74 | x = get_post_cleaned_token_txt(token)
75 | if x:
76 | token_str_list.append(x)
77 | if token.pos_ in ["NOUN", "PRON", "PROPN"] and x:
78 | subline_has_noun_or_pronoun = True
79 | # print("NOUN or PRONOUN", token.text)
80 | if token.pos_ in ["VERB", "AUX"] and x:
81 | subline_has_verb = True
82 | # print("VERB", token.text)
83 | if not (subline_has_noun_or_pronoun and subline_has_verb):
84 | subline_str = []
85 | else:
86 | subline_str = " ".join(token_str_list)
87 |
88 | if subline_str:
89 | ztz_list.append(subline_str)
90 |
91 | if verbose:
92 | print(line.strip())
93 | print(ztz_list)
94 | return ztz_list
95 |
96 |
97 | def get_post_cleaned_token_txt(token):
98 | """
99 | This auxiliary method takes as input a SpaCy Token `token` and returns a
100 | simplified version of the token's text.
101 |
102 | Parameters
103 | ----------
104 | token: Token
105 |
106 | Returns
107 | -------
108 | str
109 |
110 | """
111 | x = token.text
112 | # remove all punctuation marks
113 | x = re.sub(r'[^\w\s]', '', x)
114 |
115 | # if token.ent_type_:
116 | # # replace named entities by their labels
117 | # # x = token.ent_type_
118 | #
119 | # # remove named entities
120 | # x = ""
121 | # if token.is_stop and (token.pos_ not in RETAINED_STOPWORD_POS):
122 | # x = ""
123 | # if token.pos_ not in RETAINED_POS:
124 | # x = ""
125 |
126 | # remove single character tokens
127 | if len(x.strip()) == 1:
128 | x = ""
129 | x = x.strip()
130 | return x
131 |
132 |
133 | def post_clean_one_m_script(
134 | in_dir, out_dir,
135 | file_name,
136 | verbose=False):
137 | """
138 | in_dir and out_dir can be the same, but this will overwrite the files.
139 |
140 | This method reads a file called `file_name` in the `in_dir` directory
141 | and creates a post-cleaned version in the `out_dir` directory.
142 |
143 |
144 | Parameters
145 | ----------
146 | in_dir: str
147 | out_dir: str
148 | file_name: str
149 | verbose: bool
150 |
151 | Returns
152 | -------
153 | None
154 |
155 | """
156 | inpath = in_dir + "/" + file_name
157 | outpath = out_dir + "/" + file_name
158 | new_lines = []
159 | with open(inpath, "r") as f:
160 | count = 1
161 | for line in f:
162 | if verbose:
163 | print(str(count) + ".")
164 | simple_ztz_list = post_clean_line(line,
165 | verbose=verbose)
166 |
167 | # remove empty simple ztz
168 | simple_ztz_list = [ztz for ztz in simple_ztz_list if ztz]
169 |
170 | if not simple_ztz_list:
171 | simple_ztz_list = [ZTZ_SEPARATOR]
172 |
173 | # replace multiple white spaces by single white space
174 | simple_ztz_list = [re.sub(r'\s+', ' ', ztz) for ztz in
175 | simple_ztz_list]
176 |
177 | if len(simple_ztz_list) > 1:
178 | xx = " " + ZTZ_SEPARATOR + " "
179 | new_lines.append(xx.join(simple_ztz_list))
180 | elif len(simple_ztz_list) == 1:
181 | new_lines.append(simple_ztz_list[0])
182 | else:
183 | assert False
184 |
185 | count += 1
186 | with open(outpath, "w") as f:
187 | for line in new_lines:
188 | f.write(line + "\n")
189 |
190 |
191 | def post_clean_batch_of_m_scripts(
192 | in_dir, out_dir,
193 | batch_file_names,
194 | verbose=False):
195 | """
196 | This method calls the method `post_clean_one_m_script` for all the file
197 | names in the list of file names `batch_file_names`.
198 |
199 |
200 | Parameters
201 | ----------
202 | in_dir: str
203 | out_dir: str
204 | batch_file_names: list[str]
205 | verbose: bool
206 |
207 | Returns
208 | -------
209 | None
210 |
211 | """
212 | all_file_names = my_listdir(in_dir)
213 | assert set(batch_file_names).issubset(set(all_file_names))
214 | for file_name in batch_file_names:
215 | i = all_file_names.index(file_name)
216 | print('%i.' % (i + 1), file_name)
217 | post_clean_one_m_script(in_dir, out_dir, file_name, verbose)
218 |
219 |
220 | if __name__ == "__main__":
221 | def main1():
222 | in_dir = "short_stories_simp"
223 | out_dir = "short_stories_post_clean"
224 | batch_file_names = my_listdir(in_dir)[0:3]
225 | post_clean_batch_of_m_scripts(
226 | in_dir, out_dir,
227 | batch_file_names,
228 | verbose=False)
229 |
230 |
231 | def main2():
232 | remove_dialogs = False
233 | in_dir = SIMP_DIR if not remove_dialogs else SIMP_RD_DIR
234 | out_dir = POST_CLEAN_DIR if not remove_dialogs else POST_CLEAN_RD_DIR
235 | batch_file_names = my_listdir(in_dir)[0:3]
236 | post_clean_batch_of_m_scripts(
237 | in_dir, out_dir,
238 | batch_file_names)
239 |
240 | main1()
241 | main2()
242 |
--------------------------------------------------------------------------------
/stopwords.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file has a function that lists all SpaCy stopwords, classified by POS (
4 | part of speech).
5 |
6 | """
7 |
8 | import spacy
9 | from pprint import pprint
10 |
11 | nlp = spacy.load('en_core_web_sm')
12 |
13 |
14 | def get_stopwords_dict():
15 | """
16 | This method returns a dictionary that maps the parts of speech (POS) to
17 | a list of the stopwords that have that POS.
18 |
19 | Returns
20 | -------
21 | dict[str, list[str]]
22 |
23 | """
24 |
25 | stopwords = nlp.Defaults.stopwords
26 |
27 | pos_to_stopwords = {}
28 |
29 | for word in stopwords:
30 | pos = nlp(word)[0].pos_
31 | if pos in pos_to_stopwords:
32 | pos_to_stopwords[pos].append(word)
33 | else:
34 | pos_to_stopwords[pos] = [word]
35 |
36 | return pos_to_stopwords
37 |
38 |
39 | if __name__ == "__main__":
40 | def main():
41 | d = get_stopwords_dict()
42 | print(sorted(d.keys()))
43 | pprint(d)
44 |
45 |
46 | main()
47 |
48 | """
49 | ['ADJ',
50 | 'ADP',
51 | 'ADV',
52 | 'AUX',
53 | 'CCONJ',
54 | 'DET',
55 | 'INTJ',
56 | 'NOUN',
57 | 'NUM',
58 | 'PART',
59 | 'PRON',
60 | 'PUNCT',
61 | 'SCONJ',
62 | 'VERB']
63 | {'ADJ': ['same',
64 | 'few',
65 | 'former',
66 | 'full',
67 | 'serious',
68 | 'own',
69 | 'empty',
70 | 'such',
71 | 'several',
72 | 'latter',
73 | 'various',
74 | 'other',
75 | 'least',
76 | 'many',
77 | 'whole',
78 | 'top',
79 | 'due',
80 | 'whereafter',
81 | 'last',
82 | 'third'],
83 | 'ADP': ['behind',
84 | 'with',
85 | 'below',
86 | 'of',
87 | 'through',
88 | 'amongst',
89 | 'at',
90 | 'onto',
91 | 'thru',
92 | 'among',
93 | 'throughout',
94 | 'under',
95 | 'into',
96 | 'before',
97 | 'beside',
98 | 'against',
99 | 'within',
100 | 'per',
101 | 'towards',
102 | 'after',
103 | 'without',
104 | 'beyond',
105 | 'from',
106 | 'via',
107 | 'in',
108 | 'on',
109 | 'as',
110 | 'than',
111 | 'during',
112 | 'for',
113 | 'toward',
114 | 'until',
115 | 'above',
116 | 'across',
117 | 'along',
118 | 'between',
119 | 'over',
120 | 'by'],
121 | 'ADV': ['together',
122 | 'so',
123 | 'therein',
124 | 'next',
125 | 'off',
126 | 'meanwhile',
127 | 'whereupon',
128 | 'sometimes',
129 | 'again',
130 | 'rather',
131 | 'enough',
132 | 'thereby',
133 | 'first',
134 | 'too',
135 | 'always',
136 | 'either',
137 | 'somehow',
138 | 'very',
139 | 'perhaps',
140 | 'back',
141 | 'down',
142 | 'elsewhere',
143 | 'latterly',
144 | 'moreover',
145 | 'formerly',
146 | 'about',
147 | 'sometime',
148 | 'really',
149 | 'once',
150 | 'else',
151 | 'anyhow',
152 | 'also',
153 | 'there',
154 | 'most',
155 | 'nowhere',
156 | 'then',
157 | 'up',
158 | 'out',
159 | 'further',
160 | 'however',
161 | 'yet',
162 | 'namely',
163 | 'afterwards',
164 | 'already',
165 | 'hereby',
166 | 'thereupon',
167 | 'still',
168 | 'hence',
169 | 'anyway',
170 | 'even',
171 | 'much',
172 | 'thus',
173 | 'never',
174 | 'almost',
175 | 'alone',
176 | 'somewhere',
177 | 'here',
178 | 'more',
179 | 'hereupon',
180 | 'indeed',
181 | 'now',
182 | 'beforehand',
183 | 'everywhere',
184 | 'just',
185 | 'anywhere',
186 | 'often',
187 | 'thereafter',
188 | 'therefore',
189 | 'nevertheless',
190 | 'ever',
191 | 'quite',
192 | 'mostly',
193 | 'around',
194 | 'only',
195 | 'otherwise',
196 | 'less'],
197 | 'AUX': ['might',
198 | 'will',
199 | 'being',
200 | '’d',
201 | 'can',
202 | '’ll',
203 | 'are',
204 | 'was',
205 | "'ll",
206 | 'ca',
207 | 'could',
208 | 'must',
209 | 'would',
210 | "'re",
211 | 'may',
212 | 'were',
213 | "'s",
214 | 'should',
215 | 'be',
216 | 'cannot',
217 | 'am',
218 | 'is',
219 | 'been'],
220 | 'CCONJ': ['but', 'or', 'and', 'nor', 'neither'],
221 | 'DET': ['whose'],
222 | 'INTJ': ['please', 'well', 'no'],
223 | 'NOUN': ['part',
224 | 'bottom',
225 | 'whither',
226 | 'side',
227 | 'name',
228 | 'thence',
229 | 'amount',
230 | 'whence',
231 | 'yourselves',
232 | 'noone',
233 | 'front',
234 | 'yours',
235 | 'others',
236 | 'none',
237 | 'hers',
238 | '‘s',
239 | 'ours',
240 | 'herein'],
241 | 'NUM': ['one',
242 | 'twenty',
243 | 'nine',
244 | 'sixty',
245 | 'ten',
246 | 'five',
247 | 'fifty',
248 | 'forty',
249 | 'n’t',
250 | 'six',
251 | 'three',
252 | 'hundred',
253 | 'eleven',
254 | 'twelve',
255 | 'fifteen',
256 | 'four',
257 | 'two',
258 | 'eight'],
259 | 'PART': ["n't", 'to', 'not'],
260 | 'PRON': ['nothing',
261 | 'the',
262 | 'my',
263 | 'this',
264 | 'something',
265 | 'they',
266 | 'whom',
267 | 'nobody',
268 | 'her',
269 | 'those',
270 | 'me',
271 | 'he',
272 | 'themselves',
273 | 'us',
274 | 'an',
275 | 'anything',
276 | 'his',
277 | 'i',
278 | 'you',
279 | 'which',
280 | 'him',
281 | 'all',
282 | 'we',
283 | 'them',
284 | 'any',
285 | 'who',
286 | 'everyone',
287 | 'these',
288 | 'someone',
289 | 'some',
290 | 'himself',
291 | 'whoever',
292 | 'what',
293 | 'each',
294 | 'yourself',
295 | 'mine',
296 | 'everything',
297 | 'our',
298 | 'itself',
299 | 'anyone',
300 | 'herself',
301 | 'your',
302 | 'its',
303 | 'every',
304 | 'it',
305 | 'their',
306 | 'both',
307 | 'ourselves',
308 | 'that',
309 | 'another',
310 | 'whatever',
311 | 'she',
312 | 'myself',
313 | 'a'],
314 | 'PUNCT': ['‘ll', '’ve', '‘ve', '‘d', '’m', '‘m', '‘re'],
315 | 'SCONJ': ['where',
316 | 'wherever',
317 | 'unless',
318 | 'wherein',
319 | 'if',
320 | 'how',
321 | 'though',
322 | 'why',
323 | 'except',
324 | 'whether',
325 | 'while',
326 | 'upon',
327 | 'whereas',
328 | 'besides',
329 | 'when',
330 | 'because',
331 | 'whereby',
332 | 'whenever',
333 | 'since',
334 | 'although'],
335 | 'VERB': ['became',
336 | 'made',
337 | 'hereafter',
338 | 'used',
339 | 'using',
340 | 'did',
341 | 'becomes',
342 | 'seem',
343 | 'do',
344 | 'seems',
345 | "'d",
346 | 'done',
347 | 'give',
348 | 'keep',
349 | 'say',
350 | 'has',
351 | 'get',
352 | 'become',
353 | 'have',
354 | 'doing',
355 | 'seemed',
356 | 'make',
357 | 'n‘t',
358 | 'put',
359 | 'take',
360 | 'becoming',
361 | 'show',
362 | "'m",
363 | "'ve",
364 | '’s',
365 | 'see',
366 | 'regarding',
367 | 'move',
368 | 'had',
369 | 'seeming',
370 | 'call',
371 | 're',
372 | 'go',
373 | '’re',
374 | 'does']}
375 | """
376 |
--------------------------------------------------------------------------------
/DagAtlas.py:
--------------------------------------------------------------------------------
1 | from Dag import *
2 | from BatchSimilarity import *
3 | from utils import *
4 | import sys
5 | from itertools import product
6 | from globals import *
7 |
8 | import pickle as pik
9 | from time import time
10 | from sentence_transformers import SentenceTransformer
11 |
12 |
13 | class DagAtlas:
14 | """
15 | This class reads movie script txt files from the `out_dir` directory (
16 | simplified movie scripts) and creates a pickled file for each movie
17 | script. Each pickled file contains a Dag object for one movie. `dag_dir`
18 | (called the DAG atlas) is the directory containing the pickled files.
19 | This class is also called `DagAtlas`.
20 |
21 | Attributes
22 | ----------
23 | dag_dir: str
24 | directory where this class writes pickled files. One pickled file (
25 | i.e., DAG) per movie.
26 | model: SentenceTransformer
27 | Model returned by SentenceTransformer constructor
28 | simp_dir: str
29 | directory where this class reads txt files.
30 | start_time: float
31 | time in minutes when self is created.
32 | title_to_permission_to_write_new_pickle: dict[str, bool]
33 | A dictionary that maps each movie title to a boolean that grants
34 | permission to overwrite an existing pickled file.
35 |
36 | """
37 |
38 | def __init__(self, simp_dir, dag_dir,
39 | recycled_pickles=None):
40 | """
41 | Constructor
42 |
43 | Parameters
44 | ----------
45 | simp_dir: str
46 | directory with a simplified txt file for each movie script
47 | dag_dir: str
48 | directory with a pickled file containing a Dag object for each
49 | movie script
50 | recycled_pickles: list[str]
51 | titles for which overwriting of pickled files is forbidden, at the
52 | beginning, when self is first constructed.
53 |
54 | """
55 | self.start_time = time()
56 | time_now = (time() - self.start_time) / 60
57 | print(f"Initiating DagAtlas object: {time_now:.2f} minutes\n")
58 |
59 | self.simp_dir = simp_dir
60 | self.dag_dir = dag_dir
61 | all_simp_titles = [file_name[:-len(".txt")] for \
62 | file_name in my_listdir(self.simp_dir)]
63 | all_dag_titles = [file_name[:-len(".pkl")] for \
64 | file_name in my_listdir(self.dag_dir)]
65 | assert set(all_dag_titles).issubset(set(all_simp_titles))
66 |
67 | self.title_to_permission_to_write_new_pickle = {}
68 | for title in all_simp_titles:
69 | self.title_to_permission_to_write_new_pickle[title] = True
70 | if recycled_pickles is None:
71 | recycled_pickles = []
72 | for title in recycled_pickles:
73 | assert title in all_dag_titles
74 | self.title_to_permission_to_write_new_pickle[title] = False
75 |
76 | if SIMI_DEF == "similarity_bert":
77 | self.model = SentenceTransformer('all-MiniLM-L6-v2')
78 | else:
79 | self.model = None
80 |
81 | def update_arrows_for_two_m_scripts(self, title1, title2):
82 | """
83 | This method updates the arrows for 2 movie titles.
84 |
85 | Parameters
86 | ----------
87 | title1: str
88 | title2: str
89 |
90 | Returns
91 | -------
92 | None
93 |
94 | """
95 | time_now = (time() - self.start_time) / 60
96 | print(f"Starting comparison of 2 titles: {time_now:.2f} minutes")
97 |
98 | if self.title_to_permission_to_write_new_pickle[title1]:
99 | dag1 = Dag(title1, simp_dir=self.simp_dir)
100 | else:
101 | path1 = self.dag_dir + "/" + title1 + ".pkl"
102 | try:
103 | with open(path1, "rb") as f:
104 | dag1 = pik.load(f)
105 | except OSError:
106 | print("This file is probably missing:", path1)
107 | sys.exit()
108 |
109 | if self.title_to_permission_to_write_new_pickle[title2]:
110 | dag2 = Dag(title2, simp_dir=self.simp_dir)
111 | else:
112 | path2 = self.dag_dir + "/" + title2 + ".pkl"
113 | try:
114 | with open(path2, "rb") as f:
115 | dag2 = pik.load(f)
116 | except OSError:
117 | print("This file is probably missing:", path2)
118 | sys.exit()
119 | node_to_simple_ztz1 = \
120 | dag1.build_node_to_simple_ztz_dict(self.simp_dir)
121 | node_to_simple_ztz2 = \
122 | dag2.build_node_to_simple_ztz_dict(self.simp_dir)
123 |
124 | print("title1 and its num of nodes:", title1, len(dag1.nodes))
125 | print("title2 and its num of nodes:", title2, len(dag2.nodes))
126 | print("product of numbers of nodes=",
127 | len(dag1.nodes) * len(dag2.nodes))
128 |
129 | time_now = (time() - self.start_time) / 60
130 | print(f"Starting bridges: {time_now:.2f} minutes")
131 |
132 | nd1_nd2_bridges = []
133 | bridge_count = 0
134 | batch_simi = BatchSimilarity(dag1, dag2,
135 | node_to_simple_ztz1,
136 | node_to_simple_ztz2,
137 | model=self.model)
138 | for nd1, nd2 in product(dag1.nodes, dag2.nodes):
139 | if batch_simi.simi(nd1, nd2) > SIMI_THRESHOLD:
140 | nd1_nd2_bridges.append((nd1, nd2))
141 | bridge_count += 1
142 | print(bridge_count, "bridges")
143 | range0 = range(len(nd1_nd2_bridges))
144 | for i, j in product(range0, range0):
145 | if i < j:
146 | bridge_a = nd1_nd2_bridges[i]
147 | bridge_b = nd1_nd2_bridges[j]
148 | arrows = [None, None]
149 | time_gaps = [0, 0]
150 | for movie in range(2):
151 | time_gaps[movie] = \
152 | bridge_a[movie].time - bridge_b[movie].time
153 | if time_gaps[movie] > 0:
154 | arrows[movie] = (bridge_b[movie], bridge_a[movie])
155 | else:
156 | arrows[movie] = (bridge_a[movie], bridge_b[movie])
157 | bridges_do_not_cross = (time_gaps[0] * time_gaps[1] > 0)
158 | if bridges_do_not_cross:
159 | accepted = True
160 | else:
161 | accepted = False
162 | dag1.update_arrow(arrows[0], accepted)
163 | dag2.update_arrow(arrows[1], accepted)
164 |
165 | time_now = (time() - self.start_time) / 60
166 | print(f"Before saving 2 dags: {time_now:.2f} minutes")
167 | dag1.save_self(self.dag_dir)
168 | self.title_to_permission_to_write_new_pickle[title1] = False
169 | dag2.save_self(self.dag_dir)
170 | self.title_to_permission_to_write_new_pickle[title2] = False
171 |
172 | time_now = (time() - self.start_time) / 60
173 | print(f"Exiting 2 titles comparison: {time_now:.2f} minutes\n")
174 |
175 | def update_arrows_in_batch_of_m_scripts(self, batch_titles=None):
176 | """
177 | This method calls the method `update_arrows_for_two_m_scripts` for
178 | every pair '{ title1, title2}' of movie scripts in the list
179 | `batch_titles`.
180 |
181 | Parameters
182 | ----------
183 | batch_titles: list[str] or None
184 |
185 | Returns
186 | -------
187 | None
188 |
189 | """
190 | all_simp_titles = [file_name[:-len(".txt")] for \
191 | file_name in my_listdir(self.simp_dir)]
192 |
193 | if batch_titles is None:
194 | batch_titles = all_simp_titles
195 | assert set(batch_titles).issubset(set(all_simp_titles))
196 | assert len(batch_titles) >= 2
197 | num = len(batch_titles)
198 |
199 | for i, j in product(range(num), range(num)):
200 | if i < j:
201 | self.update_arrows_for_two_m_scripts(batch_titles[i],
202 | batch_titles[j])
203 |
204 | def update_arrows_for_one_m_script_and_others(self,
205 | title,
206 | other_titles):
207 | """
208 | This method calls the method `update_arrows_for_two_m_scripts` for
209 | every pair '{ title, other_title}' of movie scripts,
210 | where `other_title` is in the list `other_titles`.
211 |
212 | Parameters
213 | ----------
214 | title: str
215 | other_titles: list[str]
216 |
217 | Returns
218 | -------
219 | None
220 |
221 | """
222 | all_simp_titles = [file_name[:-len(".txt")] for \
223 | file_name in my_listdir(self.simp_dir)]
224 | assert set(other_titles).issubset(set(all_simp_titles))
225 | assert title not in other_titles
226 |
227 | for j in range(len(other_titles)):
228 | self.update_arrows_for_two_m_scripts(title,
229 | other_titles[j])
230 |
231 |
232 | if __name__ == "__main__":
233 | def main1():
234 | simp_dir = "short_stories_post_clean"
235 | dag_dir = "short_stories_dag_atlas"
236 | atlas = DagAtlas(simp_dir, dag_dir)
237 | all_titles = [file_name[:-len(".txt")] \
238 | for file_name in my_listdir(simp_dir)]
239 | atlas.update_arrows_in_batch_of_m_scripts(
240 | batch_titles=all_titles[0:3])
241 |
242 |
243 | def main2():
244 | remove_dialog = False
245 | atlas = DagAtlas(
246 | simp_dir=POST_CLEAN_DIR if not remove_dialog else
247 | POST_CLEAN_RD_DIR,
248 | dag_dir=DAG_DIR)
249 | all_titles = [file_name[:-len(".txt")] \
250 | for file_name in my_listdir(SIMP_DIR)]
251 | atlas.update_arrows_in_batch_of_m_scripts(
252 | batch_titles=all_titles[0:3])
253 |
254 |
255 | main1()
256 | # main2()
257 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy2.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | Refs:
7 |
8 | https://spacy.io/usage/spacy-101/
9 |
10 | https://github.com/ac491/Sentence-simplifier/blob/master/simplifcation.ipynb
11 | """
12 | import spacy
13 | import nltk
14 |
15 | nltk.download('averaged_perceptron_tagger')
16 |
17 | nlp = spacy.load("en_core_web_sm")
18 |
19 | # set of relative pronouns
20 | RELPRON = ['whom', 'whose', 'which', 'who']
21 |
22 |
23 | def transform(parsed):
24 | d = {}
25 | # print(parsed)
26 | # print()
27 | for x in parsed:
28 | rel = x.dep_
29 | parent = x.head.i + 1
30 | dependent = x.i + 1
31 | if parent == dependent and rel == 'ROOT':
32 | parent = 0
33 | if parent not in d.keys():
34 | d[parent] = {}
35 | if rel not in d[parent].keys():
36 | d[parent][rel] = []
37 |
38 | d[parent][rel].append(dependent)
39 |
40 | return d
41 |
42 |
43 | def analyse_rc(sentence):
44 | # check for markers indicating rel_clause
45 | if any([s.lower() in RELPRON for s in sentence]):
46 | mark = []
47 | for s in sentence:
48 | if s.lower() in RELPRON:
49 | mark.append(s.lower())
50 | return True, mark
51 | else:
52 | return False, None
53 |
54 |
55 | def remove_all(aux, item):
56 | for a in aux.keys():
57 | for d in aux[a].keys():
58 | if item in aux[a][d]:
59 | aux[a][d].remove(item)
60 |
61 |
62 | def build(root, dep, aux, words, final, yes_root=True, previous=None):
63 | if previous is None:
64 | previous = []
65 |
66 | if root in previous:
67 | return
68 |
69 | previous.append(root)
70 |
71 | if yes_root:
72 | final[root] = words[root - 1]
73 | previous.append(root)
74 |
75 | for k in dep.keys():
76 | for i in dep[k]:
77 | if i in aux.keys():
78 | deps = aux[i]
79 | build(i, deps, aux, words, final, previous=previous)
80 |
81 | final[i] = words[i - 1]
82 |
83 |
84 | def appositive_phrases(dep_dict, words, root, dep_root, ant):
85 | if 'nsubj' in dep_root:
86 | subj = dep_root['nsubj'][0]
87 | subj_word = words[subj - 1]
88 |
89 | # print(dep_dict)
90 | if subj not in dep_dict:
91 | return False, ant
92 |
93 | deps_subj = dep_dict[subj]
94 | v_tense = words[root - 1][1]
95 | n_num = words[subj - 1][1]
96 |
97 | if 'amod' in deps_subj:
98 | mod = deps_subj['amod'][0]
99 | if mod in dep_dict:
100 | deps_mod = dep_dict[mod]
101 | else:
102 | deps_mod = {}
103 | del dep_dict[subj]['amod']
104 | deps_subj = dep_dict[subj]
105 |
106 | # Treat simple cases such as 'general rule'
107 | if 'JJ' in words[mod - 1][1] and 'punct' not in deps_subj:
108 | return False, ant
109 |
110 | elif 'appos' in deps_subj:
111 | mod = deps_subj['appos'][0]
112 | if mod in dep_dict:
113 | deps_mod = dep_dict[mod]
114 | else:
115 | deps_mod = {}
116 | del dep_dict[subj]['appos']
117 | deps_subj = dep_dict[subj]
118 | else:
119 | return False, ant
120 |
121 | if 'punct' in deps_subj.keys():
122 | del deps_subj['punct']
123 |
124 | final_root = {}
125 | build(root, dep_root, dep_dict, [s[0].lower() for s in words],
126 | final_root)
127 | final_appos = {}
128 | build(mod, deps_mod, dep_dict, [s[0].lower() for s in words],
129 | final_appos)
130 | final_subj = {}
131 | build(subj, deps_subj, dep_dict, [s[0].lower() for s in words],
132 | final_subj)
133 |
134 | # print(final_root)
135 | s1 = []
136 | for i in sorted(final_root):
137 | s1.append(final_root[i])
138 | s1 = ' '.join(s1)
139 | # print(s1)
140 |
141 | # print(final_appos)
142 | s2 = []
143 | for i in sorted(final_appos):
144 | s2.append(final_appos[i])
145 | s2 = ' '.join(s2)
146 | # print(s2)
147 |
148 | # print(final_subj)
149 | s3 = []
150 | for i in sorted(final_subj):
151 | s3.append(final_subj[i])
152 | s3 = ' '.join(s3)
153 | # print(s3)
154 |
155 | if len(final_appos.keys()) < 2:
156 | return False, ant
157 |
158 | if n_num in ["NN", "NNP"]:
159 | if v_tense in ["VBP", "VBZ", "VB"]:
160 | s3 += " is "
161 | elif v_tense in ["VBD", "VBG", "VBN"]:
162 | s3 += " was "
163 |
164 | elif n_num in ["NNS", "NNPS"]:
165 | if v_tense in ["VBP", "VBZ", "VB"]:
166 | s3 += " are "
167 | elif v_tense in ("VBD", "VBG", "VBN"):
168 | s3 += " were "
169 |
170 | elif n_num in ["PRP"] and subj_word.lower() == "they":
171 |
172 | if v_tense in ["VBP", "VBZ", "VB"]:
173 | s3 += " are "
174 | elif v_tense in ["VBD", "VBG", "VBN"]:
175 | s3 += " were "
176 |
177 | elif n_num in ["PRP"]:
178 | if v_tense in ["VBP", "VBZ", "VB"]:
179 | s3 += " is "
180 | elif v_tense in ["VBD", "VBG", "VBN"]:
181 | s3 += " was "
182 |
183 | s2 = s3 + s2
184 |
185 | return True, [s1, s2]
186 |
187 | return False, ant
188 |
189 |
190 | def relative_clauses(dep_dict, words, root, dep_root, rel, ant):
191 | subj = dep_root[rel][0]
192 | if subj in dep_dict:
193 |
194 | dep_subj = dep_dict[subj]
195 |
196 | if 'relcl' in dep_subj or 'rcmod' in dep_subj:
197 | if 'relcl' in dep_subj:
198 | relc = dep_subj['relcl'][0]
199 | type_rc = 'relcl'
200 | else:
201 | relc = dep_subj['rcmod'][0]
202 | type_rc = 'rcmod'
203 | deps_relc = dep_dict[relc]
204 |
205 | if 'nsubj' in deps_relc:
206 | subj_rel = 'nsubj'
207 | elif 'nsubjpass' in deps_relc:
208 | subj_rel = 'nsubjpass'
209 |
210 | if 'ref' in dep_subj:
211 | to_remove = dep_subj['ref'][0]
212 | mark = words[dep_subj['ref'][0] - 1].lower()
213 | else:
214 | to_remove = deps_relc[subj_rel][0]
215 | mark = words[deps_relc[subj_rel][0] - 1].lower()
216 |
217 | # print(mark)
218 |
219 | if mark in RELPRON:
220 | deps_relc[subj_rel][0] = subj
221 | remove_all(dep_dict, to_remove)
222 | # needed for cases where the subject of
223 | # the relative clause is the object
224 | elif 'dobj' in deps_relc:
225 | obj = deps_relc['dobj'][0]
226 |
227 | if 'poss' in dep_dict[obj]:
228 | mod = dep_dict[obj]['poss'][0]
229 | aux_words = words[mod - 1]
230 | aux_words = words[subj - 1] + '\'s'
231 | words[mod - 1] = aux_words
232 | dep_dict[mod] = dep_dict[subj]
233 | else:
234 | return False, ant
235 | else:
236 | return False, ant # for broken cases -
237 | # " There are some
238 | # situations where it is particularly important
239 | # that you get financial information and
240 | # advice that is independent of us."
241 |
242 | del dep_dict[subj][type_rc]
243 |
244 | if 'punct' in dep_subj:
245 | del dep_dict[subj]['punct']
246 |
247 | final_root = {}
248 | build(root, dep_root, dep_dict, words, final_root)
249 | final_relc = {}
250 | build(relc, deps_relc, dep_dict, words, final_relc)
251 |
252 | # print(final_root)
253 | # print(final_relc)
254 |
255 | s1 = []
256 | for i in sorted(final_root):
257 | s1.append(final_root[i])
258 |
259 | s2 = []
260 | for i in sorted(final_relc):
261 | s2.append(final_relc[i])
262 |
263 | return True, [' '.join(s1), ' '.join(s2)]
264 | return False, ant
265 |
266 |
267 | sentence0 = 'Robert, who lives nearby, was walking his dog'
268 | sentence1 = 'Marcus, my sister\'s hamster, likes to run in a wheel.'
269 |
270 |
271 | def simplify_ztz(ztz, verbose=False):
272 | """
273 | This method simplifies the sentence `sentence`. It returns a list of
274 | simple sentences extracted from the input sentence.
275 |
276 | Parameters
277 | ----------
278 | ztz: str
279 | verbose: bool
280 | kwargs: dict[]
281 |
282 | Returns
283 | -------
284 | list[str]
285 |
286 | """
287 |
288 | ztz.strip()
289 | sentences = [ztz]
290 | result = []
291 |
292 | for s in sentences:
293 |
294 | output = nlp(s)
295 |
296 | dep_dict = transform(output)
297 | # print(dep_dict)
298 |
299 | # words = [(token.text.lower(), token.pos_) for token in output]
300 |
301 | tokens = [token.text.lower() for token in output]
302 |
303 | words = nltk.pos_tag(tokens)
304 |
305 | # print(words)
306 |
307 | if 0 in dep_dict:
308 |
309 | root = dep_dict[0]['ROOT'][0]
310 |
311 | if root in dep_dict:
312 |
313 | dep_root = dep_dict[root]
314 |
315 | # handle appositive_phrases
316 | flag_appos, res = appositive_phrases(dep_dict, words, root,
317 | dep_root, sentences)
318 | if flag_appos:
319 | result += res
320 | continue
321 |
322 | # check for relative clauses
323 | flag_rc, type_rc = analyse_rc(s.split())
324 |
325 | if flag_rc:
326 |
327 | if 'nsubj' in dep_root:
328 | flag, res = relative_clauses(dep_dict, tokens, root,
329 | dep_root, 'nsubj',
330 | sentences)
331 | if flag:
332 | result += res
333 | continue
334 | elif 'dobj' in dep_root:
335 | flag, res = relative_clauses(dep_dict, tokens, root,
336 | dep_root, 'dobj',
337 | sentences)
338 | if flag:
339 | result += res
340 | continue
341 | result.append(s)
342 |
343 | if verbose:
344 | print(ztz.strip())
345 | print(result)
346 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_stanford2.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Mar 19, 2015
3 |
4 | @author: TPetrou
5 | '''
6 |
7 | from nltk.parse import stanford
8 | import os, sys
9 | import operator
10 |
11 | java_path = r"C:\Program Files\Java\jdk1.8.0_31\bin\java.exe"
12 | os.environ['JAVAHOME'] = java_path
13 | os.environ[
14 | 'STANFORD_PARSER'] = r'/users/ted/stanford nlp/stanford-parser-full-2015-01-30'
15 | os.environ[
16 | 'STANFORD_MODELS'] = r'/users/ted/stanford nlp/stanford-parser-full-2015-01-30'
17 |
18 |
19 | class RDF_Triple():
20 | class RDF_SOP():
21 |
22 | def __init__(self, name, pos=''):
23 | self.name = name
24 | self.word = ''
25 | self.parent = ''
26 | self.grandparent = ''
27 | self.depth = ''
28 | self.predicate_list = []
29 | self.predicate_sibings = []
30 | self.pos = pos
31 | self.attr = []
32 | self.attr_trees = []
33 |
34 | def __init__(self, sentence):
35 | self.sentence = sentence
36 | self.clear_data()
37 |
38 | def clear_data(self):
39 | self.parser = stanford.StanfordParser(
40 | model_path=r"/users/ted/stanford nlp/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
41 | self.first_NP = ''
42 | self.first_VP = ''
43 | self.parse_tree = None
44 | self.subject = RDF_Triple.RDF_SOP('subject')
45 | self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB')
46 | self.Object = RDF_Triple.RDF_SOP('object')
47 |
48 | def find_NP(self, t):
49 | try:
50 | t.label()
51 | except AttributeError:
52 | pass
53 | else:
54 | # Now we know that t.node is defined
55 | if t.label() == 'NP':
56 | if self.first_NP == '':
57 | self.first_NP = t
58 | elif t.label() == 'VP':
59 | if self.first_VP == '':
60 | self.first_VP = t
61 | for child in t:
62 | self.find_NP(child)
63 |
64 | def find_subject(self, t, parent=None, grandparent=None):
65 | if self.subject.word != '':
66 | return
67 | try:
68 | t.label()
69 | except AttributeError:
70 | pass
71 | else:
72 | # Now we know that t.node is defined
73 | if t.label()[:2] == 'NN':
74 | if self.subject.word == '':
75 | self.subject.word = t.leaves()[0]
76 | self.subject.pos = t.label()
77 | self.subject.parent = parent
78 | self.subject.grandparent = grandparent
79 | else:
80 | for child in t:
81 | self.find_subject(child, parent=t, grandparent=parent)
82 |
83 | def find_predicate(self, t, parent=None, grandparent=None, depth=0):
84 | try:
85 | t.label()
86 | except AttributeError:
87 | pass
88 | else:
89 | if t.label()[:2] == 'VB':
90 | self.predicate.predicate_list.append(
91 | (t.leaves()[0], depth, parent, grandparent))
92 |
93 | for child in t:
94 | self.find_predicate(child, parent=t, grandparent=parent,
95 | depth=depth + 1)
96 |
97 | def find_deepest_predicate(self):
98 | if not self.predicate.predicate_list:
99 | return '', '', '', ''
100 | return max(self.predicate.predicate_list, key=operator.itemgetter(1))
101 |
102 | def extract_word_and_pos(self, t, depth=0, words=[]):
103 | try:
104 | t.label()
105 | except AttributeError:
106 | # print t
107 | # print 'error', t
108 | pass
109 | else:
110 | # Now we know that t.node is defined
111 | if t.height() == 2:
112 | # self.word_pos_holder.append((t.label(), t.leaves()[0]))
113 | words.append((t.leaves()[0], t.label()))
114 | for child in t:
115 | self.extract_word_and_pos(child, depth + 1, words)
116 | return words
117 |
118 | def print_tree(self, t, depth=0):
119 | try:
120 | t.label()
121 | except AttributeError:
122 | print(t)
123 | # print 'error', t
124 | pass
125 | else:
126 | # Now we know that t.node is defined
127 | print('(') # , t.label(), t.leaves()[0]
128 | for child in t:
129 | self.print_tree(child, depth + 1)
130 | print(') ')
131 |
132 | def find_object(self):
133 | for t in self.predicate.parent:
134 | if self.Object.word == '':
135 | self.find_object_NP_PP(t, t.label(), self.predicate.parent,
136 | self.predicate.grandparent)
137 |
138 | def find_object_NP_PP(self, t, phrase_type, parent=None, grandparent=None):
139 | '''
140 | finds the object given its a NP or PP or ADJP
141 | '''
142 | if self.Object.word != '':
143 | return
144 | try:
145 | t.label()
146 | except AttributeError:
147 | pass
148 | else:
149 | # Now we know that t.node is defined
150 | if t.label()[:2] == 'NN' and phrase_type in ['NP', 'PP']:
151 | if self.Object.word == '':
152 | self.Object.word = t.leaves()[0]
153 | self.Object.pos = t.label()
154 | self.Object.parent = parent
155 | self.Object.grandparent = grandparent
156 | elif t.label()[:2] == 'JJ' and phrase_type == 'ADJP':
157 | if self.Object.word == '':
158 | self.Object.word = t.leaves()[0]
159 | self.Object.pos = t.label()
160 | self.Object.parent = parent
161 | self.Object.grandparent = grandparent
162 | else:
163 | for child in t:
164 | self.find_object_NP_PP(child, phrase_type, parent=t,
165 | grandparent=parent)
166 |
167 | def get_attributes(self, pos, sibling_tree, grandparent):
168 | rdf_type_attr = []
169 | if pos[:2] == 'JJ':
170 | for item in sibling_tree:
171 | if item.label()[:2] == 'RB':
172 | rdf_type_attr.append((item.leaves()[0], item.label()))
173 | else:
174 | if pos[:2] == 'NN':
175 | for item in sibling_tree:
176 | if item.label()[:2] in ['DT', 'PR', 'PO', 'JJ', 'CD']:
177 | rdf_type_attr.append((item.leaves()[0], item.label()))
178 | if item.label() in ['QP', 'NP']:
179 | # append a tree
180 | rdf_type_attr.append(item, item.label())
181 | elif pos[:2] == 'VB':
182 | for item in sibling_tree:
183 | if item.label()[:2] == 'AD':
184 | rdf_type_attr.append((item, item.label()))
185 |
186 | if grandparent:
187 | if pos[:2] in ['NN', 'JJ']:
188 | for uncle in grandparent:
189 | if uncle.label() == 'PP':
190 | rdf_type_attr.append((uncle, uncle.label()))
191 | elif pos[:2] == 'VB':
192 | for uncle in grandparent:
193 | if uncle.label()[:2] == 'VB':
194 | rdf_type_attr.append((uncle, uncle.label()))
195 |
196 | return self.attr_to_words(rdf_type_attr)
197 |
198 | def attr_to_words(self, attr):
199 | new_attr_words = []
200 | new_attr_trees = []
201 | for tup in attr:
202 | # if type(tup[0]) != unicode:
203 | # if tup[0].height() == 2:
204 | # new_attr_words.append((tup[0].leaves()[0], tup[0].label()))
205 | # else:
206 | # # new_attr_words.extend(self.extract_word_and_pos(tup[0]))
207 | # new_attr_trees.append(tup[0].unicode_repr())
208 | # else:
209 | new_attr_words.append(tup)
210 | return new_attr_words, new_attr_trees
211 |
212 | def jsonify_rdf(self):
213 | return {'sentence': self.sentence,
214 | 'parse_tree': self.parse_tree.unicode_repr(),
215 | 'predicate': {'word': self.predicate.word,
216 | 'POS': self.predicate.pos,
217 | 'Word Attributes': self.predicate.attr,
218 | 'Tree Attributes': self.predicate.attr_trees},
219 | 'subject': {'word': self.subject.word, 'POS': self.subject.pos,
220 | 'Word Attributes': self.subject.attr,
221 | 'Tree Attributes': self.subject.attr_trees},
222 | 'object': {'word': self.Object.word, 'POS': self.Object.pos,
223 | 'Word Attributes': self.Object.attr,
224 | 'Tree Attributes': self.Object.attr_trees},
225 | 'rdf': [self.subject.word, self.predicate.word,
226 | self.Object.word]
227 | }
228 |
229 | def main(self):
230 | self.clear_data()
231 | self.parse_tree = self.parser.raw_parse(self.sentence)[0]
232 | self.find_NP(self.parse_tree)
233 | self.find_subject(self.first_NP)
234 | self.find_predicate(self.first_VP)
235 | if self.subject.word == '' and self.first_NP != '':
236 | self.subject.word = self.first_NP.leaves()[0]
237 | self.predicate.word, self.predicate.depth, self.predicate.parent, self.predicate.grandparent = self.find_deepest_predicate()
238 | self.find_object()
239 | self.subject.attr, self.subject.attr_trees = self.get_attributes(
240 | self.subject.pos, self.subject.parent, self.subject.grandparent)
241 | self.predicate.attr, self.predicate.attr_trees = self.get_attributes(
242 | self.predicate.pos, self.predicate.parent,
243 | self.predicate.grandparent)
244 | self.Object.attr, self.Object.attr_trees = self.get_attributes(
245 | self.Object.pos, self.Object.parent, self.Object.grandparent)
246 | self.answer = self.jsonify_rdf()
247 |
248 |
249 | if __name__ == '__main__':
250 | # try:
251 | # sentence = sys.argv[1]
252 | # sentence = 'A rare black squirrel has become a regular visitor to a suburban garden'
253 | # except IndexError:
254 | # print("Enter in your sentence")
255 | # sentence = 'A rare black squirrel has become a regular visitor to a suburban garden'
256 | # print("Heres an example")
257 | # print(sentence)
258 |
259 | # sentence = 'The boy dunked the basketball'
260 | sentence = 'They also made the substance able to last longer in the bloodstream, which led to more stable blood sugar levels and less frequent injections.'
261 | #sentence = 'A rare black squirrel has become a regular visitor to a
262 | # suburban garden'
263 | rdf = RDF_Triple(sentence)
264 | rdf.main()
265 |
266 | ans = rdf.answer
267 | print(ans)
--------------------------------------------------------------------------------
/cleaning.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains functions for cleaning movie scripts (or short stories)
4 |
5 | input directory: m_scripts or short_stories
6 | output directory: m_scripts_clean or short_stories_clean
7 |
8 | The code in this file cleans one movie script file at a time. It takes each
9 | input movie script file from the folder `m_scripts` and outputs a new file
10 | to the folder `m_scripts_clean`.
11 |
12 | It removes contractions like "didn't", and replaces exclusively unicode
13 | symbols by their closest ANSII analogues (e.g., curly quotes are replaced by
14 | straight quotes).
15 |
16 | It uses the software SpaCy to break up the movie script into separate
17 | sentences, and returns a file with only one sentence per line.
18 |
19 | For the case of movie scripts (but not for short stories), it also tries to
20 | distinguish between dialog lines and narration lines. In many but not all
21 | movie scripts, the dialog lines are indented with respect to the narration
22 | lines. In the case of Pixar/Disney, they don't indent dialog. In cases where
23 | the movie script indents, the MM software gives the option of throwing away
24 | all the dialog lines and keeping only the narration ones. Folders ending in
25 | `_rd` are for remove dialog files.
26 |
27 | Occasionally in this file, we use regex (via the Python module `re`).
28 | Here is a nice reference on `re`.
29 |
30 | https://www.datacamp.com/tutorial/python-regular-expression-tutorial
31 |
32 | ChatGPT is also very good at answering regex questions.
33 |
34 | """
35 | import re
36 | import os
37 | # sentence splitting with NLTK
38 | # from nltk.tokenize import sent_tokenize
39 | import collections as co
40 | from globals import *
41 | from unidecode import unidecode
42 | import contractions
43 | from utils import *
44 |
45 | # sentence splitting with spacy
46 | import spacy
47 |
48 | nlp = spacy.load('en_core_web_sm')
49 |
50 |
51 | def expand_contractions(line):
52 | """
53 | This auxiliary method replaces all contractions in the string `line` by
54 | expansions thereof (e.g., replaces "didn't" by "did not".)
55 |
56 | Parameters
57 | ----------
58 | line: str
59 |
60 | Returns
61 | -------
62 | str
63 |
64 | """
65 | str_list = []
66 | for word in line.split():
67 | str_list.append(contractions.fix(word))
68 | return ' '.join(str_list)
69 |
70 |
71 | def clean_one_m_script(in_dir,
72 | out_dir,
73 | file_name,
74 | remove_dialog=False):
75 | """
76 | in_dir and out_dir can be the same, but this will overwrite the files.
77 |
78 | This method reads a file called `file_name` in the `in_dir` directory
79 | and creates a clean version in the `out_dir` directory.
80 |
81 | Parameters
82 | ----------
83 | in_dir: str
84 | out_dir: str
85 | file_name: str
86 | remove_dialog: bool
87 | True iff dialog part of the movie script is removed, leaving only
88 | the narration part.
89 |
90 | Returns
91 | -------
92 | None
93 |
94 | """
95 |
96 | print('fetching %s' % file_name)
97 |
98 | def count_leading_wh_sp(str0):
99 | # wh_sp = white space
100 | count = 0
101 | if str0:
102 | for char in str0:
103 | if char.isspace():
104 | count += 1
105 | else:
106 | break
107 | return count
108 |
109 | inpath = in_dir + "/" + file_name
110 | outpath = out_dir + "/" + file_name
111 |
112 | with open(inpath, "r", encoding='utf-8') as f:
113 | lines = [line for line in f]
114 |
115 | # Replace exclusively unicode characters by ascii analogues (e.g.,
116 | # replace curly quotes by straight ones) so don't have to use
117 | # encoding="utf-8" as a parameter in open() from here on.
118 | lines = [unidecode(line) for line in lines]
119 |
120 | # expand contractions
121 | lines = [expand_contractions(line) for line in lines]
122 |
123 | # strip trailing (i.e., right) white space and newline.
124 | # If this results in an empty line, remove it.
125 | new_lines = []
126 | for line in lines:
127 | line = line.rstrip()
128 | if line:
129 | new_lines.append(line)
130 | lines = new_lines
131 |
132 | # remove everything after and including THE END
133 | new_lines = []
134 | for line in lines:
135 | if line.strip() in ["THE END", "END"]:
136 | break
137 | else:
138 | new_lines.append(line)
139 | lines = new_lines
140 |
141 | # regex for parenthetical remarks
142 | pattern_paren = re.compile(r'\[(.*?)\]|\((.*?)\)|\{(.*?)\}')
143 | # regex for period followed by white spaces + number
144 | pattern_period = r"\.(?=\s*\d)"
145 |
146 | # Substitutions. If subs results in empty line, remove it.
147 | new_lines = []
148 | for line in lines:
149 | # print("ssdf", line)
150 | # remove parenthetical remarks
151 | line = re.sub(pattern_paren, "", line)
152 | # remove the underscore, which is not
153 | # considered a punctuation mark.
154 | line = re.sub(r'[_]', '', line)
155 | # Replace tabs by 12 blank spaces
156 | line = re.sub(r"\t", " " * 12, line)
157 | # replace period by dash if period followed by number
158 | line = re.sub(pattern_period, "-", line)
159 | # print("\tssdf", line)
160 | if len(line) >= 1:
161 | new_lines.append(line)
162 | lines = new_lines
163 |
164 | # Add missing periods for transitions from dialog to narration or vice
165 | # versa
166 | indent = count_leading_wh_sp(lines[0])
167 | for i in range(len(lines)):
168 | if i != len(lines) - 1:
169 | next_indent = count_leading_wh_sp(lines[i + 1])
170 | if indent != next_indent and \
171 | not lines[i][-1] in [".", "!", "?"]:
172 | lines[i] = lines[i] + "."
173 | else:
174 | next_indent = None
175 | if not lines[i][-1] in [".", "!", "?"]:
176 | lines[i] = lines[i] + "."
177 | indent = next_indent
178 |
179 | # Regex for string that contains at least 2 lower case letters
180 | # Found cases where line was just "is."
181 | pattern_lc = re.compile(r'^(.*[a-z]){2,}.*$')
182 |
183 | # Reject lines that don't contain at least 2 lower case letters string.
184 | # This gets rid of scene directions and character invocations.
185 | lines = [line for line in lines if re.search(pattern_lc, line)]
186 |
187 | white_spaces = [count_leading_wh_sp(line) for line in lines]
188 | # Counter returns dictionary mapping item to its number of repetitions
189 | wh_sp_counter = co.Counter(white_spaces)
190 | # print("llkh", wh_sp_counter)
191 | sum_reps = sum(wh_sp_counter.values())
192 | indent_prob_dist = co.OrderedDict()
193 | indents = []
194 | for indent in sorted(wh_sp_counter,
195 | key=wh_sp_counter.get,
196 | reverse=True):
197 | prob = round(wh_sp_counter[indent] / sum_reps, 3)
198 | indent_prob_dist[indent] = prob
199 | indents.append(indent)
200 | # print("ddfg", indents)
201 | # print("ddfg", indent_prob_dist)
202 | print("indent prob dist =", [(indent, indent_prob_dist[indent]) \
203 | for indent in indents[0:4]])
204 |
205 | # likely dialog indents
206 | # most probable indent = indents[0]
207 | dial_indents = [indent for indent in indents if \
208 | abs(indent - indents[0]) <= 3 and \
209 | indent_prob_dist[indent] >= .01]
210 |
211 | ndial_indents = [indent for indent in indents \
212 | if indent not in dial_indents]
213 | # likely narration indents
214 | narr_indents = [indent for indent in ndial_indents if \
215 | abs(indent - ndial_indents[0]) <= 3 and \
216 | indent_prob_dist[indent] >= .01]
217 |
218 | print("dialog indents=", dial_indents)
219 | print("narration indents=", narr_indents)
220 |
221 | # keep only narration (less likely than dialog) indentations. Also
222 | # remove smallest indentation.
223 | new_lines = []
224 | for line in lines:
225 | indent = count_leading_wh_sp(line)
226 | if indent in dial_indents + narr_indents:
227 | if not narr_indents or not dial_indents:
228 | # there is no difference in indentation between narr and dial
229 | new_lines.append(line)
230 | else:
231 | if remove_dialog:
232 | if indent in narr_indents:
233 | new_lines.append(line[min(narr_indents):])
234 | else:
235 | new_lines.append(line[min(narr_indents):])
236 | lines = new_lines
237 |
238 | # print("nnuu", lines[0:15])
239 | # print("nnuu", lines[-15:])
240 |
241 | # remove enumeration markers.
242 | # pattern = re.compile(r"^[^a-zA-Z]*")
243 | # lines = [re.sub(pattern, "", line) for line in lines]
244 |
245 | # join lines to create new script
246 | lines = [line.strip() for line in lines if line]
247 | script = ' '.join(lines)
248 |
249 | # split script into sentences with NLTK
250 | # lines = sent_tokenize(script)
251 |
252 | # split script into sentences with spacy
253 | lines = nlp(script).sents
254 | # for line in lines:
255 | # print("zzzxc", line)
256 |
257 | # remove single character sentences
258 | lines = [line.text for line in lines if len(line.text) > 1]
259 |
260 | with open(outpath, "w") as f:
261 | for line in lines:
262 | f.write(line + "\n")
263 |
264 |
265 | def clean_batch_of_m_scripts(
266 | in_dir, out_dir,
267 | batch_file_names,
268 | remove_dialog=False):
269 | """
270 | This method calls the method `clean_one_m_script` for all the file names
271 | in the list of file names `batch_file_names`.
272 |
273 | Parameters
274 | ----------
275 | in_dir: str
276 | out_dir: str
277 | batch_file_names: list[str]
278 | remove_dialog: bool
279 |
280 | Returns
281 | -------
282 | None
283 |
284 | """
285 |
286 | all_file_names = my_listdir(in_dir)
287 | assert set(batch_file_names).issubset(set(all_file_names))
288 | for file_name in batch_file_names:
289 | i = all_file_names.index(file_name)
290 | print('%i.' % (i + 1))
291 | clean_one_m_script(in_dir,
292 | out_dir,
293 | file_name,
294 | remove_dialog=remove_dialog)
295 |
296 |
297 | if __name__ == "__main__":
298 | from globals import *
299 |
300 |
301 | def main1():
302 | in_dir = "short_stories"
303 | out_dir = "short_stories_clean"
304 | batch_file_names = my_listdir(in_dir)[0:3]
305 | clean_batch_of_m_scripts(
306 | in_dir, out_dir,
307 | batch_file_names,
308 | remove_dialog=False)
309 |
310 |
311 | def main2():
312 | remove_dialog = True
313 | clean_one_m_script(
314 | in_dir=M_SCRIPTS_DIR,
315 | out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR,
316 | file_name="up.txt",
317 | remove_dialog=remove_dialog)
318 |
319 |
320 | def main3():
321 | remove_dialog = False
322 | # batch_file_names=my_listdir(M_SCRIPTS_DIR)
323 | batch_file_names = ["toy-story.txt", "up.txt", "wall-e.txt"]
324 | clean_batch_of_m_scripts(
325 | in_dir=M_SCRIPTS_DIR,
326 | out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR,
327 | batch_file_names=batch_file_names,
328 | remove_dialog=remove_dialog)
329 |
330 |
331 | # main1()
332 | # main2()
333 | main3()
334 |
--------------------------------------------------------------------------------
/Dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from Node import *
3 | import pickle as pik
4 | from globals import *
5 | from utils import *
6 |
7 | import graphviz as gv
8 | from IPython.display import display, Image
9 | from PIL.Image import open as open_image
10 |
11 |
12 | class Dag:
13 | """
14 | This class creates a DAG (directed acyclic graph) for the movie entitled
15 | `m_title`. The DAG has nodes `nodes` and arrows `arrows`. Each arrow has
16 | a two weights `num_acc` and `num_rej`. Those weights are the number of
17 | times the arrow has been accepted and rejected. They are stored in the
18 | dictionary `arrow_to_acc_rej_nums`.
19 |
20 | Attributes
21 | ----------
22 | arrow_to_acc_rej_nums: dict[tuple(Node), tuple(int)]
23 | arrows: list[tuple[Node, Node]]
24 | arrows of self. Arrows are defined as a pair of Node objects.
25 | The first element of the pair is the origin of the arrow and the
26 | second is the target of the arrow.
27 | m_title: str
28 | nodes: list[Node]
29 |
30 | """
31 |
32 | def __init__(self, m_title, simp_dir):
33 | """
34 | Constructor
35 |
36 | Parameters
37 | ----------
38 | m_title: str
39 | title of movie to which this DAG refers to.
40 | simp_dir: str
41 | the directory in which simplified files are stored, and from
42 | which objects of this class are constructed.
43 | """
44 | self.m_title = m_title
45 | path = simp_dir + "/" + m_title + ".txt"
46 | with open(path, "r") as f:
47 | lines = [line for line in f]
48 | self.nodes = []
49 | for time, line in enumerate(lines):
50 | if line.strip() not in [ZTZ_SEPARATOR, ""]:
51 | ztz_list = line.split(ZTZ_SEPARATOR)
52 | for place in range(len(ztz_list)):
53 | self.nodes.append(Node(time, place))
54 | self.arrows = []
55 | self.arrow_to_acc_rej_nums = {}
56 |
57 | def save_self(self, dag_dir):
58 | """
59 | This method stores self as a pickled file.
60 |
61 | Parameters
62 | ----------
63 | dag_dir: str
64 | Directory in which pickled file is stored.
65 |
66 | Returns
67 | -------
68 | None
69 |
70 | """
71 | path = dag_dir + "/" + self.m_title + ".pkl"
72 | with open(path, "wb") as f:
73 | pik.dump(self, f, protocol=pik.HIGHEST_PROTOCOL)
74 |
75 | def update_arrow(self, arrow, accepted):
76 | """
77 | This method changes the tuple (num_accepted, num_rejected) of
78 | `arrow`. If accepted=True, num_accepted is increased by one. If
79 | accepted=False, num_rejected is increased by one.
80 |
81 | Parameters
82 | ----------
83 | arrow: tuple[Node, Node]
84 | accepted: bool
85 |
86 | Returns
87 | -------
88 | None
89 |
90 | """
91 | if arrow not in self.arrows:
92 | self.arrows.append(arrow)
93 | self.arrow_to_acc_rej_nums[arrow] = [0, 0]
94 | if accepted:
95 | self.arrow_to_acc_rej_nums[arrow][0] += 1
96 | else:
97 | self.arrow_to_acc_rej_nums[arrow][1] += 1
98 |
99 | def build_node_to_clean_ztz_dict(self,
100 | clean_dir,
101 | skip_1st_line=False):
102 | """
103 | This method builds from scratch and returns a dictionary called
104 | `nd_to_clean_ztz` that maps each node to a clean sentence. ztz
105 | stands for sentence.
106 |
107 | Parameters
108 | ----------
109 | clean_dir: str
110 | directory of movie scripts after cleaning.
111 |
112 | Returns
113 | -------
114 | dict(Node, str)
115 |
116 | """
117 | path = clean_dir + "/" + self.m_title + ".txt"
118 | is_csv = False
119 | if not os.path.isfile(path):
120 | path = path.replace(".txt", ".csv")
121 | is_csv = True
122 | assert os.path.isfile(path)
123 |
124 | time_to_clean_ztz = {}
125 | with open(path, "r") as f:
126 | time = -1
127 | for line in f:
128 | time += 1
129 | if is_csv:
130 | if time == 0:
131 | continue
132 | else:
133 | time_to_clean_ztz[time - 1] = line.strip()
134 | else:
135 | time_to_clean_ztz[time] = line.strip()
136 |
137 | nd_to_clean_ztz = {}
138 | for nd in self.nodes:
139 | nd_to_clean_ztz[nd] = time_to_clean_ztz[nd.time]
140 |
141 | return nd_to_clean_ztz
142 |
143 | def build_node_to_simple_ztz_dict(self, simp_dir):
144 | """
145 | This method builds from scratch and returns a dictionary called
146 | `nd_to_simple_ztz` that maps each node to a simplified sentence. ztz
147 | stands for sentence.
148 |
149 |
150 | Parameters
151 | ----------
152 | simp_dir: str
153 | directory of movie scripts after simplifying.
154 |
155 | Returns
156 | -------
157 | dict(Node, str)
158 |
159 | """
160 | path = simp_dir + "/" + self.m_title + ".txt"
161 |
162 | time_to_simp_ztz_list = {}
163 | with open(path, "r") as f:
164 | time = -1
165 | for line in f:
166 | time += 1
167 | if line.strip() != ZTZ_SEPARATOR:
168 | time_to_simp_ztz_list[time] = \
169 | line.split(ZTZ_SEPARATOR)
170 |
171 | nd_to_simp_ztz = {}
172 | for nd in self.nodes:
173 | nd_to_simp_ztz[nd] = \
174 | time_to_simp_ztz_list[nd.time][nd.place].strip()
175 |
176 | return nd_to_simp_ztz
177 |
178 | def build_high_prob_acc_arrows(self,
179 | prob_acc_thold,
180 | nsam_thold):
181 | """
182 | This method builds from scratch and returns a list of all arrows
183 | whose `prob_acc` (i.e., probability of acceptance) is >=
184 | `prob_acc_thold` with `nsam` (i.e., number of samples used to
185 | calculate that probability) >= `nsam_thold`. thold = threshold
186 |
187 | Parameters
188 | ----------
189 | prob_acc_thold: float
190 | nsam_thold: int
191 |
192 | Returns
193 | -------
194 | list[tuple[Node, Node]]
195 |
196 | """
197 | high_prob_arrows = []
198 | for arrow in self.arrows:
199 | prob_acc, nsam = get_prob_acc_and_nsam(
200 | *self.arrow_to_acc_rej_nums[arrow])
201 | if prob_acc >= prob_acc_thold and \
202 | nsam >= nsam_thold:
203 | high_prob_arrows.append(arrow)
204 | return high_prob_arrows
205 |
206 | def print_map_legend(self,
207 | clean_dir,
208 | simp_dir,
209 | prob_acc_thold,
210 | nsam_thold):
211 | """
212 | This method prints the DAG Rosetta stone (map legend).
213 |
214 | For each node labeled `( time, place)`, this method prints the
215 | simplified clause ( i.e., simplified sentence) in line `time` of the
216 | simplified file, after a number `place` of separator-tokens. It also
217 | prints the original sentence from which that simplified clause came
218 | from. The full sentence is preceded by the label `(full)` and the
219 | simplified sentence by the label `(part)`.
220 |
221 | It only prints the `(full)` and `(part)` for those nodes that appear
222 | in the DAG, after removing all arrows with probability of acceptance
223 | < `prob_acc_thold` or number of sample used to calculate that
224 | probability < `nsam_thold`.
225 |
226 | Parameters
227 | ----------
228 | clean_dir: str
229 | directory of movie scripts after cleaning
230 | simp_dir: str
231 | directory of movie scripts after simplification
232 | prob_acc_thold: float
233 | nsam_thold: int
234 |
235 | Returns
236 | -------
237 | None
238 |
239 | """
240 | hprob_arrows = self.build_high_prob_acc_arrows(
241 | prob_acc_thold, nsam_thold)
242 | print("MAP LEGEND")
243 | print("title:", self.m_title)
244 | print("prob of acceptance threshold:", prob_acc_thold)
245 | print("number of samples threshold:", nsam_thold)
246 | print("number of arrows shown:", len(hprob_arrows))
247 | print("number of arrows dropped:",
248 | len(self.arrows) - len(hprob_arrows))
249 |
250 | hprob_nodes = []
251 | for arrow in hprob_arrows:
252 | if arrow[0] not in hprob_nodes:
253 | hprob_nodes.append(arrow[0])
254 | if arrow[1] not in hprob_nodes:
255 | hprob_nodes.append(arrow[1])
256 |
257 | hprob_nodes = sorted(hprob_nodes, key=lambda x: x.time)
258 | if clean_dir:
259 | nd_to_clean_ztz = self.build_node_to_clean_ztz_dict(clean_dir)
260 | else:
261 | nd_to_clean_ztz = None
262 | nd_to_simple_ztz = self.build_node_to_simple_ztz_dict(simp_dir)
263 |
264 | for nd in hprob_nodes:
265 | print(color.GREEN + color.BOLD + node_str(nd) + ":" + color.END)
266 | ztz0 = ""
267 | if nd_to_clean_ztz:
268 | ztz0 = nd_to_clean_ztz[nd]
269 | print("(FULL)", ztz0)
270 | print("(PART)", nd_to_simple_ztz[nd])
271 |
272 | @staticmethod
273 | def draw_dot(s, j_embed):
274 | """
275 | This method draws a dot string.
276 |
277 | Using display(s) will draw the graph but will not embed it permanently
278 | in the notebook. To embed it permanently, must generate temporary image
279 | file and use Image().display(s)
280 |
281 | Parameters
282 | ----------
283 | s: output of graphviz Source(dot_str)
284 | j_embed: bool
285 | True iff want to embed image in jupyter notebook. If you are
286 | using a python terminal instead of a jupyter notebook,
287 | only j_embed=False will draw image.
288 |
289 | Returns
290 | -------
291 | None
292 | """
293 | x = s.render("tempo", format='png', view=False)
294 | if j_embed:
295 | display(Image(x))
296 | else:
297 | open_image("tempo.png").show()
298 |
299 | def draw(self, prob_acc_thold, nsam_thold, jupyter=False):
300 | """
301 | This method draws the graph for self. Only arrows with
302 | `prob_acceptance` >= `prob_acc_thold` are drawn.
303 |
304 | Parameters
305 | ----------
306 | prob_acc_thold: float
307 | nsam_thold: int
308 | jupyter: bool
309 |
310 | Returns
311 | -------
312 | None
313 |
314 | """
315 | hprob_arrows = self.build_high_prob_acc_arrows(
316 | prob_acc_thold, nsam_thold)
317 |
318 | dot = "digraph {\n"
319 | for arrow in hprob_arrows:
320 | prob_acc, nsam = get_prob_acc_and_nsam(
321 | *self.arrow_to_acc_rej_nums[arrow])
322 | X = '"' + str(prob_acc) + " (" + str(nsam) + ")" + '"'
323 | dot += '"' + node_str(arrow[0]) + '"' + "->" + \
324 | '"' + node_str(arrow[1]) + '"' + \
325 | ' [label=' + X + "];\n"
326 | dot += 'labelloc="b";\n'
327 | dot += 'label="' + self.m_title + '";\n'
328 | dot += "}\n"
329 | # print("vvbn", dot)
330 | Dag.draw_dot(gv.Source(dot), j_embed=jupyter)
331 |
332 |
333 | if __name__ == "__main__":
334 | def main1(prob_acc_thold, nsam_thold, draw):
335 | dag_dir = "short_stories_dag_atlas"
336 | simp_dir = "short_stories_simp"
337 | clean_dir = "short_stories_clean"
338 | file_names = [file_name for
339 | file_name in my_listdir(dag_dir)[0:3]]
340 | dags = []
341 | for fname in file_names:
342 | path = dag_dir + "/" + fname
343 | # print("ghty", path)
344 | with open(path, "rb") as f:
345 | dag = pik.load(f)
346 | dags.append(dag)
347 | for dag in dags:
348 | print("==================================")
349 | print(dag.m_title)
350 | hprob_arrows = dag.build_high_prob_acc_arrows(
351 | prob_acc_thold, nsam_thold)
352 | print({arrow_str(arrow):
353 | dag.arrow_to_acc_rej_nums[arrow] \
354 | for arrow in hprob_arrows})
355 | print()
356 | if draw:
357 | dag.draw(prob_acc_thold, nsam_thold)
358 | dag.print_map_legend(clean_dir, simp_dir, prob_acc_thold)
359 |
360 |
361 | main1(prob_acc_thold=.90, nsam_thold=2, draw=True)
362 |
--------------------------------------------------------------------------------
/spell_checking.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains functions for spell-checking movie scripts (or short
4 | stories)
5 |
6 | input directory: m_scripts_clean or short_stories_clean
7 | output directory: m_scripts_spell or short_stories_spell
8 |
9 | Refs:
10 | https://pyspellchecker.readthedocs.io/en/latest/code.html
11 |
12 | Spell checkers that don't take context into consideration don't work too
13 | well. They replace very infrequent words (probably a misspelled word) by
14 | more frequent ones---a very error prone stategy. Spell checkers that do take
15 | context into consideration are better, but we didn't have access to them.
16 |
17 | This is a very conservative spell checker that doesn't know about context.
18 | We make its corrections conservative by constraining it to follow the
19 | following rules.
20 |
21 | 1. It doesn't change the spelling if a word starts with a capital letter.
22 |
23 | 2. if the word ends in "s" or "ed", it only considers replacements that
24 | also end in "s" or "ed"
25 |
26 | 3. it assumes that the first 2 letters of all words are always correct.
27 |
28 | 4. It retains the capitalization of the first letter of a word.
29 |
30 | 5. It retains punctuation
31 |
32 | 6. When looking for a double letter that should be a single letter or
33 | vice versa, it only considers guesses that have the same set of characters
34 | (e.g., "pool" and "pol")
35 |
36 | This spell checker also uses two word-to-reps dictionaries: a global one
37 | compiled from global usage, and a local one compiled from the local document
38 | (i.e., movie script or short story) being corrected.
39 |
40 | This spell checker also uses two agents (WordGuess objects) working
41 | separately to produce their best possible guess.
42 |
43 | """
44 | from globals import *
45 | from spellchecker import SpellChecker # package is called pyspellchecker
46 | import os
47 | import re
48 | from WordGuesser import *
49 | from collections import defaultdict
50 | from utils import *
51 |
52 |
53 | def has_double_letter(word):
54 | """
55 | This method returns True iff `word` has two consecutive letters that are
56 | the same.
57 |
58 | Parameters
59 | ----------
60 | word: str
61 |
62 | Returns
63 | -------
64 | bool
65 |
66 | """
67 | pattern = r'(\w)\1'
68 | match = re.search(pattern, word)
69 | if match:
70 | return True
71 | else:
72 | return False
73 |
74 |
75 | def fancy_split(in_ztz):
76 | """
77 | This method first adds a white space before and after punctuation marks
78 | in `in_ztz`, then it applies a `split()` on the new sentence and returns
79 | the list generated by the split().
80 |
81 | Parameters
82 | ----------
83 | in_ztz: str
84 |
85 | Returns
86 | -------
87 | list[str]
88 |
89 | """
90 | # Match any pattern that is not a word character
91 | # or a white space.
92 | # This is the same as a punctuation mark.
93 | punctuation_pattern = re.compile(r'([^\w\s])+')
94 | # add a whitespace before and after each punctuation mark
95 | in_ztz0 = punctuation_pattern.sub(r' \1 ', in_ztz)
96 | return in_ztz0.split()
97 |
98 |
99 | def get_word_to_reps(in_file_path):
100 | """
101 | This method returns a dictionary `word_to_reps` and an int
102 | `local_word_count`. `word_to_reps` is a dictionary mapping each word to
103 | its number of repetitions, for the file located at `in_file_path`.
104 | The method also returns an int `local_word_count` which equals the number
105 | of distinct words in the file located at `in_file_path`.
106 |
107 |
108 | Parameters
109 | ----------
110 | in_file_path: str
111 |
112 | Returns
113 | -------
114 | dict[str, int], int
115 |
116 | """
117 | # tempo dictionary words are lower case
118 | word_to_reps = defaultdict(lambda: 0)
119 | with open(in_file_path, "r") as f:
120 | local_word_count = 0
121 | for line in f:
122 | words = fancy_split(line)
123 | for word in words:
124 | word = word.lower()
125 | if word.isalpha() and len(word) >= 2:
126 | local_word_count += 1
127 | if word in word_to_reps:
128 | word_to_reps[word] += 1
129 | else:
130 | word_to_reps[word] = 1
131 |
132 | return word_to_reps, local_word_count
133 |
134 |
135 | def get_corrected_sentence(in_ztz,
136 | global_checker,
137 | error_type,
138 | word_to_reps=None,
139 | local_word_count=None):
140 | """
141 | This method takes a sentence `in_zstz` as input and returns a corrected
142 | sentence. It uses two dictionaries to guess an answer: global_checker,
143 | word_to_reps.
144 |
145 | If `word_to_reps` is kept at None, no local dictionary is used. The
146 | function `get_word_to_reps()` returns both `word_to_reps` and
147 | `local_word_count`.
148 |
149 | `error_type` must be in the list ["tt", "random", "all"]
150 |
151 | Parameters
152 | ----------
153 | in_ztz: str
154 | global_checker: SpellChecker
155 | error_type: str
156 | must be in ["tt", "random", "all"]
157 | word_to_reps: dict[str, int]
158 | local_word_count: int
159 |
160 | Returns
161 | -------
162 | str, list[tuple(str, str)]
163 |
164 | """
165 | if word_to_reps:
166 | assert local_word_count
167 |
168 | def implies(x, y):
169 | return (not x) or y
170 |
171 | words = fancy_split(in_ztz)
172 | # print("dfgh", words)
173 | best_guesses = []
174 | changes = []
175 | for word in words:
176 | capitalized = word[0].isupper()
177 | word = word.lower()
178 | best_guess = word
179 | prob_global_for_word = global_checker.word_usage_frequency(word)
180 | if word.isalpha() and len(word) >= 2 and \
181 | prob_global_for_word < SPELLING_CORRECTION_RISK \
182 | and not capitalized:
183 | word_guessers = {}
184 | simple_error_types = ["tt", "random"]
185 | if error_type in simple_error_types:
186 | word_guessers[error_type] = \
187 | WordGuesser(word, global_checker,
188 | word_to_reps, local_word_count)
189 | if error_type == "all":
190 | for err in simple_error_types:
191 | word_guessers[err] = \
192 | WordGuesser(word, global_checker,
193 | word_to_reps, local_word_count)
194 | assert word_guessers
195 |
196 | for guess in global_checker.edit_distance_1(word):
197 | cond1 = (guess[0:2] == word[0:2])
198 | cond2a = implies(word[-1] == "s", guess[-1] == "s")
199 | cond2b = implies(word[-2:] == "ed", guess[-2:] == "ed")
200 |
201 | if cond1 and cond2a and cond2b:
202 | # this fixes tt, ss, dd, ll, errors
203 | if error_type in ["tt", "all"]:
204 | cond4 = (has_double_letter(guess) or has_double_letter(
205 | word)) and (len(guess) != len(word)) and set(
206 | guess) == set(word)
207 | if cond4:
208 | word_guessers['tt'].do_update(guess)
209 | if error_type in ["random", "all"]:
210 | word_guessers["random"].do_update(guess)
211 | guesser0 = None
212 | prob0 = -1
213 | for guesser in word_guessers.values():
214 | # print("fgyt", guesser)
215 | if guesser.prob_for_best_guess > prob0:
216 | guesser0 = guesser
217 | prob0 = guesser.prob_for_best_guess
218 | best_guess = guesser0.best_guess
219 | if capitalized:
220 | word = word[0].upper() + word[1:]
221 | best_guess = best_guess[0].upper() + best_guess[1:]
222 | best_guesses.append(best_guess)
223 | if word != best_guess:
224 | changes.append((word, best_guess))
225 |
226 | return " ".join(best_guesses), changes
227 |
228 |
229 | def correct_this_file(in_dir,
230 | out_dir,
231 | file_name,
232 | error_type,
233 | verbose=True,
234 | use_local_dict=False):
235 | """
236 | This method reads a file called `file_name` in the `in_dir` directory
237 | and creates a spelling corrected version in the `out_dir` directory.
238 |
239 | in_dir and out_dir can be the same, but this will overwrite the files
240 |
241 | Parameters
242 | ----------
243 | in_dir: str
244 | out_dir: str
245 | file_name: str
246 | error_type: str
247 | verbose: bool
248 | use_local_dict: bool
249 |
250 | Returns
251 | -------
252 | None
253 |
254 | """
255 | inpath = in_dir + "/" + file_name
256 | if out_dir:
257 | outpath = out_dir + "/" + file_name
258 | else:
259 | outpath = None
260 |
261 | global_checker = SpellChecker(distance=1)
262 | if use_local_dict:
263 | word_to_reps, local_word_count = get_word_to_reps(inpath)
264 | else:
265 | word_to_reps, local_word_count = None, None
266 | # print("nmjk", local_word_count, word_to_reps)
267 |
268 | # this didn't work. It merges TEMPO_DICT_FILE with global dict
269 | # instead of producing a dict solely from TEMP0_DICT_FILE
270 | # checker_local.word_frequency.load_dictionary("./" + TEMPO_DICT_FILE)
271 |
272 | if verbose:
273 | def print_probs(word1, word2):
274 | print()
275 | print("global probs:")
276 | print(word1, global_checker.word_usage_frequency(word1))
277 | print(word2, global_checker.word_usage_frequency(word2))
278 | print("local_probs:")
279 | if word_to_reps:
280 | print(word1, word_to_reps[word1])
281 | print(word2, word_to_reps[word2])
282 | else:
283 | print("N/A")
284 | print()
285 |
286 | print_probs("beautifull", "beautiful")
287 | print_probs("tomatos", "tomatoes")
288 | print_probs("mitty", "misty")
289 |
290 | corrected_lines = []
291 | all_changes = []
292 | with open(inpath, "r") as f:
293 | for line in f:
294 | corr_line, changes = get_corrected_sentence(
295 | line, global_checker, error_type,
296 | word_to_reps, local_word_count)
297 | corrected_lines.append(corr_line)
298 | all_changes += changes
299 | if verbose:
300 | print(line.strip())
301 | print(corr_line)
302 | print()
303 | print("all changes:", all_changes)
304 |
305 | if outpath:
306 | with open(outpath, "w") as f:
307 | for corr_line in corrected_lines:
308 | f.write(corr_line + "\n")
309 |
310 |
311 | def correct_this_batch_of_files(in_dir,
312 | out_dir,
313 | batch_file_names,
314 | error_type,
315 | verbose=True,
316 | use_local_dict=False):
317 | """
318 | This method calls the method `correct_this_file` for all the file names
319 | in the list of file names `batch_file_names`.
320 |
321 | Parameters
322 | ----------
323 | in_dir: str
324 | out_dir: str
325 | batch_file_names: list[str]
326 | error_type: str
327 | verbose: bool
328 | use_local_dict: bool
329 |
330 | Returns
331 | -------
332 | None
333 |
334 | """
335 | all_file_names = my_listdir(in_dir)
336 | assert set(batch_file_names).issubset(set(all_file_names))
337 | for file_name in batch_file_names:
338 | i = all_file_names.index(file_name)
339 | print(str(i + 1) + ".")
340 | print(file_name)
341 | correct_this_file(in_dir, out_dir, file_name,
342 | error_type,
343 | verbose,
344 | use_local_dict)
345 |
346 |
347 | if __name__ == "__main__":
348 | def main1(use_local_dict, error_type):
349 | print("**************************")
350 | print("use_local_dict=", use_local_dict)
351 | print("error_type=", error_type)
352 | print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK)
353 | print()
354 |
355 | in_dir = "."
356 | out_dir = "" # if empty out_dir, won't write to a file
357 | file_name = "spell_checking_test.txt"
358 |
359 | correct_this_file(in_dir,
360 | out_dir,
361 | file_name,
362 | error_type,
363 | verbose=True,
364 | use_local_dict=use_local_dict)
365 |
366 |
367 | def main2(use_local_dict, error_type):
368 | print("**************************")
369 | print("use_local_dict=", use_local_dict)
370 | print("error_type=", error_type)
371 | print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK)
372 | print()
373 |
374 | in_dir = "short_stories_clean"
375 | out_dir = "short_stories_spell"
376 | batch_file_names = my_listdir(in_dir)
377 | correct_this_batch_of_files(in_dir,
378 | out_dir,
379 | batch_file_names,
380 | error_type=error_type,
381 | verbose=False,
382 | use_local_dict=use_local_dict)
383 |
384 |
385 | def main3(use_local_dict, error_type):
386 | print("**************************")
387 | print("use_local_dict=", use_local_dict)
388 | print("error_type=", error_type)
389 | print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK)
390 | print()
391 |
392 | remove_dialogs = False
393 | in_dir = CLEAN_DIR if not remove_dialogs else CLEAN_RD_DIR
394 | out_dir = SPELL_DIR if not remove_dialogs else SPELL_RD_DIR
395 | batch_file_names = my_listdir(in_dir)[0:3]
396 | correct_this_batch_of_files(in_dir,
397 | out_dir,
398 | batch_file_names,
399 | error_type=error_type,
400 | verbose=False,
401 | use_local_dict=use_local_dict)
402 |
403 |
404 | # main1(use_local_dict=True, error_type="all")
405 | # main2(use_local_dict=True, error_type="all")
406 | main3(use_local_dict=True, error_type="all")
407 |
--------------------------------------------------------------------------------
/white_paper/bayesuvius.sty:
--------------------------------------------------------------------------------
1 | \usepackage{graphicx} %standard package. Note graphics0$, then
554 |
555 | \beqa
556 | P(y. | \cald \rvx.=x.)&=&\sum_{m.}
557 | \underbrace{\left[\sum_{x'.}
558 | P(y.|x'., m.)P(x'.)\right]}_
559 | {P(y.|\cald \rvm.=m.)}
560 | \underbrace{P(m.|x.)}_
561 | {P(m.|\cald \rvx.=x.)}
562 | \\
563 | &=&
564 | \xymatrix{
565 | &\sum x'.\ar[dr]
566 | \\
567 | x.´\ar[r]
568 | &\sum m.\ar[r]&y.
569 | }
570 | \eeqa
571 | where $\sum x'.$ and
572 | $\sum m.$
573 | means nodes
574 | $\rvx'.$ and $\rvm.$
575 | are summed over.
576 | }
577 |
578 |
579 | %Symmetry
580 | \newcommand{\symrule}[0]{
581 | $\rva\perp_P\rvb\implies \rvb\perp_P\rva$}
582 |
583 | \newcommand{\symruleH}[0]{
584 | $H(\rva:\rvb)=0\implies H(\rvb:\rva)=0$}
585 |
586 | %Decomposition
587 | \newcommand{\decrule}[0]{
588 | $\rva\perp_P\rvb, \rvc\implies
589 | \rva\perp_P\rvb \text{ and } \rva\perp_P\rvc$}
590 |
591 | \newcommand{\decruleH}[0]{
592 | $H(\rva:\rvb, \rvc)=0\implies
593 | H(\rva:\rvb)=0 \text{ and } H(\rva:\rvc)=0$}
594 |
595 | %Weak Union
596 | \newcommand{\wearule}[0]{
597 | $\rva\perp_P \rvb, \rvc \implies
598 | \rva\perp_P\rvb|\rvc\text{ and }\rva\perp_P\rvc|\rvb$}
599 |
600 | \newcommand{\wearuleH}[0]{
601 | $H(\rva:\rvb, \rvc)=0 \implies
602 | H(\rva:\rvb|\rvc)=0\text{ and }H(\rva:\rvc|\rvb)=0$}
603 |
604 | %Contraction
605 | \newcommand{\conrule}[0]{
606 | $\rva\perp_P\rvb|\rvc\text{ and }\rva\perp_P \rvc
607 | \implies \rva\perp_P \rvb, \rvc$}
608 |
609 | \newcommand{\conruleH}[0]{
610 | $H(\rva:\rvb|\rvc)=0\text{ and }H(\rva:\rvc)=0
611 | \implies H(\rva:\rvb, \rvc)=0$}
612 |
613 | %Intersection
614 | \newcommand{\intrule}[0]{
615 | $\rva\perp_P\rvb|\rvc, \rvd\text{ and }
616 | \rva\perp_P \rvd|\rvc, \rvb\implies
617 | \rva\perp_P \rvb,\rvd|\rvc$}
618 |
619 | \newcommand{\intruleH}[0]{
620 | $H(\rva:\rvb|\rvc, \rvd)=0\text{ and }
621 | H(\rva:\rvd|\rvc, \rvb)=0\implies
622 | H(\rva:\rvb,\rvd|\rvc)=0$}
623 |
624 | \newcommand{\dotbarmu}[0]{{\cdot|\mu}}
625 | \newcommand{\dotmu}[0]{{\cdot, \mu}}
626 | \newcommand{\kbarmu}[0]{{k|\mu}}
627 | \newcommand{\kmu}[0]{{k,\mu}}
628 | \newcommand{\plusbarmu}[0]{{+|\mu}}
629 | \newcommand{\plusmu}[0]{{+,\mu}}
630 |
631 | \newcommand{\bnlearn}[0]{{\tt bnlearn\;}}
632 |
633 | \newcommand{\sqsig}[0]{{[\sigma]}}
634 |
635 | \newcommand{\misscellone}[0]{
636 | \begin{array}{c}
637 | \frac{1}{nsam}
638 | P(x_0=0, x_2=0\cond x_1=1, \theta)
639 | \\
640 | \frac{1}{nsam}
641 | P(x_0=0, x_2=1\cond x_1=1, \theta)
642 | \\
643 | \frac{1}{nsam}
644 | P(x_0=1, x_2=0\cond x_1=1, \theta)
645 | \\
646 | \frac{1}{nsam}
647 | P(x_0=1, x_2=1\cond x_1=1, \theta)
648 | \end{array}
649 | }
650 |
651 | \newcommand{\misscelltwo}[0]{
652 | \begin{array}{c}
653 | \frac{1}{nsam}
654 | P(x_1=0\cond x_0=0,x_2=1, \theta)
655 | \\
656 | \frac{1}{nsam}
657 | P(x_1=1\cond x_0=0,x_2=1, \theta)
658 | \end{array}
659 | }
660 |
661 |
662 | \newcommand{\td}[0]{{\TIL{d}}}
663 | \newcommand{\rvtd}[0]{{\ul{\TIL{d}}}}
664 | \newcommand{\tx}[0]{{\TIL{x}}}
665 | \newcommand{\tmu}[0]{{\TIL{\mu}}}
666 | \newcommand{\rvtx}[0]{{\ul{\TIL{x}}}}
667 |
668 | \newcommand{\mlarr}[0]{\xrightarrow{\rm ML-fit}}
669 | \newcommand{\lrarr}[0]{\xrightarrow{\rm LR-fit}}
670 |
671 | \newcommand{\setprob}[3]
672 | {{\begin{array}{c}S=\{#1\}
673 | \\P(S)=#2\\ \haty(x^\s_S)=\$#3 K
674 | \end{array}}}
675 |
676 | \newcommand{\Gno}[0]{\xymatrix{\;\ar[r]|\parallel_G&}}
677 | \newcommand{\Gyes}[0]{\xymatrix{\;\ar[r]_G&}}
678 |
679 | \newcommand{\calypso}[0]{\ol{\caly}}
680 |
681 | \newcommand{\SeqBdoorDef}[0]{
682 | Suppose that we have access to data
683 | that allows us to
684 | estimate a probability
685 | distribution
686 | $P(x^n, y, z^n)$.
687 | Hence, the variables
688 | $\rvx^n, \rvy, \rvz^n$ are
689 | ALL observed (i.e, not hidden).
690 | Then we say that the
691 | the multinode
692 | of ``covariates" $\rvz^n$
693 | satisfies the
694 | {\bf sequential backdoor (SBD) adjustment criterion}
695 | relative to $(\rvx^n, \rvy)$
696 | if for all $t\in\{0,1, \ldots, n-1\}$,
697 |
698 | \begin{enumerate}
699 | \item
700 | $\rvy\perp\rvx_t|
701 | \underbrace{(\rvx_0, \rvx_1, \ldots,\rvx_{t-1},
702 | \rvz_0, \rvz_1, \ldots, \rvz_t)}
703 | _{\text{Past of $\rvx_t$}}$
704 | in $\call_{\rvx_{t}}
705 | \cald_{\rvx_{t+1},\rvx_{t+2}
706 | ,\ldots, \rvx_{n-1}}G$.
707 | \item
708 | $\rvz_t \cap de(\rvx_t)=\emptyset$.
709 | \end{enumerate}
710 | }
711 |
712 | \newcommand{\SeqBdoorClaim}[0]{
713 | If $\rvz^n$ satisfies the
714 | sequential backdoor criterion relative to
715 | $(\rvx^n, \rvy)$, then
716 |
717 | \beq
718 | P(y | \cald \rvx^n=x^n)=
719 | \calq(y|x^n)
720 | \;,
721 | \eeq
722 | where $\calq(y|x^n)$
723 | is defined by
724 | Eq.(\ref{def-q-y-xn-seqbdoor}).
725 | }
726 |
--------------------------------------------------------------------------------
/simp_deprecated/simp_stanford.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This file contains one of several implementations of the function
4 | `simplify_ztz(sentence, verbose=False)` that we considered.
5 |
6 | Refs:
7 | https://github.com/garain/Sentence-Simplification
8 | ztz = sentence
9 |
10 | """
11 | import nltk
12 | from nltk.tree import ParentedTree
13 | from anytree import NodeMixin, Node, AnyNode, RenderTree
14 | import re
15 | import os
16 | import subprocess
17 | from globals import *
18 |
19 | version = subprocess.check_output(
20 | ['java', '-version'], stderr=subprocess.STDOUT)
21 | print("java version=\t", version)
22 | print("CLASSPATH=\t", os.environ['CLASSPATH'])
23 | print("STANFORD_MODELS=\t", os.environ['STANFORD_MODELS'])
24 | print("JAVA_HOME=\t", os.environ['JAVA_HOME'])
25 |
26 | from nltk.parse.stanford import StanfordParser
27 |
28 | parser = StanfordParser()
29 |
30 |
31 | def simplify_ztz(sentence0, verbose=False):
32 | """
33 | This method simplifies the sentence `sentence`. It returns a list of
34 | simple sentences extracted from the input sentence.
35 |
36 | Parameters
37 | ----------
38 | sentence0: str
39 | verbose: bool
40 | kwargs: dict[]
41 |
42 | Returns
43 | -------
44 | list[str]
45 |
46 | """
47 |
48 | simple_ztz_list = []
49 | success = False
50 |
51 | # split = []
52 | # simple_sent = []
53 | # index = []
54 | # index1 = 0
55 | n = 0
56 | but = 0
57 |
58 | # scount = 0
59 | # parts = []
60 | # ht_3_last_obj = []
61 |
62 | def SBAR_simplify(sent):
63 |
64 | def make_tree(tree, t, sent_list):
65 | # this fn. converts nltk tree to anytree
66 | if tree not in sent_list:
67 | ttt = AnyNode(id=str(tree.label()), parent=t)
68 | for tt in tree:
69 | make_tree(tt, ttt, sent_list)
70 | else:
71 | AnyNode(id=str(tree), parent=t)
72 |
73 | # SBAR CASE
74 | def find_sbar(t):
75 | if t.id == 'SBAR':
76 | global sbar
77 | sbar = t
78 | for tt in t.children:
79 | find_sbar(tt)
80 |
81 | def find_vp_in_sbar(t):
82 | if t.id == 'VP':
83 | global vp_sbar
84 | vp_sbar.append(t)
85 | for tt in t.children:
86 | find_vp_in_sbar(tt)
87 |
88 | def find_np_in_sbar(t):
89 | global f
90 | global ff
91 | if t.id == 'VP':
92 | ff = False
93 | if (t.id == 'NP') and f == True and ff == True:
94 | global np_sbar
95 | np_sbar = t
96 | f = False
97 | for tt in t.children:
98 | find_np_in_sbar(tt)
99 |
100 | def find_vp(t):
101 | if t.id == 'SBAR':
102 | return
103 | global f
104 | if t.id == 'VP' and f == True:
105 | global vp
106 | vp = t
107 | f = False
108 | for tt in t.children:
109 | find_vp(tt)
110 |
111 | def find_np(t):
112 | if t.id == 'SBAR':
113 | return
114 | global f
115 | if t.id == 'NP' and f == True:
116 | global np
117 | np = t
118 | f = False
119 | for tt in t.children:
120 | find_np(tt)
121 |
122 | def find_vbz(t):
123 | if t.id == 'SBAR':
124 | return
125 | global f
126 | if t.id == 'VBZ' and f == True:
127 | global vbz
128 | vbz = t.children[0].id
129 | f = False
130 | for tt in t.children:
131 | find_vbz(tt)
132 |
133 | def make_sent(t):
134 | global simple_sentences
135 | if t.id in sent_list:
136 | simple_sentences[-1].append(t.id)
137 | for tt in t.children:
138 | make_sent(tt)
139 |
140 | # sent=sent8
141 |
142 | parse_trees = parser.raw_parse(sent)
143 | global sent_list
144 | sent_list = [s for s in sent.split()]
145 | tree = next(parse_trees)[0]
146 | # tree.draw()
147 | t = AnyNode(id='ROOT')
148 | make_tree(tree, t, sent_list)
149 | global sbar
150 | sbar = t
151 | global vp_sbar
152 | global f
153 | global ff
154 | global np_sbar
155 | global vp
156 | global np
157 | global vbz
158 | vp_sbar = []
159 | vp = t
160 | np = t
161 | vbz = 'bn2'
162 | np_sbar = t
163 | find_sbar(t)
164 | find_vp_in_sbar(sbar)
165 | f = True
166 | ff = True
167 | find_np_in_sbar(sbar)
168 | f = True
169 | find_vp(t)
170 | f = True
171 | find_np(t)
172 | f = True
173 | find_vbz(t)
174 | global simple_sentences
175 | simple_sentences = []
176 | simple_sentences.append([])
177 | make_sent(np)
178 | make_sent(vp)
179 | for i in range(len(vp_sbar)):
180 | simple_sentences.append([])
181 | if np_sbar == t:
182 | make_sent(np)
183 | else:
184 | make_sent(np_sbar)
185 | if vbz != 'bn2':
186 | simple_sentences[-1].append(vbz)
187 | make_sent(vp_sbar[i])
188 | # print (simple_sentences)
189 | simple = []
190 | for sentence in simple_sentences:
191 | string = ''
192 | for word in sentence:
193 | string += word + ' '
194 | string += '.'
195 | simple.append(string)
196 |
197 | def is_any_sbar(t):
198 | if t.id == 'SBAR':
199 | global f
200 | f = True
201 | return
202 | for tt in t.children:
203 | is_any_sbar(tt)
204 |
205 | f = False
206 | is_any_sbar(t)
207 | if f == False:
208 | simple = [sent]
209 | return simple
210 |
211 | # print(pos_tagged)
212 | # SBAR functions start here
213 | def make_tree_sbar(tree, t, sent_list):
214 | # this fn. converts nltk tree to anytree
215 | if tree not in sent_list:
216 | ttt = AnyNode(id=str(tree.label()), parent=t)
217 | for tt in tree:
218 | make_tree_sbar(tt, ttt, sent_list)
219 | else:
220 | AnyNode(id=str(tree), parent=t)
221 |
222 | def find_sbar(t):
223 | if t.id == 'SBAR':
224 | global sbar
225 | sbar = t
226 | for tt in t.children:
227 | find_sbar(tt)
228 |
229 | def find_vp_in_sbar(t):
230 | if t.id == 'VP':
231 | global vp_sbar
232 | vp_sbar = t
233 | for tt in t.children:
234 | find_vp_in_sbar(tt)
235 |
236 | def find_vp(t):
237 | if t.id == 'SBAR':
238 | return
239 | global f
240 | if t.id == 'VP' and f == True:
241 | global vp
242 | vp = t
243 | f = False
244 | for tt in t.children:
245 | find_vp(tt)
246 |
247 | def find_np(t):
248 | if t.id == 'SBAR':
249 | return
250 | global f
251 | if t.id == 'NP' and f == True:
252 | global np
253 | np = t
254 | f = False
255 | for tt in t.children:
256 | find_np(tt)
257 |
258 | def find_vbz(t):
259 | if t.id == 'SBAR':
260 | return
261 | global f
262 | if t.id == 'VBZ' and f == True:
263 | global vbz
264 | vbz = t.children[0].id
265 | f = False
266 | for tt in t.children:
267 | find_vbz(tt)
268 |
269 | def make_sent(t):
270 | global simple_sentences
271 | if t.id in sent_list:
272 | simple_sentences[-1].append(t.id)
273 | for tt in t.children:
274 | make_sent(tt)
275 |
276 | # SBAR functions end here
277 | # Multiple CC functions start here
278 | def pos_tag(tokenized_sent):
279 | return nltk.pos_tag(tokenized_sent)
280 |
281 | def has_conj(tagged_sent):
282 | cc_list = [('and', 'CC'), ('but', 'CC')]
283 | for cc_pair in cc_list:
284 | if cc_pair in tagged_sent:
285 | return True
286 | return False
287 |
288 | def split_needed(sent_list):
289 | for sent in sent_list:
290 | if has_conj(pos_tag(tokenize(sent))):
291 | return True
292 | return False
293 |
294 | def do_split(sent, cc_tuple):
295 | pos_tagged = pos_tag(tokenize(sent))
296 | tree = next(parser.tagged_parse(pos_tagged))
297 | tree1 = ParentedTree.convert(tree)
298 | # tree.draw()
299 | count = 0
300 | m = 0
301 | for t in tree1.subtrees():
302 | if t.label() == 'PP':
303 | count = count + 1
304 |
305 | index = []
306 | index1 = 0
307 | if count > 0 and (('to') not in tokenized_sent and (
308 | 'washed') not in tokenized_sent) and (
309 | tokenized_sent.count(",") < 2):
310 | for i in range(len(pos_tagged) - 3):
311 | if (pos_tagged[i][1] == 'VBD' or pos_tagged[i][1] == 'VBZ') and \
312 | pos_tagged[i + 1][1] != 'VBG' and pos_tagged[i + 3][
313 | 1] != 'CC' and pos_tagged[i + 1][1] != 'NNP' and \
314 | pos_tagged[i - 1][1] != 'CC':
315 | pos_tagged.insert(i + 1, (',', ','))
316 |
317 | for j in range(len(pos_tagged)):
318 | if pos_tagged[j][1] == 'CC':
319 | index.append(j)
320 |
321 | for t in tree1.subtrees():
322 | if t.label() == 'SBAR':
323 | m = m + 1
324 | if len(index) > 0 and count > 0 and m == 0:
325 | c = 0
326 | for i in range(len(index)):
327 | pos_tagged.insert(index[i] + c, (',', ','))
328 | c = c + 1
329 | if m > 0:
330 | for j in range(len(pos_tagged)):
331 | if pos_tagged[j][1] == 'CC':
332 | index1 = j
333 |
334 | if (index1 > 0 and m > 0) and count == 0:
335 | pos_tagged.insert(index1, (' ,', ',')) # ', 'is used
336 | pos_tagged.insert(index1 + 2, (', ', ',')) # ' ,' is used
337 | # print(pos_tagged)
338 | tree = next(parser.tagged_parse(pos_tagged))
339 | p_tree = ParentedTree.convert(tree)
340 |
341 | leaf_values = p_tree.leaves()
342 | parts = []
343 | ht_3_last_obj = []
344 |
345 | if cc_tuple in pos_tagged:
346 | leaf_index = leaf_values.index(cc_tuple[0])
347 | tree_location = p_tree.leaf_treeposition(leaf_index)
348 | parent = p_tree[tree_location[:-2]]
349 | # print(parent.height())
350 |
351 | if parent.height() == 3:
352 | # find the noun being referred to
353 | for subtree in reversed(list(parent.subtrees())):
354 | if subtree.parent() == parent:
355 | if subtree.label() == 'NN' or subtree.label() == 'NNS':
356 | ht_3_last_obj = subtree.leaves() + ht_3_last_obj
357 | del p_tree[subtree.treeposition()]
358 | # print("ht 3 last obj -> ", ht_3_last_obj)
359 | part = []
360 | for subtree in reversed(list(parent.subtrees())):
361 | if subtree.parent() == parent:
362 | # print(subtree)
363 | if subtree.label() != ',' and subtree.label() != 'CC':
364 | part = subtree.leaves() + part
365 | else:
366 | parts.append(part + ht_3_last_obj)
367 | part = []
368 | del p_tree[subtree.treeposition()]
369 | parts.append(part + ht_3_last_obj)
370 | # print('parent', parent)
371 | # print('treeloc', tree_location)
372 | parent.append(ParentedTree('INSRT', ['*']))
373 |
374 | else:
375 | for subtree in reversed(list(parent.subtrees())):
376 | if subtree.parent() == parent:
377 | # print(subtree)
378 | if subtree.label() != ',' and subtree.label() != 'CC':
379 | parts.append(subtree.leaves() + ht_3_last_obj)
380 | del p_tree[subtree.treeposition()]
381 | # print('parent', parent)
382 | # print('treeloc', tree_location)
383 | parent.append(ParentedTree('INSRT', ['*']))
384 |
385 | # p_tree.draw()
386 | # print(parts)
387 |
388 | split = []
389 | rem = p_tree.leaves()
390 | start_idx = rem.index('*')
391 |
392 | for part in reversed(parts):
393 | offset = start_idx
394 | r_clone = rem.copy()
395 | del r_clone[offset]
396 | for i, word in enumerate(part):
397 | r_clone.insert(offset + i, word)
398 | split.append(r_clone)
399 |
400 | # print("split", split)
401 |
402 | split = [" ".join(sent) for sent in split]
403 |
404 | return split
405 |
406 | def split_util(sent):
407 | cc_list = [('and', 'CC'), ('but', 'CC')]
408 | for cc_pair in cc_list:
409 | if cc_pair in pos_tag(tokenize(sent)):
410 | return do_split(sent, cc_pair)
411 | return sent
412 |
413 | def rem_dup(list):
414 | final = []
415 | for item in list:
416 | if item not in final:
417 | final.append(item)
418 | return final
419 |
420 | def simplify(sent):
421 | initial = [sent]
422 | final = []
423 |
424 | while (split_needed(initial)):
425 | final = []
426 | while (initial):
427 | sent = initial.pop(0)
428 | if (split_needed([sent])):
429 | for split_sent in reversed(split_util(sent)):
430 | final.append(split_sent)
431 | else:
432 | final.append(sent)
433 | # print("final -> ", final)
434 | initial = final.copy()
435 |
436 | final = rem_dup(final)
437 | final = list(reversed(final))
438 | # print(final)
439 |
440 | return final
441 |
442 | def tokenize(sent):
443 | tokenized_sent = nltk.word_tokenize(sent)
444 | if ('If') in tokenized_sent and ('then') in tokenized_sent:
445 | tokenized_sent.remove('If')
446 | tokenized_sent.insert(tokenized_sent.index('then'), 'and')
447 | tokenized_sent.remove('then')
448 | if ('because') in tokenized_sent:
449 | tokenized_sent.insert(tokenized_sent.index('because'),
450 | (',')) # ', 'is used
451 | tokenized_sent.insert(tokenized_sent.index('because') + 1, (','))
452 | tokenized_sent.insert(tokenized_sent.index('because'), 'and')
453 | tokenized_sent.remove('because')
454 | if ('while') in tokenized_sent:
455 | tokenized_sent.insert(tokenized_sent.index('while'), 'and')
456 | tokenized_sent.remove('while')
457 | if ('which') in tokenized_sent:
458 | tokenized_sent.insert(tokenized_sent.index('which'), 'and')
459 | tokenized_sent.remove('which')
460 | if ('or') in tokenized_sent:
461 | tokenized_sent.insert(tokenized_sent.index('or'), 'and')
462 | tokenized_sent.remove('or')
463 | if ('who') in tokenized_sent:
464 | while (',') in tokenized_sent:
465 | tokenized_sent.insert(tokenized_sent.index(','), 'and')
466 | tokenized_sent.remove(',')
467 | tokenized_sent.insert(tokenized_sent.index('who'), 'and')
468 | tokenized_sent.remove('who')
469 |
470 | return tokenized_sent
471 |
472 | sentences = [sentence0.strip()]
473 | for sentence in sentences:
474 | if verbose:
475 | print("Complex Sentence: " + sentence)
476 | tokenized_sent = tokenize(sentence)
477 | # print(tokenized_sent)
478 |
479 | # parse_trees = parser1.tagged_parse(pos_tagged)
480 |
481 | pos_tagged = pos_tag(tokenized_sent)
482 | parse_trees = parser.tagged_parse(pos_tagged)
483 | tree = next(parse_trees)
484 | p_tree = ParentedTree.convert(tree)
485 | # p_tree.draw()
486 |
487 | leaf_values = p_tree.leaves()
488 | # print(leaf_values)
489 | for i in pos_tagged:
490 | if ('and') in i:
491 | n = n + 1
492 |
493 | if ('but') in i:
494 | but = but + 1
495 | tree1 = ParentedTree.convert(tree)
496 | # tree.draw()
497 | m = 0
498 | for t in tree1.subtrees():
499 | if t.label() == 'SBAR':
500 | m = m + 1
501 |
502 | if (n + but) > 0:
503 | # tokenized_sent=nltk.word_tokenize(sent10)
504 | # pos_tagged=nltk.pos_tag(tokenized_sent)
505 | sent1 = sentence
506 | sent = " ".join(tokenize(sent1))
507 | # print(sent)
508 | simplified = simplify(sent)
509 | for i in simplified:
510 | i = list(i)
511 | if ord(i[0]) >= 97 and ord(i[0]) <= 122:
512 | i[0] = chr(ord(i[0]) - 32)
513 | while i.count(",") > 0:
514 | # i.pop(i.index(","))
515 | del (i[i.index(",")])
516 | if (".") not in (i):
517 | if verbose:
518 | print("Simple sentence: " + "".join(i) + ".")
519 | simple_ztz_list.append("".join(i) + ".")
520 | success = True
521 | else:
522 | if verbose:
523 | print("Simple sentence: " + "".join(i))
524 | simple_ztz_list.append("".join(i))
525 | success = True
526 | n = 0
527 | but = 0
528 | # print("."),
529 |
530 | elif n == 0 and m > 0 and len(re.findall(r",", sentence)) == 0 and len(
531 | re.findall(r"While", sentence)) == 0:
532 | try:
533 | sent = sentence
534 | # print(sent)
535 | # print("Hello")
536 | tokenized_sent = tokenize(sent)
537 | pos_tagged = nltk.pos_tag(tokenized_sent)
538 | parse_trees = parser.tagged_parse(pos_tagged)
539 | sent_list = [s for s in sent.split()]
540 | tree = next(parse_trees)[0]
541 | # tree.draw()
542 | t = AnyNode(id='ROOT')
543 | make_tree_sbar(tree, t, sent_list)
544 | sbar = t
545 | vp_sbar = t
546 | vp = t
547 | np = t
548 | vbz = 'asvf'
549 | find_sbar(t)
550 | find_vp_in_sbar(sbar)
551 | f = True
552 | find_vp(t)
553 | f = True
554 | find_np(t)
555 | f = True
556 | find_vbz(t)
557 | simple_sentences = []
558 | simple_sentences.append([])
559 | make_sent(np)
560 | make_sent(vp)
561 | simple_sentences.append([])
562 | make_sent(np)
563 | if vbz != 'asvf':
564 | simple_sentences[-1].append(vbz)
565 | make_sent(vp_sbar)
566 | for i in simple_sentences:
567 | i = list(i)
568 |
569 | # if ord(i[0])>=97 and ord(i[0])<=122:
570 | # i[0]=chr(ord(i[0])-32)
571 |
572 | while i.count(",") > 0:
573 | i.pop(i.index(","))
574 | if (".") not in (i):
575 | if verbose:
576 | print("Simple sentence: " + " ".join(i) + ".")
577 | simple_ztz_list.append(" ".join(i) + ".")
578 | success = True
579 | else:
580 | if verbose:
581 | print("Simple sentence: " + " ".join(i))
582 | simple_ztz_list.append(" ".join(i))
583 | success = True
584 | # print("."),
585 | except:
586 | continue
587 | elif m > 0 and (len(re.findall(r",", sentence)) > 0 or len(
588 | re.findall(r"While", sentence)) > 0):
589 | try:
590 | # sent=re.sub(r",","",sentence)
591 | # print("Hello")
592 | tokenized_sent = tokenize(sentence)
593 | simple_sentences = SBAR_simplify(" ".join(tokenized_sent))
594 | for i in simple_sentences:
595 | # i=list(i)
596 |
597 | # if ord(i[0])>=97 and ord(i[0])<=122:
598 | # i[0]=chr(ord(i[0])-32)
599 |
600 | # while i.count(",")>0:
601 | # i.pop(i.index(","))
602 | if (".") not in (i):
603 | if verbose:
604 | print("Simple sentence: " + i)
605 | simple_ztz_list.append(i)
606 | success = True
607 | else:
608 | if verbose:
609 | print("Simple sentence: " + i)
610 | simple_ztz_list.append(i)
611 | success = True
612 | # print("."),
613 | except:
614 | continue
615 | if not success:
616 | simple_ztz_list.append(sentence0)
617 | return [ztz for ztz in simple_ztz_list if len(ztz) > 2]
618 |
--------------------------------------------------------------------------------