├── pics
    ├── llm-addons.jpg
    ├── mm-project.png
    ├── first_atlas.jpg
    ├── causal-bridges.png
    ├── llm-superstitious.png
    ├── film_script_template.jpg
    ├── my_first_big_atlas.jpeg
    ├── universal-pos-tags.jpg
    ├── causal-bridges-captioned.png
    ├── spacy-named-entity-tags.png
    └── nltk_tags.html
├── m_scripts_dag_atlas
    ├── up.pkl
    ├── wall-e.pkl
    └── toy-story.pkl
├── white_paper
    ├── harris-e-mc2.jpg
    ├── mappa_mundi.pdf
    ├── shark-attacks.png
    ├── mappa_mundi_V2.pdf
    ├── nacc-nrej-plane.png
    ├── crossing-bridges.png
    ├── references.bib
    └── bayesuvius.sty
├── short_stories_dag_atlas
    ├── wiltons-holiday.pkl
    ├── bill-the-bloodhound.pkl
    └── extricating-young-gussie.pkl
├── spell_checking_test.txt
├── resources.txt
├── miscellaneous
    ├── nlp-environmental-variables.txt
    ├── starting-stanford-coreNLP-server.txt
    ├── predictions.txt.conj
    ├── parser-choices.py
    ├── WALL-E-quote-summarized-by-chatgpt.txt
    └── testing-stan-parser.py
├── requirements.txt
├── openie6_translation_test1.txt
├── README.md
├── MIT-License.txt
├── simp_deprecated
    ├── simp_spacy-claucy.py
    ├── simp_openie.py
    ├── simp_openie6-old.py
    ├── simp_spacy1.py
    ├── simp_spacy4.py
    ├── simp_spacy2.py
    ├── simp_stanford2.py
    └── simp_stanford.py
├── similarity_bert.py
├── Node.py
├── globals.py
├── simplifying_test.txt
├── similarity.py
├── similarity_deprecated
    ├── similarity_spacy2.py
    └── similarity_spacy.py
├── utils.py
├── WordGuesser.py
├── BatchSimilarity.py
├── simp_spacy3.py
├── similarity_nltk.py
├── simplifying.py
├── downloading_imsdb.py
├── simp_openie6.py
├── jupyter_notebooks
    └── SUMMARY.ipynb
├── post_cleaning.py
├── stopwords.py
├── DagAtlas.py
├── cleaning.py
├── Dag.py
└── spell_checking.py


/pics/llm-addons.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/llm-addons.jpg


--------------------------------------------------------------------------------
/pics/mm-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/mm-project.png


--------------------------------------------------------------------------------
/pics/first_atlas.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/first_atlas.jpg


--------------------------------------------------------------------------------
/pics/causal-bridges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/causal-bridges.png


--------------------------------------------------------------------------------
/m_scripts_dag_atlas/up.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/up.pkl


--------------------------------------------------------------------------------
/pics/llm-superstitious.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/llm-superstitious.png


--------------------------------------------------------------------------------
/pics/film_script_template.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/film_script_template.jpg


--------------------------------------------------------------------------------
/pics/my_first_big_atlas.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/my_first_big_atlas.jpeg


--------------------------------------------------------------------------------
/pics/universal-pos-tags.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/universal-pos-tags.jpg


--------------------------------------------------------------------------------
/white_paper/harris-e-mc2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/harris-e-mc2.jpg


--------------------------------------------------------------------------------
/white_paper/mappa_mundi.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/mappa_mundi.pdf


--------------------------------------------------------------------------------
/white_paper/shark-attacks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/shark-attacks.png


--------------------------------------------------------------------------------
/m_scripts_dag_atlas/wall-e.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/wall-e.pkl


--------------------------------------------------------------------------------
/white_paper/mappa_mundi_V2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/mappa_mundi_V2.pdf


--------------------------------------------------------------------------------
/white_paper/nacc-nrej-plane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/nacc-nrej-plane.png


--------------------------------------------------------------------------------
/m_scripts_dag_atlas/toy-story.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/m_scripts_dag_atlas/toy-story.pkl


--------------------------------------------------------------------------------
/pics/causal-bridges-captioned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/causal-bridges-captioned.png


--------------------------------------------------------------------------------
/pics/spacy-named-entity-tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/pics/spacy-named-entity-tags.png


--------------------------------------------------------------------------------
/white_paper/crossing-bridges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/white_paper/crossing-bridges.png


--------------------------------------------------------------------------------
/short_stories_dag_atlas/wiltons-holiday.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/wiltons-holiday.pkl


--------------------------------------------------------------------------------
/short_stories_dag_atlas/bill-the-bloodhound.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/bill-the-bloodhound.pkl


--------------------------------------------------------------------------------
/short_stories_dag_atlas/extricating-young-gussie.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrtucci/mappa_mundi/HEAD/short_stories_dag_atlas/extricating-young-gussie.pkl


--------------------------------------------------------------------------------
/spell_checking_test.txt:
--------------------------------------------------------------------------------
1 | Poul , Poul , Paul
2 | caesar caesar ceesar caisar
3 | Hooww are youu Judy , Poul . Judy
4 | fitnes fitness
5 | how haves you bein ? been
6 | leter beautifull adress addres
7 | letter beautiful address
8 | tomatos
9 | 


--------------------------------------------------------------------------------
/resources.txt:
--------------------------------------------------------------------------------
1 | python -m spacy download en_core_web_lg
2 | python -m spacy download en_core_web_sm
3 | python -m spacy download en_core_web_trf
4 | 
5 | python -m coreferee install en
6 | 
7 | python -m nltk.downloader popular # this includes wordnet


--------------------------------------------------------------------------------
/miscellaneous/nlp-environmental-variables.txt:
--------------------------------------------------------------------------------
1 | CLASSPATH
2 | C:\NLP\stanford-parser-full-2018-02-27;C:\NLP\stanford-postagger-full-2015-12-09;C:\NLP\stanford-ner-2015-12-09
3 | 
4 | STANFORD_MODELS
5 | C:\NLP\stanford-ner-2015-12-09\classifiers;C:\NLP\stanford-postagger-full-2015-12-09\models
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | graphviz
 2 | ipython
 3 | Pillow
 4 | contractions
 5 | spacy
 6 | Unidecode
 7 | nltk
 8 | # pycorenlp~=0.3.0
 9 | # coreferee~=1.4.0
10 | # anytree~=2.8.0
11 | numpy
12 | requests
13 | beautifulsoup4
14 | python-slugify
15 | # claucy~=0.0.2.0
16 | pyspellchecker
17 | sentence-transformers


--------------------------------------------------------------------------------
/miscellaneous/starting-stanford-coreNLP-server.txt:
--------------------------------------------------------------------------------
 1 | Reference:
 2 | https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024
 3 | 
 4 | # check java installed properly
 5 | java -version
 6 | 
 7 | # starting server
 8 | # cd to folder with stanford java code
 9 | cd /StanfordParser/stanford-corenlp-4.5.4/
10 | 
11 | # no need to deactivate conda virtual environment
12 | 
13 | # start server (IMPORTANT: make sure this is one line)
14 | java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000
15 | 


--------------------------------------------------------------------------------
/openie6_translation_test1.txt:
--------------------------------------------------------------------------------
 1 | The man , who had never liked the words `` booby '' and `` boobyhatch , '' and who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
 2 | The man , '' , thought for a moment .
 3 | The man , who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
 4 | The man , who had never liked the words `` booby , thought for a moment .
 5 | The man , who had never liked the words `` boobyhatch , thought for a moment .
 6 | 
 7 | I love Luciano Pavarotti and Jose Carreras .
 8 | I love Luciano Pavarotti .
 9 | I love Jose Carreras .
10 | 
11 | 


--------------------------------------------------------------------------------
/miscellaneous/predictions.txt.conj:
--------------------------------------------------------------------------------
 1 | The man , who had never liked the words `` booby '' and `` boobyhatch , '' and who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
 2 | The man , '' , thought for a moment .
 3 | The man , who liked them even less on a shining morning when there was a unicorn in the garden , thought for a moment .
 4 | The man , who had never liked the words `` booby , thought for a moment .
 5 | The man , who had never liked the words `` boobyhatch , thought for a moment .
 6 | 
 7 | I love Luciano Pavarotti and Jose Carreras .
 8 | I love Luciano Pavarotti .
 9 | I love Jose Carreras .
10 | 
11 | 


--------------------------------------------------------------------------------
/miscellaneous/parser-choices.py:
--------------------------------------------------------------------------------
 1 | # 1. nltk
 2 | # NLTK was unable to find stanford-parser\.jar! Set the CLASSPATH
 3 | #   environment variable.
 4 | # https://stackoverflow.com/questions/13883277/how-to-use-stanford-parser-in-nltk-using-python
 5 | 
 6 | from nltk.parse.stanford import StanfordParser
 7 | parser = StanfordParser()
 8 | 
 9 | from nltk.parse.stanford import GenericStanfordParser
10 | parser = GenericStanfordParser()
11 | 
12 | # 2. nltk.parse.corenlp
13 | # AttributeError: 'CoreNLPParser' object has no attribute 'tagged_parse'https://stackoverflow.com/questions/39320782/corenlp-provide-pos-tags
14 | import nltk
15 | from nltk.parse.corenlp import CoreNLPParser
16 | parser = CoreNLPParser(url='http://localhost:9000')
17 | 
18 | 
19 | # 3. pycorenlp
20 | from pycorenlp import StanfordCoreNLP
21 | nlp = StanfordCoreNLP('http://localhost:9000')
22 | parser = nlp.parse()
23 | 
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mappa_Mundi
 2 | 
 3 | Welcome to Mappa_Mundi (MM)!
 4 | 
 5 | MM is a method that combines seamlessly
 6 | Large Language Models (LLM)
 7 | and Causal Inference (CI).
 8 | 
 9 | The MM software does causal DEFT
10 | (causal DAG Extraction From Text).
11 | We store each extracted DAG in a separate file, and we put
12 | all DAG files in a directory
13 | that we call
14 | a DAG Atlas.
15 | 
16 | I discuss the software in
17 | detail in this white paper: 
18 | * [Version 1](https://github.com/rrtucci/mappa_mundi/blob/master/white_paper/mappa_mundi.pdf)  
19 | * [Version 2](https://github.com/rrtucci/mappa_mundi/blob/master/white_paper/mappa_mundi_V2.pdf)
20 | 
21 | ![LLM add-ons](pics/llm-addons.jpg)
22 | 
23 | ![LLM are supertitious](https://github.com/rrtucci/mappa_mundi/blob/master/pics/llm-superstitious.png)
24 | 
25 | ![Mappa Mundi Causal Bridges](https://github.com/rrtucci/mappa_mundi/blob/master/pics/causal-bridges-captioned.png)
26 | 
27 | ![Mappa Mundi Project](https://github.com/rrtucci/mappa_mundi/blob/master/pics/mm-project.png)
28 | 
29 | ![My First Big Atlas](pics/my_first_big_atlas.jpeg)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/MIT-License.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Robert R. Tucci
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy-claucy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file contains one of several implementations of the function
 4 | `simplify_ztz(sentence, verbose=False)` that we considered.
 5 | 
 6 | Refs:
 7 | https://spacy.io/usage/spacy-101/
 8 | 
 9 | https://github.com/mmxgn/spacy-clausieec13/splitting-sentences-into-clauses
10 | """
11 | from globals import *
12 | import spacy
13 | import claucy
14 | 
15 | nlp = spacy.load('en_core_web_sm')
16 | claucy.add_to_pipe(nlp)
17 | 
18 | 
19 | def simplify_ztz(sentence, verbose=False):
20 |     """
21 |     This method simplifies the sentence `sentence`. It returns a list of
22 |     simple sentences extracted from the input sentence.
23 | 
24 |     Parameters
25 |     ----------
26 |     sentence: str
27 |     verbose: bool
28 |     kwargs: dict[]
29 | 
30 |     Returns
31 |     -------
32 |     list[str]
33 | 
34 |     """
35 | 
36 |     doc = nlp(sentence.strip())
37 |     if doc._.clauses:
38 |         propositions = doc._.clauses[0].to_propositions(as_text=True)
39 |     else:
40 |         propositions = [sentence]
41 |     if verbose:
42 |         print(sentence.strip())
43 |         print(propositions)
44 |     return propositions
45 | 


--------------------------------------------------------------------------------
/similarity_bert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
 4 | that returns the similarity of sentences `ztz1` and `ztz2`.
 5 | ztz = sentence
 6 | 
 7 | It uses
 8 | 
 9 | Ref:
10 | 1. https://www.sbert.net/
11 | 2. https://huggingface.co/tasks/sentence-similarity
12 | 3. https://towardsdatascience.com/bert-for-measuring-text-similarity
13 | -eec91c6bf9e1
14 | """
15 | from sklearn.metrics.pairwise import cosine_similarity
16 | 
17 | 
18 | def ztz_similarity(ztz1, ztz2, **kwargs):
19 |     """
20 |     This method returns the similarity between sentences `ztz1` and `ztz2`.
21 |     The similarity is measured as odds of a probability, so it ranges from 0
22 |     to infinity.
23 | 
24 |     Parameters
25 |     ----------
26 |     ztz1: str
27 |     ztz2: str
28 |     kwargs: dict[]
29 | 
30 |     Returns
31 |     -------
32 |     float
33 | 
34 |     """
35 |     model = kwargs['model']
36 |     embedding_1 = model.encode(ztz1)
37 |     embedding_2 = model.encode(ztz2)
38 | 
39 |     prob = cosine_similarity([embedding_1], [embedding_2])[0, 0]
40 |     if prob < 0:
41 |         # print("neg. prob.=", prob)
42 |         # print(ztz1)
43 |         # print(ztz2)
44 |         prob = 0
45 |     odds = prob / (1 - prob) if prob < 1 else 1e5
46 |     return round(odds, 3)
47 | 


--------------------------------------------------------------------------------
/miscellaneous/WALL-E-quote-summarized-by-chatgpt.txt:
--------------------------------------------------------------------------------
 1 | Consider the following quote:
 2 | "A range of mountains takes form in the haze.
 3 | Moving closer.
 4 | The mountains are piles of TRASH.
 5 | The entire surface is nothing but waste.
 6 | 
 7 | '..We're gonna find adventure in the evening air...'
 8 | 
 9 | A silhouetted city in the distance.
10 | What looks like skyscrapers turns into trash.
11 | Thousands of neatly stacked CUBES OF TRASH, stories high.
12 | Rows and rows of stacked cubes, like city avenues.
13 | They go on for miles."
14 | 
15 | Give an enumerated list summarizing this quote.
16 | Each list item is a simple declarative sentence
17 | consisting of a subject phrase and a verb phrase.
18 | The list items are in chronological order of occurrence
19 | in the quote.
20 | 
21 | 1. A range of mountains appears in the haze.
22 | 2. The narrator moves closer.
23 | 3. The mountains are revealed to be piles of trash.
24 | 4. The entire surface is described as nothing but waste.
25 | 5. The narrator speaks of finding adventure in the evening air.
26 | 6. A silhouetted city appears in the distance.
27 | 7. The skyscrapers turn out to be made of trash.
28 | 8. Thousands of neatly stacked cubes of trash are seen, stories high.
29 | 9. Rows and rows of stacked cubes are seen, resembling city avenues.
30 | 10. The expanse of trash goes on for miles.
31 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_openie.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file contains one of several implementations of the function
 4 | `simplify_ztz(sentence, verbose=False)` that we considered.
 5 | 
 6 | References
 7 | 
 8 | 1. https://stanfordnlp.github.io/CoreNLP/openie.html#api
 9 | # Default value of openie.affinity_probability_cap was 1/3.
10 | 2. https://pypi.org/project/stanford-openie/
11 | 
12 | 2. https://stanfordnlp.github.io/CoreNLP/demo.html
13 | 
14 | """
15 | from openie import StanfordOpenIE
16 | 
17 | properties = {
18 |     'openie.triple.all_nominals': True,
19 |     'openie.triple.strict': False,
20 |     'openie.splitter.nomodel': True,
21 |     'openie.affinity_probability_cap': 1/ 3
22 | }
23 | client = StanfordOpenIE(properties=properties)
24 | 
25 | 
26 | def simplify_ztz(sentence, verbose=False):
27 |     """
28 |     This method simplifies the sentence `sentence`.
29 | 
30 |     Parameters
31 |     ----------
32 |     sentence: str
33 |     verbose: bool
34 | 
35 |     Returns
36 |     -------
37 |     str
38 | 
39 |     """
40 |     ztz_list = []
41 |     for triple in client.annotate(sentence):
42 |         ztz_list.append(triple['subject'] + " " +
43 |                         triple['relation'] + " " +
44 |                         triple['object'])
45 |     if verbose:
46 |         print(sentence.strip())
47 |         print(ztz_list)
48 |     return ztz_list
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_openie6-old.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file contains one of several implementations of the function 
 4 | `simplify_ztz(sentence, verbose=False)` that we considered.
 5 | 
 6 | It is called within a jupyter notebook at Google colab
 7 | https://colab.research.google.com/drive/1S2EWOGkoCgjfOJzTRJ7PLeu4T8SBwhlF?usp=sharing
 8 | 
 9 | Refs:
10 | 
11 | 1. https://github.com/dair-iitd/CaRB
12 | 
13 | 2. https://github.com/dair-iitd/imojie
14 | 
15 | 3. https://github.com/dair-iitd/openie6
16 | 
17 | """
18 | import subprocess
19 | from globals import *
20 | 
21 | def simplify_ztz(sentence, verbose=False):
22 |     """
23 |     This method simplifies the sentence `sentence`. It returns a list of
24 |     simple sentences extracted from the input sentence.
25 | 
26 |     Parameters
27 |     ----------
28 |     sentence: str
29 |     verbose: bool
30 |     kwargs: dict[]
31 | 
32 |     Returns
33 |     -------
34 |     list[str]
35 | 
36 |     """
37 | 
38 |     with open("../openie6_sentences.txt", "w") as f:
39 |         f.write(sentence)
40 | 
41 |     gpu_command = \
42 |         "cd ../openie6 && CUDA_DEVICE_ORDER=PCI_BUS_ID " \
43 |         "CUDA_VISIBLE_DEVICES=0 " \
44 |         "PYTHONPATH=imojie:imojie/allennlp:imojie" \
45 |         "/pytorch_transformers:$PYTHONPATH python run.py " \
46 |         "--save models/conj_model --mode predict " \
47 |         "--inp ../openie6_sentences.txt --batch_size 1 " \
48 |         "--model_str bert-large-cased --task conj " \
49 |         "--gpus 1 --out ../openie6_predictions.txt"
50 | 
51 |     cpu_command = gpu_command.replace("--gpus 1", "--gpus 0")
52 | 
53 |     if USE_GPU:
54 |         subprocess.Popen(gpu_command, shell=True)
55 |     else:
56 |         subprocess.Popen(cpu_command, shell=True)
57 | 
58 |     ztz_list = []
59 |     with open("../openie6_predictions.txt.conj", "r") as f:
60 |         for line in f:
61 |             ztz_list.append(line)
62 |     # ztz_list has full sentence in first row
63 |     return ztz_list[1:]
64 | 


--------------------------------------------------------------------------------
/Node.py:
--------------------------------------------------------------------------------
 1 | class Node:
 2 |     """
 3 | 
 4 |     This is a very simple class that holds the `time` and `place` of each
 5 |     node.
 6 | 
 7 |     Each simplified clause becomes a node of the DAG.
 8 | 
 9 |     For brevity, let us refer to time as `t` and place as `x`. Previously,
10 |     we put each full sentence of the movie script into one row of a file.
11 |     Then each sentence was replaced by zero, one, two, or more simplified
12 |     clauses, separated by separator-tokens. If a simplified clause ( i.e.,
13 |     node) appears at the row $t$ of the file (counting starting with 0),
14 |     then we say that the node occurs at time $t$. If a simplified clause
15 |     appears after zero separator-tokens, we say $x=0$ for it. If it appears
16 |     after one separator-token, we say $x=1$ for it, and so forth. Hence each
17 |     node ( i.e., simplified clause) can be labeled by its $(t, x)$ coordinates.
18 | 
19 |     Attributes
20 |     ----------
21 |     place: int
22 |     time: int
23 |     """
24 | 
25 |     def __init__(self, time, place):
26 |         """
27 |         Constructor
28 | 
29 |         Parameters
30 |         ----------
31 |         time: int
32 |         place: int
33 |         """
34 |         self.time = time
35 |         self.place = place
36 |         assert time >= 0 and place >= 0
37 | 
38 |     def coords(self):
39 |         """
40 |         This method returns the coordinates of self as a tuple.
41 | 
42 |         Returns
43 |         -------
44 |         tuple(int, int)
45 | 
46 |         """
47 |         return (self.time, self.place)
48 | 
49 | 
50 | def node_str(node):
51 |     """
52 |     This method returns a string for Node `node`.
53 | 
54 |     Parameters
55 |     ----------
56 |     node: Node
57 | 
58 |     Returns
59 |     -------
60 |     str
61 | 
62 |     """
63 |     return "(" + str(node.time) + "," + str(node.place) + ")"
64 | 
65 | 
66 | def arrow_str(arrow):
67 |     """
68 |     This method returns a string for an arrow `arrow`
69 | 
70 |     Parameters
71 |     ----------
72 |     arrow: tuple[Node, Node]
73 | 
74 |     Returns
75 |     -------
76 |     str
77 | 
78 |     """
79 |     return node_str(arrow[0]) + "->" + node_str(arrow[1])
80 | 


--------------------------------------------------------------------------------
/globals.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file contains all the global variables used by Mappa Mundi (MM).
 4 | 
 5 | """
 6 | 
 7 | BASE_URL = "https://imsdb.com"
 8 | 
 9 | M_SCRIPTS_DIR = "m_scripts"
10 | CLEAN_DIR = "m_scripts_clean"
11 | CLEAN_RD_DIR = "m_scripts_clean_rd"
12 | SPELL_DIR = "m_scripts_spell"
13 | SPELL_RD_DIR = "m_scripts_spell_rd"
14 | SIMP_DIR = "m_scripts_simp"
15 | SIMP_RD_DIR = "m_scripts_simp_rd"
16 | POST_CLEAN_DIR = "m_scripts_post_clean"
17 | POST_CLEAN_RD_DIR = "m_scripts_post_clean_rd"
18 | DAG_DIR = "m_scripts_dag_atlas"
19 | DAG_RD_DIR = "m_scripts_dag_atlas_rd"
20 | 
21 | # ZTZ_SIMPLIFIER = "simp_stanford"
22 | # ZTZ_SIMPLIFIER = "simp_spacy_claucy"
23 | # ZTZ_SIMPLIFIER = "simp_spacy1"
24 | # ZTZ_SIMPLIFIER = "simp_spacy2"
25 | # ZTZ_SIMPLIFIER = "simp_spacy3" # originally recommended
26 | ZTZ_SIMPLIFIER = "simp_openie6"  # recommended
27 | 
28 | # SIMI_DEF = "similarity_spacy"
29 | # SIMI_DEF = "similarity_spacy2"
30 | # SIMI_DEF = "similarity_nltk" # originally recommended
31 | SIMI_DEF = "similarity_bert"  # recommended
32 | 
33 | # good threshold values gleaned from similarity.py examples
34 | # SIMI_THRESHOLD = 2.2 for NLTK
35 | # SIMI_THRESHOLD = 2.69 for SpaCy
36 | SIMI_THRESHOLD = 2  # for bert, recommended
37 | 
38 | ZTZ_SEPARATOR = "[%@!]"
39 | 
40 | SPELLING_CORRECTION_RISK = 1e-8
41 | 
42 | # POS (part of speech) in stopwords.py
43 | # ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ',
44 | # 'NOUN', 'NUM', 'PART', 'PRON', 'PUNCT', 'SCONJ', 'VERB']
45 | 
46 | # To see full list of POS, see jpg in pics folder
47 | 
48 | # ADP (adposition) are mostly prepositions
49 | # AUX contains verbs like 'is'
50 | # DET (determiner) contains 'whose'
51 | # NUM contains number words like 'three'
52 | # PART (particle) contains 'not'
53 | 
54 | RETAINED_POS = ['ADJ', 'ADV', 'NOUN', 'VERB']
55 | 
56 | # See stopwords.py
57 | # RETAINED_STOPWORD_POS should be subset of RETAINED_POS
58 | # RETAINED_STOPWORD_POS = RETAINED_POS
59 | RETAINED_STOPWORD_POS = []  # recommended
60 | 
61 | USE_GPU = True
62 | 
63 | class color:
64 |    PURPLE = '\033[95m'
65 |    CYAN = '\033[96m'
66 |    DARKCYAN = '\033[36m'
67 |    BLUE = '\033[94m'
68 |    GREEN = '\033[92m'
69 |    YELLOW = '\033[93m'
70 |    RED = '\033[91m'
71 |    BOLD = '\033[1m'
72 |    UNDERLINE = '\033[4m'
73 |    END = '\033[0m'


--------------------------------------------------------------------------------
/miscellaneous/testing-stan-parser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reference:
 3 | https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024
 4 | 
 5 | """
 6 | import os
 7 | import subprocess
 8 | version = subprocess.check_output(
 9 |     ['java', '-version'], stderr=subprocess.STDOUT)
10 | print("java version=\t", version)
11 | print("CLASSPATH=\t", os.environ['CLASSPATH'])
12 | print("STANFORD_MODELS=\t", os.environ['STANFORD_MODELS'])
13 | print("JAVA_HOME=\t", os.environ['JAVA_HOME'])
14 | 
15 | def main1():
16 |     from pycorenlp import StanfordCoreNLP
17 |     nlp = StanfordCoreNLP('http://localhost:9000')
18 | 
19 |     text = "This movie was actually neither that funny, nor super witty. The movie was meh. I liked watching that movie. If I had a choice, I would not watch that movie areps."
20 |     result = nlp.annotate(text,
21 |                        properties={
22 |                            'annotators': 'sentiment, ner, pos',
23 |                            'outputFormat': 'json',
24 |                            'timeout': 1000,
25 |                        })
26 |     print(result)
27 | 
28 | def main2():
29 |     #ttps://www.nltk.org/api/nltk.parse.corenlp.html
30 |     import nltk
31 |     from nltk.parse.corenlp import CoreNLPParser
32 | 
33 |     # Start the CoreNLP server
34 |     # nltk.download('punkt')
35 |     # nltk.download('corenlp')
36 |     parser = CoreNLPParser(url='http://localhost:9000')
37 | 
38 |     # Parse a sentence
39 |     sentence = "The quick brown fox jumps over the lazy dog."
40 |     parse_tree = list(
41 |         parser.parse(sentence.split())
42 |         )[0]
43 |     print(parse_tree)
44 | 
45 | def main3():
46 |     import nltk
47 |     from nltk.parse.corenlp import CoreNLPParser
48 | 
49 |     # Start the CoreNLP server
50 |     parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
51 | 
52 |     # Parse a tagged sentence
53 |     tagged_sentence = [('The', 'DT'), ('quick', 'JJ'), ('brown', 'JJ'),
54 |                        ('fox', 'NN'),
55 |                        ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'),
56 |                        ('lazy', 'JJ'),
57 |                        ('dog', 'NN'), ('.', '.')]
58 |     # parse_tree = list(parser.parse(tagged_sentence))[0]
59 |     # print(parse_tree)
60 |     parser.parse(tagged_sentence)
61 | 
62 | if __name__ == "__main__":
63 |     # main1()
64 |     main2()
65 |     # main3() # doesn't work
66 | 
67 | 


--------------------------------------------------------------------------------
/simplifying_test.txt:
--------------------------------------------------------------------------------
 1 | Robert, who lives nearby, was walking his dog.
 2 | While eating food Ram is singing a song .
 3 | After she ate the cake , Emma visited Tony in his room .
 4 | If she is singing then I will sing .
 5 | Melanie bought a Batman game for $ 6.95 , a strategy game for $ 7.90 , and a Superman game for $ 7.73 .
 6 | Bag A contains 3 white and 2 blue marbles .
 7 | Ram has two apples and five bananas .
 8 | Ram and Shyam are two brothers .
 9 | Ram is a boy and Sita is a girl .
10 | Ram is a boy who is six years old .
11 | Ram eats a banana and an apple but sings a song .
12 | He washed cars over the weekend and now has 86 dollars .
13 | While playing piano Ram is singing a song in a room and Shyam is playing violin .
14 | You are a boy, and Sita is a girl .
15 | Ram sold 6 balls at 10 a.m and 7 balls at 11 a.m .
16 | The restaurant sold 6 slices of pie during the day and 7 slices of pie during the night .
17 | Sam's dad gave Sam 39 nickels and 31 quarters .
18 | Park workers will plant 41 dogwood trees today and 20 dogwood trees tomorrow .
19 | Dan picked 9 limes and gave Sara 4 of the limes .
20 | This year Diane bought some new hives and increased Diane's honey harvest by 6085 pounds .
21 | Sara had 4 quarters and 8 dimes in Sara's bank .
22 | Mike found 6 seashells and 4 starfishes but 4 of the seashells were broken .
23 | Jessica grew 35 watermelons and 30 carrots but the rabbits ate 27 watermelons .
24 | Dan bought a clarinet for $ 130.30 , and a song book which was $ 11.24 .
25 | There are 2 maple trees and 5 popular trees currently in the park .
26 | Dan 's cat had kittens and 5 had spots .
27 | This year, 712261 male salmon and 259378 female salmon, returned to their rivers .
28 | Each day , the polar bear at Richmond 's zoo eats 0.2 bucket of trout and 0.4 bucket of salmon .
29 | While eating food and drinking water Ram is singing a song .
30 | He is eating food and she is playing and they are fighting .
31 | Ram is playing guitar while talking to Sita .
32 | He is playing and she is crying but they are singing .
33 | The embattled Major government survived a crucial vote on coal pits closure as its last-minute concessions curbed the extent of Tony revolt over an issue that generated unusual heat in the House of Commons and brought the miners to London streets.
34 | When Sam is eating food, Alice is singing a song.
35 | Talwinder Singh,who masterminded the Kanishka crash in 1998, was killed in a fierce two hour counter.
36 | Because I was late, I became angry. 


--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file has functions to test the function `ztz_similarity(str1, str2)`
 4 | which measures the similarity of two sentences `ztz1` and `ztz2`.
 5 | `ztz_similarity()` has been implemented 4 different ways, in separate files
 6 | 
 7 | 1. similarity_bert.py (Recommended)
 8 | Uses BERT and sentence-transformers
 9 | 
10 | 2. similarity_nltk.py
11 | Uses NLTK + WordNet
12 | 
13 | 3. similarity_spacy.py
14 | Uses SpaCy + WordVec
15 | 
16 | 4. similarity_spacy2.py
17 | Attempt to use SpaCy + WordNet
18 | 
19 | """
20 | from globals import *
21 | import importlib as imp
22 | from sentence_transformers import SentenceTransformer
23 | 
24 | simi_def = imp.import_module(SIMI_DEF)
25 | 
26 | 
27 | def print_simi_12(str1, str2, **kwargs):
28 |     """
29 |     Prints similarity of `str1` and `str2`.
30 | 
31 |     Parameters
32 |     ----------
33 |     str1: str
34 |     str2: str
35 | 
36 |     Returns
37 |     -------
38 |     None
39 | 
40 |     """
41 |     print()
42 |     print("1.", str1)
43 |     print("2.", str2)
44 |     simi12 = simi_def.ztz_similarity(str1, str2, **kwargs)
45 |     simi21 = simi_def.ztz_similarity(str2, str1, **kwargs)
46 |     print("simi(1, 2)=", str(simi12))
47 |     print("simi(2, 1)=", str(simi21))
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     def main1():
52 |         if SIMI_DEF == "similarity_bert":
53 |             model = SentenceTransformer('all-MiniLM-L6-v2')
54 |         else:
55 |             model = None
56 |         print("************ simi definition from:", SIMI_DEF)
57 | 
58 |         ztzs = [
59 |             "Dogs are awesome.",
60 |             "Some gorgeous creatures are felines.",
61 |             "Dolphins are swimming mammals.",
62 |             "Cats are beautiful animals.",
63 |             "Cats are beauti animals.",
64 |         ]
65 | 
66 |         focus_ztz = "Cats are beautiful animals."
67 |         for ztz in ztzs:
68 |             print_simi_12(focus_ztz, ztz, model=model)
69 | 
70 | 
71 |     def main2():
72 |         if SIMI_DEF == "similarity_bert":
73 |             model = SentenceTransformer('all-MiniLM-L6-v2')
74 |         else:
75 |             model = None
76 |         print("************ simi definition from:", SIMI_DEF)
77 |         word1, word2 = "apple", "horse"
78 |         print_simi_12(word1, word2, model=model)
79 |         print_simi_12("Paul", "John", model=model)
80 | 
81 |         ztz1 = "The cat sat on the mat."
82 |         ztz2 = "The dog lay on the rug."
83 |         print_simi_12(ztz1, ztz2, model=model)
84 | 
85 | 
86 |     main1()
87 |     main2()
88 | 


--------------------------------------------------------------------------------
/pics/nltk_tags.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <body>
 4 | 
 5 | <ol>
 6 |     <li><strong>CC:</strong> Coordinating conjunction</li>
 7 |     <li><strong>CD:</strong> Cardinal number</li>
 8 |     <li><strong>DT:</strong> Determiner</li>
 9 |     <li><strong>EX:</strong> Existential there</li>
10 |     <li><strong>FW:</strong> Foreign word</li>
11 |     <li><strong>IN:</strong> Preposition or subordinating conjunction</li>
12 |     <li><strong>JJ:</strong> Adjective</li>
13 |     <li><strong>JJR:</strong> Adjective, comparative</li>
14 |     <li><strong>JJS:</strong> Adjective, superlative</li>
15 |     <li><strong>LS:</strong> List item marker</li>
16 |     <li><strong>MD:</strong> Modal</li>
17 |     <li><strong>NN:</strong> Noun, singular or mass</li>
18 |     <li><strong>NNS:</strong> Noun, plural</li>
19 |     <li><strong>NNP:</strong> Proper noun, singular Phrase</li>
20 |     <li><strong>NNPS:</strong> Proper noun, plural</li>
21 |     <li><strong>PDT:</strong> Pre determiner</li>
22 |     <li><strong>POS:</strong> Possessive ending</li>
23 |         <li><strong>PP:</strong> Preposition Phrase</li>
24 |     <li><strong>PRP:</strong> Possessive pronoun Phrase</li>
25 |     <li><strong>RB:</strong> Adverb</li>
26 |     <li><strong>RBR:</strong> Adverb, comparative</li>
27 | <li><strong>RBS:</strong> Adverb, superlative</li>
28 | <li><strong>RP:</strong> Particle</li>
29 | <li><strong>S:</strong> Simple declarative clause</li>
30 | <li><strong>SBAR:</strong> Clause introduced by a (possibly empty) subordinating conjunction</li>
31 | <li><strong>SBARQ:</strong> Direct question introduced by a wh-word or a wh-phrase.</li>
32 | <li><strong>SINV:</strong> Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal.</li>
33 | <li><strong>SQ:</strong> Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ.</li>
34 | <li><strong>SYM:</strong> Symbol</li>
35 | <li><strong>VB:</strong> Verb, base form</li>
36 | <li><strong>VBD:</strong> Verb, past tense</li>
37 | <li><strong>VBG:</strong> Verb, gerund or present participle</li>
38 | <li><strong>VBN:</strong> Verb, past participle</li>
39 | <li><strong>VBP:</strong> Verb, non-3rd person singular present</li>
40 | <li><strong>VBZ:</strong> Verb, 3rd person singular present</li>
41 |      <li><strong>VP:</strong> Verb Phrase</li>
42 |     <li><strong>WDT:</strong> Wh-determiner</li>
43 | <li><strong>WP:</strong> Wh-pronoun</li>
44 | <li><strong>WP$:</strong> Possessive wh-pronoun</li>
45 | <li><strong>WRB:</strong> Wh-adverb</li>
46 | </ol>
47 | </body>
48 | </html>


--------------------------------------------------------------------------------
/similarity_deprecated/similarity_spacy2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
 4 | that returns the similarity of sentences `ztz1` and `ztz2`.
 5 | ztz = sentence
 6 | 
 7 | It uses SpaCy + WordNet
 8 | 
 9 | Ref:
10 | 
11 | """
12 | import spacy
13 | import nltk
14 | from nltk.corpus import wordnet as wn
15 | from globals import *
16 | from itertools import product
17 | from collections import defaultdict
18 | from time import time
19 | 
20 | nlp = spacy.load("en_core_web_sm")
21 | 
22 | 
23 | def ztz_similarity(ztz1, ztz2, **kwargs):
24 |     """
25 |     This method returns the similarity between sentences `ztz1` and `ztz2`.
26 |     The similarity is measured as odds of a probability, so it ranges from 0
27 |     to infinity.
28 | 
29 |     Parameters
30 |     ----------
31 |     ztz1: str
32 |     ztz2: str
33 | 
34 |     Returns
35 |     -------
36 |     float
37 | 
38 |     """
39 |     do_time = False
40 |     if do_time:
41 |         print("similarity begins", time())
42 |     doc1 = nlp(ztz1)
43 |     doc2 = nlp(ztz2)
44 |     sp_tokens1 = [token1 for token1 in doc1 \
45 |                   if token1.pos_ in RETAINED_POS]
46 |     sp_tokens2 = [token2 for token2 in doc2 \
47 |                   if token2.pos_ in RETAINED_POS]
48 |     all_ss1 = []
49 |     for token1 in sp_tokens1:
50 |         if wn.synsets(token1.text):
51 |             ss1 = wn.synsets(token1.text)[0]
52 |             all_ss1.append(ss1)
53 | 
54 |     all_ss2 = []
55 |     for token2 in sp_tokens2:
56 |         if wn.synsets(token2.text):
57 |             ss2 = wn.synsets(token2.text)[0]
58 |             all_ss2.append(ss2)
59 |     ss_pair_to_simi = defaultdict(lambda: 0)
60 |     if do_time:
61 |         print("beginning of path_similarity()", time())
62 |     for ss1, ss2 in product(all_ss1, all_ss2):
63 |         simi = ss1.path_similarity(ss2)
64 |         if simi is not None:
65 |             ss_pair_to_simi[(ss1, ss2)] = simi
66 | 
67 |     score1 = 0.0
68 |     count1 = 0
69 |     for ss1 in all_ss1:
70 |         simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss2 in all_ss2]
71 |         if simi_list:
72 |             best_score = max(simi_list)
73 |             score1 += best_score
74 |             count1 += 1
75 |     if count1:
76 |         score1 /= count1
77 | 
78 |     score2 = 0.0
79 |     count2 = 0
80 |     for ss2 in all_ss2:
81 |         simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss1 in all_ss1]
82 |         if simi_list:
83 |             best_score = max(simi_list)
84 |             score2 += best_score
85 |             count2 += 1
86 |     if count2:
87 |         score2 /= count2
88 |     prob = (score1 + score2) / 2
89 |     if prob < 1:
90 |         odds = prob / (1 - prob)
91 |     else:
92 |         odds = 1000
93 |     if do_time:
94 |         print("similarity ends", time())
95 |     return round(odds, 3)
96 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file holds some general purpose functions (utilities).
  4 | 
  5 | """
  6 | import os
  7 | from cfitbit_globals import *
  8 | import shutil
  9 | 
 10 | 
 11 | def zero_based_position_from_m_title(dir_, title):
 12 |     """
 13 |     This method returns the position (zero based, starting from zero) of
 14 |     title `title` in directory `dir_`.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     dir_: str
 19 |     title: str
 20 | 
 21 |     Returns
 22 |     -------
 23 |     int
 24 | 
 25 |     """
 26 |     return list(my_listdir(dir_)).index(title + ".txt")
 27 | 
 28 | 
 29 | def m_title_from_zero_based_position(dir_, pos):
 30 |     """
 31 |     This method returns the title in directory `dir_` of the movie at
 32 |     position `pos` (zero based, starting from zero).
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     dir_: str
 37 |     pos: int
 38 | 
 39 |     Returns
 40 |     -------
 41 |     str
 42 | 
 43 |     """
 44 |     return list(my_listdir(dir_))[pos][:-len(".txt")]
 45 | 
 46 | 
 47 | def argmax_of_list(lista):
 48 |     """
 49 |     This method returns the argmax of list `lista`.
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     lista: list[X]
 54 | 
 55 |     Returns
 56 |     -------
 57 |     type(X)
 58 | 
 59 | 
 60 |     """
 61 |     return max(range(len(lista)), key=(lambda i: lista[i]))
 62 | 
 63 | 
 64 | def print_welcome_message():
 65 |     """
 66 |     This method prints a welcome message.
 67 | 
 68 |     Returns
 69 |     -------
 70 |     None
 71 | 
 72 |     """
 73 |     print("Welcome Causal AI Navigator. We have been waiting for you for "
 74 |           "millennia. Where would you like us to go next?")
 75 | 
 76 | 
 77 | def my_listdir(dir_):
 78 |     """
 79 |     Whenever one opens a text file within directory `dir_` using jupyter lab
 80 |     ( JL), JL writes an annoying `.ipynb.checkpoints` folder inside `dir_`.
 81 |     This method deletes that checkpoints folder and then returns the usual
 82 |     `os.listdir( dir_)`
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     dir_: str
 87 | 
 88 |     Returns
 89 |     -------
 90 |     iterable
 91 | 
 92 |     """
 93 |     # listdir includes hidden files like .ipynb_checkpoints
 94 |     checkpoints = dir_ + "/" + ".ipynb_checkpoints"
 95 |     shutil.rmtree(checkpoints, ignore_errors=True)
 96 |     # os.listdir list in arbitrary order!
 97 |     return sorted(os.listdir(dir_))
 98 | 
 99 | 
100 | def get_prob_acc_and_nsam(num_acc, num_rej, round_digits=2):
101 |     """
102 |     This method returns the probability of acceptance `prob_acc` and the
103 |     number of samples `nsam` used to calculate that probability.
104 | 
105 |     Parameters
106 |     ----------
107 |     num_acc: int
108 |         number of times an arrow has been accepted
109 |     num_rej: int
110 |         number of times an arrow has been rejected.
111 |     round_digits: int
112 | 
113 |     Returns
114 |     -------
115 |     float, int
116 | 
117 |     """
118 |     nsam = num_acc + num_rej
119 |     return round(num_acc / nsam, round_digits), nsam
120 | 


--------------------------------------------------------------------------------
/WordGuesser.py:
--------------------------------------------------------------------------------
 1 | class WordGuesser:
 2 |     """
 3 |     This class is used by `spell_checking.py` to store and update the word
 4 |     `best_guess` which is a guess for the word `word`. Also stored in this
 5 |     class: the probabilities for `best_guess` and `word`.
 6 | 
 7 | 
 8 |     Attributes
 9 |     ----------
10 |     best_guess: str
11 |         a word which is the best guess so far for the word `word`
12 |     global_checker: SpellChecker
13 |         a class of pyspellchecker that can give global probabilities of words
14 |     local_word_count: int
15 |         number of different words in the single local document being considered
16 |     prob_for_best_guess: float
17 |         probability for `best_guess` (average of local and global probs)
18 |     prob_for_word: float
19 |         probability for `word` (average of local and global probs)
20 |     word: str
21 |         low probability word, likely a misspelled word. `best_guess` is a
22 |         replacement for it.
23 |     word_to_reps: dict[str, int]
24 |         a dictionary mapping each word in the local document being considered,
25 |         to its number of repetitions in that document.
26 | 
27 |     """
28 | 
29 |     def __init__(self, word, global_checker,
30 |                  word_to_reps=None, local_word_count=None):
31 |         """
32 |         Constructor
33 | 
34 |         Parameters
35 |         ----------
36 |         word: str
37 |         global_checker: SpellChecker
38 |         word_to_reps: dict[str, int]
39 |         local_word_count: int
40 | 
41 |         """
42 |         assert word[0].islower()
43 |         self.word = word
44 |         self.global_checker = global_checker
45 |         self.word_to_reps = word_to_reps
46 |         self.local_word_count = local_word_count
47 |         if word_to_reps:
48 |             assert local_word_count
49 | 
50 |         self.prob_for_word = \
51 |             global_checker.word_usage_frequency(word)
52 |         if word_to_reps:
53 |             local_prob = word_to_reps[word] / local_word_count
54 |             self.prob_for_word = (self.prob_for_word + local_prob) / 2
55 | 
56 |         self.best_guess = word
57 |         self.prob_for_best_guess = 0
58 |         self.do_update(word)
59 | 
60 |     def do_update(self, guess):
61 |         """
62 |         This method finds the probability of the word `guess` in the local
63 |         dictionary, and if that probability is greater that
64 |         `prob_best_guess`, it replaces `best_guess` by `guess`. It also
65 |         updates `prob_for_best_guess`.
66 | 
67 |         Parameters
68 |         ----------
69 |         guess: str
70 | 
71 |         Returns
72 |         -------
73 |         None
74 | 
75 |         """
76 |         prob_for_guess = \
77 |             self.global_checker.word_usage_frequency(guess)
78 |         if self.word_to_reps:
79 |             local_prob = self.word_to_reps[guess] / self.local_word_count
80 |             prob_for_guess = (prob_for_guess + local_prob) / 2
81 |         if prob_for_guess > self.prob_for_best_guess:
82 |             self.best_guess = guess
83 |             self.prob_for_best_guess = prob_for_guess
84 | 


--------------------------------------------------------------------------------
/BatchSimilarity.py:
--------------------------------------------------------------------------------
 1 | import importlib as imp
 2 | from globals import *
 3 | from Dag import *
 4 | 
 5 | simi_def = imp.import_module(SIMI_DEF)
 6 | from sklearn.metrics.pairwise import cosine_similarity
 7 | 
 8 | 
 9 | class BatchSimilarity:
10 |     """
11 |     With sentence transformers, one can speed up the evaluation of sentence
12 |     similarity by embedding large batches of sentences all at once, rather
13 |     than one at a time. Given two DAGs, dag1 and dag2, this class uses a
14 |     sentence transformer to evaluate the similarity between all sentences
15 |     `all_ztz1` in dag1 and all the sentences `all_ztz2` in dag2. (ztz =
16 |     sentence). `all_ztz1 + all_ztz2` are embedded as a batch, in a single
17 |     shot.
18 | 
19 |     Attributes
20 |     ----------
21 |     all_ztz1: list[str]
22 |     all_ztz2: list[str]
23 |     cos_mat: np.array[float]
24 |         a matrix of cosines corresponding to all_ztz1 X all_ztz2
25 |     model: SentenceTransformer
26 |     node_to_simple_ztz1: dict[Node, str]
27 |     node_to_simple_ztz2: dict[Node, str]
28 | 
29 |     """
30 | 
31 |     def __init__(self,
32 |                  dag1,
33 |                  dag2,
34 |                  node_to_simple_ztz1,
35 |                  node_to_simple_ztz2,
36 |                  model=None):
37 |         """
38 |         Constructor
39 | 
40 |         Parameters
41 |         ----------
42 |         dag1: Dag
43 |         dag2: Dag
44 |         node_to_simple_ztz1: dict[Node, str]
45 |         node_to_simple_ztz2: dict[Node, str]
46 |         model: SentenceTransformer
47 |         """
48 |         self.node_to_simple_ztz1 = node_to_simple_ztz1
49 |         self.node_to_simple_ztz2 = node_to_simple_ztz2
50 |         self.all_ztz1 = [node_to_simple_ztz1[nd] for nd in dag1.nodes]
51 |         self.all_ztz2 = [node_to_simple_ztz2[nd] for nd in dag2.nodes]
52 |         self.model = model
53 |         if model:
54 |             sent_embeddings = model.encode(self.all_ztz1 + self.all_ztz2)
55 |             len1 = len(self.all_ztz1)
56 |             self.cos_mat = cosine_similarity(sent_embeddings[:len1],
57 |                                              sent_embeddings[len1:])
58 | 
59 |     def simi(self, nd1, nd2):
60 |         """
61 |         This method returns the similarity of the sentences corresponding to
62 |         nodes `nd1` and `nd2`.
63 | 
64 |         Parameters
65 |         ----------
66 |         nd1: Node
67 |         nd2: Node
68 | 
69 |         Returns
70 |         -------
71 |         float
72 | 
73 |         """
74 |         ztz1 = self.node_to_simple_ztz1[nd1]
75 |         ztz2 = self.node_to_simple_ztz2[nd2]
76 |         if not self.model:
77 |             return simi_def.ztz_similarity(ztz1, ztz2)
78 |         else:
79 |             k1 = self.all_ztz1.index(ztz1)
80 |             k2 = self.all_ztz2.index(ztz2)
81 |             prob = self.cos_mat[k1, k2]
82 |             if prob < 0:
83 |                 # print("neg. prob.=", prob)
84 |                 # print(ztz1)
85 |                 # print(ztz2)
86 |                 prob = 0
87 |             odds = prob / (1 - prob) if prob < 1 else 1e5
88 |             return round(odds, 3)
89 | 


--------------------------------------------------------------------------------
/white_paper/references.bib:
--------------------------------------------------------------------------------
  1 | @book{book-of-why ,
  2 |   title={The book of why: the new science of cause and effect},
  3 |   author={Pearl, Judea and Mackenzie, Dana},
  4 |   year={2018},
  5 |   publisher={Basic books}
  6 | }
  7 | 
  8 | @misc{bayesuvius,
  9 | title="Bayesuvius (book)",
 10 | author="Robert R. Tucci",
 11 | howpublished="\url{https://github.com/rrtucci/Bayesuvius/raw/master/main.pdf}"
 12 | }
 13 | 
 14 | @inproceedings{2022opberg,
 15 |   title={OpBerg: Discovering causal sentences using optimal alignments},
 16 |   author={Wood, Justin and Matiasz, Nicholas and Silva, Alcino and Hsu, William and Abyzov, Alexej and Wang, Wei},
 17 |   booktitle={International Conference on Big Data Analytics and Knowledge Discovery},
 18 |   pages={17--30},
 19 |   year={2022},
 20 |   organization={Springer}
 21 | }
 22 | 
 23 | @misc{yann-religion,
 24 | title="Twitter, Absurd statement about causal inference
 25 | and religion",
 26 | author="Yann LeCun",
 27 | howpublished="\url{https://twitter.com/ylecun/status/1577128801620070400}"
 28 | }
 29 | 
 30 | @misc{yann-text,
 31 | title="Twitter,  Absurd statement
 32 | about all 
 33 | the text ever written",
 34 | author="Yann LeCun",
 35 | howpublished="\url{https://twitter.com/ylecun/status/1562137291845521408}"
 36 | }
 37 | 
 38 | @misc{deft1,
 39 | title="Causal DAG extraction from a library of books or videos/movies",
 40 | author="Robert R. Tucci",
 41 | howpublished="\url{https://arxiv.org/abs/2211.00486}"
 42 | }
 43 | 
 44 | 
 45 | @misc{tic-tac-toe,
 46 | title="deft-tic-tac-toe at github",
 47 | author="Robert R. Tucci",
 48 | howpublished="\url{https://github.com/rrtucci/deft-tic-tac-toe}"
 49 | }
 50 | 
 51 | @misc{project-gutenberg,
 52 | title="Project {G}utenberg website",
 53 | howpublished="\url{https://www.gutenberg.org}"
 54 | }
 55 | 
 56 | @misc{imsdb,
 57 | title="Internet {M}ovie {S}cript {D}atabase ({IMSDb})",
 58 | howpublished="\url{https://imsdb.com/}"
 59 | }
 60 | 
 61 | @misc{github-mappa-mundi,
 62 | title="Mappa {M}undi at github",
 63 | author="Robert R. Tucci",
 64 | howpublished="\url{https://github.com/rrtucci/mappa_mundi}"
 65 | }
 66 | 
 67 | @misc{audio-description,
 68 | title="Audio description",
 69 | author="Wikipedia",
 70 | howpublished="\url{https://en.wikipedia.org/wiki/Audio_description}"
 71 | }
 72 | 
 73 | @misc{scumpy,
 74 | title="{SCuMpy} at github",
 75 | author="Robert R. Tucci",
 76 | howpublished="\url{https://github.com/rrtucci/scumpy}"
 77 | }
 78 | 
 79 | @misc{sentence-ax,
 80 | title="Sentence{A}x at github",
 81 | author="Robert R. Tucci",
 82 | howpublished="\url{https://github.com/rrtucci/SentenceAx}"
 83 | }
 84 | 
 85 | @misc{fitbit-dataset,
 86 | title="FitBit Fitness Tracker Data",
 87 | author="Kaggle.com",
 88 | howpublished="\url{https://www.kaggle.com/datasets/arashnic/fitbit}"
 89 | }
 90 | 
 91 | @misc{causal-fitbit,
 92 | title="Causal{F}itbit at github",
 93 | author="Robert R. Tucci",
 94 | howpublished="\url{https://github.com/rrtucci/CausalFitbit}"
 95 | }
 96 | 
 97 | @misc{sbert,
 98 | title="{sBERT}",
 99 | author="sbert.net",
100 | howpublished="\url{https://www.sbert.net/}"
101 | }
102 | 
103 | @misc{openie6,
104 | title="{Openie6}",
105 | author="dair-iitd",
106 | howpublished="\url{https://github.com/dair-iitd/openie6}"
107 | }
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains one of several implementations of the function
  4 | `simplify_ztz(sentence, verbose=False)` that we considered.
  5 | 
  6 | Refs:
  7 | https://spacy.io/usage/spacy-101/
  8 | 
  9 | https://subscription.packtpub.com/book/data/9781838987312/2/ch02lvl1sec13/splitting-sentences-into-clauses
 10 | """
 11 | from globals import *
 12 | import spacy
 13 | 
 14 | nlp = spacy.load('en_core_web_sm')
 15 | 
 16 | 
 17 | # sentence = "He eats cheese, but he won't eat ice cream."
 18 | 
 19 | def simplify_ztz(sentence, verbose=False):
 20 |     """
 21 |     This method simplifies the sentence `sentence`. It returns a list of
 22 |     simple sentences extracted from the input sentence.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     sentence: str
 27 |     verbose: bool
 28 |     kwargs: dict[]
 29 | 
 30 |     Returns
 31 |     -------
 32 |     list[str]
 33 | 
 34 |     """
 35 | 
 36 | 
 37 |     doc = nlp(sentence)
 38 | 
 39 |     for token in doc:
 40 |         ancestors = [t.text for t in token.ancestors]
 41 |         children = [t.text for t in token.children]
 42 |         # if verbose:
 43 |         #     print(token.text, "\t", token.i, "\t",
 44 |         #           token.pos_, "\t", token.dep_, "\t",
 45 |         #           ancestors, "\t", children)
 46 | 
 47 |     def find_root_of_sentence(doc):
 48 |         root_token = None
 49 |         for token in doc:
 50 |             if (token.dep_ == "ROOT"):
 51 |                 root_token = token
 52 |         return root_token
 53 | 
 54 |     root_token = find_root_of_sentence(doc)
 55 | 
 56 |     def find_other_verbs(doc, root_token):
 57 |         other_verbs = []
 58 |         for token in doc:
 59 |             ancestors = list(token.ancestors)
 60 |             if (token.pos_ == "VERB" and len(ancestors) == 1 \
 61 |                     and ancestors[0] == root_token):
 62 |                 other_verbs.append(token)
 63 |         return other_verbs
 64 | 
 65 |     other_verbs = find_other_verbs(doc, root_token)
 66 | 
 67 |     def get_clause_token_span_for_verb(verb, doc, all_verbs):
 68 |         first_token_index = len(doc)
 69 |         last_token_index = 0
 70 |         this_verb_children = list(verb.children)
 71 |         for child in this_verb_children:
 72 |             if (child not in all_verbs):
 73 |                 if (child.i < first_token_index):
 74 |                     first_token_index = child.i
 75 |                 if (child.i > last_token_index):
 76 |                     last_token_index = child.i
 77 |         return (first_token_index, last_token_index)
 78 | 
 79 |     token_spans = []
 80 |     all_verbs = [root_token] + other_verbs
 81 |     for other_verb in all_verbs:
 82 |         (first_token_index, last_token_index) = \
 83 |             get_clause_token_span_for_verb(other_verb,
 84 |                                            doc, all_verbs)
 85 |         token_spans.append((first_token_index,
 86 |                             last_token_index))
 87 | 
 88 |     sentence_clauses = []
 89 |     for token_span in token_spans:
 90 |         start = token_span[0]
 91 |         end = token_span[1]
 92 |         if (start < end):
 93 |             clause = doc[start:end]
 94 |             sentence_clauses.append(clause)
 95 |     sentence_clauses = sorted(sentence_clauses,
 96 |                               key=lambda tup: tup[0])
 97 | 
 98 |     clauses_text = [clause.text for clause in sentence_clauses]
 99 |     if verbose:
100 |         print(sentence)
101 |         print(clauses_text)
102 | 
103 |     return clauses_text
104 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains one of several implementations of the function
  4 | `simplify_ztz(sentence, verbose=False)` that we considered.
  5 | 
  6 | References:
  7 | 
  8 |  1. "Knowledge graphs from complex text" by Karthika Vijayan (Solution
  9 |     Consultant @Sahaj )
 10 |     https://medium.com/inspiredbrilliance/knowledge-graphs-from-complex-text-eb009aeed48e
 11 | 
 12 | """
 13 | import spacy
 14 | nlp = spacy.load('en_core_web_lg')
 15 | import coreferee
 16 | import spacy_transformers
 17 | 
 18 | def coref_resolve(text):
 19 |     nlp1 = spacy.load('en_core_web_trf')
 20 |     nlp1.add_pipe('coreferee')
 21 |     doc1 = nlp1(text)
 22 |     tok_list = list(token.text for token in doc1)
 23 |     c = 0
 24 |     for chain in doc1._.coref_chains:
 25 |         for mention in chain:
 26 |             res1 = [doc1._.coref_chains.resolve(doc1[i]) for i in mention]
 27 |             res = list(filter((None).__ne__, res1))
 28 |             if len(res) != 0:
 29 |                 if len(res[0]) == 1:
 30 |                     tok_list[mention[0] + c] = str(res[0][0])
 31 |                 elif len(res[0]) > 1:
 32 |                     tok_list[mention[0] + c] = str(res[0][0])
 33 |                     for j in range(1, len(res[0])):
 34 |                         tok_list.insert(mention[0] + c + j, str(res[0][j]))
 35 |                         c = c + 1
 36 |     textres = " ".join(tok_list)
 37 |     return textres
 38 | 
 39 | 
 40 | def compound_to_simple(sentence):
 41 |     doc = nlp(sentence)
 42 | 
 43 |     root_token = None
 44 |     for token in doc:
 45 |         if (token.dep_ == "ROOT"):
 46 |             root_token = token
 47 | 
 48 |     other_verbs = []
 49 |     for token in doc:
 50 |         ancestors = list(token.ancestors)
 51 |         if (token.pos_ == "VERB" and len(
 52 |                 ancestors) < 3 and token != root_token):
 53 |             other_verbs.append(token)
 54 | 
 55 |     token_spans = []
 56 |     all_verbs = [root_token] + other_verbs
 57 |     for other_verb in all_verbs:
 58 |         first_token_index = len(doc)
 59 |         last_token_index = 0
 60 |         this_verb_children = list(other_verb.children)
 61 |         for child in this_verb_children:
 62 |             if (child not in all_verbs):
 63 |                 if (child.i < first_token_index):
 64 |                     first_token_index = child.i
 65 |                 if (child.i > last_token_index):
 66 |                     last_token_index = child.i
 67 |         token_spans.append((first_token_index, last_token_index))
 68 | 
 69 |     sentence_clauses = []
 70 |     for token_span in token_spans:
 71 |         start = token_span[0]
 72 |         end = token_span[1]
 73 |         if (start < end):
 74 |             clause = doc[start:end]
 75 |             sentence_clauses.append(clause)
 76 |     sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
 77 |     clauses_text = [clause.text for clause in sentence_clauses]
 78 |     return clauses_text
 79 | 
 80 | def simplify_ztz(sentence, verbose=False):
 81 |     """
 82 |     This method simplifies the sentence `sentence`. It returns a list of
 83 |     simple sentences extracted from the input sentence.
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     sentence: str
 88 |     verbose: bool
 89 |     kwargs: dict[]
 90 | 
 91 |     Returns
 92 |     -------
 93 |     list[str]
 94 | 
 95 |     """
 96 | 
 97 |     textres = coref_resolve(sentence)
 98 |     ztz_list = compound_to_simple(textres)
 99 |     if verbose:
100 |         print(sentence.strip())
101 |         print(ztz_list)
102 |     return ztz_list


--------------------------------------------------------------------------------
/simp_spacy3.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains one of several implementations of the function
  4 | `simplify_ztz(sentence, verbose=False)` that we considered.
  5 | 
  6 | Refs:
  7 | https://spacy.io/usage/spacy-101/
  8 | 
  9 | For spacy, here are some values of token.dep_
 10 | 
 11 | cc: coordinating conjunction.
 12 |     i.e., FANBOYS = for, and, nor, but, or, yet, so
 13 | 
 14 | mark: marker that introduces a subordinate clause
 15 | 
 16 | ADP: adposition, e.g. in, to, during
 17 | 
 18 | """
 19 | 
 20 | import spacy
 21 | import re
 22 | from globals import *
 23 | 
 24 | nlp = spacy.load("en_core_web_sm")
 25 | nlp.add_pipe("merge_entities")
 26 | 
 27 | 
 28 | def simplify_ztz(sentence, verbose=False):
 29 |     """
 30 |     This method simplifies the sentence `sentence`. It returns a list of
 31 |     simple sentences extracted from the input sentence.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     sentence: str
 36 |     verbose: bool
 37 | 
 38 |     Returns
 39 |     -------
 40 |     list[str]
 41 | 
 42 |     """
 43 |     doc = nlp(sentence)
 44 |     tokenized_clauses_list = []
 45 |     tokenized_clause = []
 46 |     for token in doc:
 47 |         cond = (token.dep_ == "mark") or \
 48 |                (token.dep_ == "cc") or \
 49 |                (token.text == ";")
 50 |         if not cond:
 51 |             tokenized_clause.append(token)
 52 |         else:
 53 |             tokenized_clauses_list.append(tokenized_clause)
 54 |             tokenized_clause = []
 55 |     # last clause
 56 |     tokenized_clauses_list.append(tokenized_clause)
 57 | 
 58 |     ztz_list = []
 59 |     for tokenized_clause in tokenized_clauses_list:
 60 | 
 61 |         # replace by empty list any tokenized clause
 62 |         # that doesn't have a noun/pronoun and a verb
 63 |         clause_has_noun_or_pronoun = False
 64 |         clause_has_verb = False
 65 |         token_str_list = []
 66 |         for token in tokenized_clause:
 67 |             x = get_simplified_token_txt(token)
 68 |             if x:
 69 |                 token_str_list.append(x)
 70 |             if token.pos_ in ["NOUN", "PRON", "PROPN"] and x:
 71 |                 clause_has_noun_or_pronoun = True
 72 |                 # print("NOUN or PRONOUN", token.text)
 73 |             if token.pos_ in ["VERB", "AUX"] and x:
 74 |                 clause_has_verb = True
 75 |                 # print("VERB", token.text)
 76 |         if not (clause_has_noun_or_pronoun and clause_has_verb):
 77 |             clause_str = []
 78 |         else:
 79 |             clause_str = " ".join(token_str_list)
 80 | 
 81 |         if clause_str:
 82 |             ztz_list.append(clause_str)
 83 | 
 84 |     if verbose:
 85 |         print(sentence.strip())
 86 |         print(ztz_list)
 87 |     return ztz_list
 88 | 
 89 | 
 90 | def get_simplified_token_txt(token):
 91 |     """
 92 |     This auxiliary method takes as input a SpaCy Token `token` and returns a
 93 |     simplified version of the token's text.
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     token: Token
 98 | 
 99 |     Returns
100 |     -------
101 |     str
102 | 
103 |     """
104 |     x = token.text
105 |     # remove all punctuation marks
106 |     x = re.sub(r'[^\w\s]', '', x)
107 |     if token.ent_type_:
108 |         # replace named entities by their labels
109 |         # x = token.ent_type_
110 | 
111 |         # remove named entities
112 |         x = ""
113 |     if token.is_stop and (token.pos_ not in RETAINED_STOPWORD_POS):
114 |         x = ""
115 |     if token.pos_ not in RETAINED_POS:
116 |         x = ""
117 |     # remove single character tokens
118 |     if len(x.strip()) == 1:
119 |         x = ""
120 |     x = x.strip()
121 |     return x
122 | 


--------------------------------------------------------------------------------
/similarity_deprecated/similarity_spacy.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
  4 | that returns the similarity of sentences `ztz1` and `ztz2`.
  5 | ztz = sentence
  6 | 
  7 | It uses SpaCy + WordVec
  8 | 
  9 | Ref:
 10 | 
 11 | 
 12 | """
 13 | 
 14 | from itertools import product
 15 | import numpy as np
 16 | import spacy
 17 | from globals import *
 18 | 
 19 | nlp = spacy.load('en_core_web_lg')
 20 | 
 21 | 
 22 | def ztz_similarity(ztz1, ztz2, **kwargs):
 23 |     """
 24 |     This method returns the similarity between sentences `ztz1` and `ztz2`.
 25 |     The similarity is measured as odds of a probability, so it ranges from 0
 26 |     to infinity.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     ztz1: str
 31 |     ztz2: str
 32 | 
 33 |     Returns
 34 |     -------
 35 |     float
 36 | 
 37 |     """
 38 | 
 39 |     def same_pos(token1, token2):
 40 |         # this gives same simi but elapsed time is less
 41 |         return token1.pos_ == token2.pos_
 42 | 
 43 |     doc1 = nlp(ztz1)
 44 |     doc2 = nlp(ztz2)
 45 |     sp_tokens1 = [token1 for token1 in doc1 \
 46 |                   if token1.pos_ in RETAINED_POS]
 47 |     sp_tokens2 = [token2 for token2 in doc2 \
 48 |                   if token2.pos_ in RETAINED_POS]
 49 |     token_pair_to_simi = {}
 50 |     for token1, token2 in product(sp_tokens1, sp_tokens2):
 51 |         if same_pos(token1, token2):
 52 |             simi = nlp(token1.text.lower()). \
 53 |                 similarity(nlp(token2.text.lower()))
 54 |             # print("llkj", token1.text, token2.text, token1.pos_, simi)
 55 |             if simi is not None:
 56 |                 token_pair_to_simi[(token1, token2)] = simi
 57 |     # print("ffgh", "****************")
 58 |     # ("mmnk", token_pair_to_simi)
 59 |     score1 = 0.0
 60 |     count1 = 0
 61 |     for token1 in sp_tokens1:
 62 |         simi_list = [token_pair_to_simi[(token1, token2)]
 63 |                      for token2 in sp_tokens2
 64 |                      if same_pos(token1, token2)]
 65 |         if simi_list:
 66 |             best_score = max(simi_list)
 67 |             score1 += best_score
 68 |             count1 += 1
 69 |     if count1:
 70 |         score1 /= count1
 71 | 
 72 |     score2 = 0.0
 73 |     count2 = 0
 74 |     for token2 in sp_tokens2:
 75 |         simi_list = [token_pair_to_simi[(token1, token2)]
 76 |                      for token1 in sp_tokens1
 77 |                      if same_pos(token1, token2)]
 78 |         if simi_list:
 79 |             best_score = max(simi_list)
 80 |             score2 += best_score
 81 |             count2 += 1
 82 |     if count2:
 83 |         score2 /= count2
 84 |     prob = (score1 + score2) / 2
 85 |     if prob < 1:
 86 |         odds = prob / (1 - prob)
 87 |     else:
 88 |         odds = 1000
 89 |     return round(odds, 3)
 90 | 
 91 | 
 92 | """
 93 | ************ simi definition from: similarity_spacy
 94 | 1. Cats are beautiful animals.
 95 | 2. Dogs are awesome.
 96 | simi(1, 2)= 2.578
 97 | simi(2, 1)= 2.578
 98 | 
 99 | 1. Cats are beautiful animals.
100 | 2. Some gorgeous creatures are felines.
101 | simi(1, 2)= 2.697
102 | simi(2, 1)= 2.697
103 | 
104 | 1. Cats are beautiful animals.
105 | 2. Dolphins are swimming mammals.
106 | simi(1, 2)= 2.535
107 | simi(2, 1)= 2.535
108 | 
109 | 1. Cats are beautiful animals.
110 | 2. Cats are beautiful animals.
111 | simi(1, 2)= 1000
112 | simi(2, 1)= 1000
113 | 
114 | 1. Cats are beautiful animals.
115 | 2. Cats are beauti animals.
116 | simi(1, 2)= 7.986
117 | simi(2, 1)= 7.986
118 | 
119 | ************ simi definition from: similarity_spacy
120 | 1. apple
121 | 2. horse
122 | simi(1, 2)= 0.247
123 | simi(2, 1)= 0.247
124 | 
125 | 1. Paul
126 | 2. John
127 | simi(1, 2)= 0.0
128 | simi(2, 1)= 0.0
129 | 
130 | 1. The cat sat on the mat.
131 | 2. The dog lay on the rug.
132 | simi(1, 2)= 1.678
133 | simi(2, 1)= 1.678
134 | elapsed time= 0.14391398429870605
135 | 
136 | """
137 | 


--------------------------------------------------------------------------------
/similarity_nltk.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains a function `ztz_similarity(ztz1, ztz2)`
  4 | that returns the similarity of sentences `ztz1` and `ztz2`.
  5 | ztz = sentence
  6 | 
  7 | It uses NLTK + WordNet
  8 | 
  9 | Ref:
 10 | https://nlpforhackers.io/wordnet-sentence-similarity/
 11 | 
 12 | """
 13 | 
 14 | from nltk import word_tokenize, pos_tag
 15 | from nltk.corpus import wordnet as wn
 16 | from itertools import product
 17 | from collections import defaultdict
 18 | from time import time
 19 | 
 20 | 
 21 | def penn_to_wn(tag):
 22 |     """
 23 |     Convert a Penn Treebank tag to a simplified Wordnet tag
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     tag: str
 28 | 
 29 |     Returns
 30 |     -------
 31 |     str
 32 | 
 33 |     """
 34 |     if tag.startswith('N'):
 35 |         return 'n'  # noun
 36 | 
 37 |     if tag.startswith('V'):
 38 |         return 'v'  # verb
 39 | 
 40 |     if tag.startswith('J'):
 41 |         return 'a'  # adjective
 42 | 
 43 |     if tag.startswith('R'):
 44 |         return 'r'  # adverb
 45 | 
 46 |     return None
 47 | 
 48 | 
 49 | def synset_for_tgd_word(tgd_word):
 50 |     """
 51 |     This private method returns the most likely synset for a tagged word
 52 |     `tgd_word`. A synset (synonym set) is a sort of equivalence class of
 53 |     words with very similar meanings.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     tgd_word: tuple(str, str)
 58 | 
 59 |     Returns
 60 |     -------
 61 |     wn.synset or None
 62 | 
 63 |     """
 64 |     word, tag = tgd_word
 65 |     wn_tag = penn_to_wn(tag)
 66 |     if wn_tag is None:
 67 |         return None
 68 | 
 69 |     try:
 70 |         return wn.synsets(word, wn_tag)[0]
 71 |     except:
 72 |         return None
 73 | 
 74 | 
 75 | def ztz_similarity(ztz1, ztz2, **kwargs):
 76 |     """
 77 |     This method returns the similarity between sentences `ztz1` and `ztz2`.
 78 |     The similarity is measured as odds of a probability, so it ranges from 0
 79 |     to infinity.
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     ztz1: str
 84 |     ztz2: str
 85 | 
 86 |     Returns
 87 |     -------
 88 |     float
 89 | 
 90 |     """
 91 | 
 92 |     do_time = False
 93 |     if do_time:
 94 |         print("similarity start", time())
 95 |     # Tokenize and tag
 96 |     tgd_ztz1 = pos_tag(word_tokenize(ztz1.lower()))
 97 |     tgd_ztz2 = pos_tag(word_tokenize(ztz2.lower()))
 98 | 
 99 |     # Get the synsets for the tagged words (tgd_word)
100 |     all_ss1 = []
101 |     for tgd_word in tgd_ztz1:
102 |         ss1 = synset_for_tgd_word(tgd_word)
103 |         if ss1:
104 |             all_ss1.append(ss1)
105 |     all_ss2 = []
106 |     for tgd_word in tgd_ztz2:
107 |         ss2 = synset_for_tgd_word(tgd_word)
108 |         if ss2:
109 |             all_ss2.append(ss2)
110 | 
111 |     ss_pair_to_simi = defaultdict(lambda: 0)
112 |     if do_time:
113 |         print("similarity begin path_similarity()", time())
114 |     for ss1, ss2 in product(all_ss1, all_ss2):
115 |         simi = ss1.path_similarity(ss2)
116 |         if simi is not None:
117 |             ss_pair_to_simi[(ss1, ss2)] = simi
118 | 
119 |     score1 = 0.0
120 |     count1 = 0
121 |     for ss1 in all_ss1:
122 |         simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss2 in all_ss2]
123 |         if simi_list:
124 |             best_score = max(simi_list)
125 |             score1 += best_score
126 |             count1 += 1
127 |     if count1:
128 |         score1 /= count1
129 | 
130 |     score2 = 0.0
131 |     count2 = 0
132 |     for ss2 in all_ss2:
133 |         simi_list = [ss_pair_to_simi[(ss1, ss2)] for ss1 in all_ss1]
134 |         if simi_list:
135 |             best_score = max(simi_list)
136 |             score2 += best_score
137 |             count2 += 1
138 |     if count2:
139 |         score2 /= count2
140 |     prob = (score1 + score2) / 2
141 |     if prob < 1:
142 |         odds = prob / (1 - prob)
143 |     else:
144 |         odds = 1000
145 |     if do_time:
146 |         print("similarity ends", time())
147 |     return round(odds, 3)
148 | 
149 | 
150 | """
151 | ************ simi definition from: similarity_nltk
152 | 1. Cats are beautiful animals.
153 | 2. Dogs are awesome.
154 | simi(1, 2)= 1.045
155 | simi(2, 1)= 1.045
156 | 
157 | 1. Cats are beautiful animals.
158 | 2. Some gorgeous creatures are felines.
159 | simi(1, 2)= 2.429
160 | simi(2, 1)= 2.429
161 | 
162 | 1. Cats are beautiful animals.
163 | 2. Dolphins are swimming mammals.
164 | simi(1, 2)= 0.733
165 | simi(2, 1)= 0.733
166 | 
167 | 1. Cats are beautiful animals.
168 | 2. Cats are beautiful animals.
169 | simi(1, 2)= 1000
170 | simi(2, 1)= 1000
171 | 
172 | ************ simi definition from: similarity_nltk
173 | 1. apple
174 | 2. horse
175 | simi(1, 2)= 0.056
176 | simi(2, 1)= 0.056
177 | 
178 | 1. Paul
179 | 2. John
180 | simi(1, 2)= 0.083
181 | simi(2, 1)= 0.083
182 | 
183 | 1. The cat sat on the mat.
184 | 2. The dog lay on the rug.
185 | simi(1, 2)= 0.353
186 | simi(2, 1)= 0.353
187 | elapsed time= 0.006499767303466797
188 | 
189 | """
190 | 


--------------------------------------------------------------------------------
/simplifying.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains functions for simplifying movie scripts (or short stories).
  4 | 
  5 | input directory: m_scripts_spell or short_stories_spell
  6 | output directory: m_scripts_simp or short_stories_simp
  7 | 
  8 | Simplification is done by the function `simplify_ztz()`. This function was
  9 | implemented in several ways before we decided to stick with the version in
 10 | file `simp_spacy3`.
 11 | 
 12 | simp_spacy1.py
 13 | simp_spacy2.py
 14 | simp_spacy3.py (recommended)
 15 | simp_spacy-claucy.py
 16 | simp_stanford.py
 17 | 
 18 | The input files have only one sentence per line. For each file, we use SpaCy
 19 | to break each sentence into clauses. Then we simplify the clauses by
 20 | removing stop-words, punctuation marks, proper nouns (a.k.a. named entities)
 21 | and other excess baggage. Then we replace each clause by its simplified
 22 | version. Different simplified clauses from the same sentence are put in the
 23 | same line, separated by a separator-token. Some sentences are diminished to
 24 | nothing after the simplification. Those sentences are replaced by a single
 25 | asterisk.
 26 | 
 27 | """
 28 | from globals import *
 29 | import os
 30 | import re
 31 | import importlib as imp
 32 | 
 33 | zsimp = imp.import_module(ZTZ_SIMPLIFIER)
 34 | from utils import *
 35 | 
 36 | 
 37 | def simplify_one_m_script(
 38 |         in_dir, out_dir,
 39 |         file_name,
 40 |         verbose=False,
 41 |         use_gpu=False):
 42 |     """
 43 |     in_dir and out_dir can be the same, but this will overwrite the files.
 44 | 
 45 |     This method reads a file called `file_name` in the `in_dir` directory
 46 |     and creates a simplified version in the `out_dir` directory.
 47 | 
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     in_dir: str
 52 |     out_dir: str
 53 |     file_name: str
 54 |     verbose: bool
 55 |     use_gpu: bool
 56 | 
 57 |     Returns
 58 |     -------
 59 |     None
 60 | 
 61 |     """
 62 |     inpath = in_dir + "/" + file_name
 63 |     outpath = out_dir + "/" + file_name
 64 |     new_lines = []
 65 |     with open(inpath, "r") as f:
 66 |         count = 1
 67 |         for line in f:
 68 |             if verbose:
 69 |                 print(str(count) + ".")
 70 |             simple_ztz_list = zsimp.simplify_ztz(line,
 71 |                                                  verbose=verbose,
 72 |                                                  use_gpu=use_gpu)
 73 | 
 74 |             # remove empty clauses
 75 |             simple_ztz_list = [ztz for ztz in simple_ztz_list if ztz]
 76 | 
 77 |             if simple_ztz_list == []:
 78 |                 simple_ztz_list = [ZTZ_SEPARATOR]
 79 | 
 80 |             # replace multiple white spaces by single white space
 81 |             simple_ztz_list = [re.sub(r'\s+', ' ', ztz) for ztz in
 82 |                                simple_ztz_list]
 83 | 
 84 |             if len(simple_ztz_list) > 1:
 85 |                 xx = " " + ZTZ_SEPARATOR + " "
 86 |                 new_lines.append(xx.join(simple_ztz_list))
 87 |             elif len(simple_ztz_list) == 1:
 88 |                 new_lines.append(simple_ztz_list[0])
 89 |             else:
 90 |                 assert False
 91 | 
 92 |             count += 1
 93 |     with open(outpath, "w") as f:
 94 |         for line in new_lines:
 95 |             f.write(line + "\n")
 96 | 
 97 | 
 98 | def simplify_batch_of_m_scripts(
 99 |         in_dir, out_dir,
100 |         batch_file_names,
101 |         verbose=False):
102 |     """
103 |     This method calls the method `simplify_one_m_script` for all the file
104 |     names in the list of file names `batch_file_names`.
105 | 
106 | 
107 |     Parameters
108 |     ----------
109 |     in_dir: str
110 |     out_dir: str
111 |     batch_file_names: list[str]
112 |     verbose: bool
113 | 
114 |     Returns
115 |     -------
116 |     None
117 | 
118 |     """
119 |     all_file_names = my_listdir(in_dir)
120 |     assert set(batch_file_names).issubset(set(all_file_names))
121 |     for file_name in batch_file_names:
122 |         i = all_file_names.index(file_name)
123 |         print('%i.' % (i + 1), file_name)
124 |         simplify_one_m_script(in_dir, out_dir, file_name, verbose)
125 | 
126 | 
127 | if __name__ == "__main__":
128 | 
129 |     def main1():
130 |         print("************ simplifier:", ZTZ_SIMPLIFIER)
131 |         ztz = \
132 |             'The man, who had never liked the words' \
133 |             ' "booby" and "boobyhatch,"' \
134 |             ' and who liked them even less on a shining morning when there' \
135 |             ' was a unicorn in the garden, thought for a moment.'
136 |         zsimp.simplify_ztz(ztz, verbose=True)
137 | 
138 | 
139 |     def main2():
140 |         print("************ simplifier:", ZTZ_SIMPLIFIER)
141 |         path = "simplifying_test.txt"
142 |         with open(path, "r") as f:
143 |             count = 1
144 |             for line in f:
145 |                 print(str(count) + ".")
146 |                 zsimp.simplify_ztz(line, verbose=True)
147 |                 count += 1
148 | 
149 | 
150 |     def main3():
151 |         print("************ simplifier:", ZTZ_SIMPLIFIER)
152 |         in_dir = "short_stories_spell"
153 |         out_dir = "short_stories_simp"
154 |         batch_file_names = my_listdir(in_dir)[0:3]
155 |         simplify_batch_of_m_scripts(
156 |             in_dir, out_dir,
157 |             batch_file_names,
158 |             verbose=False)
159 | 
160 | 
161 |     def main4():
162 |         print("************ simplifier:", ZTZ_SIMPLIFIER)
163 |         remove_dialogs = False
164 |         in_dir = SPELL_DIR if not remove_dialogs else SPELL_RD_DIR
165 |         out_dir = SIMP_DIR if not remove_dialogs else SIMP_RD_DIR
166 |         batch_file_names = my_listdir(in_dir)[0:3]
167 |         simplify_batch_of_m_scripts(
168 |             in_dir, out_dir,
169 |             batch_file_names)
170 | 
171 | 
172 |     main1()
173 |     main2()
174 |     # main3()
175 |     # main4()
176 | 


--------------------------------------------------------------------------------
/downloading_imsdb.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | The goal of this file is to scrape the 1,100+ movie scripts from the IMSDb
  4 | website.
  5 | 
  6 | References:
  7 | https://github.com/j2kun/imsdb_download_all_scripts
  8 | https://github.com/AdeboyeML/Film_Script_Analysis
  9 | https://www.datacamp.com/tutorial/scraping-reddit-python-scrapy
 10 | 
 11 | In Chrome and most web browsers, pressing Ctrl+U opens the current page's
 12 | source code in a new window.
 13 | 
 14 | 3 depths, d0, d1, d2
 15 | 
 16 | d0_url
 17 | https://imsdb.com/all-scripts.html
 18 | 
 19 | d1_url (depends on movie)
 20 | https://imsdb.com/Movie%20Scripts/10%20Things%20I%20Hate%20About%20You%20Script.html
 21 | 
 22 | d2_url (depends on movie)
 23 | https://imsdb.com/scripts/10-Things-I-Hate-About-You.html
 24 | 
 25 | find_all() takes you from X->Y
 26 | (d0_html, d0_soup)->d1_url
 27 | (d1_html, d1_soup)->d2_url
 28 | """
 29 | from bs4 import BeautifulSoup
 30 | import requests
 31 | from slugify import slugify  # python-slugify
 32 | from globals import *
 33 | 
 34 | 
 35 | def get_d1_urls_and_titles():
 36 |     """
 37 |     This auxiliary method returns lists `d1_urls` and `titles`.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     list[str], list[str]
 42 | 
 43 |     """
 44 |     d1_urls = []
 45 |     titles = []
 46 |     d0_url = BASE_URL + "/all-scripts.html"
 47 |     d0_html = requests.get(d0_url).text
 48 |     d0_soup = BeautifulSoup(d0_html, "html.parser")
 49 |     for p_tag in d0_soup.find_all('p'):
 50 |         d1_url = p_tag.a['href']
 51 |         cond1 = "/Movie Scripts/" in d1_url
 52 |         cond2 = ".html" in d1_url
 53 |         if cond1 and cond2:
 54 |             title = d1_url.replace("/Movie Scripts/", ""). \
 55 |                 replace(" Script.html", ""). \
 56 |                 replace(".html", "")
 57 |             d1_urls.append(BASE_URL + d1_url)
 58 |             titles.append(title)
 59 |     return d1_urls, titles
 60 | 
 61 | 
 62 | def get_one_m_script(d1_url, stub_only=False):
 63 |     """
 64 |     This method scrapes one movie script with d1-level URL `d1_url`.
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     d1_url: str
 69 |     stub_only: bool
 70 |         True iff don't want to scrape the movie script text at all. Instead
 71 |         of the movie script text, it leaves a message "coming soon to a
 72 |         theater near you"
 73 | 
 74 |     Returns
 75 |     -------
 76 |     str, bool
 77 |         the movie script, and a boolean indicating if it's missing.
 78 | 
 79 |     """
 80 |     missing = False
 81 |     tail = d1_url.split('/')[-1].replace(".html", "")
 82 |     if stub_only:
 83 |         m_script = "coming soon to a theater near you"
 84 |     else:
 85 |         # print("nabf", d1_url)
 86 |         d1_html = requests.get(d1_url).text
 87 |         d1_soup = BeautifulSoup(d1_html, "html.parser")
 88 |         p_tags = d1_soup.find_all('p', align="center")
 89 |         if not p_tags:
 90 |             print('**************** Missing: %s' % tail)
 91 |             missing = True
 92 |             return "coming soon to a theater near you", missing
 93 |         assert len(p_tags) == 1
 94 |         d2_url = p_tags[0].a['href']
 95 |         d2_url = BASE_URL + d2_url
 96 |         # print("nnfx", d2_url)
 97 |         d2_html = requests.get(d2_url).text
 98 |         d2_soup = BeautifulSoup(d2_html, "html.parser")
 99 |         # tried this. Doesn't always work
100 |         # pre_tags = d2_soup.find_all('pre')
101 |         pre_tags = d2_soup.find_all('td', {'class': "scrtext"})
102 |         if not pre_tags:
103 |             print('**************** Missing: %s' % tail)
104 |             missing = True
105 |             return "coming soon to a theater near you", missing
106 |         m_script = pre_tags[0].get_text()
107 |         # m_script = clean_m_script(m_script)
108 |     return m_script, missing
109 | 
110 | 
111 | def get_batch_of_m_scripts(first=1, last=5000, stub_only=False):
112 |     """
113 |     This method scrapes the movie scripts starting at position `first` and
114 |     ending at position `last`. If `last` is larger than the number of movie
115 |     scripts at IMSDb, then the method ends when it has scraped all movie
116 |     scripts.
117 | 
118 |     Parameters
119 |     ----------
120 |     first: int
121 |     last: int
122 |     stub_only: bool
123 | 
124 |     Returns
125 |     -------
126 |     None
127 | 
128 |     """
129 |     d1_urls, titles = get_d1_urls_and_titles()
130 |     num_titles = len(titles)
131 |     missing_m_scripts = []
132 |     assert first <= last
133 |     if last > num_titles:
134 |         last = num_titles
135 |     if first < 1:
136 |         first = 1
137 |     for i in range(first - 1, last):
138 |         d1_url = d1_urls[i]
139 |         dashed_title = slugify(titles[i])
140 |         print('%i. fetching %s' % (i + 1, dashed_title))
141 |         m_script, missing = get_one_m_script(d1_url, stub_only=stub_only)
142 |         outpath = M_SCRIPTS_DIR + '/' + dashed_title + '.txt'
143 |         if missing:
144 |             missing_m_scripts.append(dashed_title + '.txt')
145 |         else:
146 |             written = False
147 |             len_script = len(m_script)
148 |             print("m_script num of characters=", len_script)
149 |             if len_script > 500:
150 |                 with open(outpath, "w", newline="\n") as f:
151 |                     f.write(m_script)
152 |                     written = True
153 |             if not written:
154 |                 # m-scripts with less than 500 char are just stubs
155 |                 print("------------------ Found just a stub: ", dashed_title)
156 |                 missing_m_scripts.append(dashed_title + '.txt')
157 |     print("missing m_scripts:")
158 |     print(missing_m_scripts)
159 |     print("number of missing m_scripts=", len(missing_m_scripts))
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     def main1():
164 |         urls, titles = get_d1_urls_and_titles()
165 |         print(urls)
166 |         print(titles)
167 |         assert len(urls) == len(titles)
168 |         print("number of films=", len(urls))  # 1211
169 |         # 75 missing
170 |         # 1211-75=1136 expected 238 MB
171 | 
172 | 
173 |     def main2():
174 |         get_batch_of_m_scripts(first=1, last=100, stub_only=False)
175 | 
176 | 
177 |     # main1()
178 |     main2()
179 | 


--------------------------------------------------------------------------------
/simp_openie6.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | The functions in this file are used inside the following jupyter notebook at
  4 | Google Colab
  5 | 
  6 | https://colab.research.google.com/drive/1S2EWOGkoCgjfOJzTRJ7PLeu4T8SBwhlF?usp=sharing
  7 | 
  8 | Refs:
  9 | 
 10 | 1. https://github.com/dair-iitd/CaRB
 11 | 
 12 | 2. https://github.com/dair-iitd/imojie
 13 | 
 14 | 3. https://github.com/dair-iitd/openie6
 15 | 
 16 | """
 17 | import subprocess
 18 | import os
 19 | from globals import *
 20 | from utils import my_listdir
 21 | 
 22 | 
 23 | def openie6_simplify_batch_of_m_scripts(
 24 |         in_dir, out_dir,
 25 |         batch_file_names,
 26 |         verbose=False):
 27 |     """
 28 |     This method does the same thing as the method
 29 |     `simplifying.simplify_batch_of_m_scripts()` but for the case
 30 |     `ZTZ_SIMPLIFIER = "simp_openie6"`
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     in_dir: str
 35 |     out_dir: str
 36 |     batch_file_names: list[str]
 37 |     verbose: bool
 38 | 
 39 |     Returns
 40 |     -------
 41 |     None
 42 | 
 43 |     """
 44 |     # assume directories `openie6` and `mappa_mundi`
 45 |     # live side by side inside a bigger folder X
 46 |     # and that the cwd is X
 47 | 
 48 |     m_script_starting_line_nums = \
 49 |         make_all_sentences_file(in_dir=in_dir,
 50 |                                 batch_file_names=batch_file_names)
 51 |     gpu_command = \
 52 |         r"cd openie6 && CUDA_DEVICE_ORDER=PCI_BUS_ID " \
 53 |         r"CUDA_VISIBLE_DEVICES=0 " \
 54 |         r"PYTHONPATH=imojie:imojie/allennlp:imojie" \
 55 |         r"/pytorch_transformers:$PYTHONPATH python run.py " \
 56 |         r"--save models/conj_model --mode predict " \
 57 |         r"--inp ../all_sentences.txt --batch_size 1 " \
 58 |         r"--model_str bert-large-cased --task conj " \
 59 |         r"--gpus 1 --out ../all_predictions.txt"
 60 | 
 61 |     cpu_command = gpu_command.replace("--gpus 1", "--gpus 0")
 62 | 
 63 |     if USE_GPU:
 64 |         os.system(gpu_command)
 65 |     else:
 66 |         os.system(cpu_command)
 67 | 
 68 |     translate_predictions_file_from_openie6_to_mm(
 69 |         in_fname="all_predictions.txt.conj",
 70 |         out_fname="all_predictions_in_mm.txt")
 71 | 
 72 |     make_m_scripts_simp_dir(out_dir,
 73 |                             batch_file_names,
 74 |                             m_script_starting_line_nums)
 75 | 
 76 |     os.remove("all_sentences.txt")
 77 |     os.remove("all_predictions.txt.conj")
 78 |     os.remove("all_predictions_in_mm.txt")
 79 | 
 80 | 
 81 | def make_all_sentences_file(in_dir, batch_file_names):
 82 |     """
 83 |     This internal method creates the file `all_sentences.txt`.
 84 |     `all_sentences.txt` is a concatenation of all the files in
 85 |     `batch_file_names`.
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     in_dir: str
 90 |     batch_file_names: list[str]
 91 | 
 92 |     Returns
 93 |     -------
 94 |     m_script_starting_line_nums: list[int]
 95 |         list of the starting line numbers within the file
 96 |         `all_sentences.txt` for the file names in the list `batch_file_names`.
 97 | 
 98 |     """
 99 |     m_script_starting_line_nums = []
100 |     cum_line_num = 0
101 |     with open("all_sentences.txt", "w") as big_f:
102 |         for fname in batch_file_names:
103 |             in_path = in_dir + '/' + fname
104 |             # print("bbng", in_path)
105 |             with open(in_path, "r") as f:
106 |                 # print("hhji", cum_line_num)
107 |                 m_script_starting_line_nums.append(cum_line_num)
108 |                 f_len = 0
109 |                 for line in f:
110 |                     f_len += 1
111 |                     # print("llmk", line)
112 |                     big_f.write(line)
113 |                 cum_line_num += f_len
114 |                 # print("nnmj", f_len)
115 |     return m_script_starting_line_nums
116 | 
117 | 
118 | def translate_predictions_file_from_openie6_to_mm(in_fname, out_fname):
119 |     """
120 |     This internal method reads the file `all_predictions.txt.conj` and
121 |     translates it into a new file called `all_predictions_in_mm.txt`. The
122 |     input file is in the format of openie6 extractions output and the output
123 |     file is in the mappa mundi (mm) simp format.
124 | 
125 |     openie6 extractions output format: one sentence or empty line ("row
126 |     gap") per line. Groups separated by empty lines. Each group consists of
127 |     the original sentence followed by the extraction sentences.
128 | 
129 |     mm simp format: one sentence per line. No row gaps. Each line has all
130 |     the extractions from the original sentence, separated by ZTZ_SEPARATOR.
131 | 
132 |     Parameters
133 |     ----------
134 |     in_fname: str
135 |     out_fname: str
136 | 
137 |     Returns
138 |     -------
139 |     None
140 | 
141 |     """
142 |     with open(in_fname, "r") as in_file:
143 |         with open(out_fname, "w") as out_file:
144 |             in_parts = []
145 |             prev_line_is_empty = True
146 |             for line in in_file:
147 |                 if line.strip():
148 |                     in_parts.append(line.strip())
149 |                     prev_line_is_empty = False
150 |                 else:
151 |                     if not prev_line_is_empty:
152 |                         if len(in_parts) > 1:
153 |                             in_parts = in_parts[1:]
154 |                         if len(in_parts) > 0:
155 |                             xx = " " + ZTZ_SEPARATOR + " "
156 |                             out_file.write(xx.join(in_parts) + "\n")
157 |                             in_parts = []
158 | 
159 | 
160 | def make_m_scripts_simp_dir(out_dir,
161 |                             batch_file_names,
162 |                             m_script_starting_line_nums):
163 |     """
164 |     This internal method reads the file `all_predictions_in_mm.txt` and it
165 |     uses that to create a new directory called `out_dir` populated by files
166 |     with the names in list `batch_file_names`.
167 | 
168 |     Parameters
169 |     ----------
170 |     out_dir: str
171 |     batch_file_names: list[str]
172 |     m_script_starting_line_nums: list[int]
173 | 
174 |     Returns
175 |     -------
176 |     None
177 | 
178 |     """
179 |     if not os.path.exists(out_dir):
180 |         os.makedirs(out_dir)
181 |     with open("all_predictions_in_mm.txt", "r") as big_f:
182 |         m_script_num = -1
183 |         f = None
184 |         for line_num, line in enumerate(big_f):
185 |             if line_num in m_script_starting_line_nums:
186 |                 if f:
187 |                     f.close()
188 |                 m_script_num += 1
189 |                 fname = batch_file_names[m_script_num]
190 |                 out_path = out_dir + "/" + fname
191 |                 f = open(out_path, "w")
192 |             f.write(line)
193 |         if f:
194 |             f.close()
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     def main():
199 |         in_dir = "short_stories_spell"
200 |         batch_file_names = my_listdir(in_dir)
201 |         make_all_sentences_file(in_dir=in_dir,
202 |                                 batch_file_names=batch_file_names)
203 |         translate_predictions_file_from_openie6_to_mm(
204 |             "openie6_translation_test.txt",
205 |             "openie6_test_answer.txt")
206 | 
207 | 
208 |     main()
209 | 


--------------------------------------------------------------------------------
/jupyter_notebooks/SUMMARY.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SUMMARY notebook\n",
  8 |     "\n",
  9 |     "This notebook scans the directory in which it lives to find all jupyter notebooks (other than itself) in that directory. It then prints for every notebook it finds (1) a hyperlink to the notebook, and (2) the first cell (which is always markdown) of the notebook. This way you can read a nice, automatically generated summary of all the notebooks without having to open all of them. If you find a notebook that you want to explore further, you can simply click on its link to open it."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/markdown": [
 20 |        "\n",
 21 |        "\n",
 22 |        "<hr style=\"height:10px; background-color: blue;\">\n",
 23 |        "\n",
 24 |        "Coordination_Analysis_with_IGL_CA_(OpenIE6_paper).ipynb [<a href=\"Coordination_Analysis_with_IGL_CA_(OpenIE6_paper).ipynb\" target= \"_blank\">local link</a>]  [<a href=\"https://github.com/rrtucci/mappa_mundi/blob/master/jupyter_notebooks/Coordination_Analysis_with_IGL_CA_(OpenIE6_paper).ipynb\">github link</a>] 1/4\n",
 25 |        "\n",
 26 |        "# IGL-CA: inference pipeline\n",
 27 |        "Coordination analysis inference using the OpenIE6 model.\n",
 28 |        "\n",
 29 |        "* Anton's [OpenIE6 fork](https://github.com/alexeyev/openie6)\n",
 30 |        "* [OpenIE6 original repo](https://github.com/dair-iitd/openie6)\n",
 31 |        "* [OpenIE6 original paper](https://aclanthology.org/2020.emnlp-main.306/)\n",
 32 |        "\n",
 33 |        "Prepared by [Anton Alekseev](https://github.com/alexeyev) and [Anastasia Predelina](https://github.com/PredelinaAsya). Quite a bit of effort, tbh.\n",
 34 |        "\n",
 35 |        "**NOTA BENE**: GPU environment should be enabled before running the code! If not possible, another code cell for CPU-only environment is available at the very end of the notebook.\n",
 36 |        "\n",
 37 |        "<hr style=\"height:10px; background-color: blue;\">\n",
 38 |        "\n",
 39 |        "navigating_m_scripts.ipynb [<a href=\"navigating_m_scripts.ipynb\" target= \"_blank\">local link</a>]  [<a href=\"https://github.com/rrtucci/mappa_mundi/blob/master/jupyter_notebooks/navigating_m_scripts.ipynb\">github link</a>] 2/4\n",
 40 |        "\n",
 41 |        "# Navigating Movie Scripts\n",
 42 |        "\n",
 43 |        "In this notebook, we explain\n",
 44 |        "how to use Mappa Mundi (MM) to do causal DEFT (DAG extraction from text)\n",
 45 |        "using as a test case, the following 3 movie scripts by Pixar/Disney.\n",
 46 |        "\n",
 47 |        "* [Toy Story](../m_scripts/toy-story.txt)\n",
 48 |        "* [Up](../m_scripts/up.txt)\n",
 49 |        "* [WALL-E](../m_scripts/wall-e.txt)\n",
 50 |        "\n",
 51 |        "<hr style=\"height:10px; background-color: blue;\">\n",
 52 |        "\n",
 53 |        "navigating_short_stories.ipynb [<a href=\"navigating_short_stories.ipynb\" target= \"_blank\">local link</a>]  [<a href=\"https://github.com/rrtucci/mappa_mundi/blob/master/jupyter_notebooks/navigating_short_stories.ipynb\">github link</a>] 3/4\n",
 54 |        "\n",
 55 |        "# Navigating Short Stories\n",
 56 |        "\n",
 57 |        "In this notebook, we explain how to use Mappa Mundi (MM) to do causal DEFT (DAG extraction from text)\n",
 58 |        "using as a test case, the following 3 short stories by P.G. Wodehouse.\n",
 59 |        "\n",
 60 |        "* [Bill the Bloodhound](../short_stories/bill-the-bloodhound.txt)\n",
 61 |        "* [Extricating Young Gussie](../short_stories/extricating-young-gussie.txt)\n",
 62 |        "* [Wilton's Holiday](../short_stories/wiltons-holiday.txt)\n",
 63 |        "\n",
 64 |        "\n",
 65 |        "<hr style=\"height:10px; background-color: blue;\">\n",
 66 |        "\n",
 67 |        "simplifying_with_Openie6.ipynb [<a href=\"simplifying_with_Openie6.ipynb\" target= \"_blank\">local link</a>]  [<a href=\"https://github.com/rrtucci/mappa_mundi/blob/master/jupyter_notebooks/simplifying_with_Openie6.ipynb\">github link</a>] 4/4\n",
 68 |        "\n",
 69 |        "# Simplifying with Openie6\n",
 70 |        "\n",
 71 |        "The Openie6 software takes as input a possibly complex or compound sentence X,\n",
 72 |        "and it returns a list of simple sentences that contain all the\n",
 73 |        "information in the original sentence X.\n",
 74 |        "\n",
 75 |        "Anton Alekseev (AA) and Anastasia Predelina (AP) wrote a jupyter notebook\n",
 76 |        "that installs and runs the code in the Openie6 repo https://github.com/alexeyev/openie6\n",
 77 |        "An exact copy of notebook by  AA/AP is included in this folder. It is also publicly available at AA's google drive\n",
 78 |        "under the URL\n",
 79 |        "\n",
 80 |        " https://colab.research.google.com/drive/1samvO-SH6Xgjf9ItlhAF1EmBZo5grBQb?usp=sharing\n",
 81 |        "\n",
 82 |        "\n",
 83 |        "\n",
 84 |        "This notebook adds new code to the end of the AA/AP notebook. The purpose of the\n",
 85 |        "new code is\n",
 86 |        "to simplify short stories and movie scripts."
 87 |       ],
 88 |       "text/plain": [
 89 |        "<IPython.core.display.Markdown object>"
 90 |       ]
 91 |      },
 92 |      "metadata": {},
 93 |      "output_type": "display_data"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# Version: 2\n",
 98 |     "import os\n",
 99 |     "import json\n",
100 |     "from IPython.display import display, Markdown\n",
101 |     "\n",
102 |     "# the name of this file\n",
103 |     "this_fname = 'SUMMARY.ipynb'\n",
104 |     "fname_to_md = {}\n",
105 |     "for fname in sorted([x for x in os.listdir('./')]):\n",
106 |     "    if fname[-6:] == '.ipynb'  and fname != this_fname:\n",
107 |     "        # print('------------', fname)\n",
108 |     "        with open(fname, 'r', encoding=\"utf-8\") as f:\n",
109 |     "            fdata = json.load(f)\n",
110 |     "            fname_to_md[fname] = ''.join(fdata['cells'][0]['source'])\n",
111 |     "# print(fname_to_md)\n",
112 |     "pre_sep = '\\n\\n<hr style=\"height:10px; background-color: blue;\">\\n\\n'\n",
113 |     "full_md = ''\n",
114 |     "k = 1\n",
115 |     "num_nb = len(fname_to_md)\n",
116 |     "project_name =\"mappa_mundi\"\n",
117 |     "who =\"rrtucci\"\n",
118 |     "where = \"jupyter_notebooks\"\n",
119 |     "for fname, md in fname_to_md.items():\n",
120 |     "    sep = pre_sep\n",
121 |     "    local_link = f' [<a href=\"{fname}\" target= \"_blank\">local link</a>] '\n",
122 |     "    github_link = f' [<a href=\"https://github.com/{who}/{project_name}/blob/master/{where}/' +\\\n",
123 |     "        f'{fname}\">github link</a>] '\n",
124 |     "    sep += fname + local_link + github_link + str(k) + '/' + str(num_nb) + '\\n\\n'\n",
125 |     "    full_md += sep + md\n",
126 |     "    k += 1\n",
127 |     "display(Markdown(full_md))"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3 (ipykernel)",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.10.9"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 4
152 | }
153 | 


--------------------------------------------------------------------------------
/post_cleaning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains functions for post-cleaning movie scripts (or short
  4 | stories).
  5 | 
  6 | input directory: m_scripts_simp or short_stories_simp
  7 | output directory: m_scripts_post_clean or short_stories_post_clean
  8 | 
  9 | The input files have one or more sublines per line. For each file, we use
 10 | post clean the sublines by removing stop-words, punctuation marks, proper
 11 | nouns (a.k.a. named entities) and other excess baggage. Then we replace each
 12 | subline by its post-clean version. Different clean sublines from the
 13 | same sentence are put in the same line, separated by ZTZ_SEPARATOR . Some
 14 | sentences are diminished to nothing after the post-cleaning. Those
 15 | sentences are replaced by a single ZTZ_SEPARATOR.
 16 | 
 17 | 
 18 | Refs:
 19 | https://spacy.io/usage/spacy-101/
 20 | 
 21 | For spacy, here are some values of token.dep_
 22 | 
 23 | cc: coordinating conjunction.
 24 |     i.e., FANBOYS = for, and, nor, but, or, yet, so
 25 | 
 26 | mark: marker that introduces a subordinate subline
 27 | 
 28 | ADP: adposition, e.g. in, to, during
 29 | 
 30 | """
 31 | from globals import *
 32 | import importlib as imp
 33 | 
 34 | zsimp = imp.import_module(ZTZ_SIMPLIFIER)
 35 | from utils import *
 36 | 
 37 | import spacy
 38 | import re
 39 | from globals import *
 40 | 
 41 | nlp = spacy.load("en_core_web_sm")
 42 | 
 43 | 
 44 | # nlp.add_pipe("merge_entities")
 45 | 
 46 | 
 47 | def post_clean_line(line, verbose=False):
 48 |     """
 49 |     This method cleans the line string `line`. It returns a list of simple
 50 |     sentences (sublines) extracted from the input sentence (line).
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     line: str
 55 |     verbose: bool
 56 | 
 57 |     Returns
 58 |     -------
 59 |     list[str]
 60 | 
 61 |     """
 62 |     tokenized_sublines = \
 63 |         [nlp(subline) for subline in line.split(ZTZ_SEPARATOR)]
 64 | 
 65 |     ztz_list = []
 66 |     for tokenized_subline in tokenized_sublines:
 67 | 
 68 |         # replace by empty list any tokenized subline
 69 |         # that doesn't have a noun/pronoun and a verb
 70 |         subline_has_noun_or_pronoun = False
 71 |         subline_has_verb = False
 72 |         token_str_list = []
 73 |         for token in tokenized_subline:
 74 |             x = get_post_cleaned_token_txt(token)
 75 |             if x:
 76 |                 token_str_list.append(x)
 77 |             if token.pos_ in ["NOUN", "PRON", "PROPN"] and x:
 78 |                 subline_has_noun_or_pronoun = True
 79 |                 # print("NOUN or PRONOUN", token.text)
 80 |             if token.pos_ in ["VERB", "AUX"] and x:
 81 |                 subline_has_verb = True
 82 |                 # print("VERB", token.text)
 83 |         if not (subline_has_noun_or_pronoun and subline_has_verb):
 84 |             subline_str = []
 85 |         else:
 86 |             subline_str = " ".join(token_str_list)
 87 | 
 88 |         if subline_str:
 89 |             ztz_list.append(subline_str)
 90 | 
 91 |     if verbose:
 92 |         print(line.strip())
 93 |         print(ztz_list)
 94 |     return ztz_list
 95 | 
 96 | 
 97 | def get_post_cleaned_token_txt(token):
 98 |     """
 99 |     This auxiliary method takes as input a SpaCy Token `token` and returns a
100 |     simplified version of the token's text.
101 | 
102 |     Parameters
103 |     ----------
104 |     token: Token
105 | 
106 |     Returns
107 |     -------
108 |     str
109 | 
110 |     """
111 |     x = token.text
112 |     # remove all punctuation marks
113 |     x = re.sub(r'[^\w\s]', '', x)
114 | 
115 |     # if token.ent_type_:
116 |     #     # replace named entities by their labels
117 |     #     # x = token.ent_type_
118 |     #
119 |     #     # remove named entities
120 |     #     x = ""
121 |     # if token.is_stop and (token.pos_ not in RETAINED_STOPWORD_POS):
122 |     #     x = ""
123 |     # if token.pos_ not in RETAINED_POS:
124 |     #     x = ""
125 | 
126 |     # remove single character tokens
127 |     if len(x.strip()) == 1:
128 |         x = ""
129 |     x = x.strip()
130 |     return x
131 | 
132 | 
133 | def post_clean_one_m_script(
134 |         in_dir, out_dir,
135 |         file_name,
136 |         verbose=False):
137 |     """
138 |     in_dir and out_dir can be the same, but this will overwrite the files.
139 | 
140 |     This method reads a file called `file_name` in the `in_dir` directory
141 |     and creates a post-cleaned version in the `out_dir` directory.
142 | 
143 | 
144 |     Parameters
145 |     ----------
146 |     in_dir: str
147 |     out_dir: str
148 |     file_name: str
149 |     verbose: bool
150 | 
151 |     Returns
152 |     -------
153 |     None
154 | 
155 |     """
156 |     inpath = in_dir + "/" + file_name
157 |     outpath = out_dir + "/" + file_name
158 |     new_lines = []
159 |     with open(inpath, "r") as f:
160 |         count = 1
161 |         for line in f:
162 |             if verbose:
163 |                 print(str(count) + ".")
164 |             simple_ztz_list = post_clean_line(line,
165 |                                               verbose=verbose)
166 | 
167 |             # remove empty simple ztz
168 |             simple_ztz_list = [ztz for ztz in simple_ztz_list if ztz]
169 | 
170 |             if not simple_ztz_list:
171 |                 simple_ztz_list = [ZTZ_SEPARATOR]
172 | 
173 |             # replace multiple white spaces by single white space
174 |             simple_ztz_list = [re.sub(r'\s+', ' ', ztz) for ztz in
175 |                                simple_ztz_list]
176 | 
177 |             if len(simple_ztz_list) > 1:
178 |                 xx = " " + ZTZ_SEPARATOR + " "
179 |                 new_lines.append(xx.join(simple_ztz_list))
180 |             elif len(simple_ztz_list) == 1:
181 |                 new_lines.append(simple_ztz_list[0])
182 |             else:
183 |                 assert False
184 | 
185 |             count += 1
186 |     with open(outpath, "w") as f:
187 |         for line in new_lines:
188 |             f.write(line + "\n")
189 | 
190 | 
191 | def post_clean_batch_of_m_scripts(
192 |         in_dir, out_dir,
193 |         batch_file_names,
194 |         verbose=False):
195 |     """
196 |     This method calls the method `post_clean_one_m_script` for all the file
197 |     names in the list of file names `batch_file_names`.
198 | 
199 | 
200 |     Parameters
201 |     ----------
202 |     in_dir: str
203 |     out_dir: str
204 |     batch_file_names: list[str]
205 |     verbose: bool
206 | 
207 |     Returns
208 |     -------
209 |     None
210 | 
211 |     """
212 |     all_file_names = my_listdir(in_dir)
213 |     assert set(batch_file_names).issubset(set(all_file_names))
214 |     for file_name in batch_file_names:
215 |         i = all_file_names.index(file_name)
216 |         print('%i.' % (i + 1), file_name)
217 |         post_clean_one_m_script(in_dir, out_dir, file_name, verbose)
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     def main1():
222 |         in_dir = "short_stories_simp"
223 |         out_dir = "short_stories_post_clean"
224 |         batch_file_names = my_listdir(in_dir)[0:3]
225 |         post_clean_batch_of_m_scripts(
226 |             in_dir, out_dir,
227 |             batch_file_names,
228 |             verbose=False)
229 | 
230 | 
231 |     def main2():
232 |         remove_dialogs = False
233 |         in_dir = SIMP_DIR if not remove_dialogs else SIMP_RD_DIR
234 |         out_dir = POST_CLEAN_DIR if not remove_dialogs else POST_CLEAN_RD_DIR
235 |         batch_file_names = my_listdir(in_dir)[0:3]
236 |         post_clean_batch_of_m_scripts(
237 |             in_dir, out_dir,
238 |             batch_file_names)
239 | 
240 |     main1()
241 |     main2()
242 | 


--------------------------------------------------------------------------------
/stopwords.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file has a function that lists all SpaCy stopwords, classified by POS (
  4 | part of speech).
  5 | 
  6 | """
  7 | 
  8 | import spacy
  9 | from pprint import pprint
 10 | 
 11 | nlp = spacy.load('en_core_web_sm')
 12 | 
 13 | 
 14 | def get_stopwords_dict():
 15 |     """
 16 |     This method returns a dictionary that maps the parts of speech (POS) to
 17 |     a list of the stopwords that have that POS.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     dict[str, list[str]]
 22 | 
 23 |     """
 24 | 
 25 |     stopwords = nlp.Defaults.stopwords
 26 | 
 27 |     pos_to_stopwords = {}
 28 | 
 29 |     for word in stopwords:
 30 |         pos = nlp(word)[0].pos_
 31 |         if pos in pos_to_stopwords:
 32 |             pos_to_stopwords[pos].append(word)
 33 |         else:
 34 |             pos_to_stopwords[pos] = [word]
 35 | 
 36 |     return pos_to_stopwords
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     def main():
 41 |         d = get_stopwords_dict()
 42 |         print(sorted(d.keys()))
 43 |         pprint(d)
 44 | 
 45 | 
 46 |     main()
 47 | 
 48 | """
 49 | ['ADJ',
 50 |  'ADP',
 51 |  'ADV',
 52 |  'AUX',
 53 |  'CCONJ',
 54 |  'DET',
 55 |  'INTJ',
 56 |  'NOUN',
 57 |  'NUM',
 58 |  'PART',
 59 |  'PRON',
 60 |  'PUNCT',
 61 |  'SCONJ',
 62 |  'VERB']
 63 | {'ADJ': ['same',
 64 |          'few',
 65 |          'former',
 66 |          'full',
 67 |          'serious',
 68 |          'own',
 69 |          'empty',
 70 |          'such',
 71 |          'several',
 72 |          'latter',
 73 |          'various',
 74 |          'other',
 75 |          'least',
 76 |          'many',
 77 |          'whole',
 78 |          'top',
 79 |          'due',
 80 |          'whereafter',
 81 |          'last',
 82 |          'third'],
 83 |  'ADP': ['behind',
 84 |          'with',
 85 |          'below',
 86 |          'of',
 87 |          'through',
 88 |          'amongst',
 89 |          'at',
 90 |          'onto',
 91 |          'thru',
 92 |          'among',
 93 |          'throughout',
 94 |          'under',
 95 |          'into',
 96 |          'before',
 97 |          'beside',
 98 |          'against',
 99 |          'within',
100 |          'per',
101 |          'towards',
102 |          'after',
103 |          'without',
104 |          'beyond',
105 |          'from',
106 |          'via',
107 |          'in',
108 |          'on',
109 |          'as',
110 |          'than',
111 |          'during',
112 |          'for',
113 |          'toward',
114 |          'until',
115 |          'above',
116 |          'across',
117 |          'along',
118 |          'between',
119 |          'over',
120 |          'by'],
121 |  'ADV': ['together',
122 |          'so',
123 |          'therein',
124 |          'next',
125 |          'off',
126 |          'meanwhile',
127 |          'whereupon',
128 |          'sometimes',
129 |          'again',
130 |          'rather',
131 |          'enough',
132 |          'thereby',
133 |          'first',
134 |          'too',
135 |          'always',
136 |          'either',
137 |          'somehow',
138 |          'very',
139 |          'perhaps',
140 |          'back',
141 |          'down',
142 |          'elsewhere',
143 |          'latterly',
144 |          'moreover',
145 |          'formerly',
146 |          'about',
147 |          'sometime',
148 |          'really',
149 |          'once',
150 |          'else',
151 |          'anyhow',
152 |          'also',
153 |          'there',
154 |          'most',
155 |          'nowhere',
156 |          'then',
157 |          'up',
158 |          'out',
159 |          'further',
160 |          'however',
161 |          'yet',
162 |          'namely',
163 |          'afterwards',
164 |          'already',
165 |          'hereby',
166 |          'thereupon',
167 |          'still',
168 |          'hence',
169 |          'anyway',
170 |          'even',
171 |          'much',
172 |          'thus',
173 |          'never',
174 |          'almost',
175 |          'alone',
176 |          'somewhere',
177 |          'here',
178 |          'more',
179 |          'hereupon',
180 |          'indeed',
181 |          'now',
182 |          'beforehand',
183 |          'everywhere',
184 |          'just',
185 |          'anywhere',
186 |          'often',
187 |          'thereafter',
188 |          'therefore',
189 |          'nevertheless',
190 |          'ever',
191 |          'quite',
192 |          'mostly',
193 |          'around',
194 |          'only',
195 |          'otherwise',
196 |          'less'],
197 |  'AUX': ['might',
198 |          'will',
199 |          'being',
200 |          '’d',
201 |          'can',
202 |          '’ll',
203 |          'are',
204 |          'was',
205 |          "'ll",
206 |          'ca',
207 |          'could',
208 |          'must',
209 |          'would',
210 |          "'re",
211 |          'may',
212 |          'were',
213 |          "'s",
214 |          'should',
215 |          'be',
216 |          'cannot',
217 |          'am',
218 |          'is',
219 |          'been'],
220 |  'CCONJ': ['but', 'or', 'and', 'nor', 'neither'],
221 |  'DET': ['whose'],
222 |  'INTJ': ['please', 'well', 'no'],
223 |  'NOUN': ['part',
224 |           'bottom',
225 |           'whither',
226 |           'side',
227 |           'name',
228 |           'thence',
229 |           'amount',
230 |           'whence',
231 |           'yourselves',
232 |           'noone',
233 |           'front',
234 |           'yours',
235 |           'others',
236 |           'none',
237 |           'hers',
238 |           '‘s',
239 |           'ours',
240 |           'herein'],
241 |  'NUM': ['one',
242 |          'twenty',
243 |          'nine',
244 |          'sixty',
245 |          'ten',
246 |          'five',
247 |          'fifty',
248 |          'forty',
249 |          'n’t',
250 |          'six',
251 |          'three',
252 |          'hundred',
253 |          'eleven',
254 |          'twelve',
255 |          'fifteen',
256 |          'four',
257 |          'two',
258 |          'eight'],
259 |  'PART': ["n't", 'to', 'not'],
260 |  'PRON': ['nothing',
261 |           'the',
262 |           'my',
263 |           'this',
264 |           'something',
265 |           'they',
266 |           'whom',
267 |           'nobody',
268 |           'her',
269 |           'those',
270 |           'me',
271 |           'he',
272 |           'themselves',
273 |           'us',
274 |           'an',
275 |           'anything',
276 |           'his',
277 |           'i',
278 |           'you',
279 |           'which',
280 |           'him',
281 |           'all',
282 |           'we',
283 |           'them',
284 |           'any',
285 |           'who',
286 |           'everyone',
287 |           'these',
288 |           'someone',
289 |           'some',
290 |           'himself',
291 |           'whoever',
292 |           'what',
293 |           'each',
294 |           'yourself',
295 |           'mine',
296 |           'everything',
297 |           'our',
298 |           'itself',
299 |           'anyone',
300 |           'herself',
301 |           'your',
302 |           'its',
303 |           'every',
304 |           'it',
305 |           'their',
306 |           'both',
307 |           'ourselves',
308 |           'that',
309 |           'another',
310 |           'whatever',
311 |           'she',
312 |           'myself',
313 |           'a'],
314 |  'PUNCT': ['‘ll', '’ve', '‘ve', '‘d', '’m', '‘m', '‘re'],
315 |  'SCONJ': ['where',
316 |            'wherever',
317 |            'unless',
318 |            'wherein',
319 |            'if',
320 |            'how',
321 |            'though',
322 |            'why',
323 |            'except',
324 |            'whether',
325 |            'while',
326 |            'upon',
327 |            'whereas',
328 |            'besides',
329 |            'when',
330 |            'because',
331 |            'whereby',
332 |            'whenever',
333 |            'since',
334 |            'although'],
335 |  'VERB': ['became',
336 |           'made',
337 |           'hereafter',
338 |           'used',
339 |           'using',
340 |           'did',
341 |           'becomes',
342 |           'seem',
343 |           'do',
344 |           'seems',
345 |           "'d",
346 |           'done',
347 |           'give',
348 |           'keep',
349 |           'say',
350 |           'has',
351 |           'get',
352 |           'become',
353 |           'have',
354 |           'doing',
355 |           'seemed',
356 |           'make',
357 |           'n‘t',
358 |           'put',
359 |           'take',
360 |           'becoming',
361 |           'show',
362 |           "'m",
363 |           "'ve",
364 |           '’s',
365 |           'see',
366 |           'regarding',
367 |           'move',
368 |           'had',
369 |           'seeming',
370 |           'call',
371 |           're',
372 |           'go',
373 |           '’re',
374 |           'does']}
375 | """
376 | 


--------------------------------------------------------------------------------
/DagAtlas.py:
--------------------------------------------------------------------------------
  1 | from Dag import *
  2 | from BatchSimilarity import *
  3 | from utils import *
  4 | import sys
  5 | from itertools import product
  6 | from globals import *
  7 | 
  8 | import pickle as pik
  9 | from time import time
 10 | from sentence_transformers import SentenceTransformer
 11 | 
 12 | 
 13 | class DagAtlas:
 14 |     """
 15 |     This class reads movie script txt files from the `out_dir` directory (
 16 |     simplified movie scripts) and creates a pickled file for each movie
 17 |     script. Each pickled file contains a Dag object for one movie. `dag_dir`
 18 |     (called the DAG atlas) is the directory containing the pickled files.
 19 |     This class is also called `DagAtlas`.
 20 | 
 21 |     Attributes
 22 |     ----------
 23 |     dag_dir: str
 24 |         directory where this class writes pickled files. One pickled file (
 25 |         i.e., DAG) per movie.
 26 |     model: SentenceTransformer
 27 |         Model returned by SentenceTransformer constructor
 28 |     simp_dir: str
 29 |         directory where this class reads txt files.
 30 |     start_time: float
 31 |         time in minutes when self is created.
 32 |     title_to_permission_to_write_new_pickle: dict[str, bool]
 33 |         A dictionary that maps each movie title to a boolean that grants 
 34 |         permission to overwrite an existing pickled file.
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, simp_dir, dag_dir,
 39 |                  recycled_pickles=None):
 40 |         """
 41 |         Constructor
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         simp_dir: str
 46 |             directory with a simplified txt file for each movie script
 47 |         dag_dir: str
 48 |             directory with a pickled file containing a Dag object for each
 49 |             movie script
 50 |         recycled_pickles: list[str]
 51 |             titles for which overwriting of pickled files is forbidden, at the
 52 |             beginning, when self is first constructed.
 53 | 
 54 |         """
 55 |         self.start_time = time()
 56 |         time_now = (time() - self.start_time) / 60
 57 |         print(f"Initiating DagAtlas object: {time_now:.2f} minutes\n")
 58 | 
 59 |         self.simp_dir = simp_dir
 60 |         self.dag_dir = dag_dir
 61 |         all_simp_titles = [file_name[:-len(".txt")] for \
 62 |                            file_name in my_listdir(self.simp_dir)]
 63 |         all_dag_titles = [file_name[:-len(".pkl")] for \
 64 |                           file_name in my_listdir(self.dag_dir)]
 65 |         assert set(all_dag_titles).issubset(set(all_simp_titles))
 66 | 
 67 |         self.title_to_permission_to_write_new_pickle = {}
 68 |         for title in all_simp_titles:
 69 |             self.title_to_permission_to_write_new_pickle[title] = True
 70 |         if recycled_pickles is None:
 71 |             recycled_pickles = []
 72 |         for title in recycled_pickles:
 73 |             assert title in all_dag_titles
 74 |             self.title_to_permission_to_write_new_pickle[title] = False
 75 | 
 76 |         if SIMI_DEF == "similarity_bert":
 77 |             self.model = SentenceTransformer('all-MiniLM-L6-v2')
 78 |         else:
 79 |             self.model = None
 80 | 
 81 |     def update_arrows_for_two_m_scripts(self, title1, title2):
 82 |         """
 83 |         This method updates the arrows for 2 movie titles.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         title1: str
 88 |         title2: str
 89 | 
 90 |         Returns
 91 |         -------
 92 |         None
 93 | 
 94 |         """
 95 |         time_now = (time() - self.start_time) / 60
 96 |         print(f"Starting comparison of 2 titles: {time_now:.2f} minutes")
 97 | 
 98 |         if self.title_to_permission_to_write_new_pickle[title1]:
 99 |             dag1 = Dag(title1, simp_dir=self.simp_dir)
100 |         else:
101 |             path1 = self.dag_dir + "/" + title1 + ".pkl"
102 |             try:
103 |                 with open(path1, "rb") as f:
104 |                     dag1 = pik.load(f)
105 |             except OSError:
106 |                 print("This file is probably missing:", path1)
107 |                 sys.exit()
108 | 
109 |         if self.title_to_permission_to_write_new_pickle[title2]:
110 |             dag2 = Dag(title2, simp_dir=self.simp_dir)
111 |         else:
112 |             path2 = self.dag_dir + "/" + title2 + ".pkl"
113 |             try:
114 |                 with open(path2, "rb") as f:
115 |                     dag2 = pik.load(f)
116 |             except OSError:
117 |                 print("This file is probably missing:", path2)
118 |                 sys.exit()
119 |         node_to_simple_ztz1 = \
120 |             dag1.build_node_to_simple_ztz_dict(self.simp_dir)
121 |         node_to_simple_ztz2 = \
122 |             dag2.build_node_to_simple_ztz_dict(self.simp_dir)
123 | 
124 |         print("title1 and its num of nodes:", title1, len(dag1.nodes))
125 |         print("title2 and its num of nodes:", title2, len(dag2.nodes))
126 |         print("product of numbers of nodes=",
127 |               len(dag1.nodes) * len(dag2.nodes))
128 | 
129 |         time_now = (time() - self.start_time) / 60
130 |         print(f"Starting bridges: {time_now:.2f} minutes")
131 | 
132 |         nd1_nd2_bridges = []
133 |         bridge_count = 0
134 |         batch_simi = BatchSimilarity(dag1, dag2,
135 |                                     node_to_simple_ztz1,
136 |                                     node_to_simple_ztz2,
137 |                                     model=self.model)
138 |         for nd1, nd2 in product(dag1.nodes, dag2.nodes):
139 |             if batch_simi.simi(nd1, nd2) > SIMI_THRESHOLD:
140 |                 nd1_nd2_bridges.append((nd1, nd2))
141 |                 bridge_count += 1
142 |                 print(bridge_count, "bridges")
143 |         range0 = range(len(nd1_nd2_bridges))
144 |         for i, j in product(range0, range0):
145 |             if i < j:
146 |                 bridge_a = nd1_nd2_bridges[i]
147 |                 bridge_b = nd1_nd2_bridges[j]
148 |                 arrows = [None, None]
149 |                 time_gaps = [0, 0]
150 |                 for movie in range(2):
151 |                     time_gaps[movie] = \
152 |                         bridge_a[movie].time - bridge_b[movie].time
153 |                     if time_gaps[movie] > 0:
154 |                         arrows[movie] = (bridge_b[movie], bridge_a[movie])
155 |                     else:
156 |                         arrows[movie] = (bridge_a[movie], bridge_b[movie])
157 |                 bridges_do_not_cross = (time_gaps[0] * time_gaps[1] > 0)
158 |                 if bridges_do_not_cross:
159 |                     accepted = True
160 |                 else:
161 |                     accepted = False
162 |                 dag1.update_arrow(arrows[0], accepted)
163 |                 dag2.update_arrow(arrows[1], accepted)
164 | 
165 |         time_now = (time() - self.start_time) / 60
166 |         print(f"Before saving 2 dags: {time_now:.2f} minutes")
167 |         dag1.save_self(self.dag_dir)
168 |         self.title_to_permission_to_write_new_pickle[title1] = False
169 |         dag2.save_self(self.dag_dir)
170 |         self.title_to_permission_to_write_new_pickle[title2] = False
171 | 
172 |         time_now = (time() - self.start_time) / 60
173 |         print(f"Exiting 2 titles comparison: {time_now:.2f} minutes\n")
174 | 
175 |     def update_arrows_in_batch_of_m_scripts(self, batch_titles=None):
176 |         """
177 |         This method calls the method `update_arrows_for_two_m_scripts` for
178 |         every pair '{ title1, title2}' of movie scripts in the list
179 |         `batch_titles`.
180 | 
181 |         Parameters
182 |         ----------
183 |         batch_titles: list[str] or None
184 | 
185 |         Returns
186 |         -------
187 |         None
188 | 
189 |         """
190 |         all_simp_titles = [file_name[:-len(".txt")] for \
191 |                            file_name in my_listdir(self.simp_dir)]
192 | 
193 |         if batch_titles is None:
194 |             batch_titles = all_simp_titles
195 |         assert set(batch_titles).issubset(set(all_simp_titles))
196 |         assert len(batch_titles) >= 2
197 |         num = len(batch_titles)
198 | 
199 |         for i, j in product(range(num), range(num)):
200 |             if i < j:
201 |                 self.update_arrows_for_two_m_scripts(batch_titles[i],
202 |                                                      batch_titles[j])
203 | 
204 |     def update_arrows_for_one_m_script_and_others(self,
205 |                                                   title,
206 |                                                   other_titles):
207 |         """
208 |         This method calls the method `update_arrows_for_two_m_scripts` for
209 |         every pair '{ title, other_title}' of movie scripts,
210 |         where `other_title` is in the list `other_titles`.
211 | 
212 |         Parameters
213 |         ----------
214 |         title: str
215 |         other_titles: list[str]
216 | 
217 |         Returns
218 |         -------
219 |         None
220 | 
221 |         """
222 |         all_simp_titles = [file_name[:-len(".txt")] for \
223 |                            file_name in my_listdir(self.simp_dir)]
224 |         assert set(other_titles).issubset(set(all_simp_titles))
225 |         assert title not in other_titles
226 | 
227 |         for j in range(len(other_titles)):
228 |             self.update_arrows_for_two_m_scripts(title,
229 |                                                  other_titles[j])
230 | 
231 | 
232 | if __name__ == "__main__":
233 |     def main1():
234 |         simp_dir = "short_stories_post_clean"
235 |         dag_dir = "short_stories_dag_atlas"
236 |         atlas = DagAtlas(simp_dir, dag_dir)
237 |         all_titles = [file_name[:-len(".txt")] \
238 |                       for file_name in my_listdir(simp_dir)]
239 |         atlas.update_arrows_in_batch_of_m_scripts(
240 |             batch_titles=all_titles[0:3])
241 | 
242 | 
243 |     def main2():
244 |         remove_dialog = False
245 |         atlas = DagAtlas(
246 |             simp_dir=POST_CLEAN_DIR if not remove_dialog else
247 |                 POST_CLEAN_RD_DIR,
248 |             dag_dir=DAG_DIR)
249 |         all_titles = [file_name[:-len(".txt")] \
250 |                       for file_name in my_listdir(SIMP_DIR)]
251 |         atlas.update_arrows_in_batch_of_m_scripts(
252 |             batch_titles=all_titles[0:3])
253 | 
254 | 
255 |     main1()
256 |     # main2()
257 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_spacy2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains one of several implementations of the function
  4 | `simplify_ztz(sentence, verbose=False)` that we considered.
  5 | 
  6 | Refs:
  7 | 
  8 | https://spacy.io/usage/spacy-101/
  9 | 
 10 | https://github.com/ac491/Sentence-simplifier/blob/master/simplifcation.ipynb
 11 | """
 12 | import spacy
 13 | import nltk
 14 | 
 15 | nltk.download('averaged_perceptron_tagger')
 16 | 
 17 | nlp = spacy.load("en_core_web_sm")
 18 | 
 19 | # set of relative pronouns
 20 | RELPRON = ['whom', 'whose', 'which', 'who']
 21 | 
 22 | 
 23 | def transform(parsed):
 24 |     d = {}
 25 |     # print(parsed)
 26 |     # print()
 27 |     for x in parsed:
 28 |         rel = x.dep_
 29 |         parent = x.head.i + 1
 30 |         dependent = x.i + 1
 31 |         if parent == dependent and rel == 'ROOT':
 32 |             parent = 0
 33 |         if parent not in d.keys():
 34 |             d[parent] = {}
 35 |         if rel not in d[parent].keys():
 36 |             d[parent][rel] = []
 37 | 
 38 |         d[parent][rel].append(dependent)
 39 | 
 40 |     return d
 41 | 
 42 | 
 43 | def analyse_rc(sentence):
 44 |     # check for markers indicating rel_clause
 45 |     if any([s.lower() in RELPRON for s in sentence]):
 46 |         mark = []
 47 |         for s in sentence:
 48 |             if s.lower() in RELPRON:
 49 |                 mark.append(s.lower())
 50 |         return True, mark
 51 |     else:
 52 |         return False, None
 53 | 
 54 | 
 55 | def remove_all(aux, item):
 56 |     for a in aux.keys():
 57 |         for d in aux[a].keys():
 58 |             if item in aux[a][d]:
 59 |                 aux[a][d].remove(item)
 60 | 
 61 | 
 62 | def build(root, dep, aux, words, final, yes_root=True, previous=None):
 63 |     if previous is None:
 64 |         previous = []
 65 | 
 66 |     if root in previous:
 67 |         return
 68 | 
 69 |     previous.append(root)
 70 | 
 71 |     if yes_root:
 72 |         final[root] = words[root - 1]
 73 |         previous.append(root)
 74 | 
 75 |     for k in dep.keys():
 76 |         for i in dep[k]:
 77 |             if i in aux.keys():
 78 |                 deps = aux[i]
 79 |                 build(i, deps, aux, words, final, previous=previous)
 80 | 
 81 |             final[i] = words[i - 1]
 82 | 
 83 | 
 84 | def appositive_phrases(dep_dict, words, root, dep_root, ant):
 85 |     if 'nsubj' in dep_root:
 86 |         subj = dep_root['nsubj'][0]
 87 |         subj_word = words[subj - 1]
 88 | 
 89 |         # print(dep_dict)
 90 |         if subj not in dep_dict:
 91 |             return False, ant
 92 | 
 93 |         deps_subj = dep_dict[subj]
 94 |         v_tense = words[root - 1][1]
 95 |         n_num = words[subj - 1][1]
 96 | 
 97 |         if 'amod' in deps_subj:
 98 |             mod = deps_subj['amod'][0]
 99 |             if mod in dep_dict:
100 |                 deps_mod = dep_dict[mod]
101 |             else:
102 |                 deps_mod = {}
103 |             del dep_dict[subj]['amod']
104 |             deps_subj = dep_dict[subj]
105 | 
106 |             # Treat simple cases such as 'general rule'
107 |             if 'JJ' in words[mod - 1][1] and 'punct' not in deps_subj:
108 |                 return False, ant
109 | 
110 |         elif 'appos' in deps_subj:
111 |             mod = deps_subj['appos'][0]
112 |             if mod in dep_dict:
113 |                 deps_mod = dep_dict[mod]
114 |             else:
115 |                 deps_mod = {}
116 |             del dep_dict[subj]['appos']
117 |             deps_subj = dep_dict[subj]
118 |         else:
119 |             return False, ant
120 | 
121 |         if 'punct' in deps_subj.keys():
122 |             del deps_subj['punct']
123 | 
124 |         final_root = {}
125 |         build(root, dep_root, dep_dict, [s[0].lower() for s in words],
126 |               final_root)
127 |         final_appos = {}
128 |         build(mod, deps_mod, dep_dict, [s[0].lower() for s in words],
129 |               final_appos)
130 |         final_subj = {}
131 |         build(subj, deps_subj, dep_dict, [s[0].lower() for s in words],
132 |               final_subj)
133 | 
134 |         # print(final_root)
135 |         s1 = []
136 |         for i in sorted(final_root):
137 |             s1.append(final_root[i])
138 |         s1 = ' '.join(s1)
139 |         # print(s1)
140 | 
141 |         # print(final_appos)
142 |         s2 = []
143 |         for i in sorted(final_appos):
144 |             s2.append(final_appos[i])
145 |         s2 = ' '.join(s2)
146 |         # print(s2)
147 | 
148 |         # print(final_subj)
149 |         s3 = []
150 |         for i in sorted(final_subj):
151 |             s3.append(final_subj[i])
152 |         s3 = ' '.join(s3)
153 |         # print(s3)
154 | 
155 |         if len(final_appos.keys()) < 2:
156 |             return False, ant
157 | 
158 |         if n_num in ["NN", "NNP"]:
159 |             if v_tense in ["VBP", "VBZ", "VB"]:
160 |                 s3 += " is "
161 |             elif v_tense in ["VBD", "VBG", "VBN"]:
162 |                 s3 += " was "
163 | 
164 |         elif n_num in ["NNS", "NNPS"]:
165 |             if v_tense in ["VBP", "VBZ", "VB"]:
166 |                 s3 += " are "
167 |             elif v_tense in ("VBD", "VBG", "VBN"):
168 |                 s3 += " were "
169 | 
170 |         elif n_num in ["PRP"] and subj_word.lower() == "they":
171 | 
172 |             if v_tense in ["VBP", "VBZ", "VB"]:
173 |                 s3 += " are "
174 |             elif v_tense in ["VBD", "VBG", "VBN"]:
175 |                 s3 += " were "
176 | 
177 |         elif n_num in ["PRP"]:
178 |             if v_tense in ["VBP", "VBZ", "VB"]:
179 |                 s3 += " is "
180 |             elif v_tense in ["VBD", "VBG", "VBN"]:
181 |                 s3 += " was "
182 | 
183 |         s2 = s3 + s2
184 | 
185 |         return True, [s1, s2]
186 | 
187 |     return False, ant
188 | 
189 | 
190 | def relative_clauses(dep_dict, words, root, dep_root, rel, ant):
191 |     subj = dep_root[rel][0]
192 |     if subj in dep_dict:
193 | 
194 |         dep_subj = dep_dict[subj]
195 | 
196 |         if 'relcl' in dep_subj or 'rcmod' in dep_subj:
197 |             if 'relcl' in dep_subj:
198 |                 relc = dep_subj['relcl'][0]
199 |                 type_rc = 'relcl'
200 |             else:
201 |                 relc = dep_subj['rcmod'][0]
202 |                 type_rc = 'rcmod'
203 |             deps_relc = dep_dict[relc]
204 | 
205 |             if 'nsubj' in deps_relc:
206 |                 subj_rel = 'nsubj'
207 |             elif 'nsubjpass' in deps_relc:
208 |                 subj_rel = 'nsubjpass'
209 | 
210 |             if 'ref' in dep_subj:
211 |                 to_remove = dep_subj['ref'][0]
212 |                 mark = words[dep_subj['ref'][0] - 1].lower()
213 |             else:
214 |                 to_remove = deps_relc[subj_rel][0]
215 |                 mark = words[deps_relc[subj_rel][0] - 1].lower()
216 | 
217 |             # print(mark)
218 | 
219 |             if mark in RELPRON:
220 |                 deps_relc[subj_rel][0] = subj
221 |                 remove_all(dep_dict, to_remove)
222 |             # needed for cases where the subject of
223 |             # the relative clause is the object
224 |             elif 'dobj' in deps_relc:
225 |                 obj = deps_relc['dobj'][0]
226 | 
227 |                 if 'poss' in dep_dict[obj]:
228 |                     mod = dep_dict[obj]['poss'][0]
229 |                     aux_words = words[mod - 1]
230 |                     aux_words = words[subj - 1] + '\'s'
231 |                     words[mod - 1] = aux_words
232 |                     dep_dict[mod] = dep_dict[subj]
233 |                 else:
234 |                     return False, ant
235 |             else:
236 |                 return False, ant  # for broken cases -
237 |                 # " There are some
238 |                 # situations where it is particularly important
239 |                 # that you get financial information and
240 |                 # advice that is independent of us."
241 | 
242 |             del dep_dict[subj][type_rc]
243 | 
244 |             if 'punct' in dep_subj:
245 |                 del dep_dict[subj]['punct']
246 | 
247 |             final_root = {}
248 |             build(root, dep_root, dep_dict, words, final_root)
249 |             final_relc = {}
250 |             build(relc, deps_relc, dep_dict, words, final_relc)
251 | 
252 |             # print(final_root)
253 |             # print(final_relc)
254 | 
255 |             s1 = []
256 |             for i in sorted(final_root):
257 |                 s1.append(final_root[i])
258 | 
259 |             s2 = []
260 |             for i in sorted(final_relc):
261 |                 s2.append(final_relc[i])
262 | 
263 |             return True, [' '.join(s1), ' '.join(s2)]
264 |     return False, ant
265 | 
266 | 
267 | sentence0 = 'Robert, who lives nearby, was walking his dog'
268 | sentence1 = 'Marcus, my sister\'s hamster, likes to run in a wheel.'
269 | 
270 | 
271 | def simplify_ztz(ztz, verbose=False):
272 |     """
273 |     This method simplifies the sentence `sentence`. It returns a list of
274 |     simple sentences extracted from the input sentence.
275 | 
276 |     Parameters
277 |     ----------
278 |     ztz: str
279 |     verbose: bool
280 |     kwargs: dict[]
281 | 
282 |     Returns
283 |     -------
284 |     list[str]
285 | 
286 |     """
287 | 
288 |     ztz.strip()
289 |     sentences = [ztz]
290 |     result = []
291 | 
292 |     for s in sentences:
293 | 
294 |         output = nlp(s)
295 | 
296 |         dep_dict = transform(output)
297 |         # print(dep_dict)
298 | 
299 |         # words = [(token.text.lower(), token.pos_) for token in output]
300 | 
301 |         tokens = [token.text.lower() for token in output]
302 | 
303 |         words = nltk.pos_tag(tokens)
304 | 
305 |         # print(words)
306 | 
307 |         if 0 in dep_dict:
308 | 
309 |             root = dep_dict[0]['ROOT'][0]
310 | 
311 |             if root in dep_dict:
312 | 
313 |                 dep_root = dep_dict[root]
314 | 
315 |                 # handle appositive_phrases
316 |                 flag_appos, res = appositive_phrases(dep_dict, words, root,
317 |                                                      dep_root, sentences)
318 |                 if flag_appos:
319 |                     result += res
320 |                     continue
321 | 
322 |                 # check for relative clauses
323 |                 flag_rc, type_rc = analyse_rc(s.split())
324 | 
325 |                 if flag_rc:
326 | 
327 |                     if 'nsubj' in dep_root:
328 |                         flag, res = relative_clauses(dep_dict, tokens, root,
329 |                                                      dep_root, 'nsubj',
330 |                                                      sentences)
331 |                         if flag:
332 |                             result += res
333 |                             continue
334 |                     elif 'dobj' in dep_root:
335 |                         flag, res = relative_clauses(dep_dict, tokens, root,
336 |                                                      dep_root, 'dobj',
337 |                                                      sentences)
338 |                         if flag:
339 |                             result += res
340 |                             continue
341 |         result.append(s)
342 | 
343 |     if verbose:
344 |         print(ztz.strip())
345 |         print(result)
346 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_stanford2.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Mar 19, 2015
  3 | 
  4 | @author: TPetrou
  5 | '''
  6 | 
  7 | from nltk.parse import stanford
  8 | import os, sys
  9 | import operator
 10 | 
 11 | java_path = r"C:\Program Files\Java\jdk1.8.0_31\bin\java.exe"
 12 | os.environ['JAVAHOME'] = java_path
 13 | os.environ[
 14 |     'STANFORD_PARSER'] = r'/users/ted/stanford nlp/stanford-parser-full-2015-01-30'
 15 | os.environ[
 16 |     'STANFORD_MODELS'] = r'/users/ted/stanford nlp/stanford-parser-full-2015-01-30'
 17 | 
 18 | 
 19 | class RDF_Triple():
 20 |     class RDF_SOP():
 21 | 
 22 |         def __init__(self, name, pos=''):
 23 |             self.name = name
 24 |             self.word = ''
 25 |             self.parent = ''
 26 |             self.grandparent = ''
 27 |             self.depth = ''
 28 |             self.predicate_list = []
 29 |             self.predicate_sibings = []
 30 |             self.pos = pos
 31 |             self.attr = []
 32 |             self.attr_trees = []
 33 | 
 34 |     def __init__(self, sentence):
 35 |         self.sentence = sentence
 36 |         self.clear_data()
 37 | 
 38 |     def clear_data(self):
 39 |         self.parser = stanford.StanfordParser(
 40 |             model_path=r"/users/ted/stanford nlp/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
 41 |         self.first_NP = ''
 42 |         self.first_VP = ''
 43 |         self.parse_tree = None
 44 |         self.subject = RDF_Triple.RDF_SOP('subject')
 45 |         self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB')
 46 |         self.Object = RDF_Triple.RDF_SOP('object')
 47 | 
 48 |     def find_NP(self, t):
 49 |         try:
 50 |             t.label()
 51 |         except AttributeError:
 52 |             pass
 53 |         else:
 54 |             # Now we know that t.node is defined
 55 |             if t.label() == 'NP':
 56 |                 if self.first_NP == '':
 57 |                     self.first_NP = t
 58 |             elif t.label() == 'VP':
 59 |                 if self.first_VP == '':
 60 |                     self.first_VP = t
 61 |             for child in t:
 62 |                 self.find_NP(child)
 63 | 
 64 |     def find_subject(self, t, parent=None, grandparent=None):
 65 |         if self.subject.word != '':
 66 |             return
 67 |         try:
 68 |             t.label()
 69 |         except AttributeError:
 70 |             pass
 71 |         else:
 72 |             # Now we know that t.node is defined
 73 |             if t.label()[:2] == 'NN':
 74 |                 if self.subject.word == '':
 75 |                     self.subject.word = t.leaves()[0]
 76 |                     self.subject.pos = t.label()
 77 |                     self.subject.parent = parent
 78 |                     self.subject.grandparent = grandparent
 79 |             else:
 80 |                 for child in t:
 81 |                     self.find_subject(child, parent=t, grandparent=parent)
 82 | 
 83 |     def find_predicate(self, t, parent=None, grandparent=None, depth=0):
 84 |         try:
 85 |             t.label()
 86 |         except AttributeError:
 87 |             pass
 88 |         else:
 89 |             if t.label()[:2] == 'VB':
 90 |                 self.predicate.predicate_list.append(
 91 |                     (t.leaves()[0], depth, parent, grandparent))
 92 | 
 93 |             for child in t:
 94 |                 self.find_predicate(child, parent=t, grandparent=parent,
 95 |                                     depth=depth + 1)
 96 | 
 97 |     def find_deepest_predicate(self):
 98 |         if not self.predicate.predicate_list:
 99 |             return '', '', '', ''
100 |         return max(self.predicate.predicate_list, key=operator.itemgetter(1))
101 | 
102 |     def extract_word_and_pos(self, t, depth=0, words=[]):
103 |         try:
104 |             t.label()
105 |         except AttributeError:
106 |             #             print t
107 |             #             print 'error', t
108 |             pass
109 |         else:
110 |             # Now we know that t.node is defined
111 |             if t.height() == 2:
112 |                 #                 self.word_pos_holder.append((t.label(), t.leaves()[0]))
113 |                 words.append((t.leaves()[0], t.label()))
114 |             for child in t:
115 |                 self.extract_word_and_pos(child, depth + 1, words)
116 |         return words
117 | 
118 |     def print_tree(self, t, depth=0):
119 |         try:
120 |             t.label()
121 |         except AttributeError:
122 |             print(t)
123 |             #             print 'error', t
124 |             pass
125 |         else:
126 |             # Now we know that t.node is defined
127 |             print('(') # , t.label(), t.leaves()[0]
128 |             for child in t:
129 |                 self.print_tree(child, depth + 1)
130 |             print(') ')
131 | 
132 |     def find_object(self):
133 |         for t in self.predicate.parent:
134 |             if self.Object.word == '':
135 |                 self.find_object_NP_PP(t, t.label(), self.predicate.parent,
136 |                                        self.predicate.grandparent)
137 | 
138 |     def find_object_NP_PP(self, t, phrase_type, parent=None, grandparent=None):
139 |         '''
140 |         finds the object given its a NP or PP or ADJP
141 |         '''
142 |         if self.Object.word != '':
143 |             return
144 |         try:
145 |             t.label()
146 |         except AttributeError:
147 |             pass
148 |         else:
149 |             # Now we know that t.node is defined
150 |             if t.label()[:2] == 'NN' and phrase_type in ['NP', 'PP']:
151 |                 if self.Object.word == '':
152 |                     self.Object.word = t.leaves()[0]
153 |                     self.Object.pos = t.label()
154 |                     self.Object.parent = parent
155 |                     self.Object.grandparent = grandparent
156 |             elif t.label()[:2] == 'JJ' and phrase_type == 'ADJP':
157 |                 if self.Object.word == '':
158 |                     self.Object.word = t.leaves()[0]
159 |                     self.Object.pos = t.label()
160 |                     self.Object.parent = parent
161 |                     self.Object.grandparent = grandparent
162 |             else:
163 |                 for child in t:
164 |                     self.find_object_NP_PP(child, phrase_type, parent=t,
165 |                                            grandparent=parent)
166 | 
167 |     def get_attributes(self, pos, sibling_tree, grandparent):
168 |         rdf_type_attr = []
169 |         if pos[:2] == 'JJ':
170 |             for item in sibling_tree:
171 |                 if item.label()[:2] == 'RB':
172 |                     rdf_type_attr.append((item.leaves()[0], item.label()))
173 |         else:
174 |             if pos[:2] == 'NN':
175 |                 for item in sibling_tree:
176 |                     if item.label()[:2] in ['DT', 'PR', 'PO', 'JJ', 'CD']:
177 |                         rdf_type_attr.append((item.leaves()[0], item.label()))
178 |                     if item.label() in ['QP', 'NP']:
179 |                         # append a tree
180 |                         rdf_type_attr.append(item, item.label())
181 |             elif pos[:2] == 'VB':
182 |                 for item in sibling_tree:
183 |                     if item.label()[:2] == 'AD':
184 |                         rdf_type_attr.append((item, item.label()))
185 | 
186 |         if grandparent:
187 |             if pos[:2] in ['NN', 'JJ']:
188 |                 for uncle in grandparent:
189 |                     if uncle.label() == 'PP':
190 |                         rdf_type_attr.append((uncle, uncle.label()))
191 |             elif pos[:2] == 'VB':
192 |                 for uncle in grandparent:
193 |                     if uncle.label()[:2] == 'VB':
194 |                         rdf_type_attr.append((uncle, uncle.label()))
195 | 
196 |         return self.attr_to_words(rdf_type_attr)
197 | 
198 |     def attr_to_words(self, attr):
199 |         new_attr_words = []
200 |         new_attr_trees = []
201 |         for tup in attr:
202 |             # if type(tup[0]) != unicode:
203 |             #     if tup[0].height() == 2:
204 |             #         new_attr_words.append((tup[0].leaves()[0], tup[0].label()))
205 |             #     else:
206 |             #         #                     new_attr_words.extend(self.extract_word_and_pos(tup[0]))
207 |             #         new_attr_trees.append(tup[0].unicode_repr())
208 |             # else:
209 |                 new_attr_words.append(tup)
210 |         return new_attr_words, new_attr_trees
211 | 
212 |     def jsonify_rdf(self):
213 |         return {'sentence': self.sentence,
214 |                 'parse_tree': self.parse_tree.unicode_repr(),
215 |                 'predicate': {'word': self.predicate.word,
216 |                               'POS': self.predicate.pos,
217 |                               'Word Attributes': self.predicate.attr,
218 |                               'Tree Attributes': self.predicate.attr_trees},
219 |                 'subject': {'word': self.subject.word, 'POS': self.subject.pos,
220 |                             'Word Attributes': self.subject.attr,
221 |                             'Tree Attributes': self.subject.attr_trees},
222 |                 'object': {'word': self.Object.word, 'POS': self.Object.pos,
223 |                            'Word Attributes': self.Object.attr,
224 |                            'Tree Attributes': self.Object.attr_trees},
225 |                 'rdf': [self.subject.word, self.predicate.word,
226 |                         self.Object.word]
227 |                 }
228 | 
229 |     def main(self):
230 |         self.clear_data()
231 |         self.parse_tree = self.parser.raw_parse(self.sentence)[0]
232 |         self.find_NP(self.parse_tree)
233 |         self.find_subject(self.first_NP)
234 |         self.find_predicate(self.first_VP)
235 |         if self.subject.word == '' and self.first_NP != '':
236 |             self.subject.word = self.first_NP.leaves()[0]
237 |         self.predicate.word, self.predicate.depth, self.predicate.parent, self.predicate.grandparent = self.find_deepest_predicate()
238 |         self.find_object()
239 |         self.subject.attr, self.subject.attr_trees = self.get_attributes(
240 |             self.subject.pos, self.subject.parent, self.subject.grandparent)
241 |         self.predicate.attr, self.predicate.attr_trees = self.get_attributes(
242 |             self.predicate.pos, self.predicate.parent,
243 |             self.predicate.grandparent)
244 |         self.Object.attr, self.Object.attr_trees = self.get_attributes(
245 |             self.Object.pos, self.Object.parent, self.Object.grandparent)
246 |         self.answer = self.jsonify_rdf()
247 | 
248 | 
249 | if __name__ == '__main__':
250 |     # try:
251 |     #     sentence = sys.argv[1]
252 |     #     sentence = 'A rare black squirrel has become a regular visitor to a suburban garden'
253 |     # except IndexError:
254 |     #     print("Enter in your sentence")
255 |     #     sentence = 'A rare black squirrel has become a regular visitor to a suburban garden'
256 |     #     print("Heres an example")
257 |     #     print(sentence)
258 | 
259 |     # sentence = 'The boy dunked the basketball'
260 |     sentence = 'They also made the substance able to last longer in the bloodstream, which led to more stable blood sugar levels and less frequent injections.'
261 |     #sentence = 'A rare black squirrel has become a regular visitor to a
262 |     # suburban garden'
263 |     rdf = RDF_Triple(sentence)
264 |     rdf.main()
265 | 
266 |     ans = rdf.answer
267 |     print(ans)


--------------------------------------------------------------------------------
/cleaning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains functions for cleaning movie scripts (or short stories)
  4 | 
  5 | input directory: m_scripts or short_stories
  6 | output directory: m_scripts_clean or short_stories_clean
  7 | 
  8 | The code in this file cleans one movie script file at a time. It takes each
  9 | input movie script file from the folder `m_scripts` and outputs a new file
 10 | to the folder `m_scripts_clean`.
 11 | 
 12 | It removes contractions like "didn't", and replaces exclusively unicode
 13 | symbols by their closest ANSII analogues (e.g., curly quotes are replaced by
 14 | straight quotes).
 15 | 
 16 | It uses the software SpaCy to break up the movie script into separate
 17 | sentences, and returns a file with only one sentence per line.
 18 | 
 19 | For the case of movie scripts (but not for short stories), it also tries to
 20 | distinguish between dialog lines and narration lines. In many but not all
 21 | movie scripts, the dialog lines are indented with respect to the narration
 22 | lines. In the case of Pixar/Disney, they don't indent dialog. In cases where
 23 | the movie script indents, the MM software gives the option of throwing away
 24 | all the dialog lines and keeping only the narration ones. Folders ending in
 25 | `_rd` are for remove dialog files.
 26 | 
 27 | Occasionally in this file, we use regex (via the Python module `re`).
 28 | Here is a nice reference on `re`.
 29 | 
 30 | https://www.datacamp.com/tutorial/python-regular-expression-tutorial
 31 | 
 32 | ChatGPT is also very good at answering regex questions.
 33 | 
 34 | """
 35 | import re
 36 | import os
 37 | # sentence splitting with NLTK
 38 | # from nltk.tokenize import sent_tokenize
 39 | import collections as co
 40 | from globals import *
 41 | from unidecode import unidecode
 42 | import contractions
 43 | from utils import *
 44 | 
 45 | # sentence splitting with spacy
 46 | import spacy
 47 | 
 48 | nlp = spacy.load('en_core_web_sm')
 49 | 
 50 | 
 51 | def expand_contractions(line):
 52 |     """
 53 |     This auxiliary method replaces all contractions in the string `line` by
 54 |     expansions thereof (e.g., replaces "didn't" by "did not".)
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     line: str
 59 | 
 60 |     Returns
 61 |     -------
 62 |     str
 63 | 
 64 |     """
 65 |     str_list = []
 66 |     for word in line.split():
 67 |         str_list.append(contractions.fix(word))
 68 |     return ' '.join(str_list)
 69 | 
 70 | 
 71 | def clean_one_m_script(in_dir,
 72 |                        out_dir,
 73 |                        file_name,
 74 |                        remove_dialog=False):
 75 |     """
 76 |     in_dir and out_dir can be the same, but this will overwrite the files.
 77 | 
 78 |     This method reads a file called `file_name` in the `in_dir` directory
 79 |     and creates a clean version in the `out_dir` directory.
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     in_dir: str
 84 |     out_dir: str
 85 |     file_name: str
 86 |     remove_dialog: bool
 87 |         True iff dialog part of the movie script is removed, leaving only
 88 |         the narration part.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     None
 93 | 
 94 |     """
 95 | 
 96 |     print('fetching %s' % file_name)
 97 | 
 98 |     def count_leading_wh_sp(str0):
 99 |         # wh_sp = white space
100 |         count = 0
101 |         if str0:
102 |             for char in str0:
103 |                 if char.isspace():
104 |                     count += 1
105 |                 else:
106 |                     break
107 |         return count
108 | 
109 |     inpath = in_dir + "/" + file_name
110 |     outpath = out_dir + "/" + file_name
111 | 
112 |     with open(inpath, "r", encoding='utf-8') as f:
113 |         lines = [line for line in f]
114 | 
115 |     # Replace exclusively unicode characters by ascii analogues (e.g.,
116 |     # replace curly quotes by straight ones) so don't have to use
117 |     # encoding="utf-8" as a parameter in open() from here on.
118 |     lines = [unidecode(line) for line in lines]
119 | 
120 |     # expand contractions
121 |     lines = [expand_contractions(line) for line in lines]
122 | 
123 |     # strip trailing (i.e., right) white space and newline.
124 |     # If this results in an empty line, remove it.
125 |     new_lines = []
126 |     for line in lines:
127 |         line = line.rstrip()
128 |         if line:
129 |             new_lines.append(line)
130 |     lines = new_lines
131 | 
132 |     # remove everything after and including THE END
133 |     new_lines = []
134 |     for line in lines:
135 |         if line.strip() in ["THE END", "END"]:
136 |             break
137 |         else:
138 |             new_lines.append(line)
139 |     lines = new_lines
140 | 
141 |     # regex for parenthetical remarks
142 |     pattern_paren = re.compile(r'\[(.*?)\]|\((.*?)\)|\{(.*?)\}')
143 |     # regex for period followed by white spaces + number
144 |     pattern_period = r"\.(?=\s*\d)"
145 | 
146 |     # Substitutions. If subs results in empty line, remove it.
147 |     new_lines = []
148 |     for line in lines:
149 |         # print("ssdf", line)
150 |         # remove parenthetical remarks
151 |         line = re.sub(pattern_paren, "", line)
152 |         # remove the underscore, which is not
153 |         # considered a punctuation mark.
154 |         line = re.sub(r'[_]', '', line)
155 |         # Replace tabs by 12 blank spaces
156 |         line = re.sub(r"\t", " " * 12, line)
157 |         # replace period by dash if period followed by number
158 |         line = re.sub(pattern_period, "-", line)
159 |         # print("\tssdf", line)
160 |         if len(line) >= 1:
161 |             new_lines.append(line)
162 |     lines = new_lines
163 | 
164 |     # Add missing periods for transitions from dialog to narration or vice
165 |     # versa
166 |     indent = count_leading_wh_sp(lines[0])
167 |     for i in range(len(lines)):
168 |         if i != len(lines) - 1:
169 |             next_indent = count_leading_wh_sp(lines[i + 1])
170 |             if indent != next_indent and \
171 |                     not lines[i][-1] in [".", "!", "?"]:
172 |                 lines[i] = lines[i] + "."
173 |         else:
174 |             next_indent = None
175 |             if not lines[i][-1] in [".", "!", "?"]:
176 |                 lines[i] = lines[i] + "."
177 |         indent = next_indent
178 | 
179 |     # Regex for string that contains at least 2 lower case letters
180 |     # Found cases where line was just "is."
181 |     pattern_lc = re.compile(r'^(.*[a-z]){2,}.*$')
182 | 
183 |     # Reject lines that don't contain at least 2 lower case letters string.
184 |     # This gets rid of scene directions and character invocations.
185 |     lines = [line for line in lines if re.search(pattern_lc, line)]
186 | 
187 |     white_spaces = [count_leading_wh_sp(line) for line in lines]
188 |     # Counter returns dictionary mapping item to its number of repetitions
189 |     wh_sp_counter = co.Counter(white_spaces)
190 |     # print("llkh", wh_sp_counter)
191 |     sum_reps = sum(wh_sp_counter.values())
192 |     indent_prob_dist = co.OrderedDict()
193 |     indents = []
194 |     for indent in sorted(wh_sp_counter,
195 |                          key=wh_sp_counter.get,
196 |                          reverse=True):
197 |         prob = round(wh_sp_counter[indent] / sum_reps, 3)
198 |         indent_prob_dist[indent] = prob
199 |         indents.append(indent)
200 |     # print("ddfg", indents)
201 |     # print("ddfg", indent_prob_dist)
202 |     print("indent prob dist =", [(indent, indent_prob_dist[indent]) \
203 |                                  for indent in indents[0:4]])
204 | 
205 |     # likely dialog indents
206 |     # most probable indent = indents[0]
207 |     dial_indents = [indent for indent in indents if \
208 |                     abs(indent - indents[0]) <= 3 and \
209 |                     indent_prob_dist[indent] >= .01]
210 | 
211 |     ndial_indents = [indent for indent in indents \
212 |                      if indent not in dial_indents]
213 |     # likely narration indents
214 |     narr_indents = [indent for indent in ndial_indents if \
215 |                     abs(indent - ndial_indents[0]) <= 3 and \
216 |                     indent_prob_dist[indent] >= .01]
217 | 
218 |     print("dialog indents=", dial_indents)
219 |     print("narration indents=", narr_indents)
220 | 
221 |     # keep only narration (less likely than dialog) indentations. Also
222 |     # remove smallest indentation.
223 |     new_lines = []
224 |     for line in lines:
225 |         indent = count_leading_wh_sp(line)
226 |         if indent in dial_indents + narr_indents:
227 |             if not narr_indents or not dial_indents:
228 |                 # there is no difference in indentation between narr and dial
229 |                 new_lines.append(line)
230 |             else:
231 |                 if remove_dialog:
232 |                     if indent in narr_indents:
233 |                         new_lines.append(line[min(narr_indents):])
234 |                 else:
235 |                     new_lines.append(line[min(narr_indents):])
236 |     lines = new_lines
237 | 
238 |     # print("nnuu", lines[0:15])
239 |     # print("nnuu", lines[-15:])
240 | 
241 |     # remove enumeration markers.
242 |     # pattern = re.compile(r"^[^a-zA-Z]*")
243 |     # lines = [re.sub(pattern, "", line) for line in lines]
244 | 
245 |     # join lines to create new script
246 |     lines = [line.strip() for line in lines if line]
247 |     script = ' '.join(lines)
248 | 
249 |     # split script into sentences with NLTK
250 |     # lines = sent_tokenize(script)
251 | 
252 |     # split script into sentences with spacy
253 |     lines = nlp(script).sents
254 |     # for line in lines:
255 |     #     print("zzzxc", line)
256 | 
257 |     # remove single character sentences
258 |     lines = [line.text for line in lines if len(line.text) > 1]
259 | 
260 |     with open(outpath, "w") as f:
261 |         for line in lines:
262 |             f.write(line + "\n")
263 | 
264 | 
265 | def clean_batch_of_m_scripts(
266 |         in_dir, out_dir,
267 |         batch_file_names,
268 |         remove_dialog=False):
269 |     """
270 |     This method calls the method `clean_one_m_script` for all the file names
271 |     in the list of file names `batch_file_names`.
272 | 
273 |     Parameters
274 |     ----------
275 |     in_dir: str
276 |     out_dir: str
277 |     batch_file_names: list[str]
278 |     remove_dialog: bool
279 | 
280 |     Returns
281 |     -------
282 |     None
283 | 
284 |     """
285 | 
286 |     all_file_names = my_listdir(in_dir)
287 |     assert set(batch_file_names).issubset(set(all_file_names))
288 |     for file_name in batch_file_names:
289 |         i = all_file_names.index(file_name)
290 |         print('%i.' % (i + 1))
291 |         clean_one_m_script(in_dir,
292 |                            out_dir,
293 |                            file_name,
294 |                            remove_dialog=remove_dialog)
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     from globals import *
299 | 
300 | 
301 |     def main1():
302 |         in_dir = "short_stories"
303 |         out_dir = "short_stories_clean"
304 |         batch_file_names = my_listdir(in_dir)[0:3]
305 |         clean_batch_of_m_scripts(
306 |             in_dir, out_dir,
307 |             batch_file_names,
308 |             remove_dialog=False)
309 | 
310 | 
311 |     def main2():
312 |         remove_dialog = True
313 |         clean_one_m_script(
314 |             in_dir=M_SCRIPTS_DIR,
315 |             out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR,
316 |             file_name="up.txt",
317 |             remove_dialog=remove_dialog)
318 | 
319 | 
320 |     def main3():
321 |         remove_dialog = False
322 |         # batch_file_names=my_listdir(M_SCRIPTS_DIR)
323 |         batch_file_names = ["toy-story.txt", "up.txt", "wall-e.txt"]
324 |         clean_batch_of_m_scripts(
325 |             in_dir=M_SCRIPTS_DIR,
326 |             out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR,
327 |             batch_file_names=batch_file_names,
328 |             remove_dialog=remove_dialog)
329 | 
330 | 
331 |     # main1()
332 |     # main2()
333 |     main3()
334 | 


--------------------------------------------------------------------------------
/Dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from Node import *
  3 | import pickle as pik
  4 | from globals import *
  5 | from utils import *
  6 | 
  7 | import graphviz as gv
  8 | from IPython.display import display, Image
  9 | from PIL.Image import open as open_image
 10 | 
 11 | 
 12 | class Dag:
 13 |     """
 14 |     This class creates a DAG (directed acyclic graph) for the movie entitled
 15 |     `m_title`. The DAG has nodes `nodes` and arrows `arrows`. Each arrow has
 16 |     a two weights `num_acc` and `num_rej`. Those weights are the number of
 17 |     times the arrow has been accepted and rejected. They are stored in the
 18 |     dictionary `arrow_to_acc_rej_nums`.
 19 | 
 20 |     Attributes
 21 |     ----------
 22 |     arrow_to_acc_rej_nums: dict[tuple(Node), tuple(int)]
 23 |     arrows: list[tuple[Node, Node]]
 24 |         arrows of self. Arrows are defined as a pair of Node objects.
 25 |         The first element of the pair is the origin of the arrow and the
 26 |         second is the target of the arrow.
 27 |     m_title: str
 28 |     nodes: list[Node]
 29 | 
 30 |     """
 31 | 
 32 |     def __init__(self, m_title, simp_dir):
 33 |         """
 34 |         Constructor
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |         m_title: str
 39 |             title of movie to which this DAG refers to.
 40 |         simp_dir: str
 41 |             the directory in which simplified files are stored, and from
 42 |             which objects of this class are constructed.
 43 |         """
 44 |         self.m_title = m_title
 45 |         path = simp_dir + "/" + m_title + ".txt"
 46 |         with open(path, "r") as f:
 47 |             lines = [line for line in f]
 48 |         self.nodes = []
 49 |         for time, line in enumerate(lines):
 50 |             if line.strip() not in [ZTZ_SEPARATOR, ""]:
 51 |                 ztz_list = line.split(ZTZ_SEPARATOR)
 52 |                 for place in range(len(ztz_list)):
 53 |                     self.nodes.append(Node(time, place))
 54 |         self.arrows = []
 55 |         self.arrow_to_acc_rej_nums = {}
 56 | 
 57 |     def save_self(self, dag_dir):
 58 |         """
 59 |         This method stores self as a pickled file.
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |         dag_dir: str
 64 |             Directory in which pickled file is stored.
 65 | 
 66 |         Returns
 67 |         -------
 68 |         None
 69 | 
 70 |         """
 71 |         path = dag_dir + "/" + self.m_title + ".pkl"
 72 |         with open(path, "wb") as f:
 73 |             pik.dump(self, f, protocol=pik.HIGHEST_PROTOCOL)
 74 | 
 75 |     def update_arrow(self, arrow, accepted):
 76 |         """
 77 |         This method changes the tuple (num_accepted, num_rejected) of
 78 |         `arrow`. If accepted=True, num_accepted is increased by one. If
 79 |         accepted=False, num_rejected is increased by one.
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         arrow: tuple[Node, Node]
 84 |         accepted: bool
 85 | 
 86 |         Returns
 87 |         -------
 88 |         None
 89 | 
 90 |         """
 91 |         if arrow not in self.arrows:
 92 |             self.arrows.append(arrow)
 93 |             self.arrow_to_acc_rej_nums[arrow] = [0, 0]
 94 |         if accepted:
 95 |             self.arrow_to_acc_rej_nums[arrow][0] += 1
 96 |         else:
 97 |             self.arrow_to_acc_rej_nums[arrow][1] += 1
 98 | 
 99 |     def build_node_to_clean_ztz_dict(self,
100 |                                      clean_dir,
101 |                                      skip_1st_line=False):
102 |         """
103 |         This method builds from scratch and returns a dictionary called
104 |         `nd_to_clean_ztz` that maps each node to a clean sentence. ztz
105 |         stands for sentence.
106 | 
107 |         Parameters
108 |         ----------
109 |         clean_dir:  str
110 |             directory of movie scripts after cleaning.
111 | 
112 |         Returns
113 |         -------
114 |         dict(Node, str)
115 | 
116 |         """
117 |         path = clean_dir + "/" + self.m_title + ".txt"
118 |         is_csv = False
119 |         if not os.path.isfile(path):
120 |             path = path.replace(".txt", ".csv")
121 |             is_csv = True
122 |         assert os.path.isfile(path)
123 | 
124 |         time_to_clean_ztz = {}
125 |         with open(path, "r") as f:
126 |             time = -1
127 |             for line in f:
128 |                 time += 1
129 |                 if is_csv:
130 |                     if time == 0:
131 |                         continue
132 |                     else:
133 |                         time_to_clean_ztz[time - 1] = line.strip()
134 |                 else:
135 |                     time_to_clean_ztz[time] = line.strip()
136 | 
137 |         nd_to_clean_ztz = {}
138 |         for nd in self.nodes:
139 |             nd_to_clean_ztz[nd] = time_to_clean_ztz[nd.time]
140 | 
141 |         return nd_to_clean_ztz
142 | 
143 |     def build_node_to_simple_ztz_dict(self, simp_dir):
144 |         """
145 |         This method builds from scratch and returns a dictionary called
146 |         `nd_to_simple_ztz` that maps each node to a simplified sentence. ztz
147 |         stands for sentence.
148 | 
149 | 
150 |         Parameters
151 |         ----------
152 |         simp_dir: str
153 |             directory of movie scripts after simplifying.
154 | 
155 |         Returns
156 |         -------
157 |         dict(Node, str)
158 | 
159 |         """
160 |         path = simp_dir + "/" + self.m_title + ".txt"
161 | 
162 |         time_to_simp_ztz_list = {}
163 |         with open(path, "r") as f:
164 |             time = -1
165 |             for line in f:
166 |                 time += 1
167 |                 if line.strip() != ZTZ_SEPARATOR:
168 |                     time_to_simp_ztz_list[time] = \
169 |                         line.split(ZTZ_SEPARATOR)
170 | 
171 |         nd_to_simp_ztz = {}
172 |         for nd in self.nodes:
173 |             nd_to_simp_ztz[nd] = \
174 |                 time_to_simp_ztz_list[nd.time][nd.place].strip()
175 | 
176 |         return nd_to_simp_ztz
177 | 
178 |     def build_high_prob_acc_arrows(self,
179 |                                    prob_acc_thold,
180 |                                    nsam_thold):
181 |         """
182 |         This method builds from scratch and returns a list of all arrows
183 |         whose `prob_acc` (i.e., probability of acceptance) is >=
184 |         `prob_acc_thold` with `nsam` (i.e., number of samples used to
185 |         calculate that probability) >= `nsam_thold`. thold = threshold
186 | 
187 |         Parameters
188 |         ----------
189 |         prob_acc_thold: float
190 |         nsam_thold: int
191 | 
192 |         Returns
193 |         -------
194 |         list[tuple[Node, Node]]
195 | 
196 |         """
197 |         high_prob_arrows = []
198 |         for arrow in self.arrows:
199 |             prob_acc, nsam = get_prob_acc_and_nsam(
200 |                 *self.arrow_to_acc_rej_nums[arrow])
201 |             if prob_acc >= prob_acc_thold and \
202 |                     nsam >= nsam_thold:
203 |                 high_prob_arrows.append(arrow)
204 |         return high_prob_arrows
205 | 
206 |     def print_map_legend(self,
207 |                          clean_dir,
208 |                          simp_dir,
209 |                          prob_acc_thold,
210 |                          nsam_thold):
211 |         """
212 |         This method prints the DAG Rosetta stone (map legend).
213 | 
214 |         For each node labeled `( time, place)`, this method prints the
215 |         simplified clause ( i.e., simplified sentence) in line `time` of the
216 |         simplified file, after a number `place` of separator-tokens. It also
217 |         prints the original sentence from which that simplified clause came
218 |         from. The full sentence is preceded by the label `(full)` and the
219 |         simplified sentence by the label `(part)`.
220 | 
221 |         It only prints the `(full)` and `(part)` for those nodes that appear
222 |         in the DAG, after removing all arrows with probability of acceptance
223 |         < `prob_acc_thold` or number of sample used to calculate that
224 |         probability < `nsam_thold`.
225 | 
226 |         Parameters
227 |         ----------
228 |         clean_dir: str
229 |             directory of movie scripts after cleaning
230 |         simp_dir: str
231 |             directory of movie scripts after simplification
232 |         prob_acc_thold: float
233 |         nsam_thold: int
234 | 
235 |         Returns
236 |         -------
237 |         None
238 | 
239 |         """
240 |         hprob_arrows = self.build_high_prob_acc_arrows(
241 |             prob_acc_thold, nsam_thold)
242 |         print("MAP LEGEND")
243 |         print("title:", self.m_title)
244 |         print("prob of acceptance threshold:", prob_acc_thold)
245 |         print("number of samples threshold:", nsam_thold)
246 |         print("number of arrows shown:", len(hprob_arrows))
247 |         print("number of arrows dropped:",
248 |               len(self.arrows) - len(hprob_arrows))
249 | 
250 |         hprob_nodes = []
251 |         for arrow in hprob_arrows:
252 |             if arrow[0] not in hprob_nodes:
253 |                 hprob_nodes.append(arrow[0])
254 |             if arrow[1] not in hprob_nodes:
255 |                 hprob_nodes.append(arrow[1])
256 | 
257 |         hprob_nodes = sorted(hprob_nodes, key=lambda x: x.time)
258 |         if clean_dir:
259 |             nd_to_clean_ztz = self.build_node_to_clean_ztz_dict(clean_dir)
260 |         else:
261 |             nd_to_clean_ztz = None
262 |         nd_to_simple_ztz = self.build_node_to_simple_ztz_dict(simp_dir)
263 | 
264 |         for nd in hprob_nodes:
265 |             print(color.GREEN + color.BOLD + node_str(nd) + ":" + color.END)
266 |             ztz0 = ""
267 |             if nd_to_clean_ztz:
268 |                 ztz0 = nd_to_clean_ztz[nd]
269 |             print("(FULL)", ztz0)
270 |             print("(PART)", nd_to_simple_ztz[nd])
271 | 
272 |     @staticmethod
273 |     def draw_dot(s, j_embed):
274 |         """
275 |         This method draws a dot string.
276 | 
277 |         Using display(s) will draw the graph but will not embed it permanently
278 |         in the notebook. To embed it permanently, must generate temporary image
279 |         file and use Image().display(s)
280 | 
281 |         Parameters
282 |         ----------
283 |         s: output of graphviz Source(dot_str)
284 |         j_embed: bool
285 |             True iff want to embed image in jupyter notebook. If you are
286 |             using a python terminal instead of a jupyter notebook,
287 |             only j_embed=False will draw image.
288 | 
289 |         Returns
290 |         -------
291 |         None
292 |         """
293 |         x = s.render("tempo", format='png', view=False)
294 |         if j_embed:
295 |             display(Image(x))
296 |         else:
297 |             open_image("tempo.png").show()
298 | 
299 |     def draw(self, prob_acc_thold, nsam_thold, jupyter=False):
300 |         """
301 |         This method draws the graph for self. Only arrows with
302 |         `prob_acceptance` >= `prob_acc_thold` are drawn.
303 | 
304 |         Parameters
305 |         ----------
306 |         prob_acc_thold: float
307 |         nsam_thold: int
308 |         jupyter: bool
309 | 
310 |         Returns
311 |         -------
312 |         None
313 | 
314 |         """
315 |         hprob_arrows = self.build_high_prob_acc_arrows(
316 |             prob_acc_thold, nsam_thold)
317 | 
318 |         dot = "digraph {\n"
319 |         for arrow in hprob_arrows:
320 |             prob_acc, nsam = get_prob_acc_and_nsam(
321 |                 *self.arrow_to_acc_rej_nums[arrow])
322 |             X = '"' + str(prob_acc) + " (" + str(nsam) + ")" + '"'
323 |             dot += '"' + node_str(arrow[0]) + '"' + "->" + \
324 |                    '"' + node_str(arrow[1]) + '"' + \
325 |                    ' [label=' + X + "];\n"
326 |         dot += 'labelloc="b";\n'
327 |         dot += 'label="' + self.m_title + '";\n'
328 |         dot += "}\n"
329 |         # print("vvbn", dot)
330 |         Dag.draw_dot(gv.Source(dot), j_embed=jupyter)
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     def main1(prob_acc_thold, nsam_thold, draw):
335 |         dag_dir = "short_stories_dag_atlas"
336 |         simp_dir = "short_stories_simp"
337 |         clean_dir = "short_stories_clean"
338 |         file_names = [file_name for
339 |                       file_name in my_listdir(dag_dir)[0:3]]
340 |         dags = []
341 |         for fname in file_names:
342 |             path = dag_dir + "/" + fname
343 |             # print("ghty", path)
344 |             with open(path, "rb") as f:
345 |                 dag = pik.load(f)
346 |                 dags.append(dag)
347 |         for dag in dags:
348 |             print("==================================")
349 |             print(dag.m_title)
350 |             hprob_arrows = dag.build_high_prob_acc_arrows(
351 |                 prob_acc_thold, nsam_thold)
352 |             print({arrow_str(arrow):
353 |                        dag.arrow_to_acc_rej_nums[arrow] \
354 |                    for arrow in hprob_arrows})
355 |             print()
356 |             if draw:
357 |                 dag.draw(prob_acc_thold, nsam_thold)
358 |                 dag.print_map_legend(clean_dir, simp_dir, prob_acc_thold)
359 | 
360 | 
361 |     main1(prob_acc_thold=.90, nsam_thold=2, draw=True)
362 | 


--------------------------------------------------------------------------------
/spell_checking.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains functions for spell-checking movie scripts (or short
  4 | stories)
  5 | 
  6 | input directory: m_scripts_clean or short_stories_clean
  7 | output directory: m_scripts_spell or short_stories_spell
  8 | 
  9 | Refs:
 10 | https://pyspellchecker.readthedocs.io/en/latest/code.html
 11 | 
 12 | Spell checkers that don't take context into consideration don't work too
 13 | well. They replace very infrequent words (probably a misspelled word) by
 14 | more frequent ones---a very error prone stategy. Spell checkers that do take
 15 | context into consideration are better, but we didn't have access to them.
 16 | 
 17 | This is a very conservative spell checker that doesn't know about context.
 18 | We make its corrections conservative by constraining it to follow the
 19 | following rules.
 20 | 
 21 | 1. It doesn't change the spelling if a word starts with a capital letter.
 22 | 
 23 | 2. if the word ends in "s" or "ed", it only considers replacements that
 24 | also end in "s" or "ed"
 25 | 
 26 | 3. it assumes that the first 2 letters of all words are always correct.
 27 | 
 28 | 4. It retains the capitalization of the first letter of a word.
 29 | 
 30 | 5. It retains punctuation
 31 | 
 32 | 6. When looking for a double letter that should be a single letter or
 33 | vice versa, it only considers guesses that have the same set of characters
 34 | (e.g., "pool" and "pol")
 35 | 
 36 | This spell checker also uses two word-to-reps dictionaries: a global one
 37 | compiled from global usage, and a local one compiled from the local document
 38 | (i.e., movie script or short story) being corrected.
 39 | 
 40 | This spell checker also uses two agents (WordGuess objects) working
 41 | separately to produce their best possible guess.
 42 | 
 43 | """
 44 | from globals import *
 45 | from spellchecker import SpellChecker # package is called pyspellchecker
 46 | import os
 47 | import re
 48 | from WordGuesser import *
 49 | from collections import defaultdict
 50 | from utils import *
 51 | 
 52 | 
 53 | def has_double_letter(word):
 54 |     """
 55 |     This method returns True iff `word` has two consecutive letters that are
 56 |     the same.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     word: str
 61 | 
 62 |     Returns
 63 |     -------
 64 |     bool
 65 | 
 66 |     """
 67 |     pattern = r'(\w)\1'
 68 |     match = re.search(pattern, word)
 69 |     if match:
 70 |         return True
 71 |     else:
 72 |         return False
 73 | 
 74 | 
 75 | def fancy_split(in_ztz):
 76 |     """
 77 |     This method first adds a white space before and after punctuation marks
 78 |     in `in_ztz`, then it applies a `split()` on the new sentence and returns
 79 |     the list generated by the split().
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     in_ztz: str
 84 | 
 85 |     Returns
 86 |     -------
 87 |     list[str]
 88 | 
 89 |     """
 90 |     # Match any pattern that is not a word character
 91 |     # or a white space.
 92 |     # This is the same as a punctuation mark.
 93 |     punctuation_pattern = re.compile(r'([^\w\s])+')
 94 |     # add a whitespace before and after each punctuation mark
 95 |     in_ztz0 = punctuation_pattern.sub(r' \1 ', in_ztz)
 96 |     return in_ztz0.split()
 97 | 
 98 | 
 99 | def get_word_to_reps(in_file_path):
100 |     """
101 |     This method returns a dictionary `word_to_reps` and an int
102 |     `local_word_count`. `word_to_reps` is a dictionary mapping each word to
103 |     its number of repetitions, for the file located at `in_file_path`.
104 |     The method also returns an int `local_word_count` which equals the number
105 |     of distinct words in the file located at `in_file_path`.
106 | 
107 | 
108 |     Parameters
109 |     ----------
110 |     in_file_path: str
111 | 
112 |     Returns
113 |     -------
114 |     dict[str, int], int
115 | 
116 |     """
117 |     # tempo dictionary words are lower case
118 |     word_to_reps = defaultdict(lambda: 0)
119 |     with open(in_file_path, "r") as f:
120 |         local_word_count = 0
121 |         for line in f:
122 |             words = fancy_split(line)
123 |             for word in words:
124 |                 word = word.lower()
125 |                 if word.isalpha() and len(word) >= 2:
126 |                     local_word_count += 1
127 |                     if word in word_to_reps:
128 |                         word_to_reps[word] += 1
129 |                     else:
130 |                         word_to_reps[word] = 1
131 | 
132 |     return word_to_reps, local_word_count
133 | 
134 | 
135 | def get_corrected_sentence(in_ztz,
136 |                            global_checker,
137 |                            error_type,
138 |                            word_to_reps=None,
139 |                            local_word_count=None):
140 |     """
141 |     This method takes a sentence `in_zstz` as input and returns a corrected
142 |     sentence. It uses two dictionaries to guess an answer: global_checker,
143 |     word_to_reps.
144 | 
145 |     If `word_to_reps` is kept at None, no local dictionary is used. The
146 |     function `get_word_to_reps()` returns both `word_to_reps` and
147 |     `local_word_count`.
148 | 
149 |     `error_type` must be in the list ["tt", "random", "all"]
150 | 
151 |     Parameters
152 |     ----------
153 |     in_ztz: str
154 |     global_checker: SpellChecker
155 |     error_type: str
156 |         must be in ["tt", "random", "all"]
157 |     word_to_reps: dict[str, int]
158 |     local_word_count: int
159 | 
160 |     Returns
161 |     -------
162 |     str, list[tuple(str, str)]
163 | 
164 |     """
165 |     if word_to_reps:
166 |         assert local_word_count
167 | 
168 |     def implies(x, y):
169 |         return (not x) or y
170 | 
171 |     words = fancy_split(in_ztz)
172 |     # print("dfgh", words)
173 |     best_guesses = []
174 |     changes = []
175 |     for word in words:
176 |         capitalized = word[0].isupper()
177 |         word = word.lower()
178 |         best_guess = word
179 |         prob_global_for_word = global_checker.word_usage_frequency(word)
180 |         if word.isalpha() and len(word) >= 2 and \
181 |                 prob_global_for_word < SPELLING_CORRECTION_RISK \
182 |                 and not capitalized:
183 |             word_guessers = {}
184 |             simple_error_types = ["tt", "random"]
185 |             if error_type in simple_error_types:
186 |                 word_guessers[error_type] = \
187 |                     WordGuesser(word, global_checker,
188 |                                 word_to_reps, local_word_count)
189 |             if error_type == "all":
190 |                 for err in simple_error_types:
191 |                     word_guessers[err] = \
192 |                         WordGuesser(word, global_checker,
193 |                                     word_to_reps, local_word_count)
194 |             assert word_guessers
195 | 
196 |             for guess in global_checker.edit_distance_1(word):
197 |                 cond1 = (guess[0:2] == word[0:2])
198 |                 cond2a = implies(word[-1] == "s", guess[-1] == "s")
199 |                 cond2b = implies(word[-2:] == "ed", guess[-2:] == "ed")
200 | 
201 |                 if cond1 and cond2a and cond2b:
202 |                     # this fixes tt, ss, dd, ll, errors
203 |                     if error_type in ["tt", "all"]:
204 |                         cond4 = (has_double_letter(guess) or has_double_letter(
205 |                             word)) and (len(guess) != len(word)) and set(
206 |                             guess) == set(word)
207 |                         if cond4:
208 |                             word_guessers['tt'].do_update(guess)
209 |                     if error_type in ["random", "all"]:
210 |                         word_guessers["random"].do_update(guess)
211 |             guesser0 = None
212 |             prob0 = -1
213 |             for guesser in word_guessers.values():
214 |                 # print("fgyt", guesser)
215 |                 if guesser.prob_for_best_guess > prob0:
216 |                     guesser0 = guesser
217 |                     prob0 = guesser.prob_for_best_guess
218 |             best_guess = guesser0.best_guess
219 |         if capitalized:
220 |             word = word[0].upper() + word[1:]
221 |             best_guess = best_guess[0].upper() + best_guess[1:]
222 |         best_guesses.append(best_guess)
223 |         if word != best_guess:
224 |             changes.append((word, best_guess))
225 | 
226 |     return " ".join(best_guesses), changes
227 | 
228 | 
229 | def correct_this_file(in_dir,
230 |                       out_dir,
231 |                       file_name,
232 |                       error_type,
233 |                       verbose=True,
234 |                       use_local_dict=False):
235 |     """
236 |     This method reads a file called `file_name` in the `in_dir` directory
237 |     and creates a spelling corrected version in the `out_dir` directory.
238 | 
239 |     in_dir and out_dir can be the same, but this will overwrite the files
240 | 
241 |     Parameters
242 |     ----------
243 |     in_dir: str
244 |     out_dir: str
245 |     file_name: str
246 |     error_type: str
247 |     verbose: bool
248 |     use_local_dict: bool
249 | 
250 |     Returns
251 |     -------
252 |     None
253 | 
254 |     """
255 |     inpath = in_dir + "/" + file_name
256 |     if out_dir:
257 |         outpath = out_dir + "/" + file_name
258 |     else:
259 |         outpath = None
260 | 
261 |     global_checker = SpellChecker(distance=1)
262 |     if use_local_dict:
263 |         word_to_reps, local_word_count = get_word_to_reps(inpath)
264 |     else:
265 |         word_to_reps, local_word_count = None, None
266 |     # print("nmjk", local_word_count, word_to_reps)
267 | 
268 |     # this didn't work. It merges TEMPO_DICT_FILE with global dict
269 |     # instead of producing a dict solely from TEMP0_DICT_FILE
270 |     # checker_local.word_frequency.load_dictionary("./" + TEMPO_DICT_FILE)
271 | 
272 |     if verbose:
273 |         def print_probs(word1, word2):
274 |             print()
275 |             print("global probs:")
276 |             print(word1, global_checker.word_usage_frequency(word1))
277 |             print(word2, global_checker.word_usage_frequency(word2))
278 |             print("local_probs:")
279 |             if word_to_reps:
280 |                 print(word1, word_to_reps[word1])
281 |                 print(word2, word_to_reps[word2])
282 |             else:
283 |                 print("N/A")
284 |             print()
285 | 
286 |         print_probs("beautifull", "beautiful")
287 |         print_probs("tomatos", "tomatoes")
288 |         print_probs("mitty", "misty")
289 | 
290 |     corrected_lines = []
291 |     all_changes = []
292 |     with open(inpath, "r") as f:
293 |         for line in f:
294 |             corr_line, changes = get_corrected_sentence(
295 |                 line, global_checker, error_type,
296 |                 word_to_reps, local_word_count)
297 |             corrected_lines.append(corr_line)
298 |             all_changes += changes
299 |             if verbose:
300 |                 print(line.strip())
301 |                 print(corr_line)
302 |                 print()
303 |         print("all changes:", all_changes)
304 | 
305 |     if outpath:
306 |         with open(outpath, "w") as f:
307 |             for corr_line in corrected_lines:
308 |                 f.write(corr_line + "\n")
309 | 
310 | 
311 | def correct_this_batch_of_files(in_dir,
312 |                                 out_dir,
313 |                                 batch_file_names,
314 |                                 error_type,
315 |                                 verbose=True,
316 |                                 use_local_dict=False):
317 |     """
318 |     This method calls the method `correct_this_file` for all the file names
319 |     in the list of file names `batch_file_names`.
320 | 
321 |     Parameters
322 |     ----------
323 |     in_dir: str
324 |     out_dir: str
325 |     batch_file_names: list[str]
326 |     error_type: str
327 |     verbose: bool
328 |     use_local_dict: bool
329 | 
330 |     Returns
331 |     -------
332 |     None
333 | 
334 |     """
335 |     all_file_names = my_listdir(in_dir)
336 |     assert set(batch_file_names).issubset(set(all_file_names))
337 |     for file_name in batch_file_names:
338 |         i = all_file_names.index(file_name)
339 |         print(str(i + 1) + ".")
340 |         print(file_name)
341 |         correct_this_file(in_dir, out_dir, file_name,
342 |                           error_type,
343 |                           verbose,
344 |                           use_local_dict)
345 | 
346 | 
347 | if __name__ == "__main__":
348 |     def main1(use_local_dict, error_type):
349 |         print("**************************")
350 |         print("use_local_dict=", use_local_dict)
351 |         print("error_type=", error_type)
352 |         print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK)
353 |         print()
354 | 
355 |         in_dir = "."
356 |         out_dir = ""  # if empty out_dir, won't write to a file
357 |         file_name = "spell_checking_test.txt"
358 | 
359 |         correct_this_file(in_dir,
360 |                           out_dir,
361 |                           file_name,
362 |                           error_type,
363 |                           verbose=True,
364 |                           use_local_dict=use_local_dict)
365 | 
366 | 
367 |     def main2(use_local_dict, error_type):
368 |         print("**************************")
369 |         print("use_local_dict=", use_local_dict)
370 |         print("error_type=", error_type)
371 |         print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK)
372 |         print()
373 | 
374 |         in_dir = "short_stories_clean"
375 |         out_dir = "short_stories_spell"
376 |         batch_file_names = my_listdir(in_dir)
377 |         correct_this_batch_of_files(in_dir,
378 |                                     out_dir,
379 |                                     batch_file_names,
380 |                                     error_type=error_type,
381 |                                     verbose=False,
382 |                                     use_local_dict=use_local_dict)
383 | 
384 | 
385 |     def main3(use_local_dict, error_type):
386 |         print("**************************")
387 |         print("use_local_dict=", use_local_dict)
388 |         print("error_type=", error_type)
389 |         print("SPELLING_CORRECTION_RISK=", SPELLING_CORRECTION_RISK)
390 |         print()
391 | 
392 |         remove_dialogs = False
393 |         in_dir = CLEAN_DIR if not remove_dialogs else CLEAN_RD_DIR
394 |         out_dir = SPELL_DIR if not remove_dialogs else SPELL_RD_DIR
395 |         batch_file_names = my_listdir(in_dir)[0:3]
396 |         correct_this_batch_of_files(in_dir,
397 |                                     out_dir,
398 |                                     batch_file_names,
399 |                                     error_type=error_type,
400 |                                     verbose=False,
401 |                                     use_local_dict=use_local_dict)
402 | 
403 | 
404 |     # main1(use_local_dict=True, error_type="all")
405 |     # main2(use_local_dict=True, error_type="all")
406 |     main3(use_local_dict=True, error_type="all")
407 | 


--------------------------------------------------------------------------------
/white_paper/bayesuvius.sty:
--------------------------------------------------------------------------------
  1 | \usepackage{graphicx}     %standard package. Note graphics<graphicx<epsfig
  2 | \usepackage{color}     %standard package
  3 | \usepackage{hyperref} %standard package written by Sebastian Rahtz
  4 | %\usepackage{subeqn}    %allows equations 1a, 1b
  5 | %\usepackage{subfig}    %allows figures 1a, 1b
  6 | \usepackage{amsmath}
  7 | \usepackage{amssymb}
  8 | \usepackage{xurl}
  9 | \usepackage{bbm}%for indicator function
 10 | %\bibliographystyle{alpha}
 11 | \usepackage{floatrow}
 12 | 
 13 | 
 14 | \usepackage[toc,page]{appendix}
 15 | \usepackage[nottoc]{tocbibind}
 16 | 
 17 | \usepackage[color,matrix,frame,arrow,curve]{xy}
 18 | \usepackage[table,xcdraw]{xcolor}
 19 | \usepackage[shortlabels]{enumitem}
 20 | \usepackage{array}
 21 | \usepackage{pdflscape}
 22 | \usepackage{multicol, multirow}
 23 | \usepackage{ctable}
 24 | \usepackage{dirtree}
 25 | \usepackage{subfig}
 26 | \usepackage{longtable}
 27 | 
 28 | %\usepackage{fdsymbol}
 29 | %\usepackage{animate}% play animation
 30 | 
 31 | \usepackage[vlined,ruled]{algorithm2e}
 32 | \newcommand{\mycommfont}[1]{\small\ttfamily\textcolor{blue}{#1}}
 33 | \SetCommentSty{mycommfont}
 34 | 
 35 | %highlighting
 36 | \usepackage{mdframed}
 37 | 
 38 | \usepackage[makeroom]{cancel}
 39 | 
 40 | \usepackage{pdfpages}
 41 | \usepackage{hyperxmp}
 42 | 
 43 | \paperheight=11in
 44 | \topmargin=0in
 45 | \headheight=0in
 46 | \headsep=0in
 47 | \topskip=0in
 48 | \textheight=8.5in
 49 | \footskip=.5in
 50 | 
 51 | 
 52 | \paperwidth=8.5in
 53 | \oddsidemargin=.25in
 54 | \evensidemargin=.25in
 55 | \textwidth=6.0in
 56 | \parindent=.5in
 57 | 
 58 | \newcommand{\chapquote}[3]{\begin{quotation} \textit{#1} \end{quotation} \begin{flushright} - #2, \textit{#3}\end{flushright} }
 59 | 
 60 | \newtheorem{claim}{Claim}
 61 | \newcommand{\proof}[0]{{\bf proof:} }
 62 | \newcommand{\qed}[0]{\quad\newline \noindent{\bf QED }}
 63 | 
 64 | 
 65 | \newcommand{\bra}[1]{\langle#1|}
 66 | \newcommand{\ket}[1]{|#1\rangle}
 67 | \newcommand{\av}[1]{\left\langle#1\right\rangle}
 68 | \newcommand{\pder}[2]{\frac{\partial#1}{\partial#2}}
 69 | \newcommand{\der}[2]{\frac{d#1}{d#2}}
 70 | \newcommand{\tr}[0]{{\rm tr }}
 71 | \newcommand{\beq}{\begin{equation}}
 72 | \newcommand{\eeq}{\end{equation}}
 73 | \newcommand{\bsub}{\begin{subequations}}
 74 | 	\newcommand{\esub}{\end{subequations}}
 75 | \newcommand{\beqa}{\begin{eqnarray}}
 76 | \newcommand{\eeqa}{\end{eqnarray}}
 77 | \newcommand{\rarrow}[0]{\rightarrow}
 78 | \newcommand{\larrow}[0]{\leftarrow}
 79 | \newcommand{\uarrow}[0]{\uparrow}
 80 | \newcommand{\darrow}[0]{\downarrow}
 81 | \newcommand{\Rarrow}[0]{\Rightarrow}
 82 | \newcommand{\nRarrow}[0]{\nRightarrow}
 83 | \newcommand{\Larrow}[0]{\Leftarrow}
 84 | \newcommand{\nLarrow}[0]{\nLeftarrow}
 85 | \newcommand{\ul}[1]{\underline{#1}}
 86 | \newcommand{\ol}[1]{\overline{#1}}
 87 | \newcommand{\ZZ}[0]{ {\mathbb{Z}}}
 88 | \newcommand{\RR}[0]{{ \mathbb{R}} }
 89 | \newcommand{\CC}[0]{{ \mathbb{C}} }
 90 | \newcommand{\XX}[0]{{ \mathbb{X}} }
 91 | \newcommand{\ground}{{}_{\stackrel{\stackrel{\displaystyle{\bot}}{-}}{.}}}
 92 | \newcommand{\norm}[1]{\parallel#1\parallel}
 93 | \newcommand{\eqdef}[0]{\;\;\stackrel{\text{def}}{=}\;\;}
 94 | 
 95 | \newcommand{\heavy}[0]{\mathbf{H}}
 96 | 
 97 | \newcommand{\rva}[0]{{\ul{a}}}
 98 | \newcommand{\rvb}[0]{{\ul{b}}}
 99 | \newcommand{\rvc}[0]{{\ul{c}}}
100 | \newcommand{\rvd}[0]{{\ul{d}}}
101 | \newcommand{\rve}[0]{{\ul{e}}}
102 | \newcommand{\rvf}[0]{{\ul{f}}}
103 | \newcommand{\rvg}[0]{{\ul{g}}}
104 | \newcommand{\rvh}[0]{{\ul{h}}}
105 | \newcommand{\rvi}[0]{{\ul{i}}}
106 | \newcommand{\rvj}[0]{{\ul{j}}}
107 | \newcommand{\rvk}[0]{{\ul{k}}}
108 | \newcommand{\rvl}[0]{{\ul{l}}}
109 | \newcommand{\rvll}[0]{{\ul{\ell}}}
110 | \newcommand{\rvm}[0]{{\ul{m}}}
111 | \newcommand{\rvn}[0]{{\ul{n}}}
112 | \newcommand{\rvo}[0]{{\ul{o}}}
113 | \newcommand{\rvp}[0]{{\ul{p}}}
114 | \newcommand{\rvq}[0]{{\ul{q}}}
115 | \newcommand{\rvr}[0]{{\ul{r}}}
116 | \newcommand{\rvs}[0]{{\ul{s}}}
117 | \newcommand{\rvt}[0]{{\ul{t}}}
118 | \newcommand{\rvu}[0]{{\ul{u}}}
119 | \newcommand{\rvv}[0]{{\ul{v}}}
120 | \newcommand{\rvw}[0]{{\ul{w}}}
121 | \newcommand{\rvx}[0]{{\ul{x}}}
122 | \newcommand{\rvy}[0]{{\ul{y}}}
123 | \newcommand{\rvz}[0]{{\ul{z}}}
124 | 
125 | 
126 | \newcommand{\rvA}[0]{{\ul{A}}}
127 | \newcommand{\rvB}[0]{{\ul{B}}}
128 | \newcommand{\rvC}[0]{{\ul{C}}}
129 | \newcommand{\rvD}[0]{{\ul{D}}}
130 | \newcommand{\rvE}[0]{{\ul{E}}}
131 | \newcommand{\rvF}[0]{{\ul{F}}}
132 | \newcommand{\rvH}[0]{{\ul{H}}}
133 | \newcommand{\rvK}[0]{{\ul{K}}}
134 | \newcommand{\rvL}[0]{{\ul{L}}}
135 | \newcommand{\rvN}[0]{{\ul{N}}}
136 | \newcommand{\rvQ}[0]{{\ul{Q}}}
137 | \newcommand{\rvR}[0]{{\ul{R}}}
138 | \newcommand{\rvT}[0]{{\ul{T}}}
139 | \newcommand{\rvU}[0]{{\ul{U}}}
140 | \newcommand{\rvV}[0]{{\ul{V}}}
141 | \newcommand{\rvW}[0]{{\ul{W}}}
142 | \newcommand{\rvX}[0]{{\ul{X}}}
143 | \newcommand{\rvY}[0]{{\ul{Y}}}
144 | \newcommand{\rvZ}[0]{{\ul{Z}}}
145 | 
146 | \newcommand{\rvxi}[0]{{\ul{\xi}}}
147 | \newcommand{\rvzeta}[0]{{\ul{\zeta}}}
148 | \newcommand{\rvtheta}[0]{{\ul{\theta}}}
149 | \newcommand{\rvmu}[0]{{\ul{\mu}}}
150 | \newcommand{\rvsig}[0]{{\ul{\sigma}}}
151 | \newcommand{\rvDel}[0]{{\ul{\Delta}}}
152 | \newcommand{\rvtau}[0]{{\ul{\tau}}}
153 | 
154 | \newcommand{\cala}[0]{{\cal A}}
155 | \newcommand{\calb}[0]{{\cal B}}
156 | \newcommand{\calc}[0]{{\cal C}}
157 | \newcommand{\cald}[0]{{\cal D}}
158 | \newcommand{\cale}[0]{{\cal E}}
159 | \newcommand{\calf}[0]{{\cal F}}
160 | \newcommand{\calg}[0]{{\cal G}}
161 | \newcommand{\calh}[0]{{\cal H}}
162 | \newcommand{\cali}[0]{{\cal I}}
163 | \newcommand{\calk}[0]{{\cal K}}
164 | \newcommand{\call}[0]{{\cal L}}
165 | \newcommand{\calm}[0]{{\cal M}}
166 | \newcommand{\caln}[0]{{\cal N}}
167 | \newcommand{\calo}[0]{{\cal O}}
168 | \newcommand{\calp}[0]{{\cal P}}
169 | \newcommand{\calq}[0]{{\cal Q}}
170 | \newcommand{\calr}[0]{{\cal R}}
171 | \newcommand{\cals}[0]{{\cal S}}
172 | \newcommand{\calt}[0]{{\cal T}}
173 | \newcommand{\calu}[0]{{\cal U}}
174 | \newcommand{\calv}[0]{{\cal V}}
175 | \newcommand{\calw}[0]{{\cal W}}
176 | \newcommand{\calx}[0]{{\cal X}}
177 | \newcommand{\caly}[0]{{\cal Y}}
178 | \newcommand{\calz}[0]{{\cal Z}}
179 | 
180 | \newcommand{\Pmat}[4]{\calp\left[
181 | \begin{array}{cc}#1\\#3
182 | \end{array}\right]}
183 | 
184 | %\newcommand{\PN}[0]{PN^{0,0}_{1,1}}
185 | %\newcommand{\PS}[0]{PS^{1,1}_{0,0}}
186 | \newcommand{\PN}[0]{PN}
187 | \newcommand{\PS}[0]{PS}
188 | 
189 | \newcommand{\lam}[0]{\lambda}
190 | \newcommand{\Lam}[0]{\Lambda}
191 | \newcommand{\alp}[0]{\alpha}
192 | \newcommand{\eps}[0]{\epsilon}
193 | \newcommand{\s}[0]{\sigma}
194 | \newcommand{\su}[0]{{\Sigma}}
195 | 
196 | \newcommand{\pp}[0]{\mathbb{P}}
197 | \newcommand{\dbm}[0]{{
198 | 		[1,\partial_{b},\partial_{m}]
199 | }}
200 | 
201 | \newcommand{\dg}[0]{{[1, \partial_{\theta_G}]}}
202 | \newcommand{\dd}[0]{{[1, \partial_{\theta_D}]}}
203 | \newcommand{\dgd}[0]{{[1, \partial_{\theta_G}, \partial_{\theta_D}]}}
204 | 
205 | \newcommand{\veca}[0]{{\vec{a}}}
206 | \newcommand{\vecb}[0]{{\vec{b}}}
207 | \newcommand{\vecc}[0]{{\vec{c}}}
208 | \newcommand{\vecd}[0]{{\vec{d}}}
209 | \newcommand{\vecf}[0]{{\vec{f}}}
210 | \newcommand{\vech}[0]{{\vec{h}}}
211 | \newcommand{\vecr}[0]{{\vec{r}}}
212 | \newcommand{\vecs}[0]{{\vec{s}}}
213 | \newcommand{\vecu}[0]{{\vec{u}}}
214 | \newcommand{\vecx}[0]{{\vec{x}}}
215 | \newcommand{\vecy}[0]{{\vec{y}}}
216 | \newcommand{\vechy}[0]{{\vec{\haty}}}
217 | \newcommand{\vtheta}[0]{{\vec{\theta}}}
218 | 
219 | \newcommand{\haty}[0]{{\widehat{y}}}
220 | \newcommand{\hatx}[0]{{\widehat{x}}}
221 | \newcommand{\hata}[0]{{\widehat{a}}}
222 | \newcommand{\hatr}[0]{{\widehat{r}}}
223 | 
224 | 
225 | \newcommand{\cond}[0]{{\:\mathbf{|}\:}}
226 | \newcommand{\mymathbf}[1]{#1}
227 | 
228 | \newcommand{\ranvec}[1]{\ul{\vec{#1}}}
229 | \newcommand{\indi}[0]{\mathbbm{1}}
230 | \newcommand{\smoid}[0]{{\rm smoid}}
231 | \newcommand{\lodds}[0]{{\rm lodds}}
232 | \newcommand{\expit}[0]{{\rm expit}}
233 | \newcommand{\logit}[0]{{\rm logit}}
234 | \newcommand{\sign}[0]{{\rm sign}}
235 | 
236 | \renewcommand{\labelitemii}{$\bullet$}
237 | 
238 | \newcommand{\HAT}[1]{{\widehat{#1}}}
239 | \newcommand{\TIL}[1]{{\widetilde{#1}}}
240 | \newcommand{\tild}[0]{{\TIL{d}}}
241 | \newcommand{\tile}[0]{{\TIL{e}}}
242 | \newcommand{\tilg}[0]{{\TIL{g}}}
243 | \newcommand{\tilu}[0]{{\TIL{u}}}
244 | \newcommand{\tilx}[0]{{\TIL{x}}}
245 | \newcommand{\tilP}[0]{{\TIL{P}}}
246 | \newcommand{\tilPT}[0]{{\TIL{P}_\theta}}
247 | 
248 | \newcommand{\maparrow}[1]
249 | {\xymatrix{\ar[r]_{#1}&}}
250 | 
251 | 
252 | \newcommand{\ucalm}[0]{\ul{\calm}}
253 | 
254 | \newcommand{\bool}[0]{\{0,1\}}
255 | 
256 | \newcommand{\argmin}{\mathop{\mathrm{argmin}}\limits}
257 | \newcommand{\argmax}{\mathop{\mathrm{argmax}}\limits}
258 | 
259 | \newcommand{\softmax}[0]{{\rm softmax}}
260 | 
261 | \newcommand{\A}[0]{\wedge}
262 | \newcommand{\V}[0]{\vee}
263 | \newcommand{\xor}{\oplus}
264 | \newcommand{\bigA}[0]{\bigwedge}
265 | \newcommand{\bigV}[0]{\bigvee}
266 | \newcommand{\bigxor}{\bigoplus}
267 | 
268 | 
269 | \newcommand{\rdart}[0]{\Rightarrow}
270 | \newcommand{\ldart}[0]{\Leftarrow}
271 | \newcommand{\rveps}[0]{\ul{\eps}}
272 | 
273 | \newcommand{\hatvar}[0]{\widehat{\sigma^2}}
274 | \newcommand{\ptp}[0]{{(t)}}
275 | 
276 | \newcommand{\tseries}[1]{{\{#1\}_{\forall t}}}
277 | 
278 | \newcommand{\xbeta}[0]{X_\s^T\beta}
279 | \newcommand{\xtau}[0]{X_\s^T\tau}
280 | 
281 | %arguments phi1, phi2, phi3, e
282 | \newcommand{\rbd}[4]{
283 | \xymatrix@-1.3pc{
284 | \ar[d]&\ar[d]
285 | \\
286 | &\rvx_1\ar[rr]
287 | &&
288 | \rvx_2
289 | \ar `r[rd][rd]
290 | \\
291 | 0\ar`u[u][ru]
292 | \ar`d[dr][rrd]
293 | &&&&\rvA\ar[r]
294 | \\
295 | &&
296 | \rvx_3
297 | \ar `r[rru][rru]
298 | \\
299 | &\ar[u]
300 | }
301 | }
302 | 
303 | \newcommand{\rulezeroif}[0]{
304 | If $(\rvb. \perp \rva.
305 | |\rvr., \rvs.)$
306 | in $G$, then}
307 | 
308 | \newcommand{\rulezerothen}[0]{
309 | $\rva.=a. \leftrightarrow 1$}
310 | 
311 | \newcommand{\rulezeroP}[0]{
312 | P(b.|a.,r.,s.)=P(b.|r., s.)}
313 | 
314 | \newcommand{\rulezeroH}[0]{
315 | H(\rvb.:\rva.|\rvr.,\rvs.)=0}
316 | 
317 | \newcommand{\rulezeropic}[0]{
318 | \xymatrix@C=1pc@R=1pc{
319 | &r.\ar[dr]
320 | &s.\ar[d]
321 | \\
322 | a.\ar[rr]
323 | &&b.
324 | &=
325 | }
326 | \xymatrix@C=1pc@R=1pc{
327 | &r.\ar[dr]&s.\ar[d]
328 | \\
329 | a.\ar[rr]|0
330 | &&b.
331 | }}
332 | 
333 | \newcommand{\ruleoneif}[0]{
334 | If $(\rvb. \perp \rva.
335 | |\rvr., \rvs.)$
336 | in $\cald_{\rvr.} G$, then}
337 | 
338 | \newcommand{\ruleonethen}[0]{
339 | $\rva.=a. \leftrightarrow 1$}
340 | 
341 | \newcommand{\ruleoneP}[0]{
342 | P(b.|a., \cald\rvr.=r.,s.)=
343 | P(b.|\cald\rvr.=r., s.)}
344 | 
345 | \newcommand{\ruleoneH}[0]{
346 | H(\rvb.:\rva.|\cald\rvr.,\rvs.)=0}
347 | 
348 | \newcommand{\ruleonepic}[0]{
349 | \xymatrix@C=1pc@R=1pc{
350 | &\cald\rvr.=r.\ar[dr]
351 | &s.\ar[d]
352 | \\
353 | a.\ar[rr]
354 | &&b.
355 | &=
356 | }
357 | \xymatrix@C=1pc@R=1pc{
358 | &\cald\rvr.=r.\ar[dr]
359 | &s.\ar[d]
360 | \\
361 | a.\ar[rr]|0
362 | &&b.
363 | }}
364 | 
365 | \newcommand{\ruletwoif}[0]{
366 | If $(\rvb.\perp \rva. |
367 |  \rvr., \rvs.)$
368 | in $\call_{\rva.}\cald_{\rvr.} G$,
369 |  then}
370 | 
371 | \newcommand{\ruletwothen}[0]{
372 | $\cald \rva.=a. \leftrightarrow \rva.=a.$}
373 | 
374 | \newcommand{\ruletwoP}[0]{
375 | P(b.|\cald\rva.=a., \cald\rvr.=r., s.)=
376 | P(b.|a., \cald\rvr.=r.,  s.)}
377 | 
378 | \newcommand{\ruletwoH}[0]{
379 | H(\rvb.:\cald\rva.|\cald\rvr.,  \rvs.)
380 | =
381 | H(\rvb.:\rva.|\cald\rvr.,  \rvs.)}
382 | 
383 | \newcommand{\ruletwopicA}[0]{
384 | \xymatrix@C=1pc@R=1pc{
385 | \;\ar[d]_0
386 | &\cald\rvr.=r.\ar[dr]
387 | &s.\ar[d]
388 | \\
389 | \cald\rva.=a.\ar[rr]\ar[d]
390 | &&b.
391 | &=\quad\quad
392 | \\
393 | &
394 | }
395 | \xymatrix@C=1pc@R=1pc{
396 | \;\ar[d]_0
397 | &&\cald\rvr.=r.\ar[dr]
398 | &s.\ar[d]
399 | \\
400 | \cald\rva.=a.\ar[d]
401 | &a.\ar[rr]
402 | &&b.
403 | \\
404 | &
405 | }}
406 | \newcommand{\ruletwopicB}[0]{
407 | \xymatrix@C=1pc@R=1pc{
408 | \;\ar[d]
409 | &\cald\rvr.=r.\ar[dr]
410 | &s.\ar[d]
411 | \\
412 | a.\ar[rr]\ar[d]
413 | &&b.
414 | &=\quad\quad
415 | \\
416 | &
417 | }
418 | \xymatrix@C=1pc@R=1pc{
419 | \;\ar[d]
420 | &&\cald\rvr.=r.\ar[dr]
421 | &s.\ar[d]
422 | \\
423 | a.\ar[d]
424 | &\cald\rva.=a.\ar[rr]
425 | &&b.
426 | \\
427 | &
428 | }}
429 | 
430 | \newcommand{\rulethreeif}[0]{
431 | If $
432 | (\rvb. \perp \rva.
433 | | \rvr., \rvs.)$
434 | in $\cald_{\rva.-an(\rvs.)}
435 | \cald_{\rvr.}G$,
436 |  then}
437 | 
438 | \newcommand{\rulethreethen}[0]{
439 | $\cald \rva.=a. \leftrightarrow 1$}
440 | 
441 | \newcommand{\rulethreeP}[0]{
442 | P(b.|\cald\rva.=a.,\cald\rvr.=r.,  s.)=
443 | P(b.|\cald\rvr.=r., s.)}
444 | 
445 | \newcommand{\rulethreeH}[0]{
446 | H(\rvb.:\cald\rva.|\cald\rvr., \rvs.)=0}
447 | 
448 | \newcommand{\rulethreepic}[0]{
449 | \xymatrix@C=1pc@R=1pc{
450 | &
451 | &\cald\rvr.=r.\ar[dr]
452 | &s.\ar[d]
453 | \\
454 | &\cald\rva.=a.\ar[rr]
455 | &&b.
456 | &=
457 | }
458 | \xymatrix@C=1pc@R=1pc{
459 | &
460 | &\cald\rvr.=r.\ar[dr]
461 | &s.\ar[d]
462 | \\
463 | &\cald\rva.=a.\ar[rr]|0
464 | &&b.
465 | }}
466 | 
467 | \newcommand{\bdoordef}[0]{
468 | Suppose that we have access to data
469 | that allows us to
470 | estimate a probability
471 | distribution
472 |  $P(x., y., z.)$.
473 | Hence, the variables
474 | $\rvx., \rvy., \rvz.$ are
475 | ALL observed (i.e, not hidden).
476 | Then we say that the
477 | backdoor $\rvz.$
478 | satisfies the
479 | {\bf backdoor adjustment criterion}
480 | relative to $(\rvx., \rvy.)$
481 | if
482 | \begin{enumerate}
483 | \item
484 | All backdoor paths from $\rvx.$
485 | to $\rvy.$
486 |  are blocked by  conditioning on
487 |  $\rvz.$.
488 | \item
489 | $\rvz. \cap de(\rvx.)=\emptyset$.
490 | \end{enumerate}
491 | }
492 | 
493 | \newcommand{\bdoorclaim}[0]{
494 | If $\rvz.$ satisfies the
495 | backdoor criterion relative to
496 |  $(\rvx., \rvy.)$, then
497 | 
498 | \beqa
499 | P(y. | \cald \rvx.=x.)&=&
500 | \sum_{z.} P(y.|x., z.)P(z.)
501 | \\
502 | &=&
503 | \begin{array}{l}
504 | \\
505 | \\
506 | \end{array}
507 | \xymatrix{
508 | \sum z.\ar[dr]
509 | \\
510 | x.\ar[r]
511 | &y.
512 | }
513 | \eeqa
514 | where $\sum z.$ means node
515 | $\rvz.$ is summed over.
516 | }
517 | 
518 | \newcommand{\fdoordef}[0]{
519 | Suppose that we have access to data
520 | that allows us to
521 | estimate a probability
522 | distribution
523 |  $P(x., m., y.)$.
524 | Hence, the variables
525 | $\rvx., \rvm., \rvy.$ are
526 | ALL observed (i.e, not hidden).
527 | Then we say that
528 | the frontdoor $\rvm.$
529 | satisfies the
530 | {\bf frontdoor adjustment criterion}
531 | relative to $(\rvx., \rvy.)$
532 | if
533 | \begin{enumerate}
534 | \item
535 | All directed paths from
536 | $\rvx.$ to $\rvy.$ are intercepted by
537 | (i.e., have a node in) $\rvm.$.
538 | \item
539 | All backdoor paths from $\rvx.$ to
540 | $\rvm.$ are blocked.
541 | \item
542 | All backdoor paths from
543 | on $\rvm.$ to $\rvy.$
544 | are blocked by conditioning
545 | on  $\rvx.$.
546 | \end{enumerate}
547 | }
548 | 
549 | \newcommand{\fdoorclaim}[0]{
550 | If $\rvm.$ satisfies the
551 | frontdoor criterion
552 | relative to $(\rvx., \rvy.)$,
553 | and $P(x.,m.)>0$, then
554 | 
555 | \beqa
556 | P(y. | \cald \rvx.=x.)&=&\sum_{m.}
557 | \underbrace{\left[\sum_{x'.}
558 | P(y.|x'., m.)P(x'.)\right]}_
559 | {P(y.|\cald \rvm.=m.)}
560 | \underbrace{P(m.|x.)}_
561 | {P(m.|\cald \rvx.=x.)}
562 | \\
563 | &=&
564 | \xymatrix{
565 | &\sum x'.\ar[dr]
566 | \\
567 | x.´\ar[r]
568 | &\sum m.\ar[r]&y.
569 | }
570 | \eeqa
571 | where $\sum x'.$ and
572 | $\sum m.$
573 | means nodes
574 | $\rvx'.$ and $\rvm.$
575 | are summed over.
576 | }
577 | 
578 | 
579 | %Symmetry
580 | \newcommand{\symrule}[0]{
581 | $\rva\perp_P\rvb\implies \rvb\perp_P\rva$}
582 | 
583 | \newcommand{\symruleH}[0]{
584 | $H(\rva:\rvb)=0\implies H(\rvb:\rva)=0$}
585 | 
586 | %Decomposition
587 | \newcommand{\decrule}[0]{
588 | $\rva\perp_P\rvb, \rvc\implies
589 | \rva\perp_P\rvb \text{ and } \rva\perp_P\rvc$}
590 | 
591 | \newcommand{\decruleH}[0]{
592 | $H(\rva:\rvb, \rvc)=0\implies
593 | H(\rva:\rvb)=0 \text{ and } H(\rva:\rvc)=0$}
594 | 
595 | %Weak Union
596 | \newcommand{\wearule}[0]{
597 | $\rva\perp_P \rvb, \rvc \implies
598 | \rva\perp_P\rvb|\rvc\text{ and }\rva\perp_P\rvc|\rvb$}
599 | 
600 | \newcommand{\wearuleH}[0]{
601 | $H(\rva:\rvb, \rvc)=0 \implies
602 | H(\rva:\rvb|\rvc)=0\text{ and }H(\rva:\rvc|\rvb)=0$}
603 | 
604 | %Contraction
605 | \newcommand{\conrule}[0]{
606 | $\rva\perp_P\rvb|\rvc\text{ and }\rva\perp_P \rvc
607 | \implies \rva\perp_P \rvb, \rvc$}
608 | 
609 | \newcommand{\conruleH}[0]{
610 | $H(\rva:\rvb|\rvc)=0\text{ and }H(\rva:\rvc)=0
611 | \implies H(\rva:\rvb, \rvc)=0$}
612 | 
613 | %Intersection
614 | \newcommand{\intrule}[0]{
615 | $\rva\perp_P\rvb|\rvc, \rvd\text{ and }
616 | \rva\perp_P \rvd|\rvc, \rvb\implies
617 | \rva\perp_P \rvb,\rvd|\rvc$}
618 | 
619 | \newcommand{\intruleH}[0]{
620 | $H(\rva:\rvb|\rvc, \rvd)=0\text{ and }
621 | H(\rva:\rvd|\rvc, \rvb)=0\implies
622 | H(\rva:\rvb,\rvd|\rvc)=0$}
623 | 
624 | \newcommand{\dotbarmu}[0]{{\cdot|\mu}}
625 | \newcommand{\dotmu}[0]{{\cdot, \mu}}
626 | \newcommand{\kbarmu}[0]{{k|\mu}}
627 | \newcommand{\kmu}[0]{{k,\mu}}
628 | \newcommand{\plusbarmu}[0]{{+|\mu}}
629 | \newcommand{\plusmu}[0]{{+,\mu}}
630 | 
631 | \newcommand{\bnlearn}[0]{{\tt bnlearn\;}}
632 | 
633 | \newcommand{\sqsig}[0]{{[\sigma]}}
634 | 
635 | \newcommand{\misscellone}[0]{
636 | \begin{array}{c}
637 | \frac{1}{nsam}
638 | P(x_0=0, x_2=0\cond x_1=1, \theta)
639 | \\
640 | \frac{1}{nsam}
641 | P(x_0=0, x_2=1\cond x_1=1, \theta)
642 | \\
643 | \frac{1}{nsam}
644 | P(x_0=1, x_2=0\cond x_1=1, \theta)
645 | \\
646 | \frac{1}{nsam}
647 | P(x_0=1, x_2=1\cond x_1=1, \theta)
648 | \end{array}
649 | }
650 | 
651 | \newcommand{\misscelltwo}[0]{
652 | \begin{array}{c}
653 | \frac{1}{nsam}
654 | P(x_1=0\cond x_0=0,x_2=1, \theta)
655 | \\
656 | \frac{1}{nsam}
657 | P(x_1=1\cond x_0=0,x_2=1,  \theta)
658 | \end{array}
659 | }
660 | 
661 | 
662 | \newcommand{\td}[0]{{\TIL{d}}}
663 | \newcommand{\rvtd}[0]{{\ul{\TIL{d}}}}
664 | \newcommand{\tx}[0]{{\TIL{x}}}
665 | \newcommand{\tmu}[0]{{\TIL{\mu}}}
666 | \newcommand{\rvtx}[0]{{\ul{\TIL{x}}}}
667 | 
668 | \newcommand{\mlarr}[0]{\xrightarrow{\rm ML-fit}}
669 | \newcommand{\lrarr}[0]{\xrightarrow{\rm LR-fit}}
670 | 
671 | \newcommand{\setprob}[3]
672 | {{\begin{array}{c}S=\{#1\}
673 | \\P(S)=#2\\ \haty(x^\s_S)=\$#3 K
674 | \end{array}}}
675 | 
676 | \newcommand{\Gno}[0]{\xymatrix{\;\ar[r]|\parallel_G&}}
677 | \newcommand{\Gyes}[0]{\xymatrix{\;\ar[r]_G&}}
678 | 
679 | \newcommand{\calypso}[0]{\ol{\caly}}
680 | 
681 | \newcommand{\SeqBdoorDef}[0]{
682 | Suppose that we have access to data
683 | that allows us to
684 | estimate a probability
685 | distribution
686 |  $P(x^n, y, z^n)$.
687 | Hence, the variables
688 | $\rvx^n, \rvy, \rvz^n$ are
689 | ALL observed (i.e, not hidden).
690 | Then we say that the
691 | the multinode
692 | of ``covariates" $\rvz^n$
693 | satisfies the
694 | {\bf sequential backdoor (SBD) adjustment criterion}
695 | relative to $(\rvx^n, \rvy)$
696 | if for all $t\in\{0,1, \ldots, n-1\}$,
697 | 
698 | \begin{enumerate}
699 | \item
700 | $\rvy\perp\rvx_t|
701 | \underbrace{(\rvx_0, \rvx_1, \ldots,\rvx_{t-1},
702 | \rvz_0, \rvz_1, \ldots, \rvz_t)}
703 | _{\text{Past of $\rvx_t$}}$
704 | in $\call_{\rvx_{t}}
705 | \cald_{\rvx_{t+1},\rvx_{t+2}
706 | ,\ldots, \rvx_{n-1}}G$.
707 | \item
708 | $\rvz_t \cap de(\rvx_t)=\emptyset$.
709 | \end{enumerate}
710 | }
711 | 
712 | \newcommand{\SeqBdoorClaim}[0]{
713 | If $\rvz^n$ satisfies the
714 | sequential backdoor criterion relative to
715 |  $(\rvx^n, \rvy)$, then
716 | 
717 | \beq
718 | P(y | \cald \rvx^n=x^n)=
719 | \calq(y|x^n)
720 | \;,
721 | \eeq
722 | where $\calq(y|x^n)$
723 | is defined by
724 | Eq.(\ref{def-q-y-xn-seqbdoor}).
725 | }
726 | 


--------------------------------------------------------------------------------
/simp_deprecated/simp_stanford.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This file contains one of several implementations of the function
  4 | `simplify_ztz(sentence, verbose=False)` that we considered.
  5 | 
  6 | Refs:
  7 | https://github.com/garain/Sentence-Simplification
  8 | ztz = sentence
  9 | 
 10 | """
 11 | import nltk
 12 | from nltk.tree import ParentedTree
 13 | from anytree import NodeMixin, Node, AnyNode, RenderTree
 14 | import re
 15 | import os
 16 | import subprocess
 17 | from globals import *
 18 | 
 19 | version = subprocess.check_output(
 20 |     ['java', '-version'], stderr=subprocess.STDOUT)
 21 | print("java version=\t", version)
 22 | print("CLASSPATH=\t", os.environ['CLASSPATH'])
 23 | print("STANFORD_MODELS=\t", os.environ['STANFORD_MODELS'])
 24 | print("JAVA_HOME=\t", os.environ['JAVA_HOME'])
 25 | 
 26 | from nltk.parse.stanford import StanfordParser
 27 | 
 28 | parser = StanfordParser()
 29 | 
 30 | 
 31 | def simplify_ztz(sentence0, verbose=False):
 32 |     """
 33 |     This method simplifies the sentence `sentence`. It returns a list of
 34 |     simple sentences extracted from the input sentence.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     sentence0: str
 39 |     verbose: bool
 40 |     kwargs: dict[]
 41 | 
 42 |     Returns
 43 |     -------
 44 |     list[str]
 45 | 
 46 |     """
 47 | 
 48 |     simple_ztz_list = []
 49 |     success = False
 50 | 
 51 |     # split = []
 52 |     # simple_sent = []
 53 |     # index = []
 54 |     # index1 = 0
 55 |     n = 0
 56 |     but = 0
 57 | 
 58 |     # scount = 0
 59 |     # parts = []
 60 |     # ht_3_last_obj = []
 61 | 
 62 |     def SBAR_simplify(sent):
 63 | 
 64 |         def make_tree(tree, t, sent_list):
 65 |             # this fn. converts nltk tree to anytree
 66 |             if tree not in sent_list:
 67 |                 ttt = AnyNode(id=str(tree.label()), parent=t)
 68 |                 for tt in tree:
 69 |                     make_tree(tt, ttt, sent_list)
 70 |             else:
 71 |                 AnyNode(id=str(tree), parent=t)
 72 | 
 73 |         # SBAR CASE
 74 |         def find_sbar(t):
 75 |             if t.id == 'SBAR':
 76 |                 global sbar
 77 |                 sbar = t
 78 |             for tt in t.children:
 79 |                 find_sbar(tt)
 80 | 
 81 |         def find_vp_in_sbar(t):
 82 |             if t.id == 'VP':
 83 |                 global vp_sbar
 84 |                 vp_sbar.append(t)
 85 |             for tt in t.children:
 86 |                 find_vp_in_sbar(tt)
 87 | 
 88 |         def find_np_in_sbar(t):
 89 |             global f
 90 |             global ff
 91 |             if t.id == 'VP':
 92 |                 ff = False
 93 |             if (t.id == 'NP') and f == True and ff == True:
 94 |                 global np_sbar
 95 |                 np_sbar = t
 96 |                 f = False
 97 |             for tt in t.children:
 98 |                 find_np_in_sbar(tt)
 99 | 
100 |         def find_vp(t):
101 |             if t.id == 'SBAR':
102 |                 return
103 |             global f
104 |             if t.id == 'VP' and f == True:
105 |                 global vp
106 |                 vp = t
107 |                 f = False
108 |             for tt in t.children:
109 |                 find_vp(tt)
110 | 
111 |         def find_np(t):
112 |             if t.id == 'SBAR':
113 |                 return
114 |             global f
115 |             if t.id == 'NP' and f == True:
116 |                 global np
117 |                 np = t
118 |                 f = False
119 |             for tt in t.children:
120 |                 find_np(tt)
121 | 
122 |         def find_vbz(t):
123 |             if t.id == 'SBAR':
124 |                 return
125 |             global f
126 |             if t.id == 'VBZ' and f == True:
127 |                 global vbz
128 |                 vbz = t.children[0].id
129 |                 f = False
130 |             for tt in t.children:
131 |                 find_vbz(tt)
132 | 
133 |         def make_sent(t):
134 |             global simple_sentences
135 |             if t.id in sent_list:
136 |                 simple_sentences[-1].append(t.id)
137 |             for tt in t.children:
138 |                 make_sent(tt)
139 | 
140 |         # sent=sent8
141 | 
142 |         parse_trees = parser.raw_parse(sent)
143 |         global sent_list
144 |         sent_list = [s for s in sent.split()]
145 |         tree = next(parse_trees)[0]
146 |         # tree.draw()
147 |         t = AnyNode(id='ROOT')
148 |         make_tree(tree, t, sent_list)
149 |         global sbar
150 |         sbar = t
151 |         global vp_sbar
152 |         global f
153 |         global ff
154 |         global np_sbar
155 |         global vp
156 |         global np
157 |         global vbz
158 |         vp_sbar = []
159 |         vp = t
160 |         np = t
161 |         vbz = 'bn2'
162 |         np_sbar = t
163 |         find_sbar(t)
164 |         find_vp_in_sbar(sbar)
165 |         f = True
166 |         ff = True
167 |         find_np_in_sbar(sbar)
168 |         f = True
169 |         find_vp(t)
170 |         f = True
171 |         find_np(t)
172 |         f = True
173 |         find_vbz(t)
174 |         global simple_sentences
175 |         simple_sentences = []
176 |         simple_sentences.append([])
177 |         make_sent(np)
178 |         make_sent(vp)
179 |         for i in range(len(vp_sbar)):
180 |             simple_sentences.append([])
181 |             if np_sbar == t:
182 |                 make_sent(np)
183 |             else:
184 |                 make_sent(np_sbar)
185 |             if vbz != 'bn2':
186 |                 simple_sentences[-1].append(vbz)
187 |             make_sent(vp_sbar[i])
188 |         # print (simple_sentences)
189 |         simple = []
190 |         for sentence in simple_sentences:
191 |             string = ''
192 |             for word in sentence:
193 |                 string += word + ' '
194 |             string += '.'
195 |             simple.append(string)
196 | 
197 |         def is_any_sbar(t):
198 |             if t.id == 'SBAR':
199 |                 global f
200 |                 f = True
201 |                 return
202 |             for tt in t.children:
203 |                 is_any_sbar(tt)
204 | 
205 |         f = False
206 |         is_any_sbar(t)
207 |         if f == False:
208 |             simple = [sent]
209 |         return simple
210 | 
211 |     # print(pos_tagged)
212 |     # SBAR functions start here
213 |     def make_tree_sbar(tree, t, sent_list):
214 |         # this fn. converts nltk tree to anytree
215 |         if tree not in sent_list:
216 |             ttt = AnyNode(id=str(tree.label()), parent=t)
217 |             for tt in tree:
218 |                 make_tree_sbar(tt, ttt, sent_list)
219 |         else:
220 |             AnyNode(id=str(tree), parent=t)
221 | 
222 |     def find_sbar(t):
223 |         if t.id == 'SBAR':
224 |             global sbar
225 |             sbar = t
226 |         for tt in t.children:
227 |             find_sbar(tt)
228 | 
229 |     def find_vp_in_sbar(t):
230 |         if t.id == 'VP':
231 |             global vp_sbar
232 |             vp_sbar = t
233 |         for tt in t.children:
234 |             find_vp_in_sbar(tt)
235 | 
236 |     def find_vp(t):
237 |         if t.id == 'SBAR':
238 |             return
239 |         global f
240 |         if t.id == 'VP' and f == True:
241 |             global vp
242 |             vp = t
243 |             f = False
244 |         for tt in t.children:
245 |             find_vp(tt)
246 | 
247 |     def find_np(t):
248 |         if t.id == 'SBAR':
249 |             return
250 |         global f
251 |         if t.id == 'NP' and f == True:
252 |             global np
253 |             np = t
254 |             f = False
255 |         for tt in t.children:
256 |             find_np(tt)
257 | 
258 |     def find_vbz(t):
259 |         if t.id == 'SBAR':
260 |             return
261 |         global f
262 |         if t.id == 'VBZ' and f == True:
263 |             global vbz
264 |             vbz = t.children[0].id
265 |             f = False
266 |         for tt in t.children:
267 |             find_vbz(tt)
268 | 
269 |     def make_sent(t):
270 |         global simple_sentences
271 |         if t.id in sent_list:
272 |             simple_sentences[-1].append(t.id)
273 |         for tt in t.children:
274 |             make_sent(tt)
275 | 
276 |     # SBAR functions end here
277 |     # Multiple CC functions start here
278 |     def pos_tag(tokenized_sent):
279 |         return nltk.pos_tag(tokenized_sent)
280 | 
281 |     def has_conj(tagged_sent):
282 |         cc_list = [('and', 'CC'), ('but', 'CC')]
283 |         for cc_pair in cc_list:
284 |             if cc_pair in tagged_sent:
285 |                 return True
286 |         return False
287 | 
288 |     def split_needed(sent_list):
289 |         for sent in sent_list:
290 |             if has_conj(pos_tag(tokenize(sent))):
291 |                 return True
292 |         return False
293 | 
294 |     def do_split(sent, cc_tuple):
295 |         pos_tagged = pos_tag(tokenize(sent))
296 |         tree = next(parser.tagged_parse(pos_tagged))
297 |         tree1 = ParentedTree.convert(tree)
298 |         # tree.draw()
299 |         count = 0
300 |         m = 0
301 |         for t in tree1.subtrees():
302 |             if t.label() == 'PP':
303 |                 count = count + 1
304 | 
305 |         index = []
306 |         index1 = 0
307 |         if count > 0 and (('to') not in tokenized_sent and (
308 |                 'washed') not in tokenized_sent) and (
309 |                 tokenized_sent.count(",") < 2):
310 |             for i in range(len(pos_tagged) - 3):
311 |                 if (pos_tagged[i][1] == 'VBD' or pos_tagged[i][1] == 'VBZ') and \
312 |                         pos_tagged[i + 1][1] != 'VBG' and pos_tagged[i + 3][
313 |                     1] != 'CC' and pos_tagged[i + 1][1] != 'NNP' and \
314 |                         pos_tagged[i - 1][1] != 'CC':
315 |                     pos_tagged.insert(i + 1, (',', ','))
316 | 
317 |             for j in range(len(pos_tagged)):
318 |                 if pos_tagged[j][1] == 'CC':
319 |                     index.append(j)
320 | 
321 |         for t in tree1.subtrees():
322 |             if t.label() == 'SBAR':
323 |                 m = m + 1
324 |         if len(index) > 0 and count > 0 and m == 0:
325 |             c = 0
326 |             for i in range(len(index)):
327 |                 pos_tagged.insert(index[i] + c, (',', ','))
328 |                 c = c + 1
329 |         if m > 0:
330 |             for j in range(len(pos_tagged)):
331 |                 if pos_tagged[j][1] == 'CC':
332 |                     index1 = j
333 | 
334 |         if (index1 > 0 and m > 0) and count == 0:
335 |             pos_tagged.insert(index1, (' ,', ','))  # ', 'is used
336 |             pos_tagged.insert(index1 + 2, (', ', ','))  # ' ,' is used
337 |         # print(pos_tagged)
338 |         tree = next(parser.tagged_parse(pos_tagged))
339 |         p_tree = ParentedTree.convert(tree)
340 | 
341 |         leaf_values = p_tree.leaves()
342 |         parts = []
343 |         ht_3_last_obj = []
344 | 
345 |         if cc_tuple in pos_tagged:
346 |             leaf_index = leaf_values.index(cc_tuple[0])
347 |             tree_location = p_tree.leaf_treeposition(leaf_index)
348 |             parent = p_tree[tree_location[:-2]]
349 |             # print(parent.height())
350 | 
351 |             if parent.height() == 3:
352 |                 # find the noun being referred to
353 |                 for subtree in reversed(list(parent.subtrees())):
354 |                     if subtree.parent() == parent:
355 |                         if subtree.label() == 'NN' or subtree.label() == 'NNS':
356 |                             ht_3_last_obj = subtree.leaves() + ht_3_last_obj
357 |                             del p_tree[subtree.treeposition()]
358 |                 # print("ht 3 last obj -> ", ht_3_last_obj)
359 |                 part = []
360 |                 for subtree in reversed(list(parent.subtrees())):
361 |                     if subtree.parent() == parent:
362 |                         # print(subtree)
363 |                         if subtree.label() != ',' and subtree.label() != 'CC':
364 |                             part = subtree.leaves() + part
365 |                         else:
366 |                             parts.append(part + ht_3_last_obj)
367 |                             part = []
368 |                         del p_tree[subtree.treeposition()]
369 |                 parts.append(part + ht_3_last_obj)
370 |                 # print('parent', parent)
371 |                 # print('treeloc', tree_location)
372 |                 parent.append(ParentedTree('INSRT', ['*']))
373 | 
374 |             else:
375 |                 for subtree in reversed(list(parent.subtrees())):
376 |                     if subtree.parent() == parent:
377 |                         # print(subtree)
378 |                         if subtree.label() != ',' and subtree.label() != 'CC':
379 |                             parts.append(subtree.leaves() + ht_3_last_obj)
380 |                         del p_tree[subtree.treeposition()]
381 |                 # print('parent', parent)
382 |                 # print('treeloc', tree_location)
383 |                 parent.append(ParentedTree('INSRT', ['*']))
384 | 
385 |         # p_tree.draw()
386 |         # print(parts)
387 | 
388 |         split = []
389 |         rem = p_tree.leaves()
390 |         start_idx = rem.index('*')
391 | 
392 |         for part in reversed(parts):
393 |             offset = start_idx
394 |             r_clone = rem.copy()
395 |             del r_clone[offset]
396 |             for i, word in enumerate(part):
397 |                 r_clone.insert(offset + i, word)
398 |             split.append(r_clone)
399 | 
400 |         # print("split", split)
401 | 
402 |         split = [" ".join(sent) for sent in split]
403 | 
404 |         return split
405 | 
406 |     def split_util(sent):
407 |         cc_list = [('and', 'CC'), ('but', 'CC')]
408 |         for cc_pair in cc_list:
409 |             if cc_pair in pos_tag(tokenize(sent)):
410 |                 return do_split(sent, cc_pair)
411 |         return sent
412 | 
413 |     def rem_dup(list):
414 |         final = []
415 |         for item in list:
416 |             if item not in final:
417 |                 final.append(item)
418 |         return final
419 | 
420 |     def simplify(sent):
421 |         initial = [sent]
422 |         final = []
423 | 
424 |         while (split_needed(initial)):
425 |             final = []
426 |             while (initial):
427 |                 sent = initial.pop(0)
428 |                 if (split_needed([sent])):
429 |                     for split_sent in reversed(split_util(sent)):
430 |                         final.append(split_sent)
431 |                 else:
432 |                     final.append(sent)
433 |             # print("final -> ", final)
434 |             initial = final.copy()
435 | 
436 |         final = rem_dup(final)
437 |         final = list(reversed(final))
438 |         # print(final)
439 | 
440 |         return final
441 | 
442 |     def tokenize(sent):
443 |         tokenized_sent = nltk.word_tokenize(sent)
444 |         if ('If') in tokenized_sent and ('then') in tokenized_sent:
445 |             tokenized_sent.remove('If')
446 |             tokenized_sent.insert(tokenized_sent.index('then'), 'and')
447 |             tokenized_sent.remove('then')
448 |         if ('because') in tokenized_sent:
449 |             tokenized_sent.insert(tokenized_sent.index('because'),
450 |                                   (','))  # ', 'is used
451 |             tokenized_sent.insert(tokenized_sent.index('because') + 1, (','))
452 |             tokenized_sent.insert(tokenized_sent.index('because'), 'and')
453 |             tokenized_sent.remove('because')
454 |         if ('while') in tokenized_sent:
455 |             tokenized_sent.insert(tokenized_sent.index('while'), 'and')
456 |             tokenized_sent.remove('while')
457 |         if ('which') in tokenized_sent:
458 |             tokenized_sent.insert(tokenized_sent.index('which'), 'and')
459 |             tokenized_sent.remove('which')
460 |         if ('or') in tokenized_sent:
461 |             tokenized_sent.insert(tokenized_sent.index('or'), 'and')
462 |             tokenized_sent.remove('or')
463 |         if ('who') in tokenized_sent:
464 |             while (',') in tokenized_sent:
465 |                 tokenized_sent.insert(tokenized_sent.index(','), 'and')
466 |                 tokenized_sent.remove(',')
467 |             tokenized_sent.insert(tokenized_sent.index('who'), 'and')
468 |             tokenized_sent.remove('who')
469 | 
470 |         return tokenized_sent
471 | 
472 |     sentences = [sentence0.strip()]
473 |     for sentence in sentences:
474 |         if verbose:
475 |             print("Complex Sentence: " + sentence)
476 |         tokenized_sent = tokenize(sentence)
477 |         # print(tokenized_sent)
478 | 
479 |         # parse_trees = parser1.tagged_parse(pos_tagged)
480 | 
481 |         pos_tagged = pos_tag(tokenized_sent)
482 |         parse_trees = parser.tagged_parse(pos_tagged)
483 |         tree = next(parse_trees)
484 |         p_tree = ParentedTree.convert(tree)
485 |         # p_tree.draw()
486 | 
487 |         leaf_values = p_tree.leaves()
488 |         # print(leaf_values)
489 |         for i in pos_tagged:
490 |             if ('and') in i:
491 |                 n = n + 1
492 | 
493 |             if ('but') in i:
494 |                 but = but + 1
495 |         tree1 = ParentedTree.convert(tree)
496 |         # tree.draw()
497 |         m = 0
498 |         for t in tree1.subtrees():
499 |             if t.label() == 'SBAR':
500 |                 m = m + 1
501 | 
502 |         if (n + but) > 0:
503 |             # tokenized_sent=nltk.word_tokenize(sent10)
504 |             # pos_tagged=nltk.pos_tag(tokenized_sent)
505 |             sent1 = sentence
506 |             sent = " ".join(tokenize(sent1))
507 |             # print(sent)
508 |             simplified = simplify(sent)
509 |             for i in simplified:
510 |                 i = list(i)
511 |                 if ord(i[0]) >= 97 and ord(i[0]) <= 122:
512 |                     i[0] = chr(ord(i[0]) - 32)
513 |                 while i.count(",") > 0:
514 |                     # i.pop(i.index(","))
515 |                     del (i[i.index(",")])
516 |                 if (".") not in (i):
517 |                     if verbose:
518 |                         print("Simple sentence: " + "".join(i) + ".")
519 |                     simple_ztz_list.append("".join(i) + ".")
520 |                     success = True
521 |                 else:
522 |                     if verbose:
523 |                         print("Simple sentence: " + "".join(i))
524 |                     simple_ztz_list.append("".join(i))
525 |                     success = True
526 |             n = 0
527 |             but = 0
528 |             # print("."),
529 | 
530 |         elif n == 0 and m > 0 and len(re.findall(r",", sentence)) == 0 and len(
531 |                 re.findall(r"While", sentence)) == 0:
532 |             try:
533 |                 sent = sentence
534 |                 # print(sent)
535 |                 # print("Hello")
536 |                 tokenized_sent = tokenize(sent)
537 |                 pos_tagged = nltk.pos_tag(tokenized_sent)
538 |                 parse_trees = parser.tagged_parse(pos_tagged)
539 |                 sent_list = [s for s in sent.split()]
540 |                 tree = next(parse_trees)[0]
541 |                 # tree.draw()
542 |                 t = AnyNode(id='ROOT')
543 |                 make_tree_sbar(tree, t, sent_list)
544 |                 sbar = t
545 |                 vp_sbar = t
546 |                 vp = t
547 |                 np = t
548 |                 vbz = 'asvf'
549 |                 find_sbar(t)
550 |                 find_vp_in_sbar(sbar)
551 |                 f = True
552 |                 find_vp(t)
553 |                 f = True
554 |                 find_np(t)
555 |                 f = True
556 |                 find_vbz(t)
557 |                 simple_sentences = []
558 |                 simple_sentences.append([])
559 |                 make_sent(np)
560 |                 make_sent(vp)
561 |                 simple_sentences.append([])
562 |                 make_sent(np)
563 |                 if vbz != 'asvf':
564 |                     simple_sentences[-1].append(vbz)
565 |                 make_sent(vp_sbar)
566 |                 for i in simple_sentences:
567 |                     i = list(i)
568 | 
569 |                     #             if ord(i[0])>=97 and ord(i[0])<=122:
570 |                     #                i[0]=chr(ord(i[0])-32)
571 | 
572 |                     while i.count(",") > 0:
573 |                         i.pop(i.index(","))
574 |                     if (".") not in (i):
575 |                         if verbose:
576 |                             print("Simple sentence: " + " ".join(i) + ".")
577 |                         simple_ztz_list.append(" ".join(i) + ".")
578 |                         success = True
579 |                     else:
580 |                         if verbose:
581 |                             print("Simple sentence: " + " ".join(i))
582 |                         simple_ztz_list.append(" ".join(i))
583 |                         success = True
584 |                 # print("."),
585 |             except:
586 |                 continue
587 |         elif m > 0 and (len(re.findall(r",", sentence)) > 0 or len(
588 |                 re.findall(r"While", sentence)) > 0):
589 |             try:
590 |                 # sent=re.sub(r",","",sentence)
591 |                 # print("Hello")
592 |                 tokenized_sent = tokenize(sentence)
593 |                 simple_sentences = SBAR_simplify(" ".join(tokenized_sent))
594 |                 for i in simple_sentences:
595 |                     # i=list(i)
596 | 
597 |                     #             if ord(i[0])>=97 and ord(i[0])<=122:
598 |                     #                i[0]=chr(ord(i[0])-32)
599 | 
600 |                     # while i.count(",")>0:
601 |                     #  i.pop(i.index(","))
602 |                     if (".") not in (i):
603 |                         if verbose:
604 |                             print("Simple sentence: " + i)
605 |                         simple_ztz_list.append(i)
606 |                         success = True
607 |                     else:
608 |                         if verbose:
609 |                             print("Simple sentence: " + i)
610 |                         simple_ztz_list.append(i)
611 |                         success = True
612 |                 # print("."),
613 |             except:
614 |                 continue
615 |     if not success:
616 |         simple_ztz_list.append(sentence0)
617 |     return [ztz for ztz in simple_ztz_list if len(ztz) > 2]
618 | 


--------------------------------------------------------------------------------