├── catch
├── output
│ └── .gitkeep
├── test
│ ├── wordcloud.png
│ ├── log.txt
│ ├── annotation_grep.txt
│ ├── annotation_wtags.txt
│ ├── annotation_invertedgrep.txt
│ ├── cyannotator.html
│ └── social_media_posts.txt
├── params_catch.py
├── README.md
└── catch.py
├── docs
├── _config.yml
├── workflow.png
├── paper.bib
├── paper.md
└── README.md
├── bandersnatch
├── test
│ ├── ontology_tags.txt
│ ├── words_of_interest.txt
│ ├── snatch_output.txt
│ └── pocketmonsters.owl
├── README.md
├── params_snatch.py
└── bandersnatch.py
├── flame
└── README.md
├── bite
├── test
│ ├── bite_output_plot.png
│ ├── bite_output_stats.txt
│ └── bite_output.tsv
├── README.md
├── params_bite.py
└── bite.py
├── eyes
├── test
│ └── pocketmonsters_web.png
├── README.md
├── params_eyes.py
└── eyes.py
├── arise
├── README.md
├── test
│ ├── new_annotations.tsv
│ └── pocketmonsters_updated.owl
├── params_arise.py
└── arise.py
├── Changelog.md
├── LICENSE
├── requirements.py
├── Contributing.md
├── README.md
├── .gitignore
└── highlevel.py
/catch/output/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate
--------------------------------------------------------------------------------
/docs/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sap218/jabberwocky/HEAD/docs/workflow.png
--------------------------------------------------------------------------------
/bandersnatch/test/ontology_tags.txt:
--------------------------------------------------------------------------------
1 | oboInOWL:hasExactSynonym
2 | oboInOWL:hasRelatedSynonym
3 |
--------------------------------------------------------------------------------
/catch/test/wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sap218/jabberwocky/HEAD/catch/test/wordcloud.png
--------------------------------------------------------------------------------
/flame/README.md:
--------------------------------------------------------------------------------
1 | # README - `flame`
2 |
3 | *Come back soon...*
4 |
5 | ***
6 |
7 | End of page
8 |
--------------------------------------------------------------------------------
/bite/test/bite_output_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sap218/jabberwocky/HEAD/bite/test/bite_output_plot.png
--------------------------------------------------------------------------------
/eyes/test/pocketmonsters_web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sap218/jabberwocky/HEAD/eyes/test/pocketmonsters_web.png
--------------------------------------------------------------------------------
/bite/test/bite_output_stats.txt:
--------------------------------------------------------------------------------
1 | time taken to run tf-idf: 0.001
2 | tf-idf raw df length: 137
3 | tf-idf adj. df length: 115
4 |
--------------------------------------------------------------------------------
/bandersnatch/test/words_of_interest.txt:
--------------------------------------------------------------------------------
1 | generation one
2 | dragon
3 | route
4 | water
5 | small
6 | large
7 | generation six
8 |
--------------------------------------------------------------------------------
/bandersnatch/test/snatch_output.txt:
--------------------------------------------------------------------------------
1 | generation one
2 | generation 1
3 | gen 1
4 | gen one
5 | dragon
6 | route
7 | water
8 | small
9 | large
10 | generation six
11 | generation 6
12 | gen 6
13 | gen six
14 |
--------------------------------------------------------------------------------
/arise/README.md:
--------------------------------------------------------------------------------
1 | # README - `arise`
2 |
3 | ## `ontology_name`
4 | - ontology file (+path)
5 |
6 | ## `annotation_file`
7 | - file of annotations
8 | - can be either `.tsv` or `.csv`
9 |
10 | ***
11 |
12 | End of page
13 |
--------------------------------------------------------------------------------
/catch/test/log.txt:
--------------------------------------------------------------------------------
1 | is this a test: True
2 | stopword filter level: light
3 | concepts count: 13
4 | post count: 26
5 | average word count: 12.076923076923077
6 | time taken to annotate (seconds): 0.09
7 | time taken to run script (seconds): 4.55
8 |
--------------------------------------------------------------------------------
/arise/test/new_annotations.tsv:
--------------------------------------------------------------------------------
1 | annotation class tag
2 | path route oboInOWL:hasExactSynonym
3 | evolve generation oboInOWL:hasRelatedSynonym
4 | flew flying oboInOWL:hasExactSynonym
5 | mega large oboInOWL:hasRelatedSynonym
6 | breed type oboInOWL:hasRelatedSynonym
7 | air flying oboInOWL:hasRelatedSynonym
8 | https://pokemon.fandom.com/wiki/Types type oboInOWL:DbXref
--------------------------------------------------------------------------------
/bandersnatch/README.md:
--------------------------------------------------------------------------------
1 | # README - `bandersnatch`
2 |
3 | ## `ontology_name`
4 | - ontology file (+path)
5 |
6 | ## `ontology_tags`
7 | - newline delimited file
8 |
9 | ## `classes_of_interest`
10 | - newline delimited file with ontology classes of interest
11 | - users can leave blank to use all classes
12 |
13 | ***
14 |
15 | End of page
16 |
--------------------------------------------------------------------------------
/eyes/README.md:
--------------------------------------------------------------------------------
1 | # README - `eyes`
2 |
3 | ## `ontology`
4 | - ontology file (+path)
5 |
6 | ## `plot_type`
7 | - choose `web` or `tree`
8 |
9 | ## `*_colour`
10 | - colours for `superclass` (default orange) and `subclass` (default skyblue)
11 |
12 | ## `to_annotate_subclasses`
13 | - choose `True` or `False` to overlay text for subclasses in plots
14 | - recommended to use `False` if ontologies are large
15 |
16 | ***
17 |
18 | End of page
19 |
--------------------------------------------------------------------------------
/catch/test/annotation_grep.txt:
--------------------------------------------------------------------------------
1 | Any small pokemon nearby? I need to catch a Metapod!
2 | I think only gen 6 pokemon are on this path - try route 2.
3 | I've checked that route twice already. I just want my Caterpie to evolve already.
4 | Anyone else dislike the new Pokedex? What happened to old fashioned gen 1?
5 | Anyone want a battle? I'm on route 13.
6 | I'll join but we should move to the meadow area near that route.
7 | Hey guys! Lake near route 7 I saw a Gyarados!
8 | What route is best for small normal pokemon? My Skitty needs a friend.
9 | Go to route 4, we're totally not on that path and we don't plan on catching your Skitty away from you.
10 |
--------------------------------------------------------------------------------
/catch/test/annotation_wtags.txt:
--------------------------------------------------------------------------------
1 | ['small'] # Any small pokemon nearby? I need to catch a Metapod!
2 | ['gen 6', 'route'] # I think only gen 6 pokemon are on this path - try route 2.
3 | ['route'] # I've checked that route twice already. I just want my Caterpie to evolve already.
4 | ['gen 1'] # Anyone else dislike the new Pokedex? What happened to old fashioned gen 1?
5 | ['route'] # Anyone want a battle? I'm on route 13.
6 | ['route'] # I'll join but we should move to the meadow area near that route.
7 | ['route'] # Hey guys! Lake near route 7 I saw a Gyarados!
8 | ['route', 'small'] # What route is best for small normal pokemon? My Skitty needs a friend.
9 | ['route'] # Go to route 4, we're totally not on that path and we don't plan on catching your Skitty away from you.
10 |
--------------------------------------------------------------------------------
/arise/params_arise.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @GitHub: github.com/sap218/jabberwocky
7 | """
8 |
9 | ####################################################
10 | #
11 | # PARAMETERS FOR ARISE
12 | #
13 | ####################################################
14 |
15 | is_this_a_test = True
16 |
17 | ####################################################
18 |
19 | if is_this_a_test:
20 | ontology_name = "../bandersnatch/test/pocketmonsters"
21 | annotation_file = "../arise/test/new_annotations"
22 | output_name = "../arise/test/%s" % ontology_name.split("/")[-1]
23 |
24 | else:
25 | ontology_name = ""
26 | annotation_file = ""
27 |
28 | output_name = "%s" % ontology_name.split("/")[-1]
29 |
30 | ####################################################
31 |
32 | # End of script
33 |
--------------------------------------------------------------------------------
/bandersnatch/params_snatch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @GitHub: github.com/sap218/jabberwocky
7 | """
8 |
9 | ####################################################
10 | #
11 | # PARAMETERS FOR SNATCH
12 | #
13 | ####################################################
14 |
15 | is_this_a_test = True
16 |
17 | ####################################################
18 |
19 | if is_this_a_test:
20 | ontology_name = "test/pocketmonsters"
21 | ontology_tags = "test/ontology_tags"
22 | classes_of_interest = "test/words_of_interest" # if empty, extract all annotations of all classes
23 | output_name = "test/snatch_output"
24 |
25 | else:
26 | ontology_name = ""
27 | ontology_tags = ""
28 | classes_of_interest = ""
29 |
30 | output_name = "snatch_output"
31 |
32 | ####################################################
33 |
34 | # End of script
35 |
--------------------------------------------------------------------------------
/bite/README.md:
--------------------------------------------------------------------------------
1 | # README - `bite`
2 |
3 | ## `corpus`
4 | - file with each post/sentence on a new line
5 | - can be `catch` output (grep)
6 |
7 | ## `concepts_to_remove`
8 | - concepts file with each on a new line to remove from TF-IDF statistical rankings
9 | - can be `snatch` output
10 | - users can leave blank to use all terms in corpus
11 |
12 | ## `filter_level`
13 | - parameter for which stop words list to use
14 | - "light" is a smaller list consisting only of 179 stop words
15 | - "heavy" is much larger list consisting of 1160 stop words
16 |
17 | ## `ngram_count`
18 | - a list of n-grams for TF-IDF
19 | - can modify for unigram only `[1]` or for bi-grams & tri-grams `[2,3]`
20 |
21 | ## `graph`
22 | - plot TF-IDF rankings
23 |
24 | ### `cm`
25 | - plotting colour for bars
26 | - recommended to use mediumseagreen, steelblue, or lightcoral
27 |
28 | ### `limit`
29 | - plot limit for top-N terms (default is 30)
30 |
31 | ***
32 |
33 | End of page
34 |
--------------------------------------------------------------------------------
/catch/params_catch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @GitHub: github.com/sap218/jabberwocky
7 | """
8 |
9 | ####################################################
10 | #
11 | # PARAMETERS FOR CATCH
12 | #
13 | # Any completed fields below are recommendations
14 | #
15 | ####################################################
16 |
17 | is_this_a_test = True
18 |
19 | ####################################################
20 |
21 | file_corpus = ""
22 | file_words_of_interest = ""
23 |
24 | filter_level = "light" # or "none" or "heavy"
25 | output_format = "wtags" # ["wtags","grep","invertedgrep"]
26 |
27 | output_name = "test"
28 |
29 | plotWORDCLOUD = True
30 | if plotWORDCLOUD:
31 | colormapWC = "Set3" # default
32 |
33 | plotCYANNOTATOR = True
34 | if plotCYANNOTATOR:
35 | highlightcolour = "#00bcd4" # default = cyan
36 |
37 | ####################################################
38 |
39 | # End of script
40 |
--------------------------------------------------------------------------------
/Changelog.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | * **v3.3.0** [31/10/2025]
4 | - now includes word contractions
5 | * **v3.2.0** [08/05/2025]
6 | - cleaning up code, no longer required to use a list of stop words, log files included
7 | * **v3.1.1** [20/02/2025]
8 | - highlighting script: HTML output of annotated posts with highlighted key terms
9 | * **v3.1.0** [19/06/2024]
10 | - plotting script for ontologies
11 | - users can plot wordcloud without the need for annotations
12 | - tf-idf can now be done via n-grams
13 | * **v3.0.0** [12/06/2024]
14 | - major version change due to code alterations and repository has been redesigned
15 | - high-level script for functions & vars: text cleaning & stopwords
16 | - updated scripts for usability so users only need to edit a params file
17 | - plotting wordcloud
18 | * **v2.0.0** [10/05/2021]
19 | - includes `spacy PhraseMatcher()`
20 | - users can provide their own annotation tags
21 | - plotting tf-idf
22 | * **v1.0.0** [29/06/2020]
23 | - version presented in **JOSS** paper
24 |
25 | ***
26 |
27 | End of page
28 |
--------------------------------------------------------------------------------
/catch/test/annotation_invertedgrep.txt:
--------------------------------------------------------------------------------
1 | No thanks, I'm, trying to catch a flying type in the mountatins with the clear air.
2 | I'll be there soon. Need to heal first.
3 | Currently on the opposite path, training for Brock's gym.
4 | I'll join too - it'll be nice for Ekans to get some practice - I'll be flying by air.
5 | Really? Still with an Ekans? Any chance it'll evolve soon?.
6 | Not yet. Taking is slow - Team Rocket are after one.
7 | So where are you guys meeting again?
8 | Remember to be careful everyone! If you see any grunts of Team Rocket please notify us via your pokedex. We are always alert on the path if you need assistance.
9 | Feel free to pop in the centre if you want to rest.
10 | I need one for my pokedex - luckily I'm only a few minutes away, I won't be too long, just coming down the path now.
11 | Hurry it'll mega-evolve! The other one flew away and I was too late.
12 | What is mega-evolve?
13 | What?! You've not heard of it?!
14 | Drat! I didn't know it took a while for Magikarps to evolve.
15 | Not sure.
16 | Do you mean cat-breed pokemon?
17 | The path near the school is always a good choice.
18 |
--------------------------------------------------------------------------------
/catch/test/cyannotator.html:
--------------------------------------------------------------------------------
1 |
small pokemon nearby catch metapod
think gen 6 pokemon path try route 2
check route twice already want caterpie evolve already
anyone else dislike new pokedex happen old fashioned gen 1
anyone want battle route 13
join move meadow area near route
hey guy lake near route 7 see gyarado
route good small normal pokemon skitty friend
go route 4 totally path plan catch skitty away
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020-present Samantha Pendleton | Jabberwocky
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/eyes/params_eyes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @GitHub: github.com/sap218/jabberwocky
7 | """
8 |
9 | ####################################################
10 | #
11 | # PARAMETERS FOR EYES
12 | #
13 | ####################################################
14 |
15 | is_this_a_test = True
16 |
17 | ####################################################
18 |
19 | if is_this_a_test:
20 | ontology = "../bandersnatch/test/pocketmonsters"
21 | #ontology = "test/other_ontologies/space"
22 |
23 | ontology_name = ontology.split("/")[-1]
24 |
25 | plot_type = "tree"
26 | plot_type = "web"
27 |
28 | superclass_colour = "orange"
29 | subclass_colour = "skyblue"
30 |
31 | to_annotate_subclasses = True # False
32 |
33 | output_name = "test/%s" % ontology_name
34 |
35 | else:
36 | ont = ""
37 | ontology_name = ontology.split("/")[-1]
38 |
39 | plot_type = "web"
40 |
41 | superclass_colour = "orange"
42 | subclass_colour = "skyblue"
43 |
44 | to_annotate_subclasses = False
45 |
46 | output_name = "%s" % ontology_name
47 |
48 | ####################################################
49 |
50 | # End of script
51 |
--------------------------------------------------------------------------------
/bite/params_bite.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @GitHub: github.com/sap218/jabberwocky
7 | """
8 |
9 | ####################################################
10 | #
11 | # PARAMETERS FOR BITE
12 | #
13 | ####################################################
14 |
15 | is_this_a_test = True
16 |
17 | ####################################################
18 |
19 | if is_this_a_test:
20 | corpus = "../catch/test/catch_output_invert"
21 | concepts_to_remove = "../bandersnatch/test/snatch_output"
22 | filter_level = "light" # or "none" or "heavy"
23 | ngram_count = [1,3]
24 | graph = True
25 | cm = "mediumseagreen"
26 | limit = 30 # default is top 30 words
27 | output_name = "../bite/test/bite_output"
28 | stats_output_name = "../bite/test/bite_output_stats"
29 | plot_output_name = "../bite/test/bite_output_plot"
30 |
31 | else:
32 | corpus = ""
33 | concepts_to_remove = ""
34 | filter_level = "light"
35 |
36 | ngram_count = [1,3]
37 |
38 | graph = True
39 | cm = "mediumseagreen"
40 | limit = 30
41 |
42 | output_name = "bite_output"
43 | stats_output_name = "bite_output_stats"
44 | plot_output_name = "bite_output_plot"
45 |
46 | ####################################################
47 |
48 | # End of script
49 |
--------------------------------------------------------------------------------
/catch/test/social_media_posts.txt:
--------------------------------------------------------------------------------
1 | Any small pokemon nearby? I need to catch a Metapod!
2 | I think only gen 6 pokemon are on this path - try route 2.
3 | I've checked that route twice already. I just want my Caterpie to evolve already.
4 |
5 | Anyone else dislike the new Pokedex? What happened to old fashioned gen 1?
6 |
7 | Anyone want a battle? I'm on route 13.
8 | No thanks, I'm, trying to catch a flying type in the mountatins with the clear air.
9 | I'll be there soon. Need to heal first.
10 | Currently on the opposite path, training for Brock's gym.
11 | I'll join too - it'll be nice for Ekans to get some practice - I'll be flying by air.
12 | Really? Still with an Ekans? Any chance it'll evolve soon?.
13 | Not yet. Taking is slow - Team Rocket are after one.
14 | I'll join but we should move to the meadow area near that route.
15 | So where are you guys meeting again?
16 |
17 | Remember to be careful everyone! If you see any grunts of Team Rocket please notify us via your pokedex. We are always alert on the path if you need assistance.
18 | Feel free to pop in the centre if you want to rest.
19 |
20 | Hey guys! Lake near route 7 I saw a Gyarados!
21 | I need one for my pokedex - luckily I'm only a few minutes away, I won't be too long, just coming down the path now.
22 | Hurry it'll mega-evolve! The other one flew away and I was too late.
23 | What is mega-evolve?
24 | What?! You've not heard of it?!
25 | Drat! I didn't know it took a while for Magikarps to evolve.
26 |
27 | What route is best for small normal pokemon? My Skitty needs a friend.
28 | Not sure.
29 | Do you mean cat-breed pokemon?
30 | The path near the school is always a good choice.
31 | Go to route 4, we're totally not on that path and we don't plan on catching your Skitty away from you.
32 |
--------------------------------------------------------------------------------
/catch/README.md:
--------------------------------------------------------------------------------
1 | # README - `catch`
2 |
3 | ## `test/` & `output/`
4 | - directories for results
5 |
6 | ***
7 |
8 | ## `is_this_a_test`
9 | - set to `True` to run the test, see `test/` for the results
10 |
11 | ***
12 |
13 | ## `file_corpus`
14 | - a `.txt` file with each post/sentence on a new line
15 |
16 | ## `file_words_of_interest`
17 | - a `.txt` file with concepts/words of interest separated by a new line
18 | - can be `snatch` output
19 | - script will run if empty so users can use other features - please ensure you check outputs
20 |
21 | ## `filter_level`
22 | - parameter for which list of stop words to use
23 | - `light` is a small list consisting of 179 stop words
24 | - `heavy` is much larger consisting of 1160 stop words
25 | - `none` to not remove stop words
26 |
27 | ## `output_format`
28 | - `wtags` = each annotated post **with** the terms that were annotated
29 | - `grep` = output in grep format (simply the annotated posts only)
30 | - `invertedgrep` = posts that were NOT annotated
31 |
32 | ## `output_name`
33 | - users should edit the output name, these will be stored in `output/`
34 | - all outputs are timestamped to avoid overwriting files
35 |
36 | ## `plotWORDCLOUD`
37 | - set `True` to plot a wordcloud of `file_corpus`
38 | - if you intend to plot, it is recommended to use a filter level for stop words
39 |
40 | ### `colormapWC`
41 | - colour scheme for the wordcloud - users can provide [any palette](https://matplotlib.org/stable/users/explain/colors/colormaps.html)
42 | - default is `Set3` (pastel) but a nice recommendation is `viridis` (purple -> green)
43 |
44 | ## `plotCYANNOTATOR`
45 | - set `True` to output an `HTML` of annotated posts with the highlighted concepts
46 |
47 | ### `highlightcolour`
48 | - colour of highlighting - default is cyan (`#00bcd4`)
49 |
50 | ***
51 |
52 | End of page
53 |
--------------------------------------------------------------------------------
/requirements.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: versions
7 | @GitHub: github.com/sap218/jabberwocky
8 | """
9 |
10 | # Modules used in Jabberwocky
11 |
12 | import sys # this includes: import time
13 | print("Python \t v%s" % sys.version.split(" ")[0])
14 |
15 | import bs4
16 | print("BeautifulSoup4 \t v%s" % bs4.__version__)
17 |
18 | '''
19 | # Base modules
20 |
21 | import re
22 | print("re \t v%s" % re.__version__)
23 |
24 | import json
25 | print("json \t v%s" % json.__version__)
26 | '''
27 |
28 | import contractions
29 | import pkg_resources
30 | version = pkg_resources.get_distribution("contractions").version
31 | print("contractions \t v%s" % version)
32 | del version
33 |
34 | import pandas as pd
35 | print("pandas \t v%s" % pd.__version__)
36 |
37 | import matplotlib
38 | print("matplotlib \t v%s" % matplotlib.__version__)
39 |
40 | import sklearn
41 | print("scikit-learn \t v%s" % sklearn.__version__)
42 |
43 | import spacy
44 | print("spaCy \t v%s" % spacy.__version__)
45 |
46 | import wordcloud
47 | print("wordcloud \t v%s" % wordcloud.__version__)
48 |
49 | import nltk
50 | print("nltk \t v%s" % nltk.__version__)
51 |
52 | import networkx
53 | print("networkx \t v%s" % networkx.__version__)
54 |
55 | print("additional information: \t %s" % "".join(sys.version.split("|")[1:]))
56 |
57 | # When running Jabberwocky, users need these versions minimum
58 |
59 | '''
60 | Python v3.12.3
61 | BeautifulSoup4 v4.12.3
62 | contractions v0.1.73
63 | pandas v2.2.2
64 | matplotlib v3.9.2
65 | scikit-learn v1.5.1
66 | spaCy v3.7.2
67 | wordcloud v1.9.4
68 | nltk v3.9.1
69 | networkx v3.3
70 | additional information: packaged by conda-forge (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]
71 | '''
72 |
73 | ####################################################
74 |
75 | # End of script
76 |
--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines / Issues for Jabberwocky :dragon_face:
2 |
3 | * Users are welcome to contribute to this project via pull requests or bug reporting.
4 | * In either circumstance, please ensure titles/descriptions have as much information as possible, e.g. if creating a Bug/Issue, try and trace your steps w/ details & error messages.
5 | * The primary maintainer(s) - currently [@sap218](https://github.com/sap218) - will address the request!
6 | * Maintainers will always try their best to meet the needs of the user but also considering what is best for **Jabberwocky**.
7 |
8 | ## Contributing Code
9 | * Users intending to contribute to this repository can open a **Pull request**.
10 | * Frequent contributors will be added to a contributors list for thanks and acknowledgement.
11 | * **Note**: please provide information (e.g. decisions) and plenty of comments (w/ username to acknowledge contribution), e.g.
12 |
13 | ```
14 | print(f"Error message: {errmsg}") # print error message (example comment) for reference - @yourusername
15 | ```
16 |
17 | ## Issues
18 | * Users are encouraged to create an [`Issue`](https://github.com/sap218/jabberwocky/issues).
19 | * Issues can relate to anything: bug/error reporting, feature requests, help questions, to improve documentation, etc.
20 | * Issues will be labelled accordingly - see below for [`label`](https://github.com/sap218/jabberwocky/labels) information:
21 |
22 | #### bug
23 | * if any errors arrise
24 |
25 | #### documentation
26 | * if the guides need more information
27 |
28 | #### duplicate
29 | * if the ticket already exists
30 |
31 | #### help
32 | * if the user is asking for help (not relating to other tags)
33 |
34 | #### request
35 | * if a new feature is being requested
36 |
37 | #### wontfix
38 | * there may be circumstances that an Issue *won't* or *shouldn't* be fixed
39 | * for example, some behaviours may be intentional, or a fix isn't in scope
40 | * the maintainer will help as much as possible and comment why this label is applied, giving users time to respond
41 |
42 | ***
43 |
44 | End of page
45 |
--------------------------------------------------------------------------------
/arise/arise.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: annotate ontology classes
7 | @GitHub: github.com/sap218/jabberwocky
8 |
9 | @useful links:
10 | # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#insert
11 | """
12 |
13 | from bs4 import BeautifulSoup
14 | import pandas as pd
15 |
16 | from params_arise import *
17 |
18 | ####################################################
19 |
20 | with open("%s.owl" % ontology_name, "rt") as o:
21 | ontology_file = o.read()
22 | ontology_soup = BeautifulSoup(ontology_file,'xml') # BEAUTIFUL SOUP really is beautiful
23 | del o, ontology_file
24 |
25 | ####################################################
26 |
27 | try:
28 | annotations = pd.read_csv('%s.tsv' % annotation_file, sep='\t', header=0)
29 | except:
30 | annotations = pd.read_csv('%s.csv' % annotation_file, header=0)
31 |
32 | ####################################################
33 |
34 | finding = ontology_soup.find_all('owl:Class') # finding all owl classes
35 | for concept in finding:
36 | label = concept.find("rdfs:label").get_text()#.lower() # getting labels
37 |
38 | for term_iteration in range(len(annotations)): # going through each row on the tf-idf dataframe
39 |
40 | class_match_label = list(annotations['class'])[term_iteration]
41 | class_new_annotations = list(annotations['annotation'])[term_iteration]
42 | new_annotation_tag = list(annotations['tag'])[term_iteration]
43 |
44 | if label == class_match_label:
45 | tag = ontology_soup.new_tag(new_annotation_tag)
46 | tag.string = class_new_annotations
47 | concept.insert(1, tag) # insert after line one (line one is declaring the ontology concept)
48 |
49 | ####################################################
50 |
51 | updated_ont = str(ontology_soup).replace('', '') # replacing first line - very important
52 |
53 | ####################################################
54 |
55 | with open("%s_updated.owl" % output_name, "w") as file: # exporting # encoding="utf-8"
56 | file.write(updated_ont)
57 |
58 | ####################################################
59 |
60 | # End of script
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Jabberwocky
2 |
3 | [](https://doi.org/10.21105/joss.02168)
4 |
5 | Jabberwocky is a toolkit for NLP and **ontologies**. Since we all know ontologies are *nonsense*.
6 |
7 | ## Functionality
8 |
9 | Read the [documentation](https://sap218.github.io/jabberwocky/) for more detail.
10 |
11 | script | description
12 | ------- | -----------
13 | `bandersnatch` | extract metadata from ontology classes
14 | `catch` | annotate corpus with key terms & generate wordcloud
15 | `bite` | rank terms in order of importance & bar plot
16 | `arise` | update ontology with new metadata
17 | `eyes` | plot an ontology via web or tree format
18 |
19 | When combining these Jabberwocky functions, users can create an NLP workflow.
20 |
21 | 
22 |
23 | ## Running
24 | Within each directory, there is a file `params_*.py` which users can edit.
25 | Meaning users shouldn't need to edit the main/primary script.
26 |
27 | Check the individual directory `READMEs` for parameter information.
28 |
29 | #### Prerequisites
30 | Check [`requirements.py`](https://github.com/sap218/jabberwocky/blob/master/requirements.py) for a list of packages and versions.
31 |
32 | ## Changelog / Version
33 | See the [**Changelog**](https://github.com/sap218/jabberwocky/blob/master/Changelog.md) (ordered by newest first).
34 |
35 | ## Contributing / Issues
36 | Please read the [**Contributing Guidelines**](https://github.com/sap218/jabberwocky/blob/master/Contributing.md).
37 |
38 | - [@majensen](https://github.com/majensen) set up automated testing w/ `pytest` in v1.0 - see [pull request #13](https://github.com/sap218/jabberwocky/pull/13) for more details
39 |
40 | ## License
41 | The [license](https://github.com/sap218/jabberwocky/blob/master/LICENSE) is **MIT** and so users only need to cite (below) if using.
42 |
43 | ## Citing
44 |
45 | ```
46 | @article{Pendleton2020,
47 | doi = {10.21105/joss.02168},
48 | url = {https://doi.org/10.21105/joss.02168},
49 | year = {2020},
50 | publisher = {The Open Journal},
51 | volume = {5},
52 | number = {51},
53 | pages = {2168},
54 | author = {Samantha C. Pendleton and Georgios V. Gkoutos},
55 | title = {Jabberwocky: an ontology-aware toolkit for manipulating text},
56 | journal = {Journal of Open Source Software}
57 | }
58 | ```
59 |
60 | The poem, Jabberwocky, written by Lewis Carrol, is described as a "nonsense" poem :dragon:
61 |
62 | ***
63 |
64 | End of page
65 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # gitignore
2 | # Files not needed to be uploaded
3 |
4 | eyes/test/other_ontologies/
5 |
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | pip-wheel-metadata/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | *.py,cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 |
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 |
107 | # SageMath parsed files
108 | *.sage.py
109 |
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 |
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 |
123 | # Rope project settings
124 | .ropeproject
125 |
126 | # mkdocs documentation
127 | /site
128 |
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 |
134 | # Pyre type checker
135 | .pyre/
136 |
--------------------------------------------------------------------------------
/bite/test/bite_output.tsv:
--------------------------------------------------------------------------------
1 | Word Raw score Normalised score
2 | mega 0.071 1.0
3 | hear 0.056 0.74
4 | sure 0.056 0.74
5 | path 0.044 0.542
6 | one 0.043 0.536
7 | take 0.04 0.475
8 | guy 0.039 0.469
9 | meet 0.039 0.469
10 | fly 0.038 0.44
11 | soon 0.037 0.43
12 | need 0.037 0.424
13 | ekan 0.031 0.333
14 | away 0.027 0.268
15 | air 0.026 0.251
16 | always 0.026 0.243
17 | take_slow_one 0.024 0.213
18 | yet 0.024 0.213
19 | slow 0.024 0.213
20 | yet_take_slow 0.024 0.213
21 | breed 0.023 0.192
22 | cat 0.023 0.192
23 | mean 0.023 0.192
24 | mean_cat_breed 0.023 0.192
25 | cat_breed_pokemon 0.023 0.192
26 | pokemon 0.023 0.192
27 | i_soon_need 0.022 0.18
28 | soon_need_heal 0.022 0.18
29 | first 0.022 0.18
30 | heal 0.022 0.18
31 | need_heal_first 0.022 0.18
32 | pokedex 0.021 0.172
33 | know_take_magikarp 0.021 0.17
34 | know 0.021 0.17
35 | drat_i_know 0.021 0.17
36 | drat 0.021 0.17
37 | i_know_take 0.021 0.17
38 | magikarp 0.021 0.17
39 | really_still_ekan 0.02 0.151
40 | chance 0.02 0.151
41 | really 0.02 0.151
42 | still 0.02 0.151
43 | still_ekan_chance 0.02 0.151
44 | ekan_chance_soon 0.02 0.151
45 | school_always_good 0.018 0.118
46 | choice 0.018 0.118
47 | near 0.018 0.118
48 | near_school_always 0.018 0.118
49 | good 0.018 0.118
50 | path_near_school 0.018 0.118
51 | always_good_choice 0.018 0.118
52 | school 0.018 0.118
53 | currently 0.018 0.114
54 | currently_opposite_path 0.018 0.114
55 | training_brock_gym 0.018 0.114
56 | brock 0.018 0.114
57 | path_training_brock 0.018 0.114
58 | gym 0.018 0.114
59 | opposite 0.018 0.114
60 | opposite_path_training 0.018 0.114
61 | training 0.018 0.114
62 | hurry_mega_one 0.018 0.11
63 | hurry 0.018 0.11
64 | one_fly_away 0.018 0.11
65 | fly_away_i 0.018 0.11
66 | late 0.018 0.11
67 | away_i_late 0.018 0.11
68 | mega_one_fly 0.018 0.11
69 | rest 0.018 0.107
70 | pop_centre_want 0.018 0.107
71 | pop 0.018 0.107
72 | want 0.018 0.107
73 | centre 0.018 0.107
74 | centre_want_rest 0.018 0.107
75 | free_pop_centre 0.018 0.107
76 | feel 0.018 0.107
77 | feel_free_pop 0.018 0.107
78 | free 0.018 0.107
79 | ekan_get_practice 0.015 0.069
80 | nice 0.015 0.069
81 | get 0.015 0.069
82 | i_fly_air 0.015 0.069
83 | i_join_nice 0.015 0.069
84 | nice_ekan_get 0.015 0.069
85 | get_practice_i 0.015 0.069
86 | join 0.015 0.069
87 | join_nice_ekan 0.015 0.069
88 | practice 0.015 0.069
89 | practice_i_fly 0.015 0.069
90 | type 0.015 0.058
91 | type_mountatin_clear 0.015 0.058
92 | try_catch_fly 0.015 0.058
93 | thank 0.015 0.058
94 | thank_i_try 0.015 0.058
95 | try 0.015 0.058
96 | mountatin_clear_air 0.015 0.058
97 | clear 0.015 0.058
98 | catch_fly_type 0.015 0.058
99 | fly_type_mountatin 0.015 0.058
100 | catch 0.015 0.058
101 | mountatin 0.015 0.058
102 | i_try_catch 0.015 0.058
103 | away_i_long 0.013 0.036
104 | luckily 0.013 0.036
105 | come 0.013 0.036
106 | i_long_come 0.013 0.036
107 | i_minute_away 0.013 0.036
108 | i_need_one 0.013 0.036
109 | long 0.013 0.036
110 | one_pokedex_luckily 0.013 0.036
111 | long_come_path 0.013 0.036
112 | luckily_i_minute 0.013 0.036
113 | pokedex_luckily_i 0.013 0.036
114 | minute 0.013 0.036
115 | minute_away_i 0.013 0.036
116 | need_one_pokedex 0.013 0.036
117 |
--------------------------------------------------------------------------------
/bandersnatch/bandersnatch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: curate classes (& synonyms) from an ontology
7 | @GitHub: github.com/sap218/jabberwocky
8 |
9 | @useful links:
10 | # https://stackoverflow.com/questions/35898699/why-is-beautifulsoup-altering-the-format-of-my-xml
11 | """
12 |
13 | import sys
14 | from bs4 import BeautifulSoup
15 |
16 | from params_snatch import *
17 |
18 | ####################################################
19 |
20 | with open("%s.owl" % ontology_name, "rt") as o:
21 | ontology_file = o.read()
22 | ontology_soup = BeautifulSoup(ontology_file,'xml') # BEAUTIFUL SOUP really is beautiful
23 | del o, ontology_file
24 |
25 | annotation_tags = []
26 | with open("%s.txt" % ontology_tags, "r") as t:
27 | for tag in t:
28 | annotation_tags.append(tag.strip("\n"))
29 | del tag, t
30 |
31 | ####################################################
32 |
33 | find_all_concepts = ontology_soup.find_all('owl:Class') # this finds all concepts in the ontology
34 | classes_and_annotations = {}
35 | for concept in find_all_concepts:
36 | label = concept.find("rdfs:label").get_text() # gets label for concept
37 | list_annotations = []
38 | for tag_format in annotation_tags:
39 | finding_tags = concept.find_all(tag_format) # a concept could have multiple "exact synonyms"
40 | flatten = [x.get_text() for x in finding_tags]
41 | list_annotations.extend(flatten)
42 | classes_and_annotations[label] = list_annotations
43 | del find_all_concepts, flatten, label, list_annotations, finding_tags, tag_format, annotation_tags
44 |
45 | ####################################################
46 |
47 | if len(classes_of_interest) > 0:
48 | try:
49 | words_of_interest = []
50 | with open("%s.txt" % classes_of_interest, "r") as t:
51 | for word in t:
52 | words_of_interest.append(word.strip("\n").strip(" ")) # words of interest
53 | print("User has provided a list of ontology classes of interest - success")
54 | del t, word
55 |
56 | except FileNotFoundError:
57 | sys.exit("User attempted to provide a list with ontology classes of interest - unsuccessful")
58 |
59 | else:
60 | words_of_interest = None
61 | print("User not providing a list of ontology classes of interest - using all classes for annotations")
62 |
63 | ####################################################
64 |
65 | if words_of_interest:
66 | search_concepts = {key: classes_and_annotations[key] for key in words_of_interest}
67 | else:
68 | search_concepts = classes_and_annotations.copy()
69 |
70 | ####################################################
71 |
72 | #with open('test/snatch_output.json', 'w') as j:
73 | # json.dump(search_concepts, j, indent=4)
74 | #del j
75 |
76 | ####################################################
77 |
78 | search_concepts = [key_val for key, value in search_concepts.items() for key_val in [key] + value]
79 |
80 | with open('%s.txt' % output_name, 'w') as t:
81 | for word in search_concepts:
82 | t.write(word + '\n')
83 | del t, word
84 |
85 | ####################################################
86 |
87 | # End of script
88 |
--------------------------------------------------------------------------------
/eyes/eyes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: plot an ontology
7 | @GitHub: github.com/sap218/jabberwocky
8 |
9 | @useful links:
10 | # https://stackoverflow.com/a/21990980
11 | """
12 |
13 | from bs4 import BeautifulSoup
14 | import networkx as nx
15 | import matplotlib.pyplot as plt
16 | from textwrap import wrap
17 |
18 | from params_eyes import *
19 |
20 | ####################################################
21 |
22 | with open("%s.owl" % ontology, "rt") as o:
23 | ontology_file = o.read()
24 | ontology_soup = BeautifulSoup(ontology_file,'xml') # BEAUTIFUL SOUP really is beautiful
25 | del o, ontology
26 |
27 | ####################################################
28 |
29 | G = nx.DiGraph() # graph
30 |
31 | ####################################################
32 |
33 | finding = ontology_soup.find_all('owl:Class') # finding all owl classes
34 | concepts = []
35 |
36 | for concept in finding:
37 | label = concept.find("rdfs:label").get_text()
38 | concepts.append(label)
39 | iri = concept.get('rdf:about')
40 |
41 | G.add_node(label, id=iri) # node for each class
42 |
43 | # find superclass and add edges
44 | subclasses = concept.find_all("rdfs:subClassOf")
45 | for subclass in subclasses:
46 | superclass = subclass.get('rdf:resource')
47 | # now get label of superclass...
48 | subclass_label = ontology_soup.find(attrs={"rdf:about": superclass}).find("rdfs:label").get_text()
49 | G.add_edge(subclass_label, label) # add edge for relationship
50 |
51 | del finding, iri, label, subclass_label, subclass, superclass, subclasses
52 |
53 | ####################################################
54 |
55 | # G.remove_node("Space Ontology (UFO)")
56 |
57 | high_level_classes = [node for node, degree in G.in_degree() if degree == 0]
58 | color_map = [superclass_colour if node in high_level_classes else subclass_colour for node in G.nodes()]
59 |
60 | ####################################################
61 |
62 | plt.figure(figsize=(18, 10))
63 |
64 | if plot_type == "tree":
65 | pos = nx.nx_agraph.graphviz_layout(G, prog='dot')
66 | elif plot_type == "web":
67 | pos = nx.nx_agraph.graphviz_layout(G, prog='sfdp')
68 |
69 | ####################################################
70 |
71 | node_degrees = dict(G.degree())
72 | node_sizes = [15 * node_degrees[node] for node in G.nodes()]
73 |
74 | if plot_type == "web":
75 | min_lim = int( sorted(node_sizes,reverse=True)[:11][-1] )
76 | node_sizes = [15 if n <= min_lim else n for n in node_sizes]
77 |
78 | ####################################################
79 |
80 | nx.draw_networkx_nodes(G, pos,
81 | node_size=node_sizes,
82 | node_color=color_map,
83 | alpha=0.8)
84 |
85 | nx.draw_networkx_edges(G, pos, edge_color="gray", alpha=0.5, width=1.0, arrows=True)
86 |
87 | ####################################################
88 |
89 | if to_annotate_subclasses:
90 | highlevelfontsize = 8
91 | lowlevelfontsize = 6
92 | else: highlevelfontsize = 14
93 |
94 | ####################################################
95 |
96 | labels = {node: '\n'.join(wrap(node, width=11)) if node in high_level_classes else node for node in G.nodes() if node in high_level_classes}
97 | nx.draw_networkx_labels(G, pos, font_size=highlevelfontsize, font_weight="bold",labels=labels)
98 |
99 | labels = {node: '\n'.join(wrap(node, width=15)) if node not in high_level_classes else node for node in G.nodes() if node not in high_level_classes}
100 | if to_annotate_subclasses: nx.draw_networkx_labels(G, pos, font_size=lowlevelfontsize, labels=labels)
101 |
102 | ####################################################
103 |
104 | #plt.title("Ontology")
105 | plt.axis('off')
106 | plt.savefig("%s_%s.png" % (output_name, plot_type), format="PNG", dpi=300, bbox_inches='tight')
107 | plt.show()
108 |
109 | ####################################################
110 |
111 | # End of script
112 |
--------------------------------------------------------------------------------
/docs/paper.bib:
--------------------------------------------------------------------------------
1 | @ARTICLE{Cejuela2014-lv,
2 | title = "tagtog: interactive and text-mining-assisted annotation of gene
3 | mentions in {PLOS} full-text articles",
4 | author = "Cejuela, Juan Miguel and McQuilton, Peter and Ponting, Laura and
5 | Marygold, Steven J and Stefancsik, Raymund and Millburn, Gillian
6 | H and Rost, Burkhard and {FlyBase Consortium}",
7 | journal = "Database",
8 | volume = 2014,
9 | number = 0,
10 | month = apr,
11 | year = 2014,
12 | language = "en",
13 | doi = {10.1093/database/bau033}
14 | }
15 |
16 | @MISC{Honnibal2017-dn,
17 | title = "{s}pa{C}y 2: Natural language understanding with {B}loom embeddings,
18 | convolutional neural networks and incremental parsing",
19 | author = "Honnibal, Matthew and Montani, Ines",
20 | url = {https://github.com/explosion/spaCy},
21 | year = 2017
22 | }
23 |
24 | @INPROCEEDINGS{Manning2014-rt,
25 | title = "The {S}tanford {CoreNLP} natural language processing toolkit",
26 | booktitle = "{P}roceedings of 52nd {A}nnual {M}eeting of the {A}ssociation for
27 | {C}omputational {L}inguistics: {S}ystem {D}emonstrations",
28 | author = "Manning, Christopher and Surdeanu, Mihai and Bauer, John and
29 | Finkel, Jenny and Bethard, Steven and McClosky, David",
30 | pages = "55--60",
31 | year = 2014,
32 | doi = {10.3115/v1/p14-5010}
33 | }
34 |
35 | @ARTICLE{Schriml2012-qp,
36 | title = "Disease Ontology: a backbone for disease semantic integration",
37 | author = "Schriml, Lynn Marie and Arze, Cesar and Nadendla, Suvarna and
38 | Chang, Yu-Wei Wayne and Mazaitis, Mark and Felix, Victor and
39 | Feng, Gang and Kibbe, Warren Alden",
40 | journal = "Nucleic Acids Res.",
41 | volume = 40,
42 | number = "Database issue",
43 | month = jan,
44 | year = 2012,
45 | language = "en",
46 | doi = {10.1093/nar/gkr972}
47 | }
48 |
49 | @ARTICLE{Robinson2008-jh,
50 | title = "The Human Phenotype Ontology: a tool for annotating and analyzing
51 | human hereditary disease",
52 | author = "Robinson, Peter N and K{\"o}hler, Sebastian and Bauer, Sebastian
53 | and Seelow, Dominik and Horn, Denise and Mundlos, Stefan",
54 | journal = "Am. J. Hum. Genet.",
55 | volume = 83,
56 | number = 5,
57 | pages = "610--615",
58 | month = nov,
59 | year = 2008,
60 | language = "en",
61 | doi = {10.1016/j.ajhg.2008.09.017}
62 | }
63 |
64 | @ARTICLE{Hoehndorf2015-qr,
65 | title = "The role of ontologies in biological and biomedical research: a
66 | functional perspective",
67 | author = "Hoehndorf, Robert and Schofield, Paul N and Gkoutos, Georgios V",
68 | journal = "Brief. Bioinform.",
69 | volume = 16,
70 | number = 6,
71 | pages = "1069--1080",
72 | month = nov,
73 | year = 2015,
74 | keywords = "Semantic Web; data integration; data mining; ontology",
75 | language = "en",
76 | doi = {10.1093/bib/bbv011}
77 | }
78 |
79 |
80 |
81 | @ARTICLE{Van_Rossum1995-ia,
82 | title = "Python tutorial, technical report {CS-R9526}",
83 | author = "van Rossum, G",
84 | journal = "Centrum voor Wiskunde en Informatica (CWI), Amsterdam",
85 | year = 1995
86 | }
87 |
88 | @ARTICLE{Richardson2007-ba,
89 | title = "Beautiful soup documentation",
90 | author = "Richardson, Leonard",
91 | journal = "April",
92 | publisher = "media.readthedocs.org",
93 | year = 2007,
94 | url = {https://beautiful-soup-4.readthedocs.io/en/latest/}
95 | }
96 |
97 | @ARTICLE{Pedregosa2011-st,
98 | title = "Scikit-learn: Machine Learning in {P}ython",
99 | author = "Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre
100 | and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and
101 | Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and
102 | Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and
103 | Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and
104 | Duchesnay, {\'E}douard",
105 | journal = "J. Mach. Learn. Res.",
106 | volume = 12,
107 | number = "Oct",
108 | pages = "2825--2830",
109 | year = 2011
110 | }
111 |
112 | @INPROCEEDINGS{McKinney2010-xf,
113 | title = "Data structures for statistical computing in {P}ython",
114 | booktitle = "Proceedings of the 9th {P}ython in {S}cience {C}onference",
115 | author = "McKinney, Wes and {Others}",
116 | volume = 445,
117 | pages = "51--56",
118 | institution = "Austin, TX",
119 | year = 2010,
120 | doi = {10.25080/majora-92bf1922-00a}
121 | }
122 |
--------------------------------------------------------------------------------
/docs/paper.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Jabberwocky: an ontology-aware toolkit for manipulating text'
3 | tags:
4 | - Python
5 | - Ontologies
6 | - Text
7 | authors:
8 | - name: Samantha C Pendleton
9 | orcid: 0000-0002-6169-0135
10 | affiliation: "1, 2" # (Multiple affiliations must be quoted)
11 | - name: Georgios V Gkoutos
12 | affiliation: "1, 2" # (Multiple affiliations must be quoted)
13 | affiliations:
14 | - name: Institute of Cancer and Genomic Sciences, University of Birmingham, UK
15 | index: 1
16 | - name: University Hospitals Birmingham NHS Foundation Trust, UK
17 | index: 2
18 | date: 25 February 2020
19 | bibliography: paper.bib
20 |
21 | ---
22 |
23 | # Summary
24 |
25 | Unstructured textual data is underused, as extracting the key textual elements is complicated by a lack of structured terms, e.g., collecting the sentences from a corpus that are discussing a particular topic. To extract valuable text about a topic from a corpus, a user will need to gather a set of related terms. For example, when analysing clinical documents we can extract sentences by using specific clinicial terms. However this can miss additional valuable sentences where synonyms are used instead (e.g., physician notes that use shorthand). By considering terms and their synonyms we can extract more sentences from a corpus, making more data available for analysis. One way to do this and represent our knowledge of terms associated with a domain is to create an ontology. Ontologies allow us to formalise our knowledge of a domain in a condensed manner by using controlled terms, called classes [@Hoehndorf2015-qr]. Classes can be annotated with metadata, including synonyms. Ontologies can include relationships between terms, and annotations such as cross-references to other ontologies [@Hoehndorf2015-qr].
26 |
27 | Clearly, ontologies are valuable for the analysis of textual data. Unfortunately, despite the existence of many well-established ontologies, such as the "Human Phenotype Ontology" [@Robinson2008-jh] and the "Disease Ontology" [@Schriml2012-qp], there remains a lack of tools that can take advantage of ontologies, especially for general text manipulation. Existing tools for annotating text, such as “spaCy” [@Honnibal2017-dn], “tagtog” [@Cejuela2014-lv], and “Stanford CoreNLP” [@Manning2014-rt] cannot interrogate text with an ontology directly, and require ontologies to be pre-processed into other formats (leaving the time-consuming task of extracting labels and tags from an ontology into a suitable intermediate format as an exercise for the end-user). These are specialist tools, returning all text in the document with every word tagged, as “noun”, “verb”, and other customised tags. There exists a niche for users who want to leverage an ontology to retrieve textual data from a corpus without having to perform any pre-processing, or parse away unwanted tags.
28 |
29 | We introduce Jabberwocky, a Python-based [@Van_Rossum1995-ia], open-source toolkit (accessible via https://github.com/sap218/jabberwocky) that allows users to query text in an ontology-aware fashion, and to modify those ontologies based on their findings. For example, with Jabberwocky’s ``catch`` command, a user provides textual data, their chosen ontology, and a set of classes from the ontology to use as search terms. Jabberwocky cleans the input text, collects the annotated synonyms for the user-specified target classes (using “Beautiful Soup” to read the ontology’s XML structure [@Richardson2007-ba]), and then returns the key elements (e.g., lines from a corpus) which match one of the target terms, or a synonym from the ontology. The ``catch`` command will help users retrieve more matches for their chosen terms from the corpus, without users having to explicitly define all the possible synonyms or alternative spellings beforehand.
30 |
31 | Jabberwocky also helps ontology developers to iteratively improve their ontology. The ``bite`` command allows a user to provide textual data and rank the important terms by using the term frequency–inverse document frequency (tf-idf) method from “scikit-learn” [@Pedregosa2011-st], which calculates an importance metric for a term based on the frequency of its occurrence and the document size. Providing an ontology will exclude terms already described in the ontology, meaning the result of ``bite`` will be a CSV of candidate terms to potentially be added to the ontology, exported by “pandas” [@McKinney2010-xf]. Once an expert has reviewed the terms and associated them to a class in the ontology, Jabberwocky’s third command, ``arise``, will annotate the classes in the ontology, adding the newly identified synonyms. Iteratively performing multiple rounds of ``bite`` and ``arise`` can help the development and maintenance of ontologies. A user could use the ``catch`` command to confirm the modified ontology now captures more of the corpus.
32 |
33 | Jabberwocky’s test repository (see Jabberwocky repo for further instructions), shows examples of each command separately. The ‘process’ directory shows an example that combines all three commands to demonstrate an example workflow. With 24 blog posts, the first use of ``catch`` returned 11 posts with the provided keywords. The example uses ``bite`` to review the CSV of ranked terms and curated new synonyms, simply by adding the corresponding class label from the ontology. It then uses ``arise`` to add the identified synonyms into the ontology. With the second round of ``catch`` the number of posts returned for the same keywords increased to 16. This is a basic and straightforward example, but powerful. With Jabberwocky, users can efficiently search their text and gain more instances, providing new insight.
34 |
35 | Jabberwocky leverages the strength of ontologies and text for a wide range of tasks. It will be useful to users who want to manipulate textual data using controlled vocabulary from ontologies.
36 |
37 | # Acknowledgements
38 |
39 | Project was funded by the Medical Research Council (MRC) (MR/S502431/1) & supported by Health Data Research (HDR) UK (HDRUK/CFC/01).
40 |
41 | # References
42 |
43 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | An ontology is a knowledge representation framework that is machine readable.
2 | It facilitates logical relationships between classes and allows us to standardise the formalised vocabulary within a domain.
3 | The metadata contained within an ontology is valuable - research has shown to address the challenges presented by unstructured text.
4 |
5 | Unstructured text can be processed, mined, and empowered by NLP tools, yet majority of tools are not designed to consider ontologies.
6 |
7 | Jabberwocky allows users to conduct various NLP tasks whilst easily manipulating tologies.
8 | Here provides an explanation - with a working example - for the Jabberwocky toolkit.
9 |
10 | See the [Jabberwocky](https://github.com/sap218/jabberwocky) repository for code.
11 |
12 | ---
13 |
14 | ## Functionality
15 |
16 |
17 | ### bandersnatch
18 | Extract metadata from ontology classes based on a list of tags.
19 |
20 | Users should use ontologies that are in the `OWL` RDF/XML syntax.
21 | (if not in this format, users can open their ontology in [Protégé](https://protege.stanford.edu/) and export in correct format)
22 |
23 | Metadata in ontologies are in various formats, below shows a list of tags as an example:
24 | ```
25 | oboInOWL:hasExactSynonym
26 | oboInOWL:hasRelatedSynonym
27 | ```
28 |
29 | Words of interest (recommended to match ontology)
30 | ```
31 | dragon
32 | water
33 | large
34 | ```
35 |
36 | ##### Output
37 | `snatch_output.txt` will include the ontology classes and corresponding metadata based on chosen classes & tags.
38 |
39 | If users have no words of interest, then the output will include **all ontology classes** but users will still need to include a list of tags.
40 |
41 | ---
42 |
43 | ### catch
44 | Annotation of a corpus (unstructured text).
45 |
46 | Words of interest - the `bandersnatch` output can be used here:
47 | ```
48 | dragon
49 | water
50 | ocean
51 | large
52 | big
53 | ```
54 | It is **important to note**: phrases work in Jabberwocky.
55 |
56 | The corpus should be a `txt` with sentences/posts separated in new lines:
57 | ```
58 | This is post 1 in a corpus
59 | This is post 2
60 |
61 | This is post 3 - as you can see there is a gap between post 2 and 3, this is fine
62 | This is post 4 > users also don't need to worry about formatting, Jabberwocky will handle this'
63 | ```
64 |
65 | ##### Output
66 | `catch_output.txt` will include the posts that were annotated.
67 | Users can choose output types: perhaps only the posts annotated or w/ corresponding tags.
68 |
69 | Moreover, users can choose to export the posts that were **NOT** annotated.
70 |
71 | ##### Plotting
72 | Users can generate a wordcloud figure from the corpus.
73 |
74 | ##### Highlighting
75 | Inspired by an old project - [cyannotator](https://github.com/sap218/cyannotator) - users can request an HTML output of posts with the annotations highlighted.
76 |
77 | ---
78 |
79 | ### bite
80 | Rank all words in a corpus in terms of importance (via the TF-IDF statistical technique).
81 |
82 | One valuable parameter is being able to adjust input for TF-IDF so the technique measures multiple n-grams.
83 | Users can request more than unigrams: bigrams, trigrams, and more.
84 |
85 | Users can provide a list of words to remove from the corpus to avoid being weighted/measured - the `bandersnatch` output can be used here.
86 |
87 | ##### Output
88 | `bite_output.tsv` is a dataframe with Word and Score.
89 | Scores are the average TF-IDF values across posts, normalised for readability.
90 | Moreover, normalised scores that are 0 are dropped.
91 |
92 | Word | Raw score | Normalised score
93 | ------- | -----------
94 | mega | 0.078 | 1.0
95 | path | 0.06 | 0.719
96 |
97 | ##### Plotting
98 | Users can export a bar plot of the top N ranked terms (default 30).
99 |
100 | ---
101 |
102 | ### arise
103 | Updating ontology classes with new metadata.
104 |
105 | Users will provide a dataframe with three columns: the annotation, class (exact ontology match), and tag:
106 | ```
107 | annotation class tag
108 | sea water oboInOWL:hasExactSynonym
109 | mega large oboInOWL:hasRelatedSynonym
110 | https://pokemon.fandom.com/wiki/Types type oboInOWL:DbXref
111 | ```
112 | This can be derived from the `bite` output (e.g. synonyms).
113 |
114 | ##### Output
115 | `[ontology]_updated.owl` is the updated ontology.
116 |
117 | ---
118 |
119 | ### eyes
120 | Plot an ontology in web or tree style.
121 | By default, superclasses will have overlay text but users can choose whether to include for subclasses.
122 |
123 | ##### Output
124 | `[ontology]_[plottype].png` is the updated ontology.
125 |
126 | ---
127 |
128 | ## Scenario
129 |
130 | You have curated unstructured text: blog posts from a social media platform (with permission of course, in this example I invented these fake conversations).
131 |
132 | Your aim is to text mine the corpus and only have posts covering a particular topic (or set of topics).
133 | But you realise, although you know some words in this topic of yours, you may be missing related/broad synonyms.
134 |
135 | This is where **ontologies are useful**. Ontologies are a controlled set of vocabulary with annotations.
136 |
137 | With your words of interest (ontology classes) you can run `bandersnatch` to extract all synonyms.
138 |
139 | With these classes and corresponding synonyms, you can annotate the corpus using `catch` - the `PhraseMatcher()` function[^spacy] tags each post in the corpus.
140 |
141 | You've chosen to have two outputs: one with the annotated posts for downstream analysis.
142 | The other you decided to investigate if there is anything valuable in the posts that weren't annotated.
143 |
144 | You can proceed to use `bite` - investigating if there are any "important" terms.
145 | The statistical TF-IDF method[^tfidf] is applied and all words are ranked in terms of importance.
146 | Here we can use the whole corpus, or perhaps use the `catch` output with the non-annotated posts.
147 |
148 | With whatever data we us, with the `bite` output, you may have noticed new terms/synonyms...
149 | You can use `arise` to update your ontology classes with these new synonyms.
150 |
151 | Finally, you may want to rerun `bandersnatch` to extract an updated list of key terms.
152 | Then we can rerun `catch` for a more fruitful output for our investigations.
153 |
154 | This concludes the NLP workflow: you noticed the 2nd round of `catch` provides more data and so a more fruitful downstream analysis.
155 |
156 | ---
157 |
158 | ## Conclusion
159 |
160 | This work was published in [JOSS](https://doi.org/10.21105/joss.02168), you can cite here:
161 |
162 | ```
163 | @article{Pendleton2020,
164 | doi = {10.21105/joss.02168},
165 | url = {https://doi.org/10.21105/joss.02168},
166 | year = {2020},
167 | publisher = {The Open Journal},
168 | volume = {5},
169 | number = {51},
170 | pages = {2168},
171 | author = {Samantha C. Pendleton and Georgios V. Gkoutos},
172 | title = {Jabberwocky: an ontology-aware toolkit for manipulating text},
173 | journal = {Journal of Open Source Software}
174 | }
175 | ```
176 |
177 | This repository was inspired by (and the inspiration of) the [OcIMIDo](https://doi.org/10.1016/j.compbiomed.2021.104542) project.
178 |
179 | [^spacy]: using [spaCy](https://spacy.io/api/phrasematcher)
180 | [^tfidf]: Term frequency inverse document frequency (TF-IDF)
181 |
182 | ***
183 |
184 | End of page
185 |
--------------------------------------------------------------------------------
/bite/bite.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: conducts TF-IDF
7 | @GitHub: github.com/sap218/jabberwocky
8 |
9 | @useful links:
10 | # https://python-charts.com/matplotlib/styles/
11 | """
12 |
13 | import sys
14 | import re
15 | import time
16 | import pandas as pd
17 | import matplotlib.pyplot as plt
18 |
19 | from sklearn.feature_extraction.text import TfidfVectorizer
20 | from sklearn.preprocessing import MinMaxScaler
21 |
22 | import spacy
23 | nlp = spacy.load("en_core_web_sm")
24 |
25 | from nltk import ngrams
26 |
27 | from params_bite import *
28 |
29 | ####################################################
30 |
31 | from highlevel import *
32 |
33 | ''' stopWords '''
34 | if filter_level == "none": stopWords = stopWords[0]
35 | elif filter_level == "light": stopWords = stopWords[1]
36 | elif filter_level == "heavy": stopWords = stopWords[2]
37 |
38 | #stopWords = [cleantext(x.lower()) for x in stopWords]
39 |
40 | stopWords_lemma = []
41 | for word in stopWords:
42 | word = cleantext(word.lower())
43 | doc = nlp(word)
44 | doc_lemma = " ".join([token.lemma_ for token in doc])
45 | stopWords_lemma.append(doc_lemma)
46 | stopWords_lemma_filt = list(filter(None, stopWords_lemma))
47 | stopWords_lemma_filt_flat = [word for phrase in stopWords_lemma_filt for word in phrase.split()]
48 |
49 | stopWords = list(set(stopWords_lemma_filt_flat))
50 | del word, doc, doc_lemma, stopWords_lemma, stopWords_lemma_filt, stopWords_lemma_filt_flat
51 |
52 | ####################################################
53 | ####################################################
54 |
55 | if len(concepts_to_remove) > 0:
56 | try:
57 | words_of_interest = []
58 | with open("%s.txt" % concepts_to_remove, "r") as t:
59 | for word in t:
60 | words_of_interest.append(word.strip("\n").strip(" "))
61 | del t, word
62 | except FileNotFoundError:
63 | sys.exit("User attempted to provide a list of concepts to remove from TF-IDF - unsuccessful")
64 | else: words_of_interest = ["nowordstofilter"]
65 |
66 | words_of_interest = list(filter(None, words_of_interest))
67 |
68 | ####################################################
69 |
70 | words_of_interest_clean_lemma_stpwrd = []
71 |
72 | # preprocess concepts: Lemmatize & stopWords
73 | for concept in words_of_interest:
74 | concept = cleantext(concept.lower())
75 |
76 | doc = nlp(concept)
77 |
78 | ## lemma
79 | doc_lemma = [token.lemma_ for token in doc]
80 | ## stopwords
81 | doc_lemma_stpwrd = [remove_stop_words(text, stopWords) for text in doc_lemma]
82 | doc_lemma_stpwrd = list(filter(None, doc_lemma_stpwrd))
83 |
84 | if doc_lemma_stpwrd:
85 | words_of_interest_clean_lemma_stpwrd.append(" ".join(doc_lemma_stpwrd).lower())
86 |
87 | del concept, doc, doc_lemma, doc_lemma_stpwrd
88 |
89 | ####################################################
90 | ####################################################
91 |
92 | list_of_posts = []
93 |
94 | with open("%s.txt" % corpus, "r") as t:
95 | for post in t:
96 | list_of_posts.append(post.strip("\n").strip(" "))
97 | del t, post
98 | list_of_posts = list(filter(None, list_of_posts))
99 |
100 | ####################################################
101 |
102 | list_of_posts_clean_lemma_stpwrd = []
103 |
104 | for post in list_of_posts:
105 | post = cleantext(post.lower())
106 |
107 | doc = nlp(post)
108 |
109 | ## lemma
110 | doc_lemma = [token.lemma_ for token in doc]
111 | ## stopwords
112 | doc_lemma_stpwrd = [remove_stop_words(text, stopWords) for text in doc_lemma]
113 | doc_lemma_stpwrd = list(filter(None, doc_lemma_stpwrd))
114 |
115 | list_of_posts_clean_lemma_stpwrd.append(" ".join(doc_lemma_stpwrd).lower())
116 |
117 | del post,doc,doc_lemma,doc_lemma_stpwrd
118 |
119 | ####################################################
120 | ####################################################
121 |
122 | words_of_interest_clean_lemma_stpwrd.append("evolve")
123 | words_of_interest_clean_lemma_stpwrd.append("team rocket")
124 |
125 | list_of_posts_clean_lemma_stpwrd.append("evolve")
126 | list_of_posts.append("evolve")
127 |
128 | ####################################################
129 | ####################################################
130 |
131 | def remove_phrases(sentences, phrases):
132 | cleaned_sentences = []
133 | for sentence in sentences:
134 | for phrase in phrases:
135 | sentence = sentence.replace(phrase, '')
136 | sentence = re.sub(' +', ' ', sentence).strip() # remove double whitespace
137 | cleaned_sentences.append(sentence)
138 | return cleaned_sentences
139 |
140 | list_of_posts_clean_lemma_stpwrd_filtered = remove_phrases(list_of_posts_clean_lemma_stpwrd, words_of_interest_clean_lemma_stpwrd)
141 |
142 | ####################################################
143 | ####################################################
144 |
145 | gram_limit = ngram_count.copy()
146 | #gram_limit = [x+1 for x in range(ngram_count)]
147 |
148 | posts_cln_lmm_stpwrd_flt_ngrm = {}
149 |
150 | x = 0
151 | for post in list_of_posts_clean_lemma_stpwrd_filtered:
152 | ngram_list = []
153 | for n in gram_limit:
154 | ngrammed = ngrams(post.split(), n)
155 | for gram in ngrammed:
156 | ngram_list.append( "_".join(gram) )
157 | posts_cln_lmm_stpwrd_flt_ngrm[x] = [ngram_list, post, list_of_posts[x]]
158 | x = x + 1
159 | del x, post, ngram_list, ngrammed
160 |
161 | first_index_values = [" ".join(values[0]) for values in posts_cln_lmm_stpwrd_flt_ngrm.values()]
162 |
163 | ####################################################
164 |
165 | start_time = time.time()
166 |
167 | tfidf_vectorizer = TfidfVectorizer()
168 | tfidf_matrix = tfidf_vectorizer.fit_transform(first_index_values)
169 | tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
170 |
171 | end_time = time.time() - start_time
172 | end_time = str(round(end_time, 3))
173 | print( "Seconds taken to run tf-idf: %s" % end_time)
174 | del start_time, tfidf_matrix, tfidf_vectorizer, first_index_values
175 |
176 | ####################################################
177 |
178 | tfidf_df['Sentence'] = list_of_posts_clean_lemma_stpwrd_filtered # col to show original sentences
179 | tfidf_df = tfidf_df[['Sentence'] + [col for col in tfidf_df.columns if col != 'Sentence']] # sentence first col
180 |
181 | ####################################################
182 |
183 | summary_scores = tfidf_df.drop(columns=['Sentence']).agg('mean', axis=0)
184 | tfidf_df_sum = pd.DataFrame({'Word': summary_scores.index, 'Raw score': summary_scores.values})
185 | del summary_scores, tfidf_df
186 |
187 | ####################################################
188 |
189 | scaler = MinMaxScaler()
190 | tfidf_df_sum['Normalised score'] = scaler.fit_transform(tfidf_df_sum[['Raw score']])
191 | tfidf_df_sum = tfidf_df_sum.sort_values("Normalised score", ascending=False)
192 | del scaler
193 |
194 | ####################################################
195 |
196 | df = tfidf_df_sum.copy()
197 | df = df[df['Normalised score'] != 0]
198 |
199 | df['Raw score'] = df['Raw score'].round(decimals=3)
200 | df['Normalised score'] = df['Normalised score'].round(decimals=3)
201 |
202 | ####################################################
203 |
204 | # IDEA add post for users to extrapolate/add context
205 |
206 | #df['Post'] = df['Word'].apply(lambda x: [v[2] for v in posts_cln_lmm_stpwrd_flt_ngrm.values() if x in v[0]])
207 | #df['Post'] = [list(set(x)) for x in df['Post'] ]
208 | #dfexplode = df.explode('Post')
209 |
210 | ####################################################
211 |
212 | df.to_csv('%s.tsv' % output_name, index=False, sep="\t")
213 |
214 | ####################################################
215 | ####################################################
216 |
217 | statistics = [
218 | "time taken to run tf-idf: %s" % end_time,
219 | "tf-idf raw df length: %s" % str(len(tfidf_df_sum)),
220 | "tf-idf adj. df length: %s" % str(len(df))
221 | ]
222 | del end_time, tfidf_df_sum
223 |
224 | with open('%s.txt' % stats_output_name, 'w') as t:
225 | for word in statistics:
226 | t.write(word + '\n')
227 | del t,word
228 |
229 | ####################################################
230 |
231 | if graph:
232 | plt.style.use("seaborn-poster")
233 | fig = plt.figure()
234 | ax = fig.add_axes([0,0,1,1])
235 | ax.bar(df["Word"][:limit],df["Normalised score"][:limit], color=cm)
236 | plt.xticks(rotation=90)
237 | ax.set_ylabel('Average score (normalised)')
238 | ax.set_xlabel('Terms')
239 | ax.set_title("Bar plot of top %s TF-IDF ranked terms" % limit)
240 | plt.savefig('%s.png' % plot_output_name, bbox_inches='tight')
241 | del ax, fig
242 |
243 | ####################################################
244 |
245 | # End of script
246 |
--------------------------------------------------------------------------------
/catch/catch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: with words of interest, grep a text file
7 | @GitHub: github.com/sap218/jabberwocky
8 |
9 | @useful links:
10 | # https://matplotlib.org/stable/users/explain/colors/colormaps.html
11 | """
12 |
13 | import time
14 | start_script = time.time()
15 |
16 | from datetime import datetime
17 | now = datetime.now()
18 | output_timestamp = now.strftime("%Y%m%d-%H%M")
19 | del now
20 |
21 | import sys
22 |
23 | import spacy
24 | from spacy.matcher import PhraseMatcher
25 | nlp = spacy.load("en_core_web_sm")
26 |
27 | from wordcloud import WordCloud
28 | import matplotlib.pyplot as plt
29 |
30 | from params_catch import *
31 |
32 | ####################################################
33 |
34 | if is_this_a_test:
35 | file_corpus = "../catch/test/social_media_posts.txt"
36 | file_words_of_interest = "../bandersnatch/test/snatch_output.txt"
37 |
38 | dirloc = "test/"
39 | output_name_catch = "%sannotation_%s.txt" % (dirloc, output_format)
40 | output_name_log = "%slog.txt" % (dirloc)
41 | if plotWORDCLOUD: output_name_wordcloud = "%swordcloud.png" % (dirloc)
42 | if plotCYANNOTATOR: output_name_cyannotator = "%scyannotator.html" % (dirloc)
43 |
44 | else:
45 | dirloc = "output/"
46 | output_name_catch = "%sannotation_%s_%s_%s.txt" % (dirloc, output_format, output_name, output_timestamp)
47 | output_name_log = "%slog_%s_%s.txt" % (dirloc, output_name, output_timestamp)
48 | if plotWORDCLOUD: output_name_wordcloud = "%swordcloud_%s_%s.png" % (dirloc, output_name, output_timestamp)
49 | if plotCYANNOTATOR: output_name_cyannotator = "%scyannotator_%s_%s.html" % (dirloc, output_name, output_timestamp)
50 |
51 | del dirloc, output_name
52 |
53 | ####################################################
54 |
55 | from highlevel import *
56 |
57 | ''' stopWords '''
58 | if filter_level == "none": stopWords = stopWords[0]
59 | elif filter_level == "light": stopWords = stopWords[1]
60 | elif filter_level == "heavy": stopWords = stopWords[2]
61 |
62 | stopWords_lemma = []
63 | stopWordsList = []
64 | for word in stopWords:
65 | stopWords_lemma.append(clean_lower_lemma(word, "stopwords", stopWordsList))
66 |
67 | stopWords_lemma_flat = [word for phrase in stopWords_lemma for word in phrase.split()]
68 | stopWordsList = list(set(filter(None, stopWords_lemma_flat)))
69 |
70 | del word, stopWords, stopWords_lemma, stopWords_lemma_flat#, doc
71 |
72 | ####################################################
73 | ####################################################
74 |
75 | try:
76 | list_of_posts = []
77 | with open("%s" % file_corpus, "r") as t:
78 | for line in t:
79 | list_of_posts.append(line.strip("\n").strip(" "))
80 | del file_corpus, t, line
81 | except FileNotFoundError:
82 | sys.exit("Cannot find [corpus] text file")
83 |
84 | list_of_posts = list(filter(None, list_of_posts)) # remove empty lines
85 |
86 | post_stats = [len(x.split()) for x in list_of_posts] # word count per line
87 |
88 | ####################################################
89 | ####################################################
90 |
91 | if len(file_words_of_interest) > 0:
92 | try:
93 | words_of_interest = []
94 | with open("%s" % file_words_of_interest, "r") as t:
95 | for line in t:
96 | words_of_interest.append(line.strip("\n").strip(" "))
97 | del t, line
98 | except FileNotFoundError:
99 | #sys.exit("User attempted to provide a list of terms for annotation - unsuccessful")
100 | sys.exit("Cannot find [words of interest] file")
101 | else: words_of_interest = [] #["nowordstofilter"]
102 |
103 | words_of_interest = list(filter(None, words_of_interest)) # remove empty lines
104 |
105 | ####################################################
106 |
107 | statistics = [
108 | "is this a test: %s" % str(is_this_a_test),
109 | "stopword filter level: %s" % filter_level,
110 | "concepts count: %s" % len(words_of_interest),
111 | "post count: %s" % len(list_of_posts),
112 | "average word count: %s" % (sum(post_stats)/len(post_stats)),
113 | ]
114 | del post_stats, filter_level
115 |
116 | if not words_of_interest: words_of_interest = ["PlaceholderAsThereAreNoWordsToFilter"]
117 |
118 | ####################################################
119 | ####################################################
120 |
121 | words_of_interest_formatted = []
122 | concept_patterns = [] # for matcher
123 |
124 | # preprocess concepts: Lemmatize & stopWords
125 | for concept in words_of_interest:
126 | doc_lemma_stpwrd_filter = clean_lower_lemma(concept, "text", stopWordsList)
127 |
128 | if doc_lemma_stpwrd_filter:
129 | concept_patterns.append(nlp(" ".join(doc_lemma_stpwrd_filter).lower()))
130 | words_of_interest_formatted.append(" ".join(doc_lemma_stpwrd_filter).lower())
131 | del concept
132 |
133 | matcher = PhraseMatcher(nlp.vocab) # initialize phrase matcher
134 | matcher.add("Concepts", None, *concept_patterns) # convert concepts into patterns
135 | del concept_patterns
136 |
137 | ####################################################
138 | ####################################################
139 |
140 | doc_lemma_stpwrd_filter_output = []
141 | list_of_posts_formatted = []
142 |
143 | for post in list_of_posts:
144 | doc_lemma_stpwrd_filter = clean_lower_lemma(post, "text", stopWordsList)
145 |
146 | list_of_posts_formatted.append(" ".join(doc_lemma_stpwrd_filter).lower())
147 |
148 | doc_lemma_stpwrd_filter_output.append(doc_lemma_stpwrd_filter)
149 |
150 | del post, doc_lemma_stpwrd_filter
151 |
152 | ####################################################
153 |
154 | if plotWORDCLOUD:
155 | if not colormapWC: colormapWC = "Set3"
156 |
157 | wc = WordCloud(
158 | width = 2048, height = 1080,
159 |
160 | background_color='white',
161 | colormap = colormapWC,
162 | contour_color='black', contour_width=10,
163 |
164 | max_words=30, min_font_size=10,
165 | #stopwords = ['word'], # words don't want to plot
166 | collocations = True, # words joined together
167 | normalize_plurals=False,
168 |
169 | prefer_horizontal=0.8,scale=2,
170 | random_state=123
171 | ).generate(" ".join(list_of_posts_formatted))
172 |
173 | plt.figure(figsize=(10, 5))
174 | plt.axis("off")
175 | plt.tight_layout(pad = 0)
176 | plt.imshow(wc, interpolation="bilinear")
177 | plt.savefig('%s' % output_name_wordcloud)
178 |
179 | del plotWORDCLOUD, colormapWC, wc, output_name_wordcloud
180 |
181 | ####################################################
182 |
183 | # cyan with soft glow for highlighting, can use :cyan for original
184 | if plotCYANNOTATOR:
185 | if not highlightcolour: highlightcolour = "#00bcd4"
186 | cyancolour = ["" % highlightcolour,
187 | ""] # plain
188 | cyannotator_text = []
189 |
190 | ####################################################
191 |
192 | start_annotation = time.time()
193 |
194 | matched_output_list = []
195 |
196 | y = 0
197 | for post in doc_lemma_stpwrd_filter_output:
198 | print("Sentence iteration ", y+1, " out of ", len(list_of_posts))
199 |
200 | post = " ".join(post)
201 |
202 | doc = nlp(post)
203 | matches = matcher(doc)
204 |
205 | if matches:
206 | matched_concepts = set()
207 | #if cyannotator: highlighting = " ".join(doc_lemma_stpwrd_filter).lower()
208 |
209 | cyaned = []
210 | for match_id, start, end in matches:
211 | matched_span = doc[start:end]
212 | matched_concepts.add(matched_span.text)
213 |
214 | if plotCYANNOTATOR:
215 | highlighting = re.sub(r'\b%s\b' % re.escape(matched_span.text),
216 | (cyancolour[0] + matched_span.text + cyancolour[1]), post)
217 | cyaned.append(highlighting)
218 | if plotCYANNOTATOR: cyannotator_text.append(cyaned[-1])
219 |
220 | matched_output_list.append([ list(matched_concepts), list_of_posts[y] ])
221 |
222 | del matched_concepts, match_id, start, end, matched_span, cyaned
223 |
224 | else:
225 | matched_output_list.append([ "NO ANNOTATION", list_of_posts[y] ])
226 |
227 | y = y + 1
228 |
229 | del y, post, doc, matches
230 |
231 | ####################################################
232 |
233 | end_annotation = time.time() - start_annotation
234 | end_annotation = str(round(end_annotation, 2))
235 | statistics.append("time taken to annotate (seconds): %s" % end_annotation)
236 | del start_annotation, end_annotation
237 |
238 | ####################################################
239 | ####################################################
240 |
241 | matched_output_list_output = []
242 |
243 | for x,content in enumerate(matched_output_list):
244 |
245 | if output_format == "wtags":
246 | if content[0] != "NO ANNOTATION": matched_output_list_output.append( "%s # %s" % (sorted(content[0]),content[1]) )
247 | elif output_format == "grep":
248 | if content[0] != "NO ANNOTATION": matched_output_list_output.append(content[1])
249 | elif output_format == "invertedgrep":
250 | if content[0] == "NO ANNOTATION": matched_output_list_output.append(content[1])
251 |
252 | del x, content, output_format
253 |
254 | if not matched_output_list_output:
255 | matched_output_list_output.append("NO ANNOTATIONS")
256 | if not file_words_of_interest:
257 | statistics.append("NO ANNOTATIONS - this is due to an empty [words of interest] file")
258 | else: statistics.append("NO ANNOTATIONS")
259 |
260 | ####################################################
261 |
262 | with open('%s' % output_name_catch, 'w') as t:
263 | for word in matched_output_list_output:
264 | t.write(word + '\n')
265 | del t, word, output_name_catch
266 |
267 | ####################################################
268 |
269 | if plotCYANNOTATOR:
270 | if not cyannotator_text:
271 | cyannotator_text = ["NO ANNOTATIONS"]
272 |
273 | html_content = ""
274 | html_content += "
".join(cyannotator_text)
275 | html_content += ""
276 |
277 | with open('%s' % output_name_cyannotator, 'w') as f:
278 | f.write(html_content)
279 | del f, output_name_cyannotator
280 |
281 | ####################################################
282 |
283 | end_script = time.time() - start_script
284 | end_script = str(round(end_script, 2))
285 | statistics.append("time taken to run script (seconds): %s" % end_script)
286 | del start_script, end_script
287 |
288 | with open(output_name_log, 'w') as t:
289 | for word in statistics:
290 | t.write(word + '\n')
291 | del t, word, output_name_log
292 |
293 | ####################################################
294 |
295 | # End of script
296 |
--------------------------------------------------------------------------------
/highlevel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @date: 2024
5 | @author: Samantha C Pendleton
6 | @description: high-level variables
7 | @GitHub: github.com/sap218/jabberwocky
8 |
9 | @useful links:
10 | # https://gist.github.com/sebleier/554280
11 | """
12 |
13 | import re
14 | import contractions
15 |
16 | import spacy
17 | nlp = spacy.load("en_core_web_sm")
18 |
19 | def cleantext(post):
20 | post = contractions.fix(post)
21 | post = re.sub(' +', ' ', post) # double spaces
22 | post = re.sub("[^A-Za-z0-9']+", " ", post).replace("'", " ").strip() # consider "
23 | return post
24 |
25 | ####################################################
26 |
27 | stopWords = [
28 | [''],
29 | ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
30 | ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]
31 | ]
32 |
33 | def remove_stop_words(text, stopWordsList):
34 | #return ' '.join(word for word in text.split() if word.lower() not in stopWordsList)
35 | return ' '.join(word for word in text.split() if word not in stopWordsList)
36 |
37 | ####################################################
38 |
39 | def clean_lower_lemma(iteration, itype, stopWordsList):
40 | iteration = cleantext(iteration.lower())
41 | doc = nlp(iteration)
42 |
43 | if itype == "stopwords":
44 | doc_lemma = " ".join([token.lemma_.lower() for token in doc])
45 | elif itype == "text":
46 | doc_lemma = [token.lemma_.lower() for token in doc]
47 | doc_lemma_stopwords = [remove_stop_words(text, stopWordsList) for text in doc_lemma]
48 | doc_lemma_stopwords_filter = list(filter(None, doc_lemma_stopwords))
49 | doc_lemma = doc_lemma_stopwords_filter.copy()
50 |
51 | return doc_lemma
52 |
53 | ####################################################
54 |
55 | # End of script
56 |
--------------------------------------------------------------------------------
/arise/test/pocketmonsters_updated.owl:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Brief Pokemon Ontology
5 |
6 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
26 |
27 |
28 | evolvegeneration
29 |
30 |
31 |
32 | https://pokemon.fandom.com/wiki/Typesbreedtype
33 |
34 |
35 |
36 | size
37 |
38 |
39 |
40 |
41 | small
42 |
43 |
44 |
45 |
46 | medium
47 |
48 |
49 |
50 | mega
51 | large
52 |
53 |
54 |
55 | pathroute
56 |
57 |
58 |
59 |
60 | generation 1
61 | gen 1
62 | gen one
63 | generation one
64 |
65 |
66 |
67 |
68 | generation 2
69 | gen 2
70 | gen two
71 | generation two
72 |
73 |
74 |
75 |
76 | generation 3
77 | gen 3
78 | gen three
79 | generation three
80 |
81 |
82 |
83 |
84 | generation 4
85 | gen 4
86 | gen four
87 | generation four
88 |
89 |
90 |
91 |
92 | generation 5
93 | gen 5
94 | gen five
95 | generation five
96 |
97 |
98 |
99 |
100 | generation 6
101 | gen 6
102 | gen six
103 | generation six
104 |
105 |
106 |
107 |
108 | normal
109 |
110 |
111 |
112 |
113 | grass
114 |
115 |
116 |
117 |
118 | water
119 |
120 |
121 |
122 |
123 | fire
124 |
125 |
126 |
127 |
128 | electric
129 |
130 |
131 |
132 |
133 | ground
134 |
135 |
136 |
137 |
138 | rock
139 |
140 |
141 |
142 | airflew
143 | flying
144 |
145 |
146 |
147 |
148 | bug
149 |
150 |
151 |
152 |
153 | poison
154 |
155 |
156 |
157 |
158 | fighting
159 |
160 |
161 |
162 |
163 | psychic
164 |
165 |
166 |
167 |
168 | ghost
169 |
170 |
171 |
172 |
173 | dark
174 |
175 |
176 |
177 |
178 | ice
179 |
180 |
181 |
182 |
183 | steel
184 |
185 |
186 |
187 |
188 | dragon
189 |
190 |
191 |
192 |
193 | fairy
194 |
195 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
--------------------------------------------------------------------------------
/bandersnatch/test/pocketmonsters.owl:
--------------------------------------------------------------------------------
1 |
2 |
11 |
12 | Brief Pokemon Ontology
13 |
14 |
15 |
16 |
17 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | generation
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 | type
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 | size
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | small
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 | medium
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 | large
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | route
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 | generation 1
121 | gen 1
122 | gen one
123 | generation one
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 | generation 2
133 | gen 2
134 | gen two
135 | generation two
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 | generation 3
145 | gen 3
146 | gen three
147 | generation three
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | generation 4
157 | gen 4
158 | gen four
159 | generation four
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 | generation 5
169 | gen 5
170 | gen five
171 | generation five
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 | generation 6
181 | gen 6
182 | gen six
183 | generation six
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 | normal
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 | grass
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 | water
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 | fire
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 | electric
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 | ground
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 | rock
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 | flying
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 | bug
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 | poison
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 | fighting
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 | psychic
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 | ghost
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 | dark
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 | ice
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 | steel
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 | dragon
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 | fairy
346 |
347 |
348 |
349 |
350 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
--------------------------------------------------------------------------------