├── .gitignore
├── LICENSE
├── README.md
├── combine_extracted_idioms.py
├── config.py
├── data
    └── input_sample.txt
├── detect_pies.py
├── evaluate_extraction.py
├── oxford.py
├── pos2morpha.py
├── process_corpus.py
├── using_english.py
├── utils.py
└── wiktionary.py


/.gitignore:
--------------------------------------------------------------------------------
1 | working/
2 | ext/
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Automatic Detection of Potentially Idiomatic Expressions
 2 | This is the source code for a system to automatically detect potentially idiomatic expressions (PIEs, for short) in text. It has four different methods of doing so: exact string match, fuzzy string match, inflectional string match, and parse-based matching. It relies on a set of digitally available idiom dictionaries to get an inventory of expressions, and extracts all instances of those expressions (with context) from the input corpus.
 3 | 
 4 | ## Requirements 
 5 | To run this code, you'll need the following Python setup:
 6 | * Python 2.7.6
 7 | * beautifulsoup4 4.5.1
 8 | * requests 2.17.3
 9 | * nltk 3.2.4
10 | * spacy 2.0.6 + en_core_web_sm 2.0.0
11 | * lxml 3.3.3
12 | 
13 | Different versions might work just as well, but cannot be guaranteed. 
14 | 
15 | You might also need:
16 | * [morph](http://users.sussex.ac.uk/~johnca/morph.html), if you want to run inflectional string matching.
17 | * [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/), if you want to run the parse-based method with the Stanford parser.
18 | * the Python library [stanfordcorenlp](https://github.com/Lynten/stanford-corenlp) 3.7.0.2
19 | * the [British National Corpus](http://www.natcorp.ox.ac.uk/), if you want to extract PIEs from that. 
20 | 
21 | ## Getting Started
22 | - Clone the repository
23 | - Create subdirectories called `working` and `ext`
24 | - If necessary: 
25 |   - create a symlink `ext/morph` to the main directory of the morph tools
26 |   - create a symlink `ext/stanford` to the main directory of your Stanford CorenNLP installation
27 |   - create a symlink `ext/BNC` to the `Texts` directory of your copy of the BNC
28 | - Try and run the system with `python detect_pies.py data/input_sample.txt -d wiktionary -t plain -m exact`. This should extract a list of idioms from Wiktionary and use the exact string match method to extract PIEs from the input sample file.
29 | - Get an overview of all options by simply running `python detect_pies.py --help`
30 | 
31 | ## Contact
32 | For any questions about (running) the system, feel free to contact me.
33 | 


--------------------------------------------------------------------------------
/combine_extracted_idioms.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''
 5 | Combine the output of two runs of the PIE extraction system, removing duplicates.
 6 | '''
 7 | 
 8 | import json, argparse, csv, copy
 9 | 
10 | from utils import u8
11 | 
12 | # Read in arguments
13 | parser = argparse.ArgumentParser(description = 'Parameters for PIE extraction evaluation')
14 | parser.add_argument('extracted_1', metavar = 'extracted_idioms_1.csv', type = str, help = "Specify the location of the first file containing the extracted PIEs.")
15 | parser.add_argument('extracted_2', metavar = 'extracted_idioms_2.csv', type = str, help = "Specify the location of the second file containing the extracted PIEs.")
16 | parser.add_argument('combined', metavar = 'combined_idioms.csv', type = str, help = "Specify the output location of the combined set of extracted PIEs.")
17 | args = parser.parse_args()
18 | 
19 | # Read input data
20 | extracted_idioms_1 = []
21 | with open(args.extracted_1, 'r') as csvfile:
22 | 	csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
23 | 	for csvrow in csvreader:
24 | 		extracted_idioms_1.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2], 'bnc_start': csvrow[6], 'bnc_end': csvrow[7]})
25 | extracted_idioms_2 = []
26 | with open(args.extracted_2, 'r') as csvfile:
27 | 	csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
28 | 	for csvrow in csvreader:
29 | 		extracted_idioms_2.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2], 'bnc_start': csvrow[6], 'bnc_end': csvrow[7]})
30 | 	
31 | # Combine two sets of extractions
32 | combined_idioms = copy.deepcopy(extracted_idioms_1)
33 | for extracted_idiom_2 in extracted_idioms_2:
34 | 	matched = False
35 | 	for extracted_idiom_1 in extracted_idioms_1:
36 | 		if extracted_idiom_2['idiom'].lower() == extracted_idiom_1['idiom'].lower() and extracted_idiom_2['document_id'] == extracted_idiom_1['document_id'] and extracted_idiom_2['sentence_number'] == extracted_idiom_1['sentence_number']:
37 | 			matched = True
38 | 			break
39 | 	if not matched:
40 | 		combined_idioms.append(extracted_idiom_2)
41 | 
42 | # Output to file	
43 | with open(args.combined, 'w') as of:
44 | 	writer = csv.writer(of, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
45 | 	for extracted_idiom in combined_idioms:
46 | 		output_row = [u8(extracted_idiom['idiom']), extracted_idiom['start'], extracted_idiom['end'], 
47 | 			u8(extracted_idiom['context']), u8(extracted_idiom['document_id']), u8(extracted_idiom['sentence_number']), 
48 | 			extracted_idiom['bnc_start'], extracted_idiom['bnc_end']]
49 | 		writer.writerow(output_row)
50 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''Set parameters, parse and validate command-line arguments'''
 5 | 
 6 | import argparse, os, datetime, re
 7 | 
 8 | # Non-argument parameters
 9 | WORK_DIR = './working'
10 | EXT_DIR = './ext'
11 | MORPH_DIR = os.path.join(EXT_DIR, 'morph')
12 | TIME = '{:%Y-%m-%d-%H-%M-%S}'.format(datetime.datetime.now())
13 | UE_URL = 'https://www.usingenglish.com'
14 | UE_IDOMS_URL = UE_URL + '/reference/idioms'
15 | OX_URL = 'http://www.oxfordreference.com'
16 | OX_LANDING_URL = OX_URL + '/view/10.1093/acref/9780199543793.001.0001/acref-9780199543793?pageSize=100' # Requires access through e.g. a library
17 | 
18 | # Read in arguments
19 | parser = argparse.ArgumentParser(description = 'Parameters for PIE detection')
20 | parser.add_argument('-d', '--dict', metavar = 'wiktionary|ue|oxford|intersection|2of3|union', type = str, default = 'wiktionary', help = "Specify which dictionary to use, default is 'wiktionary'. Other options are 'ue' for UsingEnglish.com, 'oxford' for Oxford Dictionary of English Idioms, 'intersection' for idioms occurring in all three dictionaries, '2of3' for idioms occurring in at least two of the three dictionaries, and 'union' for all idioms occurring in at least one of the three dictionaries. To get the intersection of a pair of dictionaries, enter two dictionary names, separated by a comma, e.g. 'wiktionary,oxford'.")
21 | parser.add_argument('corpus', metavar = 'CORPUS', type = str, help = "Specify the location of the corpus to extract PIEs from.")
22 | parser.add_argument('-t', '--corpus-type', metavar = 'plain|bnc|bnc-dev|bnc-test', type = str, default = 'plain', help = "Specify the type of corpus used. Plain text or BNC (all and dev/test sets).")
23 | parser.add_argument('-m', '--method', metavar = 'exact|fuzzy|inflect|parse', type = str, default = 'exact', help = "Specify the extraction method to use. 'exact' for exact string matching, 'fuzzy' for fuzzy/ string matching, 'inflect' for inflectional string matching, 'parse' for parse-based extraction.")
24 | parser.add_argument('-p', '--parser', metavar = 'spacy|stanford', type = str, default = 'spacy', help = "Specify whether to use the Spacy or Stanford parser for parse-based extraction")
25 | parser.add_argument('-ex', '--example-sentences', metavar = 'CORPUS', type = str, help = "With the 'parse' method, specify this option to retrieve example sentences for in-context parsing. Specify a path to a corpus or to the file containing the cached output of this method.")
26 | parser.add_argument('-iw', '--intervening-words', metavar = 'N', type = int, default = 0, help = "Number of intervening words allowed between words of an idiom in the string match methods. Default is 0.")
27 | parser.add_argument('-c', '--context', metavar = '{0-9}+{ws}', type = str, default = '0s', help = "Amount of context to extract around the idiom. Can be a number of words or sentences. '0w' will yield only the idiom, '1w' one word of context on both sides of the idiom, etc. Word-contexts never exceed sentence boundaries. '0s' will yield only the sentence containing the idiom.")
28 | parser.add_argument('-o', '--output', metavar = 'OUTFILE', type = str, help = "Specify where to output the extracted idioms. Default is WORK_DIR/extracted_idioms_from_CORPUS_NAME_TIMESTAMP.")
29 | parser.add_argument('-nc', '--no-cache', action = 'store_true', help = "Do not use a cached idiom list.")
30 | parser.add_argument('-ns', '--no-split', action = 'store_true', help = "In case of a one-sentence-per-line corpus, do not apply automatic sentence splitting. Does not affect parser-based extraction.")
31 | parser.add_argument('-cs', '--case-sensitive', action = 'store_true', help = "Make string-matching methods case sensitive.")
32 | parser.add_argument('-nl', '--no-labels', action = 'store_true', help = "Ignore dependency relation labels during parse-based extraction")
33 | parser.add_argument('-nld', '--no-labels-or-directionality', action = 'store_true', help = "Ignore dependency relation labels AND dependency relation direction during parse-based extraction.")
34 | args = parser.parse_args()
35 | 
36 | # Store arguments as parameters and do validation
37 | DICT = args.dict.split(',')
38 | if len(DICT) == 1 and DICT[0] not in ['wiktionary', 'ue', 'oxford', 'intersection', '2of3', 'union']:
39 | 	raise ValueError("No valid dictionary option specified.")
40 | elif len(DICT) == 2 and (DICT[0] not in ['wiktionary', 'ue', 'oxford'] or DICT[1] not in ['wiktionary', 'ue', 'oxford']):
41 | 	raise ValueError("No valid dictionary option specified.")
42 | elif len(DICT) < 1 or len(DICT) > 2:
43 | 	raise ValueError("No valid dictionary option specified.")
44 | 
45 | CORPUS = os.path.abspath(args.corpus)
46 | if not os.path.exists(CORPUS):
47 | 	raise ValueError("Corpus not found.")
48 | 
49 | if args.corpus_type in ['plain', 'bnc', 'bnc-dev', 'bnc-test']:
50 | 	CORPUS_TYPE = args.corpus_type
51 | else:
52 | 	raise ValueError("No valid corpus type specified.")
53 | 
54 | if args.method in ['exact', 'fuzzy', 'inflect', 'parse']:
55 | 	METHOD = args.method
56 | else:
57 | 	raise ValueError("No valid extraction method specified.")
58 | 
59 | if args.parser.lower() in ['spacy', 'stanford']:
60 | 	PARSER = args.parser.lower()
61 | else:
62 | 	raise ValueError("No valid parser specified.")
63 | 
64 | INT_WORDS = args.intervening_words
65 | 
66 | SENTENCES = args.example_sentences
67 | if SENTENCES:
68 | 	SENTENCES = os.path.abspath(args.example_sentences)
69 | 
70 | if re.match('[0-9]+[ws]', args.context):
71 | 	CONTEXT_NUMBER = int(args.context[:-1])
72 | 	CONTEXT_TYPE = args.context[-1]
73 | else:
74 | 	raise ValueError("No valid context window argument provided. Should be of the format [0-9]+[ws].")	
75 | 
76 | if not args.output: # Set default
77 | 	OUTFILE = os.path.abspath(os.path.join(WORK_DIR, 'extracted_idioms_from_{0}_{1}.csv'.format(CORPUS.split('/')[-1],TIME)))
78 | else: 
79 | 	OUTFILE = os.path.abspath(args.output)
80 | 
81 | NO_CACHE = args.no_cache
82 | NO_SPLIT = args.no_split
83 | CASE_SENSITIVE = args.case_sensitive
84 | NO_LABELS = args.no_labels or args.no_labels_or_directionality
85 | NO_DIRECTION = args.no_labels_or_directionality
86 | 


--------------------------------------------------------------------------------
/data/input_sample.txt:
--------------------------------------------------------------------------------
 1 | I often shoot the breeze while waiting at the bus stop.
 2 | I like shooting the breeze at the bus stop.
 3 | Shooting the breeze at the bus stop is fun.
 4 | Yesterday I shot the breeze with an old lady waiting there.
 5 | She shoots the breeze as well as I do. 
 6 | Sometimes, I shoot the breezes, which is different.
 7 | It means that I shoot multiple breezes at once.
 8 | The breeze was shot.
 9 | That is, multiple breezes were shot.
10 | Freddie likes to bite the dust.
11 | The dust was bitten by Freddie.
12 | The English press has been very good to me, touch wood.
13 | If it′s ever me (Jesus Christ, touch wood!), I don′t want you bringing me flowers. 
14 | I know I have my faults and one of them is my impatience and I also cannot tolerate people who are ill, mainly because I am so very rarely ill—“Touch wood,” I said out loud and touched my head at the same time. 
15 | The announcement of the political endorsement was timed to a T.
16 | It was an awful scragly tear, and it fitted to a T.
17 | Even if you like peanut butter sandwiches, eating the same sandwiches day in, day out will get old.
18 | She was sick and tired of her daughter pestering her to help her with her homework.
19 | The boy was sick and tired of doing his lengthy homework assignment. 
20 | I played gooseberry with Romeo and Juliet.
21 | Stay tuned for local weather info in your neck of the woods. 
22 | What time is it in your neck of the woods?
23 | She has been a widow these six or eight years, and has lived, I imagine, in rather a hand to mouth fashion.
24 | I'm living hand to mouth.
25 | She took everything but the kitchen sink.
26 | He went bananas.
27 | If Washington Mutual needs to raise capital quickly, it will very likely find itself between a rock and a hard place, because credit markets have all but closed their doors to troubled banks.
28 | 


--------------------------------------------------------------------------------
/detect_pies.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''Extract potential idiomatic expressions from a corpus, based on idioms from a dictionary.'''
  5 | 
  6 | import config
  7 | import process_corpus
  8 | import wiktionary
  9 | import using_english
 10 | import oxford
 11 | import utils
 12 | from utils import u8
 13 | 
 14 | import re, os, json, random, time
 15 | 
 16 | def combine_sets(combination_type, a, b, c = []):
 17 | 	'''Combines 2/3 sets of idioms in different ways'''
 18 | 	if combination_type == 'intersection':
 19 | 		if c:
 20 | 			return list(set(a) & set(b) & set(c))
 21 | 		else:
 22 | 			return list(set(a) & set(b))
 23 | 	elif combination_type == '2of3':
 24 | 		return list((set(a) & set(b)) | (set(b) & set(c)) | (set(a) & set(c)))
 25 | 	elif combination_type == 'union':
 26 | 		if c:
 27 | 			return list(set(a) | set(b) | set(c))
 28 | 		else:
 29 | 			return list(set(a) | set(b))
 30 | 
 31 | def get_idiom_list(dictionary_type = config.DICT, case_sensitive = False):
 32 | 	'''Gets idiom list, either from file or via API'''
 33 | 
 34 | 	# Read in dictionary type
 35 | 	if len(dictionary_type) == 1:
 36 | 		dictionary_type = dictionary_type[0]
 37 | 	elif len(dictionary_type) != 2:
 38 | 		raise ValueError('No valid dictionary specified!')
 39 | 
 40 | 	# Single dictionaries
 41 | 	if dictionary_type in ['wiktionary', 'ue', 'oxford']:
 42 | 		# Try to find the most recent cached idiom list
 43 | 		ifn = ''
 44 | 		ifn_pattern = 'idiom_list_{0}_[0-9\-]+\.json$'.format(dictionary_type)
 45 | 		for candidate_ifn in sorted(os.listdir(config.WORK_DIR), reverse = True):
 46 | 			if re.match(ifn_pattern, candidate_ifn):
 47 | 				ifn = os.path.join(config.WORK_DIR, candidate_ifn)
 48 | 				break
 49 | 		# Don't use the cached list, but scrape a new one
 50 | 		if not os.path.isfile(ifn) or config.NO_CACHE:
 51 | 			if dictionary_type == 'wiktionary':
 52 | 				idioms = wiktionary.get_category_members(category = 'English idioms')
 53 | 			if dictionary_type == 'ue':
 54 | 				idioms = using_english.get_idioms(config.UE_URL, config.UE_IDOMS_URL)
 55 | 			if dictionary_type == 'oxford':
 56 | 				idioms = oxford.get_idioms(config.OX_URL, config.OX_LANDING_URL)
 57 | 			# Cache idiom list
 58 | 			ofn = '{0}/idiom_list_{1}_{2}.json'.format(config.WORK_DIR, dictionary_type, config.TIME)
 59 | 			with open(ofn, 'w') as of:
 60 | 				json.dump(idioms, of)
 61 | 		# Read idiom list from file
 62 | 		else:
 63 | 			print 'Reading idiom list from {0}'.format(ifn)
 64 | 			with open(ifn, 'r') as f:
 65 | 				idioms = json.load(f)
 66 | 		# Refine Oxford idiom list
 67 | 		if dictionary_type == 'oxford':
 68 | 			idioms = oxford.refine_idioms(idioms)
 69 | 		# Lower-case everything if we ignore case
 70 | 		if not case_sensitive:
 71 | 			idioms = [idiom.lower() for idiom in idioms]
 72 | 
 73 | 	# Combinations of all dictionaries
 74 | 	elif dictionary_type in ['intersection', 'union', '2of3']:
 75 | 		# Get single dictionaries first
 76 | 		wiktionary_idioms = get_idiom_list(dictionary_type = ['wiktionary'], case_sensitive = case_sensitive)
 77 | 		ue_idioms = get_idiom_list(dictionary_type = ['ue'], case_sensitive = case_sensitive)
 78 | 		oxford_idioms = get_idiom_list(dictionary_type = ['oxford'], case_sensitive = case_sensitive)
 79 | 		# Combine dictionaries
 80 | 		if not case_sensitive: 
 81 | 			idioms = combine_sets(dictionary_type, wiktionary_idioms, ue_idioms, oxford_idioms)
 82 | 		# Keep case where possible, lower-case where dictionaries conflict
 83 | 		else:
 84 | 			idioms = combine_sets(dictionary_type, wiktionary_idioms, ue_idioms, oxford_idioms)
 85 | 			idioms_lower = [idiom.lower() for idiom in idioms]
 86 | 			# Lower-case first letter which is always upper-case in UE
 87 | 			ue_fixed = [idiom[0].lower() + idiom[1:] for idiom in ue_idioms]
 88 | 			additional_idioms = combine_sets(dictionary_type, wiktionary_idioms, ue_fixed, oxford_idioms)
 89 | 			for additional_idiom in additional_idioms:
 90 | 				if additional_idiom.lower() not in idioms_lower:
 91 | 					idioms.append(additional_idiom)
 92 | 					idioms_lower.append(additional_idiom.lower())
 93 | 			# Add all idioms which have case differences in other places
 94 | 			wiktionary_lower = [idiom.lower() for idiom in wiktionary_idioms]
 95 | 			ue_lower = [idiom.lower() for idiom in ue_idioms]
 96 | 			oxford_lower = [idiom.lower() for idiom in oxford_idioms]
 97 | 			additional_idioms = combine_sets(dictionary_type, wiktionary_lower, ue_lower, oxford_lower)
 98 | 			for additional_idiom in additional_idioms:
 99 | 				if additional_idiom.lower() not in idioms_lower:
100 | 					idioms.append(additional_idiom)
101 | 
102 | 	# Combination of a pair of dictionaries
103 | 	elif len(dictionary_type) == 2:
104 | 		print 'Taking the intersection of a pair of dictionaries'
105 | 		dictionary_idioms_1 = get_idiom_list(dictionary_type = dictionary_type[0:1], case_sensitive = case_sensitive)
106 | 		dictionary_idioms_2 = get_idiom_list(dictionary_type = dictionary_type[1:2], case_sensitive = case_sensitive)
107 | 		# Combine dictionaries
108 | 		if not case_sensitive:
109 | 			idioms = combine_sets('intersection', dictionary_idioms_1, dictionary_idioms_2)
110 | 		# Keep case where possible, lower-case where dictionaries conflict
111 | 		else: 
112 | 			idioms = combine_sets('intersection', dictionary_idioms_1, dictionary_idioms_2)
113 | 			idioms_lower = [idiom.lower() for idiom in idioms]
114 | 			# Lower-case first letter which is always upper-case in UE
115 | 			additional_idioms = []
116 | 			if dictionary_type[0:1] == 'ue':
117 | 				ue_fixed = [idiom[0].lower() + idiom[1:] for idiom in dictionary_idioms_1]
118 | 				additional_idioms = combine_sets('intersection', dictionary_idioms_2, ue_fixed)
119 | 			elif dictionary_type[1:2] == 'ue':
120 | 				ue_fixed = [idiom[0].lower() + idiom[1:] for idiom in dictionary_idioms_2]
121 | 				additional_idioms = combine_sets('intersection', dictionary_idioms_1, ue_fixed)
122 | 			if additional_idioms:
123 | 				for additional_idiom in additional_idioms:
124 | 					if additional_idiom.lower() not in idioms_lower:
125 | 						idioms.append(additional_idiom)
126 | 						idioms_lower.append(additional_idiom.lower())
127 | 			# Add all idioms which have case differences in other places
128 | 			dictionary_idioms_1_lower = [idiom.lower() for idiom in dictionary_idioms_1]
129 | 			dictionary_idioms_2_lower = [idiom.lower() for idiom in dictionary_idioms_2]
130 | 			additional_idioms = combine_sets('intersection', dictionary_idioms_1_lower, dictionary_idioms_2_lower)
131 | 			for additional_idiom in additional_idioms:
132 | 				if additional_idiom.lower() not in idioms_lower:
133 | 					idioms.append(additional_idiom)
134 | 
135 | 	return idioms
136 | 
137 | def string_match(idioms, documents, case_sensitive = False, expand_pronouns = True, fuzzy = False, inflect = False):
138 | 	'''
139 | 	Extracts idioms by exact, fuzzy, or inflectional string matching.
140 | 	Expands idioms containing indefinite pronouns and deals with idioms
141 | 	containing em-dash wildcards. Maps all matched idioms back to their
142 | 	dictionary form and extracts context around the idiom.	
143 | 	'''
144 | 
145 | 	# Set flags
146 | 	if case_sensitive:
147 | 		flags = 0
148 | 	else:
149 | 		flags = re.I
150 | 
151 | 	# Inter-word separator for regex: word boundaries + optional intervening words
152 | 	separator = r'\b\W+(?:\w+\W+){0,' + str(config.INT_WORDS) + r'}\b'
153 | 
154 | 	# Expand indefinite pronouns in idioms (e.g. 'someone')
155 | 	if expand_pronouns:
156 | 		idioms, expanded_form_map = utils.expand_indefinite_pronouns(idioms)
157 | 
158 | 	# Get all inflectional variants of idioms
159 | 	if inflect:
160 | 		idioms, inflected_form_map = utils.inflect_idioms(idioms, config.MORPH_DIR)
161 | 
162 | 	extracted_idioms = [] # List of dicts, format: {'snippet': "", 'idiom': "", 'start': 0, 'end': 0, 'bnc_doc_id': "", 'bnc_sent': "", 'bnc_char_start': 0, 'bnc_char_end': 0}
163 | 
164 | 	# Generate regular expression matching all idioms
165 | 	idiom_regex = ''
166 | 	for idiom in idioms:
167 | 		idiom_words = idiom.split(' ')
168 | 		# Fuzzy matching: add optional 1/2/3-character suffix to each idiom word 
169 | 		if fuzzy:
170 | 			idiom_words = [re.escape(iw) + '\w?' * 3 for iw in idiom_words] # Escape special chars, add fuzzy suffix, add boundaries
171 | 		# Regular string matching
172 | 		else:
173 | 			idiom_words = [re.escape(iw) for iw in idiom_words] # Escape special chars
174 | 		idiom_regex = idiom_regex + r'\b' + separator.join(idiom_words) + r'\b'
175 | 		if idiom != idioms[-1]:
176 | 			idiom_regex += '|'
177 | 			
178 | 		# Replace all em-dashes by a wildcard (\w+)
179 | 		idiom_regex = re.sub(u'\\\—', r'\w+', idiom_regex)
180 | 
181 | 	# Do actual extraction
182 | 	tokenizer = utils.load_tokenizer()
183 | 	for sentences in documents:
184 | 		# Get sentence strings from BNC data
185 | 		if config.CORPUS_TYPE[0:3] == 'bnc':
186 | 			sentences_with_metadata = sentences
187 | 			sentences = [sentence_with_metadata['sentence'] for sentence_with_metadata in sentences_with_metadata]
188 | 		# Cycle through sentences in document
189 | 		for idx, sentence in enumerate(sentences):
190 | 			matches = re.finditer(idiom_regex, sentence, flags = flags)
191 | 			tokenized_sentence = ''
192 | 			for match in matches:
193 | 				# Only tokenize once, and only when a match is found
194 | 				if not tokenized_sentence:
195 | 					tokenized_sentence = utils.tokenize(tokenizer, sentence)
196 | 				# Get token offsets from match offsets
197 | 				for token in tokenized_sentence:
198 | 					if token.idx == match.start():
199 | 						first_idiom_token_i = token.i
200 | 					if token.idx + len(token.text) == match.end():
201 | 						last_idiom_token_i = token.i
202 | 						break
203 | 				# Get BNC metadata/set dummy values
204 | 				if config.CORPUS_TYPE[0:3] == 'bnc':
205 | 					bnc_document_id = sentences_with_metadata[idx]['document_id']
206 | 					bnc_sentence = sentences_with_metadata[idx]['sentence_number']
207 | 					bnc_char_start = match.start()
208 | 					bnc_char_end = match.end()
209 | 				else:
210 | 					bnc_document_id = '-'
211 | 					bnc_sentence = '-'
212 | 					bnc_char_start = 0
213 | 					bnc_char_end = 0
214 | 				# Get n-word context
215 | 				if config.CONTEXT_TYPE == 'w':
216 | 					# Get snippet
217 | 					snippet_start = max(0, first_idiom_token_i - config.CONTEXT_NUMBER)
218 | 					snippet_end = min(len(tokenized_sentence), last_idiom_token_i + 1 + config.CONTEXT_NUMBER)
219 | 					snippet = tokenized_sentence[snippet_start:snippet_end].text
220 | 					# Get idiom character offsets in snippet
221 | 					char_offset_span = tokenized_sentence[snippet_start].idx
222 | 					char_offset_start = match.start() - char_offset_span
223 | 					char_offset_end = match.end() - char_offset_span
224 | 				# Get n-sentence context
225 | 				elif config.CONTEXT_TYPE == 's':
226 | 					if config.CONTEXT_NUMBER == 0:
227 | 						snippet = sentence
228 | 						char_offset_start = match.start()
229 | 						char_offset_end = match.end()
230 | 					else:
231 | 						# Get surrounding sentences to form snippet
232 | 						first_snippet_sentence_idx = max(0, idx - config.CONTEXT_NUMBER)
233 | 						last_snippet_sentence_idx = min(len(sentences), idx + 1 + config.CONTEXT_NUMBER)
234 | 						snippet_sentences = sentences[first_snippet_sentence_idx:last_snippet_sentence_idx]
235 | 						snippet = ' '.join(snippet_sentences)
236 | 						# Adjust offset for length of preceding sentences and joining space to the current sentence
237 | 						num_preceding_sentences = idx - first_snippet_sentence_idx
238 | 						char_offset_span = len(' '.join(snippet_sentences[:num_preceding_sentences]))
239 | 						char_offset_start = match.start() + char_offset_span + 1
240 | 						char_offset_end = match.end() + char_offset_span + 1
241 | 						
242 | 				# Get dictionary form of idiom
243 | 				matched_string = sentence[match.start():match.end()]
244 | 				if not case_sensitive:
245 | 					matched_string = matched_string.lower()
246 | 				dictionary_form = ''
247 | 				# Deal with em-dash wildcard idiom, and idioms matched with non-spaces
248 | 				if matched_string not in idioms:
249 | 					for idiom in idioms:
250 | 						idiom_words = idiom.split(' ')
251 | 						if fuzzy:
252 | 							idiom_words = [re.escape(idiom_word) + '\w?' * 3 for idiom_word in idiom_words]
253 | 						else:
254 | 							idiom_words = [re.escape(idiom_word) for idiom_word in idiom_words]
255 | 						single_idiom_regex = r'\b' + separator.join(idiom_words) + r'\b'
256 | 						if u'\u2014' in idiom:
257 | 							single_idiom_regex = re.sub(ur'\\\u2014', r'\w+', single_idiom_regex)
258 | 						if re.match(single_idiom_regex, matched_string):
259 | 							dictionary_form = idiom
260 | 							break
261 | 				# Occurs exactly in idiom list, so already is dictionary form 
262 | 				else:
263 | 					dictionary_form = matched_string
264 | 				# Map expanded and/or inflected idioms back to base form
265 | 				if inflect:
266 | 					dictionary_form = inflected_form_map[dictionary_form]
267 | 				if expand_pronouns:
268 | 					dictionary_form = expanded_form_map[dictionary_form]
269 | 
270 | 				extracted_idioms.append({'snippet': snippet, 'idiom': dictionary_form, 'start': char_offset_start, 
271 | 					'end': char_offset_end, 'bnc_document_id': bnc_document_id, 'bnc_sentence': bnc_sentence, 
272 | 					'bnc_char_start': bnc_char_start, 'bnc_char_end': bnc_char_end})
273 | 
274 | 	return extracted_idioms
275 | 
276 | def parse_extract(idioms, sentences):
277 | 	'''
278 | 	Extracts idioms based on the dependency parse of the idiom and sentence.
279 | 	Parse all idioms, optionally in context, get their parse trees and top node 
280 | 	lemmata. Then, parse each sentence, check if the top node lemma is present,
281 | 	and match the idiom parse tree to a subtree of the sentence parse. Deal 
282 | 	with idioms containing indefinite pronouns and em-dashes properly.
283 | 	'''
284 | 
285 | 	parser = utils.load_parser(config.PARSER)
286 | 	extracted_idioms = [] # List of dicts, format: {'snippet': "", 'idiom': "", 'start': 0, 'end': 0, 'bnc_doc_id': "", 'bnc_sent': "", 'bnc_char_start': 0, 'bnc_char_end': 0}
287 | 	# Use a PoS-ambiguous word to parse idioms containing em-dash wildcards
288 | 	ambiguous_word = 'fine'
289 | 
290 | 	# Parse idioms in context
291 | 	if config.SENTENCES:
292 | 		cache_file = '{0}/example_sentences_{1}_{2}_{3}.json'.format(config.WORK_DIR, '_'.join(config.DICT), config.SENTENCES.split('/')[-1][:-4], config.TIME)
293 | 		idioms_with_sentences = utils.get_example_sentences(idioms, config.SENTENCES, cache_file)
294 | 		parsed_idioms = utils.parse_example_sentences(idioms_with_sentences, ambiguous_word, parser)
295 | 	# Parse idioms without context
296 | 	else:
297 | 		parsed_idioms = []
298 | 		for idiom in idioms:
299 | 			parsed_idioms.append(utils.parse_idiom(idiom, ambiguous_word, parser))
300 | 
301 | 	# Extract idiom instances by matching parse trees
302 | 	for sentences in documents:
303 | 		time_0 = time.time()
304 | 		print 'Parsing document...'
305 | 		# Get sentence strings from BNC data and parse
306 | 		if config.CORPUS_TYPE [0:3]== 'bnc':
307 | 			sentences_with_metadata = sentences
308 | 			sentences = [sentence_with_metadata['sentence'] for sentence_with_metadata in sentences_with_metadata]
309 | 			# Parse sentence, and turn resulting Doc into Span object
310 | 			parsed_sentences = [utils.parse(parser, sentence)[:] for sentence in sentences]
311 | 		# Parse corpus as a whole, let Spacy do the sentence splitting
312 | 		else:
313 | 			parsed_corpus = utils.parse(parser, ' '.join(sentences))
314 | 			parsed_sentences = parsed_corpus.sents
315 | 
316 | 		print 'Done! Parsing document took {0:.2f} seconds'.format(time.time() - time_0)
317 | 		# Cycle through sentences, attempt to match parse trees
318 | 		for sentence_idx, parsed_sentence in enumerate(parsed_sentences):
319 | 			for parsed_idiom in parsed_idioms:
320 | 
321 | 				# Get idiom information
322 | 				idiom_top_lemma = parsed_idiom[0]
323 | 				idiom_top_token = parsed_idiom[1]
324 | 				idiom_subtree = parsed_idiom[2]
325 | 				# If not parsed in context, there is no stored list, so get generator
326 | 				if not idiom_subtree: 
327 | 					idiom_subtree = idiom_top_token.subtree
328 | 				# Use list, rather than generator
329 | 				idiom_subtree = [x for x in idiom_subtree]
330 | 				has_em_dash = parsed_idiom[3]
331 | 				# Save previously matched indices to check for overlapping spans
332 | 				previously_matched_indices = [] 
333 | 
334 | 				# When idiom top lemma is em-dash, check if other lemma-tokens occur in sentence, only then try matching the parse trees
335 | 				consider_this_em_dash_idiom = False
336 | 				if has_em_dash and idiom_top_lemma == ambiguous_word:
337 | 					idiom_content_tokens = [token for token in idiom_subtree if token.tag_ not in ['DT'] and token != idiom_top_token]
338 | 					sentence_lemmata = [token.lemma_ for token in parsed_sentence]
339 | 					if all([idiom_content_token.lemma_ in sentence_lemmata for idiom_content_token in idiom_content_tokens]):
340 | 						consider_this_em_dash_idiom = True
341 | 
342 | 				# Cycle through sentence parse, match top lemma to sentence lemma and idiom parse tree to sentence parse tree
343 | 				for sentence_token in parsed_sentence:
344 | 					# Match top lemma or em-dash heuristic or match any idiom token as possible top token in case of no directionality
345 | 					if sentence_token.lemma_ == idiom_top_token.lemma_ or consider_this_em_dash_idiom or (config.NO_DIRECTION and sentence_token.lemma_ in [x.lemma_ for x in idiom_subtree]):
346 | 						sentence_top_token = sentence_token
347 | 						# Keep track of indices of matching tokens for later span extraction
348 | 						matched_indices = [sentence_top_token.i] 
349 | 						# Match parse trees, account for many special cases
350 | 						for idiom_subtree_token in idiom_subtree:
351 | 							# Skip top token and articles
352 | 							if idiom_subtree_token != idiom_top_token and idiom_subtree_token.lower_ not in ['a', 'the', 'an']:
353 | 								matched_subtree_token = False
354 | 								for sentence_subtree_token in sentence_token.subtree:
355 | 									# Match condition components
356 | 									# Spacy gives same lemma for all pronouns, so match on lower-cased form 
357 | 									matching_lemma = (idiom_subtree_token.lemma_ == sentence_subtree_token.lemma_ and idiom_subtree_token.lemma_ != u'-PRON-') or (idiom_subtree_token.lemma_ == u'-PRON-' and idiom_subtree_token.lower_ == sentence_subtree_token.lower_)
358 | 									# Optionally, ignore dependency labels
359 | 									matching_dep = idiom_subtree_token.dep_ == sentence_subtree_token.dep_ or config.NO_LABELS
360 | 									matching_head_lemma = (idiom_subtree_token.head.lemma_ == sentence_subtree_token.head.lemma_ and idiom_subtree_token.head.lemma_ != u'-PRON-') or (idiom_subtree_token.head.lemma_ == u'-PRON-' and idiom_subtree_token.head.lower_ == sentence_subtree_token.head.lower_)
361 | 									# Optionally, allow for direction reversal
362 | 									if config.NO_DIRECTION:
363 | 										if idiom_subtree_token.head.lemma_ == u'-PRON-':
364 | 											matched_children = [x for x in sentence_subtree_token.children if x.lower_ == idiom_subtree_token.head.lower_]
365 | 										else:
366 | 											matched_children = [x for x in sentence_subtree_token.children if x.lemma_ == idiom_subtree_token.head.lemma_]
367 | 										matching_child_lemma = matched_children != []
368 | 										matching_head_lemma = matching_head_lemma or matching_child_lemma
369 | 									em_dash_lemma = has_em_dash and idiom_subtree_token.lemma_ == ambiguous_word
370 | 									em_dash_head_lemma = has_em_dash and idiom_subtree_token.head.lemma_ == ambiguous_word
371 | 									inverted_dep = idiom_subtree_token.dep_ == 'dobj' and sentence_subtree_token.dep_ == 'nsubjpass' or config.NO_LABELS
372 | 									# Default case: lemma, dep-rel and head lemma have to match.
373 | 									# In case of em-dash, match lemma or head lemma, and the other one to the ambiguous word
374 | 									if (matching_lemma and matching_dep and matching_head_lemma or 
375 | 											em_dash_lemma and matching_head_lemma or 
376 | 											matching_lemma and em_dash_head_lemma):
377 | 										matched_subtree_token = True
378 | 									# Passivization: match lemma, head lemma and inverted dep-rels
379 | 									elif matching_lemma and inverted_dep and matching_head_lemma:
380 | 										matched_subtree_token = True
381 | 									# Deal with someone and someone's
382 | 									elif idiom_subtree_token.lemma_ == 'someone':
383 | 										idiom_right_children = [right for right in idiom_subtree_token.rights]
384 | 										# Deal with someone's - match any other PRP$ or NN(P)(S) + POS for lemma
385 | 										if idiom_right_children and idiom_right_children[0].lemma_ == "'s":
386 | 											sentence_right_children = [right for right in sentence_subtree_token.rights]
387 | 											if (matching_dep and matching_head_lemma and (sentence_subtree_token.tag_ == 'PRP$' or
388 | 													sentence_subtree_token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS'] and 
389 | 													sentence_right_children and sentence_right_children[0].lemma_ == "'s")):
390 | 												matched_subtree_token = True
391 | 										# Deal with someone - match any other PRP or NN(P)(S) for lemma
392 | 										else:
393 | 											if ((matching_dep or inverted_dep) and matching_head_lemma and 
394 | 													sentence_subtree_token.tag_ in ['PRP', 'NN', 'NNS', 'NNP', 'NNPS']):
395 | 												matched_subtree_token = True
396 | 									# Deal with one's - match any PRP$ for lemma
397 | 									elif idiom_subtree_token.lemma_ == 'one':
398 | 										idiom_right_children = [right for right in idiom_subtree_token.rights]
399 | 										if idiom_right_children and idiom_right_children[0].lemma_ == "'s":
400 | 											if matching_dep and matching_head_lemma and sentence_subtree_token.tag_ == 'PRP$':
401 | 												matched_subtree_token = True
402 | 									# Deal with something and something's
403 | 									elif idiom_subtree_token.lemma_ == 'something':
404 | 										idiom_right_children = [right for right in idiom_subtree_token.rights]
405 | 										# Deal with something's - match any other PRP$ or NN(P)(S) + POS for lemma
406 | 										if idiom_right_children and idiom_right_children[0].lemma_ == "'s":
407 | 											sentence_right_children = [right for right in sentence_subtree_token.rights]
408 | 											if (matching_dep and matching_head_lemma and (sentence_subtree_token.tag_ == 'PRP$' or 
409 | 													sentence_subtree_token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS'] and 
410 | 													sentence_right_children and sentence_right_children[0].lemma_ == "'s")):
411 | 												matched_subtree_token = True
412 | 										# Deal with something - match any other PRP or NN(P)(S) or this/that/these/those for lemma
413 | 										else:
414 | 											if ((matching_dep or inverted_dep) and matching_head_lemma and 
415 | 													(sentence_subtree_token.tag_ in ['PRP', 'NN', 'NNS', 'NNP', 'NNPS'] or 
416 | 													sentence_subtree_token.lemma_ in ['this', 'that', 'these', 'those'])):
417 | 												matched_subtree_token = True
418 | 									# Deal with 's of someone's, one's and something's by ignoring it
419 | 									elif idiom_subtree_token.lemma_ == "'s" and idiom_subtree_token.head.lemma_ in ['someone', 'one', 'something']:
420 | 										matched_subtree_token = True
421 | 										break
422 | 
423 | 									if matched_subtree_token: # Match, go to next idiom subtree token
424 | 										# Add child in case of no-directionality child match
425 | 										if config.NO_DIRECTION and matching_child_lemma:
426 | 											matched_indices.append(matched_children[0].i)
427 | 										else:
428 | 											matched_indices.append(sentence_subtree_token.i)
429 | 										break
430 | 								if not matched_subtree_token: # No match, go to next sentence token
431 | 									break
432 | 
433 | 						# If everything matches, extract snippet
434 | 						if matched_subtree_token:
435 | 							# Text of idiom subtree is dictionary form
436 | 							dictionary_form = ''.join([idiom_subtree_token.text_with_ws for idiom_subtree_token in idiom_subtree]).strip()
437 | 							# Deal with em-dash wildcard idiom, substitute em-dash back in for ambiguous word
438 | 							if has_em_dash:
439 | 								dictionary_form = re.sub(ambiguous_word, u'\u2014', dictionary_form)
440 | 							# Get idiom token span
441 | 							first_idiom_token_i = min(matched_indices) - parsed_sentence.start
442 | 							last_idiom_token_i = max(matched_indices) - parsed_sentence.start
443 | 							first_idiom_token = parsed_sentence[first_idiom_token_i]
444 | 							last_idiom_token = parsed_sentence[last_idiom_token_i]
445 | 							# Extract n-word context
446 | 							if config.CONTEXT_TYPE == 'w':
447 | 								span_start = max(0, first_idiom_token_i - config.CONTEXT_NUMBER)
448 | 								span_end = min(len(parsed_sentence), last_idiom_token_i + 1 + config.CONTEXT_NUMBER)
449 | 								snippet = parsed_sentence[span_start:span_end].text
450 | 								# Store character offset of snippet start
451 | 								char_offset_span = parsed_sentence[span_start].idx
452 | 							# Extract n-sentence context
453 | 							elif config.CONTEXT_TYPE == 's':
454 | 								if config.CONTEXT_NUMBER == 0:
455 | 									snippet = parsed_sentence.text
456 | 									# Store character offset of sentence (==snippet) start
457 | 									char_offset_span = parsed_sentence.start_char
458 | 								else:
459 | 									snippet = ""
460 | 									# Get snippet sentences
461 | 									first_sentence_idx = sentence_idx - config.CONTEXT_NUMBER
462 | 									last_sentence_idx = sentence_idx + config.CONTEXT_NUMBER
463 | 									# Re-iterate over sentences to extract the sentence contents
464 | 									for sentence_idx_2, parsed_sentence_2 in enumerate(parsed_corpus.sents):
465 | 										if sentence_idx_2 >= first_sentence_idx and sentence_idx_2 <= last_sentence_idx:
466 | 											# Store character offset of snippet start
467 | 											if sentence_idx_2 == first_sentence_idx:
468 | 												char_offset_span = parsed_sentence_2.start_char
469 | 											# Add space between sentences
470 | 											if snippet: 
471 | 												snippet += ' ' 
472 | 											snippet += parsed_sentence_2.text
473 | 							# Get idiom character offsets in snippet
474 | 							char_offset_start = first_idiom_token.idx - char_offset_span
475 | 							char_offset_end = last_idiom_token.idx + len(last_idiom_token.text) - char_offset_span
476 | 							# Get BNC metadata/set dummy values
477 | 							if config.CORPUS_TYPE[0:3] == 'bnc':
478 | 								bnc_document_id = sentences_with_metadata[sentence_idx]['document_id']
479 | 								bnc_sentence = sentences_with_metadata[sentence_idx]['sentence_number']
480 | 								bnc_char_start = first_idiom_token.idx
481 | 								bnc_char_end = last_idiom_token.idx + len(last_idiom_token.text)
482 | 							else:
483 | 								bnc_document_id = '-'
484 | 								bnc_sentence = '-'
485 | 								bnc_char_start = 0
486 | 								bnc_char_end = 0
487 | 						
488 | 							extracted_idiom = {'snippet': snippet, 'idiom': dictionary_form, 'start': char_offset_start, 
489 | 								'end': char_offset_end,	'bnc_document_id': bnc_document_id, 'bnc_sentence': bnc_sentence,
490 | 								'bnc_char_start': bnc_char_start, 'bnc_char_end': bnc_char_end}
491 | 
492 | 							# Check whether the instance has already been added, with a larger span (this can happen with em-dash idioms). Don't do this for NLD matches.
493 | 							if previously_matched_indices:
494 | 								# Remove most recent entry if it has a larger span than the current entry 
495 | 								if min(previously_matched_indices) <= min(matched_indices) and max(previously_matched_indices) >= max(matched_indices) and (sentence_token.lemma_ == idiom_top_token.lemma_ or consider_this_em_dash_idiom):
496 | 									del extracted_idioms[-1]
497 | 								# Only add current entry if it doesn't have a larger span than the most recent entry
498 | 								if not (min(previously_matched_indices) >= min(matched_indices) and max(previously_matched_indices) <= max(matched_indices)) and (sentence_token.lemma_ == idiom_top_token.lemma_ or consider_this_em_dash_idiom):
499 | 									extracted_idioms.append(extracted_idiom)
500 | 									previously_matched_indices = matched_indices
501 | 							else:
502 | 								extracted_idioms.append(extracted_idiom)
503 | 								previously_matched_indices = matched_indices
504 | 
505 | 	return extracted_idioms
506 | 
507 | if __name__ == '__main__':
508 | 	print 'Hello! Time is {0}'.format(config.TIME)
509 | 
510 | 	# Create working directory if it doesn't exist
511 | 	if not os.path.isdir(config.WORK_DIR):
512 | 		os.mkdir(config.WORK_DIR)
513 | 
514 | 	# Read in corpus as list of documents
515 | 	if config.CORPUS_TYPE == 'plain':
516 | 		documents = process_corpus.plain_text(config.CORPUS, config.NO_SPLIT)
517 | 		print 'First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(u8(documents[0][0]), u8(documents[-1][-1]))
518 | 	elif config.CORPUS_TYPE[0:3] == 'bnc':
519 | 		cache_path = os.path.join(config.WORK_DIR, '{0}_parsed_xml.json'.format(config.CORPUS_TYPE))
520 | 		documents = process_corpus.bnc(config.CORPUS, config.CORPUS_TYPE, cache_path)
521 | 		print 'First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(u8(documents[0][0]['sentence']), u8(documents[-1][-1]['sentence']))
522 | 
523 | 	# Get idioms from dictionary
524 | 	idioms = get_idiom_list(case_sensitive = config.CASE_SENSITIVE)
525 | 	print "Found {4} idioms ranging from '{0}', '{1}' to '{2}', '{3}'".format(u8(idioms[0]), u8(idioms[1]), u8(idioms[-2]), u8(idioms[-1]), len(idioms))
526 | 
527 | 	# Extract idioms
528 | 	extraction_start = time.time()
529 | 	if config.METHOD == 'exact':
530 | 		extracted_idioms = string_match(idioms, documents, fuzzy = False, inflect = False, case_sensitive = config.CASE_SENSITIVE)
531 | 	elif config.METHOD == 'fuzzy':
532 | 		extracted_idioms = string_match(idioms, documents, fuzzy = True, inflect = False, case_sensitive = config.CASE_SENSITIVE)
533 | 	elif config.METHOD == 'inflect':
534 | 		extracted_idioms = string_match(idioms, documents, fuzzy = False, inflect = True, case_sensitive = config.CASE_SENSITIVE)
535 | 	elif config.METHOD == 'parse':
536 | 		extracted_idioms = parse_extract(idioms, documents)
537 | 
538 | 	# Print information about extracted idioms
539 | 	print 'Extracted {0} idioms in {1:.2f} seconds'.format(len(extracted_idioms), time.time() - extraction_start)
540 | 	idiom_set = set([extracted_idiom['idiom'] for extracted_idiom in extracted_idioms])
541 | 	if len(idiom_set) >= 5:
542 | 		idiom_sample = random.sample(idiom_set, 5)
543 | 		print 'Extracted these idioms, among others: {0}, {1}, {2}, {3}, {4}'.format(u8(idiom_sample[0]), u8(idiom_sample[1]), u8(idiom_sample[2]), u8(idiom_sample[3]), u8(idiom_sample[4]))
544 | 
545 | 	# Output extracted idioms to file 
546 | 	utils.write_csv(extracted_idioms, config.OUTFILE)
547 | 


--------------------------------------------------------------------------------
/evaluate_extraction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''
  5 | Evaluate PIE extraction performance against an exhaustively PIE annotated corpus, output recall, precision and F1-score.
  6 | '''
  7 | 
  8 | import json, argparse, csv, random
  9 | from collections import Counter
 10 | 
 11 | # Read in arguments
 12 | parser = argparse.ArgumentParser(description = 'Parameters for PIE detection evaluation')
 13 | parser.add_argument('extracted', metavar = 'extracted_idioms.csv', type = str, help = "Specify the file containing the extracted PIEs.")
 14 | parser.add_argument('annotated', metavar = 'annotated_idioms.json', type = str, help = "Specify the file containing the annotated PIEs")
 15 | args = parser.parse_args()
 16 | 
 17 | # Read input data
 18 | extracted_idioms = []
 19 | with open(args.extracted, 'r') as csvfile:
 20 | 	csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
 21 | 	for csvrow in csvreader:
 22 | 		extracted_idioms.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2]})
 23 | 		
 24 | annotated_idioms = json.load(open(args.annotated, 'r'))
 25 | 
 26 | # Check if datasets cover same documents
 27 | assert set([idiom['document_id'] for idiom in extracted_idioms]) <= set([idiom['document_id'] for idiom in annotated_idioms])
 28 | 
 29 | # Select only the PIEs from the set of annotated PIE candidates
 30 | annotated_idioms = [annotated_idiom for annotated_idiom in annotated_idioms if annotated_idiom['PIE_label'] == 'y']
 31 | 
 32 | # Keep track of false negatives
 33 | for annotated_idiom in annotated_idioms:
 34 | 	annotated_idiom['evaluation'] = 'fn'
 35 | 
 36 | # Count true/false positives/negatives
 37 | # We do not have true negatives
 38 | tp = 0.
 39 | fp = 0.
 40 | fn = 0.
 41 | for extracted_idiom in extracted_idioms:
 42 | 	for annotated_idiom in annotated_idioms:
 43 | 		# Lower case PIEs for comparison, as they are annotated as lower-case, but not necessarily extracted so 
 44 | 		if extracted_idiom['document_id'] == annotated_idiom['document_id'] and extracted_idiom['sentence_number'] == annotated_idiom['sentence_number'] and extracted_idiom['idiom'].lower() == annotated_idiom['idiom'].lower():
 45 | 			tp += 1
 46 | 			extracted_idiom['evaluation'] = 'tp'
 47 | 			annotated_idiom['evaluation'] = 'tp'
 48 | 			break
 49 | 	else: # No break 
 50 | 		fp += 1
 51 | 		extracted_idiom['evaluation'] = 'fp'
 52 | 		
 53 | fn = len(annotated_idioms) - tp # False negatives = all missed PIEs = # annotated PIEs - # correctly found PIEs
 54 | 
 55 | # Get precision, recall, F1-score
 56 | precision = tp / (tp + fp)
 57 | recall = tp / (tp + fn)
 58 | f1 = 2 * (precision * recall) / (precision + recall)
 59 | 
 60 | # Print results
 61 | print '### RESULTS ###'
 62 | print 'Total number of annotated PIEs: {0}'.format(len(annotated_idioms))
 63 | print 'Total number of extracted PIEs: {0}\n'.format(len(extracted_idioms))
 64 | print 'True Positives: {0}\nFalse Positives: {1}\nFalse Negatives: {2}\n'.format(tp, fp, fn)
 65 | print 'Precision: {0}%'.format(precision*100)
 66 | print 'Recall: {0}%'.format(recall*100)
 67 | print 'F1-score: {0}%'.format(f1*100)
 68 | 
 69 | # Print examples of classifications
 70 | def show_examples(idioms, evaluation):
 71 | 	# Define colours
 72 | 	stop = '\x1b[0m'
 73 | 	red = '\x1b[1;31;1m'
 74 | 	# Count number of examples shown
 75 | 	count = 0
 76 | 	for idiom in idioms:
 77 | 		if idiom['evaluation'] == evaluation:
 78 | 			# Highlight idiom in context 
 79 | 			try:
 80 | 				context = idiom['context']
 81 | 				start = int(idiom['start'])
 82 | 				end = int(idiom['end'])
 83 | 			except KeyError:
 84 | 				context = idiom['sentence']
 85 | 				start = idiom['offsets'][0][0]
 86 | 				end = idiom['offsets'][-1][-1]
 87 | 			highlighted_context = context[:start]
 88 | 			highlighted_context += red
 89 | 			highlighted_context += context[start:end]
 90 | 			highlighted_context += stop
 91 | 			highlighted_context += context[end:]
 92 | 			print highlighted_context,
 93 | 			print '({2} - doc. {0} - sent. {1})'.format(idiom['document_id'], idiom['sentence_number'], idiom['idiom'])
 94 | 			count += 1
 95 | 			if count % 10 == 0:
 96 | 				user_input = unicode(raw_input("Show 10 more examples? (y/n): "), 'utf-8')
 97 | 				if user_input.lower() != 'y':
 98 | 					break
 99 | 	else: # No break
100 | 		print 'No more examples!'
101 | 
102 | # Prompt and show examples for different classes
103 | # Shuffle idiom lists to avoid seeing same examples again and again
104 | random.shuffle(extracted_idioms) 
105 | random.shuffle(annotated_idioms) 
106 | user_input = unicode(raw_input("Show examples of classifications? (y/n): "), 'utf-8')
107 | if user_input.lower() == 'y':
108 | 	user_input = unicode(raw_input("Show examples of true positives? (y/n): "), 'utf-8')
109 | 	if user_input.lower() == 'y':
110 | 		show_examples(extracted_idioms, 'tp')
111 | 	user_input = unicode(raw_input("Show examples of false positives? (y/n): "), 'utf-8')
112 | 	if user_input.lower() == 'y':
113 | 		show_examples(extracted_idioms, 'fp')
114 | 	user_input = unicode(raw_input("Show examples of false negatives? (y/n): "), 'utf-8')
115 | 	if user_input.lower() == 'y':
116 | 		show_examples(annotated_idioms, 'fn')			
117 | 
118 | # Split performance for most frequent PIE types in corpus
119 | def performance_per_type(annotated_idioms, extracted_idioms, n):
120 | 	most_frequent_types = Counter([x['idiom'] for x in annotated_idioms]).most_common()
121 | 	print 'PIE Type' + 17*' ' + 'Count\tPrecision\tRecall\tF1-score'
122 | 	for pie_type in most_frequent_types[:n]:
123 | 		pie = pie_type[0]
124 | 		count = pie_type[1]
125 | 		tp = float(len([x['evaluation'] for x in extracted_idioms if x['idiom'] == pie and x['evaluation'] == 'tp']))
126 | 		fp = float(len([x['evaluation'] for x in extracted_idioms if x['idiom'] == pie and x['evaluation'] == 'fp']))
127 | 		fn = float(len([x['evaluation'] for x in annotated_idioms if x['idiom'] == pie and x['evaluation'] == 'fn']))
128 | 		try:
129 | 			precision = tp / (tp + fp)
130 | 			recall = tp / (tp + fn)
131 | 			f1 = 2 * (precision * recall) / (precision + recall)
132 | 		except ZeroDivisionError:
133 | 			precision = 0.
134 | 			recall = 0.
135 | 			f1 = 0.
136 | 		pie += (25 - len(pie)) * ' '
137 | 		print '{0}{1}\t{2:.2f}\t\t{3:.2f}\t{4:.2f}'.format(pie, count, precision * 100, recall * 100, f1 * 100)
138 | 		
139 | user_input = unicode(raw_input("Show performance for 25 most frequent PIE types? (y/n): "), 'utf-8')
140 | if user_input.lower() == 'y':
141 | 	performance_per_type(annotated_idioms, extracted_idioms, 25)
142 | 


--------------------------------------------------------------------------------
/oxford.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''
  5 | Get idioms from the online Oxford Dictionary of English Idioms, by scraping the pages at www.oxfordreference.com.
  6 | Refines the idioms by removing duplicates, and expanding things in parentheses, dealing with special cases. 
  7 | '''
  8 | 
  9 | import re, requests, itertools
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | def get_idioms(url, landing_url, use_socks_proxy = False):
 13 | 	'''
 14 | 	Scrapes idioms from the ODEI website, gets 100 entries per page, 
 15 | 	navigates to entry page, gets idiom, cycles through pages
 16 | 	'''
 17 | 	idioms = []
 18 | 	# Set proxy, if applicable, requires pysocks to be installed
 19 | 	if use_socks_proxy:
 20 | 		proxies = {'http': "socks5://127.0.0.1:8080"}
 21 | 	else:
 22 | 		proxies = {}
 23 | 	# Get and parse first page
 24 | 	page = requests.get(landing_url)
 25 | 	soup = BeautifulSoup(page.content, 'html.parser')
 26 | 	# Scrape pagination information
 27 | 	links = soup.find_all('a')
 28 | 	for link in links:
 29 | 		if link.parent.name == 'div':
 30 | 			try:
 31 | 				if link.parent['class'][0] == 't-data-grid-pager':
 32 | 					last_page = link.text # Number of pages to cycle through
 33 | 					url_template = link['href']
 34 | 			except KeyError:
 35 | 				pass # Sometimes parent has no class
 36 | 	# Cycle through pages, get actual idioms
 37 | 	for i in range(1, int(last_page) + 1):
 38 | 		print 'Scraping page {0} of {1}'.format(i, last_page) # Very slow, so give progress updates
 39 | 		# Get next page url
 40 | 		if i < int(last_page):
 41 | 			next_page = url + re.sub('gridpager/{0}'.format(last_page), 'gridpager/{0}'.format(i + 1), url_template)
 42 | 		# Find links to pages containing idioms
 43 | 		links = soup.find_all('a')
 44 | 		for link in links:
 45 | 			if link.parent.name == 'h2':
 46 | 				try:
 47 | 					if link.parent['class'][0] == 'itemTitle':
 48 | 						# Get page with idiom entries
 49 | 						entry_page = requests.get(url + link['href'], proxies=proxies)
 50 | 						entry_soup = BeautifulSoup(entry_page.content, 'html.parser')
 51 | 						# Extract idiom
 52 | 						for idiom in entry_soup.find_all('em'):
 53 | 							try:
 54 | 								if idiom.parent.parent['class'][0] == 'div1':
 55 | 									if ' ' in idiom.text: # Filter out single word 'idioms'
 56 | 										idioms.append(idiom.text) # Store the actual idiom 
 57 | 							except KeyError:
 58 | 								pass # Sometimes grandparent has no class
 59 | 				except KeyError:
 60 | 					pass # Sometimes parent has no class
 61 | 		# Get and parse next page
 62 | 		page = requests.get(next_page)
 63 | 		soup = BeautifulSoup(page.content, 'html.parser')
 64 | 
 65 | 	return sorted(list(set(idioms)))
 66 | 
 67 | def refine_idioms(idioms):
 68 | 	'''
 69 | 	Oxford scraping output is messy. Removes duplicates containing ':'. 
 70 | 	Expands optionals in parentheses. Deals with some exceptional cases individually
 71 | 	'''
 72 | 
 73 | 	refined_idioms = []
 74 | 	for idiom in idioms:
 75 | 		# Fix scraping errors
 76 | 		if idiom == 'like (or as if) it is going out of fashion (or style':
 77 | 			idiom += ')'
 78 | 		if idiom == 'cog in the wheel (or machine':
 79 | 			idiom += ')'
 80 | 		if idiom == 'get you (him, her':
 81 | 			idiom += ', etc.)!'
 82 | 		has_parentheses = False
 83 | 		if idiom[-1] != ':': # All duplicates end in ':' 
 84 | 			# Get all parenthesis pairs + content
 85 | 			if re.findall('\(.*\)', idiom):
 86 | 				pairs_of_parentheses = re.finditer('\(.*?\)', idiom)
 87 | 				# Cycle through pairs of parentheses, collect parts of idiom, and their expansions/variations
 88 | 				idiom_parts = []
 89 | 				previous_end = 0
 90 | 				for pair_of_parentheses in pairs_of_parentheses:
 91 | 					starts_with_also = False # e.g. (also sure as fate)
 92 | 					starts_with_or = False # e.g. (or get something off the ground)
 93 | 					or_in_middle = False # e.g. (final or last)
 94 | 					contains_etc = False # e.g. (me, him, etc.)
 95 | 					# Get indices
 96 | 					start = pair_of_parentheses.start()
 97 | 					end = pair_of_parentheses.end()
 98 | 					# Examine content between parentheses - set conditions
 99 | 					content_between_parentheses = pair_of_parentheses.group(0)[1:-1] # Get content without ()
100 | 					if re.match('also\\b', content_between_parentheses):
101 | 						starts_with_also = True
102 | 					if re.match('or\\b', content_between_parentheses):
103 | 						starts_with_or = True
104 | 					if re.search('.\\bor\\b', content_between_parentheses):
105 | 						or_in_middle = True
106 | 					if re.search('etc\.', content_between_parentheses):
107 | 						contains_etc = True
108 | 					# Add the non-parenthesized bit before the current pair of parentheses (if it exists)
109 | 					idiom_part_before = idiom[previous_end:start]
110 | 					if idiom_part_before:
111 | 						idiom_parts.append([idiom_part_before])
112 | 					## Deal with different types of content between parentheses
113 | 					# Deal with the case with the '/', which occurs in exactly 1 idiom entry
114 | 					if content_between_parentheses == 'or get your fingers burned/burnt':
115 | 						content_between_parentheses = 'or get your fingers burned or get your fingers burnt'
116 | 						or_in_middle = True
117 | 					# Deal with some especially difficult parentheses cases first, individually
118 | 					if '(' in content_between_parentheses:
119 | 						if content_between_parentheses == 'or bring someone back (down':
120 | 							refined_idioms.append(u'bring someone back to earth')
121 | 							refined_idioms.append(u'bring someone back down to earth')
122 | 							end = len(idiom)
123 | 						if content_between_parentheses == 'or give someone pause (for thought':
124 | 							refined_idioms.append(u'give someone pause')
125 | 							refined_idioms.append(u'give someone pause for thought')
126 | 							end = len(idiom)
127 | 						if content_between_parentheses == 'or herein (or therein':
128 | 							idiom_parts[-1].append(u'herein lies')
129 | 							idiom_parts[-1].append(u'therein lies')
130 | 							idiom_parts.append([u'a tale'])
131 | 							end = len(idiom)
132 | 					# Simplest case, just generate idiom with parentheses removed, keeping content in parentheses
133 | 					# EXAMPLE: (all) at sea -> all at sea, at sea
134 | 					elif not starts_with_also and not starts_with_or and not or_in_middle and not contains_etc:
135 | 						idiom_part_between_parentheses = ['', content_between_parentheses]
136 | 						idiom_parts.append(idiom_part_between_parentheses)
137 | 					# Simplest'case starting with 'or'. Generate idiom with n words before parentheses replaced by the n words in the parentheses
138 | 					# EXAMPLE: I should cocoa (or coco) -> I should cocoa, I should coco
139 | 					elif not starts_with_also and starts_with_or and not or_in_middle and not contains_etc:
140 | 						num_words_to_replace = len(content_between_parentheses.split(' ')) - 1 # -1 because of or
141 | 						content_between_parentheses_without_or = ' '.join(content_between_parentheses.split(' ')[1:])
142 | 						idiom_part_before_split = idiom_part_before.strip().split(' ')
143 | 						idiom_part_before_trimmed = ' '.join(idiom_part_before_split[:-num_words_to_replace])
144 | 						idiom_part_before_variant = idiom_part_before_trimmed + ' ' + content_between_parentheses_without_or
145 | 						if idiom_part_before[0] == ' ': # Add initial space if it got removed incidentally
146 | 							idiom_part_before_variant = ' ' + idiom_part_before_variant						
147 | 						idiom_parts[-1].append(idiom_part_before_variant) # Add as variant to previous part
148 | 					# Simplest case with or in the middle. Generate idioms for each part separated by 'or'.
149 | 					# EXAMPLE: a (final or last) turn of the screw -> a final turn of the screw, a last turn of the screw
150 | 					elif not starts_with_also and not starts_with_or and or_in_middle and not contains_etc:
151 | 						content_parts = content_between_parentheses.split(' or ')
152 | 						idiom_parts.append(content_parts)
153 | 					# Case with both or at the start and in the middle. Generate idioms with replacement for each part separated by 'or'
154 | 					# EXAMPLE: a bad (or bitter or nasty) taste -> a bad taste, a bitter taste, a nasty taste
155 | 					elif not starts_with_also and starts_with_or and or_in_middle and not contains_etc:
156 | 						content_parts = content_between_parentheses[3:].split(' or ') # Strip initial 'or' and split in parts
157 | 						idiom_part_before_split = idiom_part_before.strip().split(' ')
158 | 						for content_part in content_parts:
159 | 							num_words_to_replace = len(content_part.split(' '))
160 | 							idiom_part_before_trimmed = ' '.join(idiom_part_before_split[:-num_words_to_replace])
161 | 							idiom_part_before_variant = idiom_part_before_trimmed + ' ' + content_part
162 | 							idiom_parts[-1].append(idiom_part_before_variant)
163 | 					# Case with 'also' at the start, signals full replacement, only two cases, one also with 'or'
164 | 					# 1. sure as eggs is eggs (also sure as fate) 2. left, right, and centre (also left and right or right and left)
165 | 					elif starts_with_also and not contains_etc:
166 | 						if not or_in_middle:
167 | 							idiom_part_before_variant = content_between_parentheses[5:] # Remove 'also'
168 | 							idiom_parts[-1].append(idiom_part_before_variant)
169 | 						else:
170 | 							idiom_part_before_variants = content_between_parentheses[5:].split(' or ')
171 | 							idiom_parts[-1] += idiom_part_before_variants
172 | 					# Cases with etc. are rare, and require individual treatment
173 | 					elif contains_etc:
174 | 						if content_between_parentheses in ['me, him, etc.', 'him, her, etc.']:
175 | 							expanded_series = ['me', 'you', 'him', 'her', 'us', 'them', 'it']
176 | 							idiom_parts.append(expanded_series)
177 | 						elif content_between_parentheses == 'or tell, etc.':
178 | 							idiom_part_before_variant = 'tell'
179 | 							idiom_parts[-1].append(idiom_part_before_variant)
180 | 						elif content_between_parentheses == 'or herself, etc.':
181 | 							idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1])
182 | 							variant_series = ['myself', 'yourself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves']
183 | 							for variant in variant_series:
184 | 								idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant
185 | 								idiom_parts[-1].append(idiom_part_before_variant)
186 | 						elif content_between_parentheses == 'or bore etc.':
187 | 							idiom_part_before_variant = 'bore'
188 | 							idiom_parts[-1].append(idiom_part_before_variant)	
189 | 						elif content_between_parentheses == 'or your etc.':
190 | 							idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1])
191 | 							variant_series = ['my', 'your', 'his', 'her', 'its', 'our', 'your', 'their']
192 | 							for variant in variant_series:
193 | 								idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant
194 | 								idiom_parts[-1].append(idiom_part_before_variant)				
195 | 						elif content_between_parentheses in ['or you or him, etc.', 'or her, him, etc.']:
196 | 							idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1])
197 | 							variant_series = ['you', 'him', 'her', 'us', 'them', 'it']
198 | 							for variant in variant_series:
199 | 								idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant
200 | 								idiom_parts[-1].append(idiom_part_before_variant)		
201 | 						elif content_between_parentheses == 'or forty-something, etc.':
202 | 							idiom_parts = [] # Single-word idiom, ignore
203 | 					previous_end = end
204 | 				# Add remaining part of idiom after final pair of parentheses
205 | 				idiom_parts.append([idiom[end:]])
206 | 				# From the collected idiom parts and variations, generate all idiom variations and add them to the list
207 | 				for refined_idiom in itertools.product(*idiom_parts):
208 | 					refined_idiom = ''.join(refined_idiom)
209 | 					refined_idiom = re.sub(' +', ' ', refined_idiom) # Remove double spaces
210 | 					refined_idiom = re.sub('(^ )|( $)', '', refined_idiom) # Remove initial spaces and final spaces
211 | 					if len(refined_idiom.split(' ')) > 1: # Remove single-word idioms, e.g. 'forty-something' (or thirty-something')
212 | 						refined_idioms.append(refined_idiom)
213 | 			else:
214 | 				refined_idioms.append(idiom)
215 | 	return refined_idioms
216 | 


--------------------------------------------------------------------------------
/pos2morpha.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Based on a similar script by Kilian Evang
 4 | 
 5 | '''
 6 | Converts tokenized and POS-tagged texts in C&C format (pipe-separated PTB tags)
 7 | to morpha's input format (underscore-separated CLAWS tags).
 8 | '''
 9 | 
10 | import re, sys
11 | 
12 | def ptb2claws(token, tag):
13 |     if tag == 'NNP': 
14 | 		return 'NP'
15 |     if tag == 'NNPS': 
16 | 		return 'NP2'
17 |     if tag == 'NNS': 
18 | 		return 'NN2'
19 |     if token in ('ca', 'sha', 'wo', '\'d') and tag == 'MD': 
20 | 		return 'VM'
21 |     if token == 'n\'t' and tag == 'RB': 
22 | 		return 'XX'
23 |     if token == '\'d' and tag == 'VBD': 
24 | 		return 'VH'
25 |     return tag
26 | 
27 | def convert_token(token):
28 |     pipeindex = token.rfind('|')
29 |     prefix = token[:pipeindex]
30 |     suffix = token[pipeindex + 1:]
31 |     if len(prefix) > 1 and prefix.endswith('-'):
32 |         prefix = prefix[:-1]
33 |     return prefix + '_' + ptb2claws(prefix, suffix)
34 | 
35 | 


--------------------------------------------------------------------------------
/process_corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''Load and preprocess a corpus for idiom extraction'''
 5 | 
 6 | import os, time, json
 7 | import nltk.data
 8 | from bs4 import BeautifulSoup
 9 | 
10 | def plain_text(corpus_file, no_split):
11 | 	'''Read in a plain text corpus, return a single document containing a list of unicode sentences.'''	
12 | 
13 | 	splitter = nltk.data.load('tokenizers/punkt/english.pickle')
14 | 	# Read in corpus
15 | 	documents = []
16 | 	sentences = []
17 | 	with open(corpus_file, 'r') as f:
18 | 		for line in f:
19 | 			if line.strip():
20 | 				if no_split:
21 | 					sentences.append(unicode(line.strip(), 'utf-8'))
22 | 				else:
23 | 					sentences += splitter.tokenize(unicode(line.strip(), 'utf-8'))
24 | 	documents.append(sentences)
25 | 	
26 | 	return documents
27 | 
28 | def bnc(corpus_file, corpus_type, cache_path):
29 | 	'''
30 | 	Read in the British National Corpus (BNC) XML version, returns a list of documents.
31 | 	Documents are lists of dictionaries. Dictionaries contain unicode sentences and metadata 
32 | 	for offset annotation.	
33 | 	'''
34 | 
35 | 	documents = []
36 | 	# Read parsed XML from cached file, for bnc/bnc-dev/bnc-test, if available
37 | 	if os.path.exists(cache_path):
38 | 		print 'Reading BNC from {0}'.format(cache_path)
39 | 		documents = json.load(open(cache_path, 'r'))
40 | 		return documents
41 | 		
42 | 	# Read BNC from file and parse, if no cached version available
43 | 	time_0 = time.time()
44 | 	print 'Processing BNC...'
45 | 	# Cycle through subdirectories
46 | 	subdirectories = sorted(os.listdir(corpus_file))
47 | 	for subdirectory in subdirectories:
48 | 		subdirectory_path = os.path.join(corpus_file, subdirectory)
49 | 		subsubdirectories = sorted(os.listdir(subdirectory_path))
50 | 		for subsubdirectory in subsubdirectories:
51 | 			subsubdirectory_path = os.path.join(subdirectory_path, subsubdirectory)
52 | 			document_ids = sorted(os.listdir(subsubdirectory_path))
53 | 			# Cycle through documents
54 | 			for document_id in document_ids:
55 | 				# Select only documents in development or test set of evaluation corpus
56 | 				if corpus_type in ['bnc-dev', 'bnc-test']:
57 | 					if corpus_type == 'bnc-dev':
58 | 						subset_documents = [u'CBC', u'CH1', u'A61', u'A18', u'ABC', u'ABV', u'A12', u'CBD', u'A1N', u'A19', u'A69', u'A75', u'AML', u'K2A', u'FU4', u'HD8', u'A60', u'AL7', u'A1F', u'A1D', u'A1L', u'A1H']
59 | 					else:
60 | 						subset_documents = [u'CBG', u'J1C', u'B03', u'A16', u'A6J', u'A15', u'A11', u'J1M', u'AP1', u'A5Y', u'G3H',  u'B2M', u'B0X', u'A6S', u'B1C', u'A10', u'H8W', u'A1E', u'A1G', u'GXL', u'A1M', u'K29', u'A63']
61 | 					if document_id[0:3] not in subset_documents:
62 | 						continue
63 | 				sentences_with_metadata = [] # Format: {'sentence': 'I win.', 'document_number': 'A00', 'sentence_number': '1'}
64 | 				document_path = os.path.join(subsubdirectory_path, document_id)
65 | 				parsed_xml = BeautifulSoup(open(document_path), 'lxml-xml')
66 | 				# Get metadata
67 | 				for idno in parsed_xml.find_all('idno'):
68 | 					if idno['type'] == 'bnc':
69 | 						document_idno = unicode(idno.string )
70 | 				for class_code in parsed_xml.find_all('classCode'):
71 | 					if class_code['scheme'] == 'DLEE':
72 | 						class_code = unicode(class_code.string)
73 | 						break
74 | 				# Cycle through sentences, extract unicode string
75 | 				for sentence in parsed_xml.find_all('s'):
76 | 					# Skip sentences containing gap elements
77 | 					if sentence.gap:
78 | 						continue
79 | 					sentence_number = unicode(sentence['n'])
80 | 					sentence_string = ''
81 | 					for descendant in sentence.descendants:
82 | 						if descendant.name in ['c', 'w']:
83 | 							sentence_string += unicode(descendant.string)
84 | 					# Store sentence with metadata
85 | 					sentence_with_metadata = {'document_id': document_idno, 'sentence_number': sentence_number, 'sentence': sentence_string}
86 | 					sentences_with_metadata.append(sentence_with_metadata)
87 | 				documents.append(sentences_with_metadata)
88 | 	print 'Done! Processing BNC took {0:.2f} seconds'.format(time.time() - time_0)
89 | 	
90 | 	# Cache parsed XML
91 | 	json.dump(documents, open(cache_path, 'w'))
92 | 	
93 | 	return documents
94 | 


--------------------------------------------------------------------------------
/using_english.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''Get idioms from UsingEnglish.com, by scraping the a-z pages at www.usingenglish.com/reference/idioms/'''
 5 | 
 6 | import re, string
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | 
10 | def get_idioms(url, idioms_url):
11 | 	'''Scrape the idioms from the usingEnglish.com pages.'''
12 | 
13 | 	idioms = []
14 | 	for letter in string.lowercase: # Cycle through categories 
15 | 		next_page = '{0}/{1}.html'.format(idioms_url, letter) # Page 1 of the category
16 | 		while next_page:
17 | 			# Get and parse page
18 | 			page = requests.get(next_page)
19 | 			soup = BeautifulSoup(page.content, 'html.parser')
20 | 			next_page = None
21 | 			for link in soup.find_all('a'):
22 | 				# Extract idiom from html
23 | 				if link.parent.name == 'dt':
24 | 					if ' ' in link.string: # Exclude single word 'idioms'
25 | 						idioms.append(link.string)
26 | 				# Get link to next page in the category
27 | 				elif link.parent.name == 'div':
28 | 						try:
29 | 							if link.parent['class'][0]	== 'pagination':
30 | 								if re.match('next', link.string):
31 | 									next_page = url + link['href']
32 | 						except KeyError: # Sometimes parent has no class
33 | 							pass
34 | 
35 | 	return sorted(list(set(idioms)))
36 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''Utility functions to work with morpha, PoS-tagging, parsing, and other things.'''
  5 | 
  6 | import pos2morpha
  7 | 
  8 | import subprocess, shlex, time, json, re, itertools, csv
  9 | import spacy
 10 | import en_core_web_sm as spacy_model 
 11 | from stanfordcorenlp import StanfordCoreNLP
 12 | import nltk.data
 13 | 
 14 | ###### STANFORD TO SPACY ######
 15 | class StanfordDoc:
 16 | 	'''Spacy-Doc-like container for Stanford output'''
 17 | 
 18 | 	def __init__(self):
 19 | 		self.sents = []
 20 | 
 21 | 	def __iter__(self):
 22 | 		return iter(self.tokens)
 23 | 
 24 | 	def __getitem__(self, i):
 25 | 		if isinstance(i, slice):
 26 | 			return StanfordSpan(self.tokens[i.start:i.stop])
 27 | 		else:
 28 | 			return self.tokens[i]
 29 | 
 30 | 	# Generate list of tokens from sentences
 31 | 	def set_tokens(self):
 32 | 		self.tokens = [token for sent in self.sents for token in sent]
 33 | 
 34 | class StanfordSpan:
 35 | 	'''Spacy-Span-like container for Stanford output'''
 36 | 
 37 | 	def __init__(self, tokens):
 38 | 		self.tokens = tokens
 39 | 		self.start = self.tokens[0].i # Starting token index in document
 40 | 		self.start_char = self.tokens[0].idx # Starting character index in document
 41 | 		self.text_with_ws = ''.join([token.text_with_ws for token in self.tokens])
 42 | 		self.text = ''.join([token.text_with_ws for token in self.tokens[:-1]]) + self.tokens[-1].text
 43 | 
 44 | 	def __iter__(self):
 45 | 		return iter(self.tokens)
 46 | 
 47 | 	def __getitem__(self, i):
 48 | 		return self.tokens[i]
 49 | 
 50 | class StanfordToken:
 51 | 	'''Spacy-Token-like container for Stanford output'''
 52 | 
 53 | 	def __init__(self, i, idx, lemma, tag, text, ws, word, doc):
 54 | 		self.i = i # Token index in document
 55 | 		self.idx = idx # Starting character index in document
 56 | 		self.lemma_ = lemma
 57 | 		self.tag_ = tag # PoS-tag inventory might differ slightly, but should not cause problems
 58 | 		self.text = text
 59 | 		self.text_with_ws = text + ws
 60 | 		self.lower_ = word.lower()
 61 | 		self.children = []
 62 | 		self.doc = doc
 63 | 
 64 | 	def __str__(self):
 65 | 		return self.text
 66 | 
 67 | 	# Recursively gets all the syntactic descendants of a token, including self
 68 | 	def get_descendants(self):
 69 | 		descendants = [self]
 70 | 		for child in self.children:
 71 | 			descendants += child.get_descendants()
 72 | 		return descendants
 73 | 
 74 | 	# Sets the subtree attribute, which is an ordered generator for all descendants of a token
 75 | 	def get_subtree(self):
 76 | 		return sorted(self.get_descendants(), key=lambda x: x.i)
 77 | 
 78 | 	# Sets the rights attribute, which is an ordered generator for all children to the right of a token
 79 | 	def get_rights(self):
 80 | 		return [child for child in self.children if child.i > self.i]
 81 | 
 82 | 	def __repr__(self):
 83 | 		return self.text
 84 | 
 85 | def stanford_to_spacy(parse):
 86 | 	'''Turn Stanford CoreNLP output into a Spacy-like object'''
 87 | 
 88 | 	# Convert into Spacy-like objects
 89 | 	doc = StanfordDoc()
 90 | 	doc_i = 0
 91 | 	for sentence in parse['sentences']:
 92 | 		span = []
 93 | 		# Get token information
 94 | 		tokens = sentence['tokens']
 95 | 		dependencies = sentence['basicDependencies']
 96 | 		# Make tokens into StanfordTokens
 97 | 		for token in tokens:
 98 | 			new_token = StanfordToken(doc_i, token['characterOffsetBegin'], token['lemma'], token['pos'], token['originalText'], token['after'], token['word'], doc)
 99 | 			doc_i += 1
100 | 			span.append(new_token)
101 | 		# Add dependency relation and head index to tokens
102 | 		for dependency in dependencies:
103 | 			span[dependency['dependent'] - 1].head_idx = dependency['governor'] - 1
104 | 			span[dependency['dependent'] - 1].dep_ = dependency['dep']
105 | 		# Add pointer to head of each token
106 | 		for new_token in span:
107 | 			# ROOT has itself as head
108 | 			try:
109 | 				if new_token.head_idx == -1:
110 | 					new_token.head = new_token
111 | 				else:
112 | 					new_token.head = span[new_token.head_idx]
113 | 					new_token.head.children.append(new_token)
114 | 			# Occasionally, a misformed parse yields a token without a head, default to ROOT, and show problematic sentence
115 | 			except AttributeError:
116 | 				new_token.head_idx = -1
117 | 				new_token.dep_ = u'ROOT'
118 | 				new_token.head = new_token
119 | 				print 'Headless word \'{0}\' in sentence "{1}"'.format(new_token.text.encode('utf-8'), ''.join([x.text_with_ws.encode('utf-8') for x in span]))
120 | 		# Add subtree to each token
121 | 		for new_token in span:
122 | 			new_token.subtree = new_token.get_subtree()
123 | 			new_token.rights = new_token.get_rights()
124 | 		doc.sents.append(StanfordSpan(span))
125 | 	# Generate token list
126 | 	doc.set_tokens()
127 | 	
128 | 	return doc
129 | 
130 | ###### PARSING ######
131 | def load_parser(parser_type):
132 | 	'''Loads Spacy or Stanford CoreNLP'''
133 | 
134 | 	time_0 = time.time()
135 | 	print 'Loading parser...'
136 | 	if parser_type == 'spacy':
137 | 		parser = spacy_model.load()
138 | 	elif parser_type == 'stanford':
139 | 		parser = StanfordCoreNLP('ext/stanford', memory='6g')
140 | 		parse((parser_type, parser), 'The cat sat on the mat.') # Annotate dummy sentence to force loading of annotation modules
141 | 	print 'Done! Loading parser took {0:.2f} seconds'.format(time.time() - time_0)
142 | 
143 | 	return (parser_type, parser)
144 | 
145 | def parse(parser, text):
146 | 	'''Parses a (unicode) string and returns the parse.'''
147 | 
148 | 	if parser[0] == 'spacy':
149 | 		# Convert to unicode if necessary
150 | 		try:
151 | 			text = unicode(text, 'utf-8')
152 | 		except TypeError:
153 | 			pass
154 | 		# Normalize quotes, ‘ ’ ❛ ❜ to ', and “ ” ❝ ❞ to ", Spacy doesn't process them well
155 | 		text = re.sub(u'‘|’|❛|❜', u"'", text)
156 | 		text = re.sub(u'“|”|❝|❞', u'"', text)
157 | 		# Insert a space between punctuation and a dash, Spacy doesn't process that well either
158 | 		text = re.sub(ur'([^\w\s])([-—])', r'\1 \2', text)
159 | 		return parser[1](text)
160 | 
161 | 	if parser[0] == 'stanford':
162 | 		# Convert from unicode if necessary
163 | 		try:
164 | 			text = text.encode('utf-8')
165 | 		except UnicodeDecodeError:
166 | 			pass
167 | 		properties={'annotators': 'tokenize,ssplit,pos,lemma,depparse','pipelineLanguage':'en','outputFormat':'json'}
168 | 		parsed_text = parser[1].annotate(text, properties=properties)
169 | 		parsed_text = json.loads(parsed_text)
170 | 		return stanford_to_spacy(parsed_text)
171 | 
172 | ###### POS-TAGGING ######
173 | def load_pos_tagger():
174 | 	'''Loads Spacy PoS-tagger which takes pre-tokenized text.'''
175 | 	
176 | 	time_0 = time.time()
177 | 	print 'Loading PoS-tagger...'
178 | 	pos_tagger = spacy_model.load(disable = ['ner', 'parser'])
179 | 	print 'Done! Loading PoS-tagger took {0:.2f} seconds'.format(time.time() - time_0)
180 | 
181 | 	return pos_tagger
182 | 
183 | def pos_tag(pos_tagger, text):
184 | 	'''Takes pos_tagger and tokenized utf-8 idiom/sentence, returns list of word|POS strings.'''
185 | 	
186 | 	# Normalize quotes, ‘ ’ ❛ ❜ to ', and “ ” ❝ ❞ to ", Spacy doesn't process them well
187 | 	text = re.sub(u'‘|’|❛|❜', u"'", text)
188 | 	text = re.sub(u'“|”|❝|❞', u'"', text)
189 | 	# Make Doc
190 | 	doc = spacy.tokens.Doc(pos_tagger.vocab, text.split())
191 | 	# Set sentence boundary
192 | 	for token in doc:
193 | 		if token.i == 0:
194 | 			token.is_sent_start = True
195 | 		else:
196 | 			token.is_sent_start = False
197 | 	# Do actual tagging
198 | 	doc = pos_tagger.tagger(doc)
199 | 	# Convert into list of words and tags
200 | 	words_and_tags = []
201 | 	for token in doc:
202 | 		words_and_tags.append(token.text + u'|' + token.tag_)
203 | 		
204 | 	return words_and_tags
205 | 
206 | ###### MORPHA ######	
207 | def morpha(morph_dir, tokens, keep_case = True, keep_pos = False):
208 | 	'''Interface to morpha and its options, takes list of tokens as input, returns list of uninflected tokens.'''
209 | 
210 | 	# Set flags
211 | 	if keep_case:
212 | 		case_flag = 'c'
213 | 	else:
214 | 		case_flag = ''
215 | 	if keep_pos:
216 | 		pos_flag = 't'
217 | 	else:
218 | 		pos_flag = ''
219 | 	flags = '-{0}{1}f'.format(case_flag, pos_flag)
220 | 
221 | 	# Call morpha via subprocess
222 | 	call = shlex.split('{0}/morpha {1} {0}/verbstem.list'.format(morph_dir, flags))
223 | 	process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
224 | 	output = process.communicate(input=' '.join(tokens))
225 | 	base_tokens = output[0].split(' ')
226 | 
227 | 	return base_tokens
228 | 
229 | def morphg(morph_dir, tokens, keep_case = True, keep_pos = False):
230 | 	'''Interface to morphg and its options, takes list of token+inflection_POS strings as input, returns tuple of inflected tokens.'''
231 | 
232 | 	# Set flags
233 | 	if keep_case:
234 | 		case_flag = 'c'
235 | 	else:
236 | 		case_flag = ''
237 | 	if keep_pos:
238 | 		pos_flag = 't'
239 | 	else:
240 | 		pos_flag = ''
241 | 	flags = '-{0}{1}f'.format(case_flag, pos_flag)
242 | 
243 | 	# Call morphg
244 | 	call = shlex.split('{0}/morphg {1} {0}/verbstem.list'.format(morph_dir, flags))
245 | 	process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
246 | 	output = process.communicate(input=' '.join(tokens))
247 | 	inflected_tokens = output[0].split(' ')
248 | 
249 | 	# Filter out failed inflections, which will still contain '+'
250 | 	cleaned_inflected_tokens = [i_t for i_t in inflected_tokens if not '+' in i_t]
251 | 
252 | 	return tuple(cleaned_inflected_tokens)
253 | 
254 | ###### TOKENIZATION ######
255 | def load_tokenizer():
256 | 	'''Loads Spacy tokenizer'''
257 | 
258 | 	time_0 = time.time()
259 | 	print 'Loading tokenizer...'
260 | 	tokenizer = spacy_model.load(disable = ['tagger', 'ner', 'parser'])
261 | 	print 'Done! Loading tokenizer took {0:.2f} seconds'.format(time.time() - time_0)
262 | 
263 | 	return tokenizer
264 | 
265 | def tokenize(tokenizer, sentence):
266 | 	'''Parses a (unicode) sentence, returns list of Spacy Tokens'''
267 | 	try:
268 | 		return tokenizer(unicode(sentence, 'utf-8'))
269 | 	except TypeError:
270 | 		return tokenizer(sentence)
271 | 
272 | ###### EXAMPLE SENTENCES ######
273 | def get_example_sentences(idioms, sentences_file, cache_file):
274 | 	'''
275 | 	Takes a list of idioms, searches a large corpus for example sentences,
276 | 	extracts shortest example sentence, returns dict of format {idiom: sentence}.
277 | 	Saves extracted sentences and idioms to file, for fast re-use in subsequent runs. 
278 | 	'''
279 | 
280 | 	time_0 = time.time()
281 | 	idioms_with_sentences = {}
282 | 
283 | 	# If file is cached example sentences, load those, else extract sentences from corpus
284 | 	if re.search('.json$', sentences_file):
285 | 		idioms_with_sentences = json.load(open(sentences_file, 'r'))
286 | 		if set(idioms) <= set(idioms_with_sentences.keys()):
287 | 			print 'Using cached example sentences from {0}'.format(sentences_file)
288 | 			# Select only the idioms part of the idiom dictionary 
289 | 			if set(idioms) < set(idioms_with_sentences.keys()):
290 | 				idioms_with_sentences = {key: idioms_with_sentences[key] for key in idioms_with_sentences if key in idioms}
291 | 			return idioms_with_sentences
292 | 		else:
293 | 			raise Exception('{0} does not contain entries for all the idioms specified in the dictionary argument, quitting.'.format(sentences_file))
294 | 	else:
295 | 		print '{0} is not a cached json-file, extracting sentences containing idioms...'.format(sentences_file)
296 | 
297 | 	# Add fallback option: no example sentence
298 | 	for idiom in idioms:
299 | 		idioms_with_sentences[idiom] = '' 
300 | 	# Compile idiom regexes for efficiency and ignore meta-linguistic uses in quotes
301 | 	idiom_regexes = [re.compile('[^"\'] ' + idiom + ' [^"\']') for idiom in idioms]
302 | 	# Find shortest (in tokens) sentence containing idiom in corpus
303 | 	splitter = nltk.data.load('tokenizers/punkt/english.pickle')
304 | 	# Extract first 1000 lines containing the idiom with grep, then split and find sentences
305 | 	for idx, idiom in enumerate(idioms):
306 | 		if idx%100 == 0 and idx > 0:
307 | 			print '\tGetting example sentences for {0} of {1} idioms took {2} seconds'.format(idx, len(idioms), time.time()-time_0)
308 | 		call = shlex.split('grep -m 1000 "{0}" {1}'.format(u8(idiom), sentences_file))
309 | 		process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout = subprocess.PIPE, stderr=subprocess.PIPE)
310 | 		output = process.communicate()
311 | 		output = output[0].strip()
312 | 		sentences = splitter.tokenize(unicode(output, 'utf-8'))
313 | 		for sentence in sentences:
314 | 			if idiom_regexes[idx].search(sentence):
315 | 				# Should have at least 3 extra words in the 'sentence'
316 | 				if len(sentence.split(' ')) > len(idiom.split(' ')) + 3:
317 | 					if idioms_with_sentences[idiom]:
318 | 						# Replace old sentence if new sentence one is shorter
319 | 						if len(sentence.split(' ')) < len(idioms_with_sentences[idiom].split(' ')): 
320 | 							idioms_with_sentences[idiom] = sentence
321 | 					else:
322 | 						idioms_with_sentences[idiom] = sentence
323 | 
324 | 	# Caching extracted example sentences
325 | 	ofn = cache_file
326 | 	with open(ofn, 'w') as of:
327 | 		json.dump(idioms_with_sentences, of)
328 | 		print 'Caching idioms and example sentences in {0}'.format(ofn)
329 | 
330 | 	print 'Done! took {0:.2f} seconds'.format(time.time() - time_0)
331 | 
332 | 	return idioms_with_sentences
333 | 
334 | def parse_example_sentences(idioms_with_sentences, ambiguous_word, parser):
335 | 	'''Parses an example sentence containing an idiom, returns the part of the parse tree spanning the idiom.'''
336 | 
337 | 	parsed_idioms = []
338 | 	
339 | 	# Cycle through idioms, parse sentence if available, extract idiom-spanning subtree
340 | 	for idiom in idioms_with_sentences:
341 | 		sentence = idioms_with_sentences[idiom]
342 | 		if sentence:
343 | 			parsed_sentence = parse(parser, sentence)
344 | 			# Find indices of idiom in sentence
345 | 			start = re.search(idiom, sentence).start() 
346 | 			end = re.search(idiom, sentence).end()
347 | 			# Extract idiom subtree from parsed example sentence based on character offsets
348 | 			has_em_dash = u'\u2014' in idiom
349 | 			idiom_tokens = []
350 | 			subtree_start = None
351 | 			for token in parsed_sentence:
352 | 				if token.idx >= end:
353 | 					subtree_end = token.i
354 | 					break
355 | 				if token.idx >= start:
356 | 					if not subtree_start:
357 | 						subtree_start = token.i
358 | 					idiom_tokens.append(token)
359 | 			# Extract top token and lemma
360 | 			extracted = False
361 | 			for idiom_token in idiom_tokens:
362 | 				# If the head of current token is not part of the idiom, it is the top token of the idiom phrase
363 | 				if idiom_token.head.text not in idiom: 
364 | 					idiom_top_token = idiom_token
365 | 					idiom_top_lemma = idiom_token.lemma_
366 | 					# Detect parses where the idiom does not form a single subtree, parse those idioms w/o contenxt
367 | 					if extracted:
368 | 						del parsed_idioms[-1]
369 | 						parsed_idioms.append(parse_idiom(idiom, ambiguous_word, parser))
370 | 						break
371 | 					else:
372 | 						parsed_idioms.append((idiom_top_lemma, idiom_top_token, idiom_top_token.doc[subtree_start:subtree_end], has_em_dash))
373 | 						extracted = True
374 | 
375 | 		# Parse the idiom if no sentence is available
376 | 		else:
377 | 			parsed_idioms.append(parse_idiom(idiom, ambiguous_word, parser))
378 | 
379 | 	return parsed_idioms
380 | 
381 | ###### IDIOM PROCESSING ######
382 | def parse_idiom(idiom, ambiguous_word, parser):
383 | 	'''Parse idioms without context, extract top node, lemma and subtree.'''
384 | 
385 | 	parsed_idiom = None # Format: (top_lemma, top_token, idiom subtree, has_em_dash)
386 | 
387 | 	# Deal with em-dash wildcards, e.g. 'too - for words'. Replace wildcard with POS-ambiguous word (e.g. 'fine') and parse
388 | 	if u'\u2014' in idiom:
389 | 		has_em_dash = True
390 | 		parsed_idiom = parse(parser, re.sub(u'\u2014', ambiguous_word, idiom))
391 | 	else:
392 | 		has_em_dash = False
393 | 		parsed_idiom = parse(parser, idiom)
394 | 
395 | 	# Extract top token and lemma
396 | 	for token in parsed_idiom:
397 | 		if token.dep_ == 'ROOT':
398 | 			idiom_top_lemma = token.lemma_	
399 | 			idiom_top_token = token
400 | 			idiom_subtree = []
401 | 	parsed_idiom = (idiom_top_lemma, idiom_top_token, idiom_subtree, has_em_dash)
402 | 
403 | 	return parsed_idiom	
404 | 
405 | def inflect_idioms(idioms, morph_dir):
406 | 	'''
407 | 	Generate inflectional variants of idioms using the Spacy PoS-tagger,
408 | 	morpha and morphg. Takes a list of idioms, returns a list of inflected
409 | 	idioms and a mapping between inflectional variants and the base form.
410 | 	'''
411 | 	
412 | 	pos_tagger = load_pos_tagger()
413 | 	inflected_idioms = []
414 | 	base_form_map = {} # Maps inflectional variants to base form, format: {'inflectional variant': 'base form'}
415 | 	print 'Inflecting idioms...'
416 | 	time_0 = time.time()	
417 | 
418 | 	for idiom in idioms:
419 | 		# Add original form to base form map
420 | 		base_form_map[idiom] = idiom
421 | 		# Tag tokens, convert to Morpha tags
422 | 		pos_tokens = pos_tag(pos_tagger, idiom)
423 | 		if pos_tokens:
424 | 			morpha_tokens = [pos2morpha.convert_token(pos_token).encode('utf-8') for pos_token in pos_tokens]
425 | 			# Run morpha
426 | 			base_tokens = morpha(morph_dir, morpha_tokens, keep_case = True, keep_pos = True)
427 | 			# Generate inflections for verbs and nouns
428 | 			base_tuples = []
429 | 			for base_token in base_tokens:
430 | 				# Look for NN, not N, because we don't want NP, proper names
431 | 				# Differentiate noun and verb inflections
432 | 				# Morphg doesn't handle 'be' well, define manually
433 | 				if base_token[0:4] == 'be_V':
434 | 					base_tuples.append(('be', 'being', 'been', 'am', 'are', 'is', 'was', 'were'))
435 | 				elif '_V' in base_token or '_NN' in base_token:
436 | 					if '_V' in base_token:
437 | 						morphg_tokens = (re.sub('_', '+s_', base_token), re.sub('_', '+ing_', base_token),
438 | 						re.sub('_', '+ed_', base_token), re.sub('_', '+en_', base_token))
439 | 					else:
440 | 						morphg_tokens = (re.sub('_', '+s_', base_token),)
441 | 					base_tuples.append((base_token.split('_')[0],) + morphg(morph_dir, morphg_tokens, keep_case = True, keep_pos = False))
442 | 				else:
443 | 					base_tuples.append((base_token.split('_')[0],))
444 | 			# Generate combinations of inflected tokens and store base form mapping
445 | 			for inflected_tokens in itertools.product(*base_tuples):
446 | 				inflected_idiom = unicode(' '.join(inflected_tokens), 'utf-8')
447 | 				inflected_idioms.append(inflected_idiom)
448 | 				base_form_map[inflected_idiom] = idiom
449 | 
450 | 	# Join to original list, and filter out duplicates
451 | 	inflected_idioms = list(set(idioms + inflected_idioms))
452 | 
453 | 	print 'Done! Inflecting idioms took {0:.2f} seconds'.format(time.time() - time_0)
454 | 	print 'With inflections, we have {0} idioms'.format(len(inflected_idioms))
455 | 
456 | 	return inflected_idioms, base_form_map
457 | 
458 | def expand_indefinite_pronouns(idioms):
459 | 	'''
460 | 	When one's or someone's or someone occurs in an idiom, remove it,
461 | 	and add idioms with personal pronouns added in. Don't expand 'one',
462 | 	because it is too ambiguous.
463 | 	'''
464 | 
465 | 	expanded_idioms = []
466 | 	base_form_map = {} # Maps expanded variants to base form, format: {'expanded idiom': 'base form'}
467 | 	possessive_pronouns = ['my', 'your', 'his', 'her', 'its', 'our', 'their']
468 | 	objective_pronouns = ['me', 'you', 'him', 'her', 'us', 'them', 'it']
469 | 
470 | 	for idiom in idioms:
471 | 		# Add possessive pronouns only
472 | 		if re.search("\\bone's\\b", idiom):
473 | 			for possessive_pronoun in possessive_pronouns:
474 | 				expanded_idiom = re.sub("\\bone's\\b", possessive_pronoun, idiom)
475 | 				expanded_idioms.append(expanded_idiom)
476 | 				base_form_map[expanded_idiom] = idiom
477 | 		# Add possessive pronouns and a wildcard for other words
478 | 		elif re.search("\\bsomeone's\\b", idiom):
479 | 			for possessive_pronoun in possessive_pronouns + [unicode("—'s", 'utf-8')]:
480 | 				expanded_idiom = re.sub("\\bsomeone's\\b", possessive_pronoun, idiom)
481 | 				expanded_idioms.append(expanded_idiom)
482 | 				base_form_map[expanded_idiom] = idiom
483 | 		# Add objective pronouns and a wildcard for other words
484 | 		elif re.search("\\bsomeone\\b", idiom):
485 | 			for objective_pronoun in objective_pronouns + [unicode("—", 'utf-8')]:
486 | 				expanded_idiom = re.sub("\\bsomeone\\b", objective_pronoun, idiom)
487 | 				expanded_idioms.append(expanded_idiom)
488 | 				base_form_map[expanded_idiom] = idiom
489 | 		else: 
490 | 			expanded_idioms.append(idiom)
491 | 			base_form_map[idiom] = idiom
492 | 
493 | 	return expanded_idioms, base_form_map
494 | 
495 | ###### OUTPUT ######
496 | def u8(u):
497 | 	'''Encode unicode string in utf-8.'''
498 | 
499 | 	return u.encode('utf-8')
500 | 	
501 | def write_csv(extracted_idioms, outfile):
502 | 	'''Writes extracted idioms to file in csv-format'''
503 | 	
504 | 	with open(outfile, 'w') as of:
505 | 		writer = csv.writer(of, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
506 | 		for extracted_idiom in extracted_idioms:
507 | 			output_row = [u8(extracted_idiom['idiom']), extracted_idiom['start'], extracted_idiom['end'], 
508 | 				u8(extracted_idiom['snippet']), u8(extracted_idiom['bnc_document_id']), u8(extracted_idiom['bnc_sentence']), 
509 | 				extracted_idiom['bnc_char_start'], extracted_idiom['bnc_char_end']]
510 | 			writer.writerow(output_row)
511 | 


--------------------------------------------------------------------------------
/wiktionary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''Get information from Wiktionary using the MediaWiki API and process returned content.'''
 5 | 
 6 | import re
 7 | import requests
 8 | import lxml.html
 9 | 
10 | def get_category_members(category):
11 | 	'''
12 | 	Use the MediaWiki API to get all category members of a Wiktionary category. 
13 | 	Takes a category name. Returns a list of pagetitles.
14 | 	'''
15 | 
16 | 	titles = []
17 | 	cont = True
18 | 	cmcontinue = '' # Continuation point for query
19 | 	# Get titles until no members left
20 | 	while(cont):
21 | 		# Construct query
22 | 		endpoint = 'https://en.wiktionary.org/w/api.php?' # Wiktionary API endpoint
23 | 		action = 'action=' + 'query' # Which action to take (query, naturally)
24 | 		format = 'format=' + 'json' # Output format
25 | 		lists = 'list=' + 'categorymembers'
26 | 		cmtitle = 'cmtitle=Category:' + category
27 | 		cmtitle = re.sub(' ', '%20', cmtitle)
28 | 		cmlimit = 'cmlimit=' + '500' # Query result limit
29 | 		cmprop = 'cmprop=' + 'title' # Get page titles only
30 | 		
31 | 		query = endpoint + '&'.join([action, format, lists, cmtitle, cmprop, cmlimit])
32 | 		if cmcontinue: # Adding cmcontinue to query makes sure it continues from end of previous query
33 | 			query += '&cmcontinue=' + cmcontinue
34 | 
35 | 		# Get and process results
36 | 		res_raw = requests.get(query)
37 | 		res_json = res_raw.json()
38 | 		# Collect page titles, i.e. idioms 
39 | 		category_members = res_json['query']['categorymembers']
40 | 		for category_member in category_members:
41 | 			title = category_member['title']
42 | 			if not re.search('(^Appendix:)|(^Category:)|(^Special:)|(^Wiktionary:)|(^Category_talk:)|(^Citations:)', title): # Filter out special pages 
43 | 				if ' ' in title: # Exclude single-word 'idioms'
44 | 					titles.append(title.strip())
45 | 		# Check for more members in category
46 | 		try:
47 | 			cmcontinue = res_json['continue']['cmcontinue']
48 | 			cont = True
49 | 		except KeyError:
50 | 			cont = False
51 | 
52 | 	return sorted(list(set(titles)))
53 | 
54 | def get_page(title):
55 | 	'''
56 | 	Use the MediaWiki API to get *** from a Wiktionary page.
57 | 	Takes a page title. Returns ***
58 | 	'''
59 | 	
60 | 	# Construct query
61 | 	endpoint = 'http://en.wiktionary.org/w/api.php?' # Wiktionary API endpoint
62 | 	action = 'action=' + 'query' # Which action to take (query, naturally)
63 | 	format = 'format=' + 'json' # Output format
64 | 	prop = 'prop=' + 'revisions' # What info to get
65 | 	rvprop = 'rvprop=' + 'content'
66 | 	rvparse = 'rvparse' # Parse content into html
67 | 	titles = 'titles=' + title
68 | 	titles = re.sub(' ', '%20', titles)
69 | 	query = endpoint + '&'.join([action, format, prop, rvprop, rvparse, titles])
70 | 
71 | 	# Process result, get html only
72 | 	try:
73 | 		res_raw = requests.get(query)
74 | 		res_json = res_raw.json()
75 | 		temp_1 = res_json['query']['pages'] # Dig through first two layers
76 | 		res_html = temp_1[temp_1.keys()[0]]['revisions'][0]['*'] # Dig through remaining four layers
77 | 		parsed_html = lxml.html.document_fromstring(res_html)
78 | 	except KeyError:
79 | 		return
80 | 		
81 | 	return parsed_html
82 | 


--------------------------------------------------------------------------------