├── .gitignore ├── LICENSE ├── README.md ├── combine_extracted_idioms.py ├── config.py ├── data └── input_sample.txt ├── detect_pies.py ├── evaluate_extraction.py ├── oxford.py ├── pos2morpha.py ├── process_corpus.py ├── using_english.py ├── utils.py └── wiktionary.py /.gitignore: -------------------------------------------------------------------------------- 1 | working/ 2 | ext/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automatic Detection of Potentially Idiomatic Expressions 2 | This is the source code for a system to automatically detect potentially idiomatic expressions (PIEs, for short) in text. It has four different methods of doing so: exact string match, fuzzy string match, inflectional string match, and parse-based matching. It relies on a set of digitally available idiom dictionaries to get an inventory of expressions, and extracts all instances of those expressions (with context) from the input corpus. 3 | 4 | ## Requirements 5 | To run this code, you'll need the following Python setup: 6 | * Python 2.7.6 7 | * beautifulsoup4 4.5.1 8 | * requests 2.17.3 9 | * nltk 3.2.4 10 | * spacy 2.0.6 + en_core_web_sm 2.0.0 11 | * lxml 3.3.3 12 | 13 | Different versions might work just as well, but cannot be guaranteed. 14 | 15 | You might also need: 16 | * [morph](http://users.sussex.ac.uk/~johnca/morph.html), if you want to run inflectional string matching. 17 | * [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/), if you want to run the parse-based method with the Stanford parser. 18 | * the Python library [stanfordcorenlp](https://github.com/Lynten/stanford-corenlp) 3.7.0.2 19 | * the [British National Corpus](http://www.natcorp.ox.ac.uk/), if you want to extract PIEs from that. 20 | 21 | ## Getting Started 22 | - Clone the repository 23 | - Create subdirectories called `working` and `ext` 24 | - If necessary: 25 | - create a symlink `ext/morph` to the main directory of the morph tools 26 | - create a symlink `ext/stanford` to the main directory of your Stanford CorenNLP installation 27 | - create a symlink `ext/BNC` to the `Texts` directory of your copy of the BNC 28 | - Try and run the system with `python detect_pies.py data/input_sample.txt -d wiktionary -t plain -m exact`. This should extract a list of idioms from Wiktionary and use the exact string match method to extract PIEs from the input sample file. 29 | - Get an overview of all options by simply running `python detect_pies.py --help` 30 | 31 | ## Contact 32 | For any questions about (running) the system, feel free to contact me. 33 | -------------------------------------------------------------------------------- /combine_extracted_idioms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | Combine the output of two runs of the PIE extraction system, removing duplicates. 6 | ''' 7 | 8 | import json, argparse, csv, copy 9 | 10 | from utils import u8 11 | 12 | # Read in arguments 13 | parser = argparse.ArgumentParser(description = 'Parameters for PIE extraction evaluation') 14 | parser.add_argument('extracted_1', metavar = 'extracted_idioms_1.csv', type = str, help = "Specify the location of the first file containing the extracted PIEs.") 15 | parser.add_argument('extracted_2', metavar = 'extracted_idioms_2.csv', type = str, help = "Specify the location of the second file containing the extracted PIEs.") 16 | parser.add_argument('combined', metavar = 'combined_idioms.csv', type = str, help = "Specify the output location of the combined set of extracted PIEs.") 17 | args = parser.parse_args() 18 | 19 | # Read input data 20 | extracted_idioms_1 = [] 21 | with open(args.extracted_1, 'r') as csvfile: 22 | csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"') 23 | for csvrow in csvreader: 24 | extracted_idioms_1.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2], 'bnc_start': csvrow[6], 'bnc_end': csvrow[7]}) 25 | extracted_idioms_2 = [] 26 | with open(args.extracted_2, 'r') as csvfile: 27 | csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"') 28 | for csvrow in csvreader: 29 | extracted_idioms_2.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2], 'bnc_start': csvrow[6], 'bnc_end': csvrow[7]}) 30 | 31 | # Combine two sets of extractions 32 | combined_idioms = copy.deepcopy(extracted_idioms_1) 33 | for extracted_idiom_2 in extracted_idioms_2: 34 | matched = False 35 | for extracted_idiom_1 in extracted_idioms_1: 36 | if extracted_idiom_2['idiom'].lower() == extracted_idiom_1['idiom'].lower() and extracted_idiom_2['document_id'] == extracted_idiom_1['document_id'] and extracted_idiom_2['sentence_number'] == extracted_idiom_1['sentence_number']: 37 | matched = True 38 | break 39 | if not matched: 40 | combined_idioms.append(extracted_idiom_2) 41 | 42 | # Output to file 43 | with open(args.combined, 'w') as of: 44 | writer = csv.writer(of, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"') 45 | for extracted_idiom in combined_idioms: 46 | output_row = [u8(extracted_idiom['idiom']), extracted_idiom['start'], extracted_idiom['end'], 47 | u8(extracted_idiom['context']), u8(extracted_idiom['document_id']), u8(extracted_idiom['sentence_number']), 48 | extracted_idiom['bnc_start'], extracted_idiom['bnc_end']] 49 | writer.writerow(output_row) 50 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | '''Set parameters, parse and validate command-line arguments''' 5 | 6 | import argparse, os, datetime, re 7 | 8 | # Non-argument parameters 9 | WORK_DIR = './working' 10 | EXT_DIR = './ext' 11 | MORPH_DIR = os.path.join(EXT_DIR, 'morph') 12 | TIME = '{:%Y-%m-%d-%H-%M-%S}'.format(datetime.datetime.now()) 13 | UE_URL = 'https://www.usingenglish.com' 14 | UE_IDOMS_URL = UE_URL + '/reference/idioms' 15 | OX_URL = 'http://www.oxfordreference.com' 16 | OX_LANDING_URL = OX_URL + '/view/10.1093/acref/9780199543793.001.0001/acref-9780199543793?pageSize=100' # Requires access through e.g. a library 17 | 18 | # Read in arguments 19 | parser = argparse.ArgumentParser(description = 'Parameters for PIE detection') 20 | parser.add_argument('-d', '--dict', metavar = 'wiktionary|ue|oxford|intersection|2of3|union', type = str, default = 'wiktionary', help = "Specify which dictionary to use, default is 'wiktionary'. Other options are 'ue' for UsingEnglish.com, 'oxford' for Oxford Dictionary of English Idioms, 'intersection' for idioms occurring in all three dictionaries, '2of3' for idioms occurring in at least two of the three dictionaries, and 'union' for all idioms occurring in at least one of the three dictionaries. To get the intersection of a pair of dictionaries, enter two dictionary names, separated by a comma, e.g. 'wiktionary,oxford'.") 21 | parser.add_argument('corpus', metavar = 'CORPUS', type = str, help = "Specify the location of the corpus to extract PIEs from.") 22 | parser.add_argument('-t', '--corpus-type', metavar = 'plain|bnc|bnc-dev|bnc-test', type = str, default = 'plain', help = "Specify the type of corpus used. Plain text or BNC (all and dev/test sets).") 23 | parser.add_argument('-m', '--method', metavar = 'exact|fuzzy|inflect|parse', type = str, default = 'exact', help = "Specify the extraction method to use. 'exact' for exact string matching, 'fuzzy' for fuzzy/ string matching, 'inflect' for inflectional string matching, 'parse' for parse-based extraction.") 24 | parser.add_argument('-p', '--parser', metavar = 'spacy|stanford', type = str, default = 'spacy', help = "Specify whether to use the Spacy or Stanford parser for parse-based extraction") 25 | parser.add_argument('-ex', '--example-sentences', metavar = 'CORPUS', type = str, help = "With the 'parse' method, specify this option to retrieve example sentences for in-context parsing. Specify a path to a corpus or to the file containing the cached output of this method.") 26 | parser.add_argument('-iw', '--intervening-words', metavar = 'N', type = int, default = 0, help = "Number of intervening words allowed between words of an idiom in the string match methods. Default is 0.") 27 | parser.add_argument('-c', '--context', metavar = '{0-9}+{ws}', type = str, default = '0s', help = "Amount of context to extract around the idiom. Can be a number of words or sentences. '0w' will yield only the idiom, '1w' one word of context on both sides of the idiom, etc. Word-contexts never exceed sentence boundaries. '0s' will yield only the sentence containing the idiom.") 28 | parser.add_argument('-o', '--output', metavar = 'OUTFILE', type = str, help = "Specify where to output the extracted idioms. Default is WORK_DIR/extracted_idioms_from_CORPUS_NAME_TIMESTAMP.") 29 | parser.add_argument('-nc', '--no-cache', action = 'store_true', help = "Do not use a cached idiom list.") 30 | parser.add_argument('-ns', '--no-split', action = 'store_true', help = "In case of a one-sentence-per-line corpus, do not apply automatic sentence splitting. Does not affect parser-based extraction.") 31 | parser.add_argument('-cs', '--case-sensitive', action = 'store_true', help = "Make string-matching methods case sensitive.") 32 | parser.add_argument('-nl', '--no-labels', action = 'store_true', help = "Ignore dependency relation labels during parse-based extraction") 33 | parser.add_argument('-nld', '--no-labels-or-directionality', action = 'store_true', help = "Ignore dependency relation labels AND dependency relation direction during parse-based extraction.") 34 | args = parser.parse_args() 35 | 36 | # Store arguments as parameters and do validation 37 | DICT = args.dict.split(',') 38 | if len(DICT) == 1 and DICT[0] not in ['wiktionary', 'ue', 'oxford', 'intersection', '2of3', 'union']: 39 | raise ValueError("No valid dictionary option specified.") 40 | elif len(DICT) == 2 and (DICT[0] not in ['wiktionary', 'ue', 'oxford'] or DICT[1] not in ['wiktionary', 'ue', 'oxford']): 41 | raise ValueError("No valid dictionary option specified.") 42 | elif len(DICT) < 1 or len(DICT) > 2: 43 | raise ValueError("No valid dictionary option specified.") 44 | 45 | CORPUS = os.path.abspath(args.corpus) 46 | if not os.path.exists(CORPUS): 47 | raise ValueError("Corpus not found.") 48 | 49 | if args.corpus_type in ['plain', 'bnc', 'bnc-dev', 'bnc-test']: 50 | CORPUS_TYPE = args.corpus_type 51 | else: 52 | raise ValueError("No valid corpus type specified.") 53 | 54 | if args.method in ['exact', 'fuzzy', 'inflect', 'parse']: 55 | METHOD = args.method 56 | else: 57 | raise ValueError("No valid extraction method specified.") 58 | 59 | if args.parser.lower() in ['spacy', 'stanford']: 60 | PARSER = args.parser.lower() 61 | else: 62 | raise ValueError("No valid parser specified.") 63 | 64 | INT_WORDS = args.intervening_words 65 | 66 | SENTENCES = args.example_sentences 67 | if SENTENCES: 68 | SENTENCES = os.path.abspath(args.example_sentences) 69 | 70 | if re.match('[0-9]+[ws]', args.context): 71 | CONTEXT_NUMBER = int(args.context[:-1]) 72 | CONTEXT_TYPE = args.context[-1] 73 | else: 74 | raise ValueError("No valid context window argument provided. Should be of the format [0-9]+[ws].") 75 | 76 | if not args.output: # Set default 77 | OUTFILE = os.path.abspath(os.path.join(WORK_DIR, 'extracted_idioms_from_{0}_{1}.csv'.format(CORPUS.split('/')[-1],TIME))) 78 | else: 79 | OUTFILE = os.path.abspath(args.output) 80 | 81 | NO_CACHE = args.no_cache 82 | NO_SPLIT = args.no_split 83 | CASE_SENSITIVE = args.case_sensitive 84 | NO_LABELS = args.no_labels or args.no_labels_or_directionality 85 | NO_DIRECTION = args.no_labels_or_directionality 86 | -------------------------------------------------------------------------------- /data/input_sample.txt: -------------------------------------------------------------------------------- 1 | I often shoot the breeze while waiting at the bus stop. 2 | I like shooting the breeze at the bus stop. 3 | Shooting the breeze at the bus stop is fun. 4 | Yesterday I shot the breeze with an old lady waiting there. 5 | She shoots the breeze as well as I do. 6 | Sometimes, I shoot the breezes, which is different. 7 | It means that I shoot multiple breezes at once. 8 | The breeze was shot. 9 | That is, multiple breezes were shot. 10 | Freddie likes to bite the dust. 11 | The dust was bitten by Freddie. 12 | The English press has been very good to me, touch wood. 13 | If it′s ever me (Jesus Christ, touch wood!), I don′t want you bringing me flowers. 14 | I know I have my faults and one of them is my impatience and I also cannot tolerate people who are ill, mainly because I am so very rarely ill—“Touch wood,” I said out loud and touched my head at the same time. 15 | The announcement of the political endorsement was timed to a T. 16 | It was an awful scragly tear, and it fitted to a T. 17 | Even if you like peanut butter sandwiches, eating the same sandwiches day in, day out will get old. 18 | She was sick and tired of her daughter pestering her to help her with her homework. 19 | The boy was sick and tired of doing his lengthy homework assignment. 20 | I played gooseberry with Romeo and Juliet. 21 | Stay tuned for local weather info in your neck of the woods. 22 | What time is it in your neck of the woods? 23 | She has been a widow these six or eight years, and has lived, I imagine, in rather a hand to mouth fashion. 24 | I'm living hand to mouth. 25 | She took everything but the kitchen sink. 26 | He went bananas. 27 | If Washington Mutual needs to raise capital quickly, it will very likely find itself between a rock and a hard place, because credit markets have all but closed their doors to troubled banks. 28 | -------------------------------------------------------------------------------- /detect_pies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | '''Extract potential idiomatic expressions from a corpus, based on idioms from a dictionary.''' 5 | 6 | import config 7 | import process_corpus 8 | import wiktionary 9 | import using_english 10 | import oxford 11 | import utils 12 | from utils import u8 13 | 14 | import re, os, json, random, time 15 | 16 | def combine_sets(combination_type, a, b, c = []): 17 | '''Combines 2/3 sets of idioms in different ways''' 18 | if combination_type == 'intersection': 19 | if c: 20 | return list(set(a) & set(b) & set(c)) 21 | else: 22 | return list(set(a) & set(b)) 23 | elif combination_type == '2of3': 24 | return list((set(a) & set(b)) | (set(b) & set(c)) | (set(a) & set(c))) 25 | elif combination_type == 'union': 26 | if c: 27 | return list(set(a) | set(b) | set(c)) 28 | else: 29 | return list(set(a) | set(b)) 30 | 31 | def get_idiom_list(dictionary_type = config.DICT, case_sensitive = False): 32 | '''Gets idiom list, either from file or via API''' 33 | 34 | # Read in dictionary type 35 | if len(dictionary_type) == 1: 36 | dictionary_type = dictionary_type[0] 37 | elif len(dictionary_type) != 2: 38 | raise ValueError('No valid dictionary specified!') 39 | 40 | # Single dictionaries 41 | if dictionary_type in ['wiktionary', 'ue', 'oxford']: 42 | # Try to find the most recent cached idiom list 43 | ifn = '' 44 | ifn_pattern = 'idiom_list_{0}_[0-9\-]+\.json$'.format(dictionary_type) 45 | for candidate_ifn in sorted(os.listdir(config.WORK_DIR), reverse = True): 46 | if re.match(ifn_pattern, candidate_ifn): 47 | ifn = os.path.join(config.WORK_DIR, candidate_ifn) 48 | break 49 | # Don't use the cached list, but scrape a new one 50 | if not os.path.isfile(ifn) or config.NO_CACHE: 51 | if dictionary_type == 'wiktionary': 52 | idioms = wiktionary.get_category_members(category = 'English idioms') 53 | if dictionary_type == 'ue': 54 | idioms = using_english.get_idioms(config.UE_URL, config.UE_IDOMS_URL) 55 | if dictionary_type == 'oxford': 56 | idioms = oxford.get_idioms(config.OX_URL, config.OX_LANDING_URL) 57 | # Cache idiom list 58 | ofn = '{0}/idiom_list_{1}_{2}.json'.format(config.WORK_DIR, dictionary_type, config.TIME) 59 | with open(ofn, 'w') as of: 60 | json.dump(idioms, of) 61 | # Read idiom list from file 62 | else: 63 | print 'Reading idiom list from {0}'.format(ifn) 64 | with open(ifn, 'r') as f: 65 | idioms = json.load(f) 66 | # Refine Oxford idiom list 67 | if dictionary_type == 'oxford': 68 | idioms = oxford.refine_idioms(idioms) 69 | # Lower-case everything if we ignore case 70 | if not case_sensitive: 71 | idioms = [idiom.lower() for idiom in idioms] 72 | 73 | # Combinations of all dictionaries 74 | elif dictionary_type in ['intersection', 'union', '2of3']: 75 | # Get single dictionaries first 76 | wiktionary_idioms = get_idiom_list(dictionary_type = ['wiktionary'], case_sensitive = case_sensitive) 77 | ue_idioms = get_idiom_list(dictionary_type = ['ue'], case_sensitive = case_sensitive) 78 | oxford_idioms = get_idiom_list(dictionary_type = ['oxford'], case_sensitive = case_sensitive) 79 | # Combine dictionaries 80 | if not case_sensitive: 81 | idioms = combine_sets(dictionary_type, wiktionary_idioms, ue_idioms, oxford_idioms) 82 | # Keep case where possible, lower-case where dictionaries conflict 83 | else: 84 | idioms = combine_sets(dictionary_type, wiktionary_idioms, ue_idioms, oxford_idioms) 85 | idioms_lower = [idiom.lower() for idiom in idioms] 86 | # Lower-case first letter which is always upper-case in UE 87 | ue_fixed = [idiom[0].lower() + idiom[1:] for idiom in ue_idioms] 88 | additional_idioms = combine_sets(dictionary_type, wiktionary_idioms, ue_fixed, oxford_idioms) 89 | for additional_idiom in additional_idioms: 90 | if additional_idiom.lower() not in idioms_lower: 91 | idioms.append(additional_idiom) 92 | idioms_lower.append(additional_idiom.lower()) 93 | # Add all idioms which have case differences in other places 94 | wiktionary_lower = [idiom.lower() for idiom in wiktionary_idioms] 95 | ue_lower = [idiom.lower() for idiom in ue_idioms] 96 | oxford_lower = [idiom.lower() for idiom in oxford_idioms] 97 | additional_idioms = combine_sets(dictionary_type, wiktionary_lower, ue_lower, oxford_lower) 98 | for additional_idiom in additional_idioms: 99 | if additional_idiom.lower() not in idioms_lower: 100 | idioms.append(additional_idiom) 101 | 102 | # Combination of a pair of dictionaries 103 | elif len(dictionary_type) == 2: 104 | print 'Taking the intersection of a pair of dictionaries' 105 | dictionary_idioms_1 = get_idiom_list(dictionary_type = dictionary_type[0:1], case_sensitive = case_sensitive) 106 | dictionary_idioms_2 = get_idiom_list(dictionary_type = dictionary_type[1:2], case_sensitive = case_sensitive) 107 | # Combine dictionaries 108 | if not case_sensitive: 109 | idioms = combine_sets('intersection', dictionary_idioms_1, dictionary_idioms_2) 110 | # Keep case where possible, lower-case where dictionaries conflict 111 | else: 112 | idioms = combine_sets('intersection', dictionary_idioms_1, dictionary_idioms_2) 113 | idioms_lower = [idiom.lower() for idiom in idioms] 114 | # Lower-case first letter which is always upper-case in UE 115 | additional_idioms = [] 116 | if dictionary_type[0:1] == 'ue': 117 | ue_fixed = [idiom[0].lower() + idiom[1:] for idiom in dictionary_idioms_1] 118 | additional_idioms = combine_sets('intersection', dictionary_idioms_2, ue_fixed) 119 | elif dictionary_type[1:2] == 'ue': 120 | ue_fixed = [idiom[0].lower() + idiom[1:] for idiom in dictionary_idioms_2] 121 | additional_idioms = combine_sets('intersection', dictionary_idioms_1, ue_fixed) 122 | if additional_idioms: 123 | for additional_idiom in additional_idioms: 124 | if additional_idiom.lower() not in idioms_lower: 125 | idioms.append(additional_idiom) 126 | idioms_lower.append(additional_idiom.lower()) 127 | # Add all idioms which have case differences in other places 128 | dictionary_idioms_1_lower = [idiom.lower() for idiom in dictionary_idioms_1] 129 | dictionary_idioms_2_lower = [idiom.lower() for idiom in dictionary_idioms_2] 130 | additional_idioms = combine_sets('intersection', dictionary_idioms_1_lower, dictionary_idioms_2_lower) 131 | for additional_idiom in additional_idioms: 132 | if additional_idiom.lower() not in idioms_lower: 133 | idioms.append(additional_idiom) 134 | 135 | return idioms 136 | 137 | def string_match(idioms, documents, case_sensitive = False, expand_pronouns = True, fuzzy = False, inflect = False): 138 | ''' 139 | Extracts idioms by exact, fuzzy, or inflectional string matching. 140 | Expands idioms containing indefinite pronouns and deals with idioms 141 | containing em-dash wildcards. Maps all matched idioms back to their 142 | dictionary form and extracts context around the idiom. 143 | ''' 144 | 145 | # Set flags 146 | if case_sensitive: 147 | flags = 0 148 | else: 149 | flags = re.I 150 | 151 | # Inter-word separator for regex: word boundaries + optional intervening words 152 | separator = r'\b\W+(?:\w+\W+){0,' + str(config.INT_WORDS) + r'}\b' 153 | 154 | # Expand indefinite pronouns in idioms (e.g. 'someone') 155 | if expand_pronouns: 156 | idioms, expanded_form_map = utils.expand_indefinite_pronouns(idioms) 157 | 158 | # Get all inflectional variants of idioms 159 | if inflect: 160 | idioms, inflected_form_map = utils.inflect_idioms(idioms, config.MORPH_DIR) 161 | 162 | extracted_idioms = [] # List of dicts, format: {'snippet': "", 'idiom': "", 'start': 0, 'end': 0, 'bnc_doc_id': "", 'bnc_sent': "", 'bnc_char_start': 0, 'bnc_char_end': 0} 163 | 164 | # Generate regular expression matching all idioms 165 | idiom_regex = '' 166 | for idiom in idioms: 167 | idiom_words = idiom.split(' ') 168 | # Fuzzy matching: add optional 1/2/3-character suffix to each idiom word 169 | if fuzzy: 170 | idiom_words = [re.escape(iw) + '\w?' * 3 for iw in idiom_words] # Escape special chars, add fuzzy suffix, add boundaries 171 | # Regular string matching 172 | else: 173 | idiom_words = [re.escape(iw) for iw in idiom_words] # Escape special chars 174 | idiom_regex = idiom_regex + r'\b' + separator.join(idiom_words) + r'\b' 175 | if idiom != idioms[-1]: 176 | idiom_regex += '|' 177 | 178 | # Replace all em-dashes by a wildcard (\w+) 179 | idiom_regex = re.sub(u'\\\—', r'\w+', idiom_regex) 180 | 181 | # Do actual extraction 182 | tokenizer = utils.load_tokenizer() 183 | for sentences in documents: 184 | # Get sentence strings from BNC data 185 | if config.CORPUS_TYPE[0:3] == 'bnc': 186 | sentences_with_metadata = sentences 187 | sentences = [sentence_with_metadata['sentence'] for sentence_with_metadata in sentences_with_metadata] 188 | # Cycle through sentences in document 189 | for idx, sentence in enumerate(sentences): 190 | matches = re.finditer(idiom_regex, sentence, flags = flags) 191 | tokenized_sentence = '' 192 | for match in matches: 193 | # Only tokenize once, and only when a match is found 194 | if not tokenized_sentence: 195 | tokenized_sentence = utils.tokenize(tokenizer, sentence) 196 | # Get token offsets from match offsets 197 | for token in tokenized_sentence: 198 | if token.idx == match.start(): 199 | first_idiom_token_i = token.i 200 | if token.idx + len(token.text) == match.end(): 201 | last_idiom_token_i = token.i 202 | break 203 | # Get BNC metadata/set dummy values 204 | if config.CORPUS_TYPE[0:3] == 'bnc': 205 | bnc_document_id = sentences_with_metadata[idx]['document_id'] 206 | bnc_sentence = sentences_with_metadata[idx]['sentence_number'] 207 | bnc_char_start = match.start() 208 | bnc_char_end = match.end() 209 | else: 210 | bnc_document_id = '-' 211 | bnc_sentence = '-' 212 | bnc_char_start = 0 213 | bnc_char_end = 0 214 | # Get n-word context 215 | if config.CONTEXT_TYPE == 'w': 216 | # Get snippet 217 | snippet_start = max(0, first_idiom_token_i - config.CONTEXT_NUMBER) 218 | snippet_end = min(len(tokenized_sentence), last_idiom_token_i + 1 + config.CONTEXT_NUMBER) 219 | snippet = tokenized_sentence[snippet_start:snippet_end].text 220 | # Get idiom character offsets in snippet 221 | char_offset_span = tokenized_sentence[snippet_start].idx 222 | char_offset_start = match.start() - char_offset_span 223 | char_offset_end = match.end() - char_offset_span 224 | # Get n-sentence context 225 | elif config.CONTEXT_TYPE == 's': 226 | if config.CONTEXT_NUMBER == 0: 227 | snippet = sentence 228 | char_offset_start = match.start() 229 | char_offset_end = match.end() 230 | else: 231 | # Get surrounding sentences to form snippet 232 | first_snippet_sentence_idx = max(0, idx - config.CONTEXT_NUMBER) 233 | last_snippet_sentence_idx = min(len(sentences), idx + 1 + config.CONTEXT_NUMBER) 234 | snippet_sentences = sentences[first_snippet_sentence_idx:last_snippet_sentence_idx] 235 | snippet = ' '.join(snippet_sentences) 236 | # Adjust offset for length of preceding sentences and joining space to the current sentence 237 | num_preceding_sentences = idx - first_snippet_sentence_idx 238 | char_offset_span = len(' '.join(snippet_sentences[:num_preceding_sentences])) 239 | char_offset_start = match.start() + char_offset_span + 1 240 | char_offset_end = match.end() + char_offset_span + 1 241 | 242 | # Get dictionary form of idiom 243 | matched_string = sentence[match.start():match.end()] 244 | if not case_sensitive: 245 | matched_string = matched_string.lower() 246 | dictionary_form = '' 247 | # Deal with em-dash wildcard idiom, and idioms matched with non-spaces 248 | if matched_string not in idioms: 249 | for idiom in idioms: 250 | idiom_words = idiom.split(' ') 251 | if fuzzy: 252 | idiom_words = [re.escape(idiom_word) + '\w?' * 3 for idiom_word in idiom_words] 253 | else: 254 | idiom_words = [re.escape(idiom_word) for idiom_word in idiom_words] 255 | single_idiom_regex = r'\b' + separator.join(idiom_words) + r'\b' 256 | if u'\u2014' in idiom: 257 | single_idiom_regex = re.sub(ur'\\\u2014', r'\w+', single_idiom_regex) 258 | if re.match(single_idiom_regex, matched_string): 259 | dictionary_form = idiom 260 | break 261 | # Occurs exactly in idiom list, so already is dictionary form 262 | else: 263 | dictionary_form = matched_string 264 | # Map expanded and/or inflected idioms back to base form 265 | if inflect: 266 | dictionary_form = inflected_form_map[dictionary_form] 267 | if expand_pronouns: 268 | dictionary_form = expanded_form_map[dictionary_form] 269 | 270 | extracted_idioms.append({'snippet': snippet, 'idiom': dictionary_form, 'start': char_offset_start, 271 | 'end': char_offset_end, 'bnc_document_id': bnc_document_id, 'bnc_sentence': bnc_sentence, 272 | 'bnc_char_start': bnc_char_start, 'bnc_char_end': bnc_char_end}) 273 | 274 | return extracted_idioms 275 | 276 | def parse_extract(idioms, sentences): 277 | ''' 278 | Extracts idioms based on the dependency parse of the idiom and sentence. 279 | Parse all idioms, optionally in context, get their parse trees and top node 280 | lemmata. Then, parse each sentence, check if the top node lemma is present, 281 | and match the idiom parse tree to a subtree of the sentence parse. Deal 282 | with idioms containing indefinite pronouns and em-dashes properly. 283 | ''' 284 | 285 | parser = utils.load_parser(config.PARSER) 286 | extracted_idioms = [] # List of dicts, format: {'snippet': "", 'idiom': "", 'start': 0, 'end': 0, 'bnc_doc_id': "", 'bnc_sent': "", 'bnc_char_start': 0, 'bnc_char_end': 0} 287 | # Use a PoS-ambiguous word to parse idioms containing em-dash wildcards 288 | ambiguous_word = 'fine' 289 | 290 | # Parse idioms in context 291 | if config.SENTENCES: 292 | cache_file = '{0}/example_sentences_{1}_{2}_{3}.json'.format(config.WORK_DIR, '_'.join(config.DICT), config.SENTENCES.split('/')[-1][:-4], config.TIME) 293 | idioms_with_sentences = utils.get_example_sentences(idioms, config.SENTENCES, cache_file) 294 | parsed_idioms = utils.parse_example_sentences(idioms_with_sentences, ambiguous_word, parser) 295 | # Parse idioms without context 296 | else: 297 | parsed_idioms = [] 298 | for idiom in idioms: 299 | parsed_idioms.append(utils.parse_idiom(idiom, ambiguous_word, parser)) 300 | 301 | # Extract idiom instances by matching parse trees 302 | for sentences in documents: 303 | time_0 = time.time() 304 | print 'Parsing document...' 305 | # Get sentence strings from BNC data and parse 306 | if config.CORPUS_TYPE [0:3]== 'bnc': 307 | sentences_with_metadata = sentences 308 | sentences = [sentence_with_metadata['sentence'] for sentence_with_metadata in sentences_with_metadata] 309 | # Parse sentence, and turn resulting Doc into Span object 310 | parsed_sentences = [utils.parse(parser, sentence)[:] for sentence in sentences] 311 | # Parse corpus as a whole, let Spacy do the sentence splitting 312 | else: 313 | parsed_corpus = utils.parse(parser, ' '.join(sentences)) 314 | parsed_sentences = parsed_corpus.sents 315 | 316 | print 'Done! Parsing document took {0:.2f} seconds'.format(time.time() - time_0) 317 | # Cycle through sentences, attempt to match parse trees 318 | for sentence_idx, parsed_sentence in enumerate(parsed_sentences): 319 | for parsed_idiom in parsed_idioms: 320 | 321 | # Get idiom information 322 | idiom_top_lemma = parsed_idiom[0] 323 | idiom_top_token = parsed_idiom[1] 324 | idiom_subtree = parsed_idiom[2] 325 | # If not parsed in context, there is no stored list, so get generator 326 | if not idiom_subtree: 327 | idiom_subtree = idiom_top_token.subtree 328 | # Use list, rather than generator 329 | idiom_subtree = [x for x in idiom_subtree] 330 | has_em_dash = parsed_idiom[3] 331 | # Save previously matched indices to check for overlapping spans 332 | previously_matched_indices = [] 333 | 334 | # When idiom top lemma is em-dash, check if other lemma-tokens occur in sentence, only then try matching the parse trees 335 | consider_this_em_dash_idiom = False 336 | if has_em_dash and idiom_top_lemma == ambiguous_word: 337 | idiom_content_tokens = [token for token in idiom_subtree if token.tag_ not in ['DT'] and token != idiom_top_token] 338 | sentence_lemmata = [token.lemma_ for token in parsed_sentence] 339 | if all([idiom_content_token.lemma_ in sentence_lemmata for idiom_content_token in idiom_content_tokens]): 340 | consider_this_em_dash_idiom = True 341 | 342 | # Cycle through sentence parse, match top lemma to sentence lemma and idiom parse tree to sentence parse tree 343 | for sentence_token in parsed_sentence: 344 | # Match top lemma or em-dash heuristic or match any idiom token as possible top token in case of no directionality 345 | if sentence_token.lemma_ == idiom_top_token.lemma_ or consider_this_em_dash_idiom or (config.NO_DIRECTION and sentence_token.lemma_ in [x.lemma_ for x in idiom_subtree]): 346 | sentence_top_token = sentence_token 347 | # Keep track of indices of matching tokens for later span extraction 348 | matched_indices = [sentence_top_token.i] 349 | # Match parse trees, account for many special cases 350 | for idiom_subtree_token in idiom_subtree: 351 | # Skip top token and articles 352 | if idiom_subtree_token != idiom_top_token and idiom_subtree_token.lower_ not in ['a', 'the', 'an']: 353 | matched_subtree_token = False 354 | for sentence_subtree_token in sentence_token.subtree: 355 | # Match condition components 356 | # Spacy gives same lemma for all pronouns, so match on lower-cased form 357 | matching_lemma = (idiom_subtree_token.lemma_ == sentence_subtree_token.lemma_ and idiom_subtree_token.lemma_ != u'-PRON-') or (idiom_subtree_token.lemma_ == u'-PRON-' and idiom_subtree_token.lower_ == sentence_subtree_token.lower_) 358 | # Optionally, ignore dependency labels 359 | matching_dep = idiom_subtree_token.dep_ == sentence_subtree_token.dep_ or config.NO_LABELS 360 | matching_head_lemma = (idiom_subtree_token.head.lemma_ == sentence_subtree_token.head.lemma_ and idiom_subtree_token.head.lemma_ != u'-PRON-') or (idiom_subtree_token.head.lemma_ == u'-PRON-' and idiom_subtree_token.head.lower_ == sentence_subtree_token.head.lower_) 361 | # Optionally, allow for direction reversal 362 | if config.NO_DIRECTION: 363 | if idiom_subtree_token.head.lemma_ == u'-PRON-': 364 | matched_children = [x for x in sentence_subtree_token.children if x.lower_ == idiom_subtree_token.head.lower_] 365 | else: 366 | matched_children = [x for x in sentence_subtree_token.children if x.lemma_ == idiom_subtree_token.head.lemma_] 367 | matching_child_lemma = matched_children != [] 368 | matching_head_lemma = matching_head_lemma or matching_child_lemma 369 | em_dash_lemma = has_em_dash and idiom_subtree_token.lemma_ == ambiguous_word 370 | em_dash_head_lemma = has_em_dash and idiom_subtree_token.head.lemma_ == ambiguous_word 371 | inverted_dep = idiom_subtree_token.dep_ == 'dobj' and sentence_subtree_token.dep_ == 'nsubjpass' or config.NO_LABELS 372 | # Default case: lemma, dep-rel and head lemma have to match. 373 | # In case of em-dash, match lemma or head lemma, and the other one to the ambiguous word 374 | if (matching_lemma and matching_dep and matching_head_lemma or 375 | em_dash_lemma and matching_head_lemma or 376 | matching_lemma and em_dash_head_lemma): 377 | matched_subtree_token = True 378 | # Passivization: match lemma, head lemma and inverted dep-rels 379 | elif matching_lemma and inverted_dep and matching_head_lemma: 380 | matched_subtree_token = True 381 | # Deal with someone and someone's 382 | elif idiom_subtree_token.lemma_ == 'someone': 383 | idiom_right_children = [right for right in idiom_subtree_token.rights] 384 | # Deal with someone's - match any other PRP$ or NN(P)(S) + POS for lemma 385 | if idiom_right_children and idiom_right_children[0].lemma_ == "'s": 386 | sentence_right_children = [right for right in sentence_subtree_token.rights] 387 | if (matching_dep and matching_head_lemma and (sentence_subtree_token.tag_ == 'PRP$' or 388 | sentence_subtree_token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS'] and 389 | sentence_right_children and sentence_right_children[0].lemma_ == "'s")): 390 | matched_subtree_token = True 391 | # Deal with someone - match any other PRP or NN(P)(S) for lemma 392 | else: 393 | if ((matching_dep or inverted_dep) and matching_head_lemma and 394 | sentence_subtree_token.tag_ in ['PRP', 'NN', 'NNS', 'NNP', 'NNPS']): 395 | matched_subtree_token = True 396 | # Deal with one's - match any PRP$ for lemma 397 | elif idiom_subtree_token.lemma_ == 'one': 398 | idiom_right_children = [right for right in idiom_subtree_token.rights] 399 | if idiom_right_children and idiom_right_children[0].lemma_ == "'s": 400 | if matching_dep and matching_head_lemma and sentence_subtree_token.tag_ == 'PRP$': 401 | matched_subtree_token = True 402 | # Deal with something and something's 403 | elif idiom_subtree_token.lemma_ == 'something': 404 | idiom_right_children = [right for right in idiom_subtree_token.rights] 405 | # Deal with something's - match any other PRP$ or NN(P)(S) + POS for lemma 406 | if idiom_right_children and idiom_right_children[0].lemma_ == "'s": 407 | sentence_right_children = [right for right in sentence_subtree_token.rights] 408 | if (matching_dep and matching_head_lemma and (sentence_subtree_token.tag_ == 'PRP$' or 409 | sentence_subtree_token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS'] and 410 | sentence_right_children and sentence_right_children[0].lemma_ == "'s")): 411 | matched_subtree_token = True 412 | # Deal with something - match any other PRP or NN(P)(S) or this/that/these/those for lemma 413 | else: 414 | if ((matching_dep or inverted_dep) and matching_head_lemma and 415 | (sentence_subtree_token.tag_ in ['PRP', 'NN', 'NNS', 'NNP', 'NNPS'] or 416 | sentence_subtree_token.lemma_ in ['this', 'that', 'these', 'those'])): 417 | matched_subtree_token = True 418 | # Deal with 's of someone's, one's and something's by ignoring it 419 | elif idiom_subtree_token.lemma_ == "'s" and idiom_subtree_token.head.lemma_ in ['someone', 'one', 'something']: 420 | matched_subtree_token = True 421 | break 422 | 423 | if matched_subtree_token: # Match, go to next idiom subtree token 424 | # Add child in case of no-directionality child match 425 | if config.NO_DIRECTION and matching_child_lemma: 426 | matched_indices.append(matched_children[0].i) 427 | else: 428 | matched_indices.append(sentence_subtree_token.i) 429 | break 430 | if not matched_subtree_token: # No match, go to next sentence token 431 | break 432 | 433 | # If everything matches, extract snippet 434 | if matched_subtree_token: 435 | # Text of idiom subtree is dictionary form 436 | dictionary_form = ''.join([idiom_subtree_token.text_with_ws for idiom_subtree_token in idiom_subtree]).strip() 437 | # Deal with em-dash wildcard idiom, substitute em-dash back in for ambiguous word 438 | if has_em_dash: 439 | dictionary_form = re.sub(ambiguous_word, u'\u2014', dictionary_form) 440 | # Get idiom token span 441 | first_idiom_token_i = min(matched_indices) - parsed_sentence.start 442 | last_idiom_token_i = max(matched_indices) - parsed_sentence.start 443 | first_idiom_token = parsed_sentence[first_idiom_token_i] 444 | last_idiom_token = parsed_sentence[last_idiom_token_i] 445 | # Extract n-word context 446 | if config.CONTEXT_TYPE == 'w': 447 | span_start = max(0, first_idiom_token_i - config.CONTEXT_NUMBER) 448 | span_end = min(len(parsed_sentence), last_idiom_token_i + 1 + config.CONTEXT_NUMBER) 449 | snippet = parsed_sentence[span_start:span_end].text 450 | # Store character offset of snippet start 451 | char_offset_span = parsed_sentence[span_start].idx 452 | # Extract n-sentence context 453 | elif config.CONTEXT_TYPE == 's': 454 | if config.CONTEXT_NUMBER == 0: 455 | snippet = parsed_sentence.text 456 | # Store character offset of sentence (==snippet) start 457 | char_offset_span = parsed_sentence.start_char 458 | else: 459 | snippet = "" 460 | # Get snippet sentences 461 | first_sentence_idx = sentence_idx - config.CONTEXT_NUMBER 462 | last_sentence_idx = sentence_idx + config.CONTEXT_NUMBER 463 | # Re-iterate over sentences to extract the sentence contents 464 | for sentence_idx_2, parsed_sentence_2 in enumerate(parsed_corpus.sents): 465 | if sentence_idx_2 >= first_sentence_idx and sentence_idx_2 <= last_sentence_idx: 466 | # Store character offset of snippet start 467 | if sentence_idx_2 == first_sentence_idx: 468 | char_offset_span = parsed_sentence_2.start_char 469 | # Add space between sentences 470 | if snippet: 471 | snippet += ' ' 472 | snippet += parsed_sentence_2.text 473 | # Get idiom character offsets in snippet 474 | char_offset_start = first_idiom_token.idx - char_offset_span 475 | char_offset_end = last_idiom_token.idx + len(last_idiom_token.text) - char_offset_span 476 | # Get BNC metadata/set dummy values 477 | if config.CORPUS_TYPE[0:3] == 'bnc': 478 | bnc_document_id = sentences_with_metadata[sentence_idx]['document_id'] 479 | bnc_sentence = sentences_with_metadata[sentence_idx]['sentence_number'] 480 | bnc_char_start = first_idiom_token.idx 481 | bnc_char_end = last_idiom_token.idx + len(last_idiom_token.text) 482 | else: 483 | bnc_document_id = '-' 484 | bnc_sentence = '-' 485 | bnc_char_start = 0 486 | bnc_char_end = 0 487 | 488 | extracted_idiom = {'snippet': snippet, 'idiom': dictionary_form, 'start': char_offset_start, 489 | 'end': char_offset_end, 'bnc_document_id': bnc_document_id, 'bnc_sentence': bnc_sentence, 490 | 'bnc_char_start': bnc_char_start, 'bnc_char_end': bnc_char_end} 491 | 492 | # Check whether the instance has already been added, with a larger span (this can happen with em-dash idioms). Don't do this for NLD matches. 493 | if previously_matched_indices: 494 | # Remove most recent entry if it has a larger span than the current entry 495 | if min(previously_matched_indices) <= min(matched_indices) and max(previously_matched_indices) >= max(matched_indices) and (sentence_token.lemma_ == idiom_top_token.lemma_ or consider_this_em_dash_idiom): 496 | del extracted_idioms[-1] 497 | # Only add current entry if it doesn't have a larger span than the most recent entry 498 | if not (min(previously_matched_indices) >= min(matched_indices) and max(previously_matched_indices) <= max(matched_indices)) and (sentence_token.lemma_ == idiom_top_token.lemma_ or consider_this_em_dash_idiom): 499 | extracted_idioms.append(extracted_idiom) 500 | previously_matched_indices = matched_indices 501 | else: 502 | extracted_idioms.append(extracted_idiom) 503 | previously_matched_indices = matched_indices 504 | 505 | return extracted_idioms 506 | 507 | if __name__ == '__main__': 508 | print 'Hello! Time is {0}'.format(config.TIME) 509 | 510 | # Create working directory if it doesn't exist 511 | if not os.path.isdir(config.WORK_DIR): 512 | os.mkdir(config.WORK_DIR) 513 | 514 | # Read in corpus as list of documents 515 | if config.CORPUS_TYPE == 'plain': 516 | documents = process_corpus.plain_text(config.CORPUS, config.NO_SPLIT) 517 | print 'First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(u8(documents[0][0]), u8(documents[-1][-1])) 518 | elif config.CORPUS_TYPE[0:3] == 'bnc': 519 | cache_path = os.path.join(config.WORK_DIR, '{0}_parsed_xml.json'.format(config.CORPUS_TYPE)) 520 | documents = process_corpus.bnc(config.CORPUS, config.CORPUS_TYPE, cache_path) 521 | print 'First sentence of corpus: {0}\nLast sentence of corpus: {1}'.format(u8(documents[0][0]['sentence']), u8(documents[-1][-1]['sentence'])) 522 | 523 | # Get idioms from dictionary 524 | idioms = get_idiom_list(case_sensitive = config.CASE_SENSITIVE) 525 | print "Found {4} idioms ranging from '{0}', '{1}' to '{2}', '{3}'".format(u8(idioms[0]), u8(idioms[1]), u8(idioms[-2]), u8(idioms[-1]), len(idioms)) 526 | 527 | # Extract idioms 528 | extraction_start = time.time() 529 | if config.METHOD == 'exact': 530 | extracted_idioms = string_match(idioms, documents, fuzzy = False, inflect = False, case_sensitive = config.CASE_SENSITIVE) 531 | elif config.METHOD == 'fuzzy': 532 | extracted_idioms = string_match(idioms, documents, fuzzy = True, inflect = False, case_sensitive = config.CASE_SENSITIVE) 533 | elif config.METHOD == 'inflect': 534 | extracted_idioms = string_match(idioms, documents, fuzzy = False, inflect = True, case_sensitive = config.CASE_SENSITIVE) 535 | elif config.METHOD == 'parse': 536 | extracted_idioms = parse_extract(idioms, documents) 537 | 538 | # Print information about extracted idioms 539 | print 'Extracted {0} idioms in {1:.2f} seconds'.format(len(extracted_idioms), time.time() - extraction_start) 540 | idiom_set = set([extracted_idiom['idiom'] for extracted_idiom in extracted_idioms]) 541 | if len(idiom_set) >= 5: 542 | idiom_sample = random.sample(idiom_set, 5) 543 | print 'Extracted these idioms, among others: {0}, {1}, {2}, {3}, {4}'.format(u8(idiom_sample[0]), u8(idiom_sample[1]), u8(idiom_sample[2]), u8(idiom_sample[3]), u8(idiom_sample[4])) 544 | 545 | # Output extracted idioms to file 546 | utils.write_csv(extracted_idioms, config.OUTFILE) 547 | -------------------------------------------------------------------------------- /evaluate_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | Evaluate PIE extraction performance against an exhaustively PIE annotated corpus, output recall, precision and F1-score. 6 | ''' 7 | 8 | import json, argparse, csv, random 9 | from collections import Counter 10 | 11 | # Read in arguments 12 | parser = argparse.ArgumentParser(description = 'Parameters for PIE detection evaluation') 13 | parser.add_argument('extracted', metavar = 'extracted_idioms.csv', type = str, help = "Specify the file containing the extracted PIEs.") 14 | parser.add_argument('annotated', metavar = 'annotated_idioms.json', type = str, help = "Specify the file containing the annotated PIEs") 15 | args = parser.parse_args() 16 | 17 | # Read input data 18 | extracted_idioms = [] 19 | with open(args.extracted, 'r') as csvfile: 20 | csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"') 21 | for csvrow in csvreader: 22 | extracted_idioms.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2]}) 23 | 24 | annotated_idioms = json.load(open(args.annotated, 'r')) 25 | 26 | # Check if datasets cover same documents 27 | assert set([idiom['document_id'] for idiom in extracted_idioms]) <= set([idiom['document_id'] for idiom in annotated_idioms]) 28 | 29 | # Select only the PIEs from the set of annotated PIE candidates 30 | annotated_idioms = [annotated_idiom for annotated_idiom in annotated_idioms if annotated_idiom['PIE_label'] == 'y'] 31 | 32 | # Keep track of false negatives 33 | for annotated_idiom in annotated_idioms: 34 | annotated_idiom['evaluation'] = 'fn' 35 | 36 | # Count true/false positives/negatives 37 | # We do not have true negatives 38 | tp = 0. 39 | fp = 0. 40 | fn = 0. 41 | for extracted_idiom in extracted_idioms: 42 | for annotated_idiom in annotated_idioms: 43 | # Lower case PIEs for comparison, as they are annotated as lower-case, but not necessarily extracted so 44 | if extracted_idiom['document_id'] == annotated_idiom['document_id'] and extracted_idiom['sentence_number'] == annotated_idiom['sentence_number'] and extracted_idiom['idiom'].lower() == annotated_idiom['idiom'].lower(): 45 | tp += 1 46 | extracted_idiom['evaluation'] = 'tp' 47 | annotated_idiom['evaluation'] = 'tp' 48 | break 49 | else: # No break 50 | fp += 1 51 | extracted_idiom['evaluation'] = 'fp' 52 | 53 | fn = len(annotated_idioms) - tp # False negatives = all missed PIEs = # annotated PIEs - # correctly found PIEs 54 | 55 | # Get precision, recall, F1-score 56 | precision = tp / (tp + fp) 57 | recall = tp / (tp + fn) 58 | f1 = 2 * (precision * recall) / (precision + recall) 59 | 60 | # Print results 61 | print '### RESULTS ###' 62 | print 'Total number of annotated PIEs: {0}'.format(len(annotated_idioms)) 63 | print 'Total number of extracted PIEs: {0}\n'.format(len(extracted_idioms)) 64 | print 'True Positives: {0}\nFalse Positives: {1}\nFalse Negatives: {2}\n'.format(tp, fp, fn) 65 | print 'Precision: {0}%'.format(precision*100) 66 | print 'Recall: {0}%'.format(recall*100) 67 | print 'F1-score: {0}%'.format(f1*100) 68 | 69 | # Print examples of classifications 70 | def show_examples(idioms, evaluation): 71 | # Define colours 72 | stop = '\x1b[0m' 73 | red = '\x1b[1;31;1m' 74 | # Count number of examples shown 75 | count = 0 76 | for idiom in idioms: 77 | if idiom['evaluation'] == evaluation: 78 | # Highlight idiom in context 79 | try: 80 | context = idiom['context'] 81 | start = int(idiom['start']) 82 | end = int(idiom['end']) 83 | except KeyError: 84 | context = idiom['sentence'] 85 | start = idiom['offsets'][0][0] 86 | end = idiom['offsets'][-1][-1] 87 | highlighted_context = context[:start] 88 | highlighted_context += red 89 | highlighted_context += context[start:end] 90 | highlighted_context += stop 91 | highlighted_context += context[end:] 92 | print highlighted_context, 93 | print '({2} - doc. {0} - sent. {1})'.format(idiom['document_id'], idiom['sentence_number'], idiom['idiom']) 94 | count += 1 95 | if count % 10 == 0: 96 | user_input = unicode(raw_input("Show 10 more examples? (y/n): "), 'utf-8') 97 | if user_input.lower() != 'y': 98 | break 99 | else: # No break 100 | print 'No more examples!' 101 | 102 | # Prompt and show examples for different classes 103 | # Shuffle idiom lists to avoid seeing same examples again and again 104 | random.shuffle(extracted_idioms) 105 | random.shuffle(annotated_idioms) 106 | user_input = unicode(raw_input("Show examples of classifications? (y/n): "), 'utf-8') 107 | if user_input.lower() == 'y': 108 | user_input = unicode(raw_input("Show examples of true positives? (y/n): "), 'utf-8') 109 | if user_input.lower() == 'y': 110 | show_examples(extracted_idioms, 'tp') 111 | user_input = unicode(raw_input("Show examples of false positives? (y/n): "), 'utf-8') 112 | if user_input.lower() == 'y': 113 | show_examples(extracted_idioms, 'fp') 114 | user_input = unicode(raw_input("Show examples of false negatives? (y/n): "), 'utf-8') 115 | if user_input.lower() == 'y': 116 | show_examples(annotated_idioms, 'fn') 117 | 118 | # Split performance for most frequent PIE types in corpus 119 | def performance_per_type(annotated_idioms, extracted_idioms, n): 120 | most_frequent_types = Counter([x['idiom'] for x in annotated_idioms]).most_common() 121 | print 'PIE Type' + 17*' ' + 'Count\tPrecision\tRecall\tF1-score' 122 | for pie_type in most_frequent_types[:n]: 123 | pie = pie_type[0] 124 | count = pie_type[1] 125 | tp = float(len([x['evaluation'] for x in extracted_idioms if x['idiom'] == pie and x['evaluation'] == 'tp'])) 126 | fp = float(len([x['evaluation'] for x in extracted_idioms if x['idiom'] == pie and x['evaluation'] == 'fp'])) 127 | fn = float(len([x['evaluation'] for x in annotated_idioms if x['idiom'] == pie and x['evaluation'] == 'fn'])) 128 | try: 129 | precision = tp / (tp + fp) 130 | recall = tp / (tp + fn) 131 | f1 = 2 * (precision * recall) / (precision + recall) 132 | except ZeroDivisionError: 133 | precision = 0. 134 | recall = 0. 135 | f1 = 0. 136 | pie += (25 - len(pie)) * ' ' 137 | print '{0}{1}\t{2:.2f}\t\t{3:.2f}\t{4:.2f}'.format(pie, count, precision * 100, recall * 100, f1 * 100) 138 | 139 | user_input = unicode(raw_input("Show performance for 25 most frequent PIE types? (y/n): "), 'utf-8') 140 | if user_input.lower() == 'y': 141 | performance_per_type(annotated_idioms, extracted_idioms, 25) 142 | -------------------------------------------------------------------------------- /oxford.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | Get idioms from the online Oxford Dictionary of English Idioms, by scraping the pages at www.oxfordreference.com. 6 | Refines the idioms by removing duplicates, and expanding things in parentheses, dealing with special cases. 7 | ''' 8 | 9 | import re, requests, itertools 10 | from bs4 import BeautifulSoup 11 | 12 | def get_idioms(url, landing_url, use_socks_proxy = False): 13 | ''' 14 | Scrapes idioms from the ODEI website, gets 100 entries per page, 15 | navigates to entry page, gets idiom, cycles through pages 16 | ''' 17 | idioms = [] 18 | # Set proxy, if applicable, requires pysocks to be installed 19 | if use_socks_proxy: 20 | proxies = {'http': "socks5://127.0.0.1:8080"} 21 | else: 22 | proxies = {} 23 | # Get and parse first page 24 | page = requests.get(landing_url) 25 | soup = BeautifulSoup(page.content, 'html.parser') 26 | # Scrape pagination information 27 | links = soup.find_all('a') 28 | for link in links: 29 | if link.parent.name == 'div': 30 | try: 31 | if link.parent['class'][0] == 't-data-grid-pager': 32 | last_page = link.text # Number of pages to cycle through 33 | url_template = link['href'] 34 | except KeyError: 35 | pass # Sometimes parent has no class 36 | # Cycle through pages, get actual idioms 37 | for i in range(1, int(last_page) + 1): 38 | print 'Scraping page {0} of {1}'.format(i, last_page) # Very slow, so give progress updates 39 | # Get next page url 40 | if i < int(last_page): 41 | next_page = url + re.sub('gridpager/{0}'.format(last_page), 'gridpager/{0}'.format(i + 1), url_template) 42 | # Find links to pages containing idioms 43 | links = soup.find_all('a') 44 | for link in links: 45 | if link.parent.name == 'h2': 46 | try: 47 | if link.parent['class'][0] == 'itemTitle': 48 | # Get page with idiom entries 49 | entry_page = requests.get(url + link['href'], proxies=proxies) 50 | entry_soup = BeautifulSoup(entry_page.content, 'html.parser') 51 | # Extract idiom 52 | for idiom in entry_soup.find_all('em'): 53 | try: 54 | if idiom.parent.parent['class'][0] == 'div1': 55 | if ' ' in idiom.text: # Filter out single word 'idioms' 56 | idioms.append(idiom.text) # Store the actual idiom 57 | except KeyError: 58 | pass # Sometimes grandparent has no class 59 | except KeyError: 60 | pass # Sometimes parent has no class 61 | # Get and parse next page 62 | page = requests.get(next_page) 63 | soup = BeautifulSoup(page.content, 'html.parser') 64 | 65 | return sorted(list(set(idioms))) 66 | 67 | def refine_idioms(idioms): 68 | ''' 69 | Oxford scraping output is messy. Removes duplicates containing ':'. 70 | Expands optionals in parentheses. Deals with some exceptional cases individually 71 | ''' 72 | 73 | refined_idioms = [] 74 | for idiom in idioms: 75 | # Fix scraping errors 76 | if idiom == 'like (or as if) it is going out of fashion (or style': 77 | idiom += ')' 78 | if idiom == 'cog in the wheel (or machine': 79 | idiom += ')' 80 | if idiom == 'get you (him, her': 81 | idiom += ', etc.)!' 82 | has_parentheses = False 83 | if idiom[-1] != ':': # All duplicates end in ':' 84 | # Get all parenthesis pairs + content 85 | if re.findall('\(.*\)', idiom): 86 | pairs_of_parentheses = re.finditer('\(.*?\)', idiom) 87 | # Cycle through pairs of parentheses, collect parts of idiom, and their expansions/variations 88 | idiom_parts = [] 89 | previous_end = 0 90 | for pair_of_parentheses in pairs_of_parentheses: 91 | starts_with_also = False # e.g. (also sure as fate) 92 | starts_with_or = False # e.g. (or get something off the ground) 93 | or_in_middle = False # e.g. (final or last) 94 | contains_etc = False # e.g. (me, him, etc.) 95 | # Get indices 96 | start = pair_of_parentheses.start() 97 | end = pair_of_parentheses.end() 98 | # Examine content between parentheses - set conditions 99 | content_between_parentheses = pair_of_parentheses.group(0)[1:-1] # Get content without () 100 | if re.match('also\\b', content_between_parentheses): 101 | starts_with_also = True 102 | if re.match('or\\b', content_between_parentheses): 103 | starts_with_or = True 104 | if re.search('.\\bor\\b', content_between_parentheses): 105 | or_in_middle = True 106 | if re.search('etc\.', content_between_parentheses): 107 | contains_etc = True 108 | # Add the non-parenthesized bit before the current pair of parentheses (if it exists) 109 | idiom_part_before = idiom[previous_end:start] 110 | if idiom_part_before: 111 | idiom_parts.append([idiom_part_before]) 112 | ## Deal with different types of content between parentheses 113 | # Deal with the case with the '/', which occurs in exactly 1 idiom entry 114 | if content_between_parentheses == 'or get your fingers burned/burnt': 115 | content_between_parentheses = 'or get your fingers burned or get your fingers burnt' 116 | or_in_middle = True 117 | # Deal with some especially difficult parentheses cases first, individually 118 | if '(' in content_between_parentheses: 119 | if content_between_parentheses == 'or bring someone back (down': 120 | refined_idioms.append(u'bring someone back to earth') 121 | refined_idioms.append(u'bring someone back down to earth') 122 | end = len(idiom) 123 | if content_between_parentheses == 'or give someone pause (for thought': 124 | refined_idioms.append(u'give someone pause') 125 | refined_idioms.append(u'give someone pause for thought') 126 | end = len(idiom) 127 | if content_between_parentheses == 'or herein (or therein': 128 | idiom_parts[-1].append(u'herein lies') 129 | idiom_parts[-1].append(u'therein lies') 130 | idiom_parts.append([u'a tale']) 131 | end = len(idiom) 132 | # Simplest case, just generate idiom with parentheses removed, keeping content in parentheses 133 | # EXAMPLE: (all) at sea -> all at sea, at sea 134 | elif not starts_with_also and not starts_with_or and not or_in_middle and not contains_etc: 135 | idiom_part_between_parentheses = ['', content_between_parentheses] 136 | idiom_parts.append(idiom_part_between_parentheses) 137 | # Simplest'case starting with 'or'. Generate idiom with n words before parentheses replaced by the n words in the parentheses 138 | # EXAMPLE: I should cocoa (or coco) -> I should cocoa, I should coco 139 | elif not starts_with_also and starts_with_or and not or_in_middle and not contains_etc: 140 | num_words_to_replace = len(content_between_parentheses.split(' ')) - 1 # -1 because of or 141 | content_between_parentheses_without_or = ' '.join(content_between_parentheses.split(' ')[1:]) 142 | idiom_part_before_split = idiom_part_before.strip().split(' ') 143 | idiom_part_before_trimmed = ' '.join(idiom_part_before_split[:-num_words_to_replace]) 144 | idiom_part_before_variant = idiom_part_before_trimmed + ' ' + content_between_parentheses_without_or 145 | if idiom_part_before[0] == ' ': # Add initial space if it got removed incidentally 146 | idiom_part_before_variant = ' ' + idiom_part_before_variant 147 | idiom_parts[-1].append(idiom_part_before_variant) # Add as variant to previous part 148 | # Simplest case with or in the middle. Generate idioms for each part separated by 'or'. 149 | # EXAMPLE: a (final or last) turn of the screw -> a final turn of the screw, a last turn of the screw 150 | elif not starts_with_also and not starts_with_or and or_in_middle and not contains_etc: 151 | content_parts = content_between_parentheses.split(' or ') 152 | idiom_parts.append(content_parts) 153 | # Case with both or at the start and in the middle. Generate idioms with replacement for each part separated by 'or' 154 | # EXAMPLE: a bad (or bitter or nasty) taste -> a bad taste, a bitter taste, a nasty taste 155 | elif not starts_with_also and starts_with_or and or_in_middle and not contains_etc: 156 | content_parts = content_between_parentheses[3:].split(' or ') # Strip initial 'or' and split in parts 157 | idiom_part_before_split = idiom_part_before.strip().split(' ') 158 | for content_part in content_parts: 159 | num_words_to_replace = len(content_part.split(' ')) 160 | idiom_part_before_trimmed = ' '.join(idiom_part_before_split[:-num_words_to_replace]) 161 | idiom_part_before_variant = idiom_part_before_trimmed + ' ' + content_part 162 | idiom_parts[-1].append(idiom_part_before_variant) 163 | # Case with 'also' at the start, signals full replacement, only two cases, one also with 'or' 164 | # 1. sure as eggs is eggs (also sure as fate) 2. left, right, and centre (also left and right or right and left) 165 | elif starts_with_also and not contains_etc: 166 | if not or_in_middle: 167 | idiom_part_before_variant = content_between_parentheses[5:] # Remove 'also' 168 | idiom_parts[-1].append(idiom_part_before_variant) 169 | else: 170 | idiom_part_before_variants = content_between_parentheses[5:].split(' or ') 171 | idiom_parts[-1] += idiom_part_before_variants 172 | # Cases with etc. are rare, and require individual treatment 173 | elif contains_etc: 174 | if content_between_parentheses in ['me, him, etc.', 'him, her, etc.']: 175 | expanded_series = ['me', 'you', 'him', 'her', 'us', 'them', 'it'] 176 | idiom_parts.append(expanded_series) 177 | elif content_between_parentheses == 'or tell, etc.': 178 | idiom_part_before_variant = 'tell' 179 | idiom_parts[-1].append(idiom_part_before_variant) 180 | elif content_between_parentheses == 'or herself, etc.': 181 | idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1]) 182 | variant_series = ['myself', 'yourself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves'] 183 | for variant in variant_series: 184 | idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant 185 | idiom_parts[-1].append(idiom_part_before_variant) 186 | elif content_between_parentheses == 'or bore etc.': 187 | idiom_part_before_variant = 'bore' 188 | idiom_parts[-1].append(idiom_part_before_variant) 189 | elif content_between_parentheses == 'or your etc.': 190 | idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1]) 191 | variant_series = ['my', 'your', 'his', 'her', 'its', 'our', 'your', 'their'] 192 | for variant in variant_series: 193 | idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant 194 | idiom_parts[-1].append(idiom_part_before_variant) 195 | elif content_between_parentheses in ['or you or him, etc.', 'or her, him, etc.']: 196 | idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1]) 197 | variant_series = ['you', 'him', 'her', 'us', 'them', 'it'] 198 | for variant in variant_series: 199 | idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant 200 | idiom_parts[-1].append(idiom_part_before_variant) 201 | elif content_between_parentheses == 'or forty-something, etc.': 202 | idiom_parts = [] # Single-word idiom, ignore 203 | previous_end = end 204 | # Add remaining part of idiom after final pair of parentheses 205 | idiom_parts.append([idiom[end:]]) 206 | # From the collected idiom parts and variations, generate all idiom variations and add them to the list 207 | for refined_idiom in itertools.product(*idiom_parts): 208 | refined_idiom = ''.join(refined_idiom) 209 | refined_idiom = re.sub(' +', ' ', refined_idiom) # Remove double spaces 210 | refined_idiom = re.sub('(^ )|( $)', '', refined_idiom) # Remove initial spaces and final spaces 211 | if len(refined_idiom.split(' ')) > 1: # Remove single-word idioms, e.g. 'forty-something' (or thirty-something') 212 | refined_idioms.append(refined_idiom) 213 | else: 214 | refined_idioms.append(idiom) 215 | return refined_idioms 216 | -------------------------------------------------------------------------------- /pos2morpha.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Based on a similar script by Kilian Evang 4 | 5 | ''' 6 | Converts tokenized and POS-tagged texts in C&C format (pipe-separated PTB tags) 7 | to morpha's input format (underscore-separated CLAWS tags). 8 | ''' 9 | 10 | import re, sys 11 | 12 | def ptb2claws(token, tag): 13 | if tag == 'NNP': 14 | return 'NP' 15 | if tag == 'NNPS': 16 | return 'NP2' 17 | if tag == 'NNS': 18 | return 'NN2' 19 | if token in ('ca', 'sha', 'wo', '\'d') and tag == 'MD': 20 | return 'VM' 21 | if token == 'n\'t' and tag == 'RB': 22 | return 'XX' 23 | if token == '\'d' and tag == 'VBD': 24 | return 'VH' 25 | return tag 26 | 27 | def convert_token(token): 28 | pipeindex = token.rfind('|') 29 | prefix = token[:pipeindex] 30 | suffix = token[pipeindex + 1:] 31 | if len(prefix) > 1 and prefix.endswith('-'): 32 | prefix = prefix[:-1] 33 | return prefix + '_' + ptb2claws(prefix, suffix) 34 | 35 | -------------------------------------------------------------------------------- /process_corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | '''Load and preprocess a corpus for idiom extraction''' 5 | 6 | import os, time, json 7 | import nltk.data 8 | from bs4 import BeautifulSoup 9 | 10 | def plain_text(corpus_file, no_split): 11 | '''Read in a plain text corpus, return a single document containing a list of unicode sentences.''' 12 | 13 | splitter = nltk.data.load('tokenizers/punkt/english.pickle') 14 | # Read in corpus 15 | documents = [] 16 | sentences = [] 17 | with open(corpus_file, 'r') as f: 18 | for line in f: 19 | if line.strip(): 20 | if no_split: 21 | sentences.append(unicode(line.strip(), 'utf-8')) 22 | else: 23 | sentences += splitter.tokenize(unicode(line.strip(), 'utf-8')) 24 | documents.append(sentences) 25 | 26 | return documents 27 | 28 | def bnc(corpus_file, corpus_type, cache_path): 29 | ''' 30 | Read in the British National Corpus (BNC) XML version, returns a list of documents. 31 | Documents are lists of dictionaries. Dictionaries contain unicode sentences and metadata 32 | for offset annotation. 33 | ''' 34 | 35 | documents = [] 36 | # Read parsed XML from cached file, for bnc/bnc-dev/bnc-test, if available 37 | if os.path.exists(cache_path): 38 | print 'Reading BNC from {0}'.format(cache_path) 39 | documents = json.load(open(cache_path, 'r')) 40 | return documents 41 | 42 | # Read BNC from file and parse, if no cached version available 43 | time_0 = time.time() 44 | print 'Processing BNC...' 45 | # Cycle through subdirectories 46 | subdirectories = sorted(os.listdir(corpus_file)) 47 | for subdirectory in subdirectories: 48 | subdirectory_path = os.path.join(corpus_file, subdirectory) 49 | subsubdirectories = sorted(os.listdir(subdirectory_path)) 50 | for subsubdirectory in subsubdirectories: 51 | subsubdirectory_path = os.path.join(subdirectory_path, subsubdirectory) 52 | document_ids = sorted(os.listdir(subsubdirectory_path)) 53 | # Cycle through documents 54 | for document_id in document_ids: 55 | # Select only documents in development or test set of evaluation corpus 56 | if corpus_type in ['bnc-dev', 'bnc-test']: 57 | if corpus_type == 'bnc-dev': 58 | subset_documents = [u'CBC', u'CH1', u'A61', u'A18', u'ABC', u'ABV', u'A12', u'CBD', u'A1N', u'A19', u'A69', u'A75', u'AML', u'K2A', u'FU4', u'HD8', u'A60', u'AL7', u'A1F', u'A1D', u'A1L', u'A1H'] 59 | else: 60 | subset_documents = [u'CBG', u'J1C', u'B03', u'A16', u'A6J', u'A15', u'A11', u'J1M', u'AP1', u'A5Y', u'G3H', u'B2M', u'B0X', u'A6S', u'B1C', u'A10', u'H8W', u'A1E', u'A1G', u'GXL', u'A1M', u'K29', u'A63'] 61 | if document_id[0:3] not in subset_documents: 62 | continue 63 | sentences_with_metadata = [] # Format: {'sentence': 'I win.', 'document_number': 'A00', 'sentence_number': '1'} 64 | document_path = os.path.join(subsubdirectory_path, document_id) 65 | parsed_xml = BeautifulSoup(open(document_path), 'lxml-xml') 66 | # Get metadata 67 | for idno in parsed_xml.find_all('idno'): 68 | if idno['type'] == 'bnc': 69 | document_idno = unicode(idno.string ) 70 | for class_code in parsed_xml.find_all('classCode'): 71 | if class_code['scheme'] == 'DLEE': 72 | class_code = unicode(class_code.string) 73 | break 74 | # Cycle through sentences, extract unicode string 75 | for sentence in parsed_xml.find_all('s'): 76 | # Skip sentences containing gap elements 77 | if sentence.gap: 78 | continue 79 | sentence_number = unicode(sentence['n']) 80 | sentence_string = '' 81 | for descendant in sentence.descendants: 82 | if descendant.name in ['c', 'w']: 83 | sentence_string += unicode(descendant.string) 84 | # Store sentence with metadata 85 | sentence_with_metadata = {'document_id': document_idno, 'sentence_number': sentence_number, 'sentence': sentence_string} 86 | sentences_with_metadata.append(sentence_with_metadata) 87 | documents.append(sentences_with_metadata) 88 | print 'Done! Processing BNC took {0:.2f} seconds'.format(time.time() - time_0) 89 | 90 | # Cache parsed XML 91 | json.dump(documents, open(cache_path, 'w')) 92 | 93 | return documents 94 | -------------------------------------------------------------------------------- /using_english.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | '''Get idioms from UsingEnglish.com, by scraping the a-z pages at www.usingenglish.com/reference/idioms/''' 5 | 6 | import re, string 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | def get_idioms(url, idioms_url): 11 | '''Scrape the idioms from the usingEnglish.com pages.''' 12 | 13 | idioms = [] 14 | for letter in string.lowercase: # Cycle through categories 15 | next_page = '{0}/{1}.html'.format(idioms_url, letter) # Page 1 of the category 16 | while next_page: 17 | # Get and parse page 18 | page = requests.get(next_page) 19 | soup = BeautifulSoup(page.content, 'html.parser') 20 | next_page = None 21 | for link in soup.find_all('a'): 22 | # Extract idiom from html 23 | if link.parent.name == 'dt': 24 | if ' ' in link.string: # Exclude single word 'idioms' 25 | idioms.append(link.string) 26 | # Get link to next page in the category 27 | elif link.parent.name == 'div': 28 | try: 29 | if link.parent['class'][0] == 'pagination': 30 | if re.match('next', link.string): 31 | next_page = url + link['href'] 32 | except KeyError: # Sometimes parent has no class 33 | pass 34 | 35 | return sorted(list(set(idioms))) 36 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | '''Utility functions to work with morpha, PoS-tagging, parsing, and other things.''' 5 | 6 | import pos2morpha 7 | 8 | import subprocess, shlex, time, json, re, itertools, csv 9 | import spacy 10 | import en_core_web_sm as spacy_model 11 | from stanfordcorenlp import StanfordCoreNLP 12 | import nltk.data 13 | 14 | ###### STANFORD TO SPACY ###### 15 | class StanfordDoc: 16 | '''Spacy-Doc-like container for Stanford output''' 17 | 18 | def __init__(self): 19 | self.sents = [] 20 | 21 | def __iter__(self): 22 | return iter(self.tokens) 23 | 24 | def __getitem__(self, i): 25 | if isinstance(i, slice): 26 | return StanfordSpan(self.tokens[i.start:i.stop]) 27 | else: 28 | return self.tokens[i] 29 | 30 | # Generate list of tokens from sentences 31 | def set_tokens(self): 32 | self.tokens = [token for sent in self.sents for token in sent] 33 | 34 | class StanfordSpan: 35 | '''Spacy-Span-like container for Stanford output''' 36 | 37 | def __init__(self, tokens): 38 | self.tokens = tokens 39 | self.start = self.tokens[0].i # Starting token index in document 40 | self.start_char = self.tokens[0].idx # Starting character index in document 41 | self.text_with_ws = ''.join([token.text_with_ws for token in self.tokens]) 42 | self.text = ''.join([token.text_with_ws for token in self.tokens[:-1]]) + self.tokens[-1].text 43 | 44 | def __iter__(self): 45 | return iter(self.tokens) 46 | 47 | def __getitem__(self, i): 48 | return self.tokens[i] 49 | 50 | class StanfordToken: 51 | '''Spacy-Token-like container for Stanford output''' 52 | 53 | def __init__(self, i, idx, lemma, tag, text, ws, word, doc): 54 | self.i = i # Token index in document 55 | self.idx = idx # Starting character index in document 56 | self.lemma_ = lemma 57 | self.tag_ = tag # PoS-tag inventory might differ slightly, but should not cause problems 58 | self.text = text 59 | self.text_with_ws = text + ws 60 | self.lower_ = word.lower() 61 | self.children = [] 62 | self.doc = doc 63 | 64 | def __str__(self): 65 | return self.text 66 | 67 | # Recursively gets all the syntactic descendants of a token, including self 68 | def get_descendants(self): 69 | descendants = [self] 70 | for child in self.children: 71 | descendants += child.get_descendants() 72 | return descendants 73 | 74 | # Sets the subtree attribute, which is an ordered generator for all descendants of a token 75 | def get_subtree(self): 76 | return sorted(self.get_descendants(), key=lambda x: x.i) 77 | 78 | # Sets the rights attribute, which is an ordered generator for all children to the right of a token 79 | def get_rights(self): 80 | return [child for child in self.children if child.i > self.i] 81 | 82 | def __repr__(self): 83 | return self.text 84 | 85 | def stanford_to_spacy(parse): 86 | '''Turn Stanford CoreNLP output into a Spacy-like object''' 87 | 88 | # Convert into Spacy-like objects 89 | doc = StanfordDoc() 90 | doc_i = 0 91 | for sentence in parse['sentences']: 92 | span = [] 93 | # Get token information 94 | tokens = sentence['tokens'] 95 | dependencies = sentence['basicDependencies'] 96 | # Make tokens into StanfordTokens 97 | for token in tokens: 98 | new_token = StanfordToken(doc_i, token['characterOffsetBegin'], token['lemma'], token['pos'], token['originalText'], token['after'], token['word'], doc) 99 | doc_i += 1 100 | span.append(new_token) 101 | # Add dependency relation and head index to tokens 102 | for dependency in dependencies: 103 | span[dependency['dependent'] - 1].head_idx = dependency['governor'] - 1 104 | span[dependency['dependent'] - 1].dep_ = dependency['dep'] 105 | # Add pointer to head of each token 106 | for new_token in span: 107 | # ROOT has itself as head 108 | try: 109 | if new_token.head_idx == -1: 110 | new_token.head = new_token 111 | else: 112 | new_token.head = span[new_token.head_idx] 113 | new_token.head.children.append(new_token) 114 | # Occasionally, a misformed parse yields a token without a head, default to ROOT, and show problematic sentence 115 | except AttributeError: 116 | new_token.head_idx = -1 117 | new_token.dep_ = u'ROOT' 118 | new_token.head = new_token 119 | print 'Headless word \'{0}\' in sentence "{1}"'.format(new_token.text.encode('utf-8'), ''.join([x.text_with_ws.encode('utf-8') for x in span])) 120 | # Add subtree to each token 121 | for new_token in span: 122 | new_token.subtree = new_token.get_subtree() 123 | new_token.rights = new_token.get_rights() 124 | doc.sents.append(StanfordSpan(span)) 125 | # Generate token list 126 | doc.set_tokens() 127 | 128 | return doc 129 | 130 | ###### PARSING ###### 131 | def load_parser(parser_type): 132 | '''Loads Spacy or Stanford CoreNLP''' 133 | 134 | time_0 = time.time() 135 | print 'Loading parser...' 136 | if parser_type == 'spacy': 137 | parser = spacy_model.load() 138 | elif parser_type == 'stanford': 139 | parser = StanfordCoreNLP('ext/stanford', memory='6g') 140 | parse((parser_type, parser), 'The cat sat on the mat.') # Annotate dummy sentence to force loading of annotation modules 141 | print 'Done! Loading parser took {0:.2f} seconds'.format(time.time() - time_0) 142 | 143 | return (parser_type, parser) 144 | 145 | def parse(parser, text): 146 | '''Parses a (unicode) string and returns the parse.''' 147 | 148 | if parser[0] == 'spacy': 149 | # Convert to unicode if necessary 150 | try: 151 | text = unicode(text, 'utf-8') 152 | except TypeError: 153 | pass 154 | # Normalize quotes, ‘ ’ ❛ ❜ to ', and “ ” ❝ ❞ to ", Spacy doesn't process them well 155 | text = re.sub(u'‘|’|❛|❜', u"'", text) 156 | text = re.sub(u'“|”|❝|❞', u'"', text) 157 | # Insert a space between punctuation and a dash, Spacy doesn't process that well either 158 | text = re.sub(ur'([^\w\s])([-—])', r'\1 \2', text) 159 | return parser[1](text) 160 | 161 | if parser[0] == 'stanford': 162 | # Convert from unicode if necessary 163 | try: 164 | text = text.encode('utf-8') 165 | except UnicodeDecodeError: 166 | pass 167 | properties={'annotators': 'tokenize,ssplit,pos,lemma,depparse','pipelineLanguage':'en','outputFormat':'json'} 168 | parsed_text = parser[1].annotate(text, properties=properties) 169 | parsed_text = json.loads(parsed_text) 170 | return stanford_to_spacy(parsed_text) 171 | 172 | ###### POS-TAGGING ###### 173 | def load_pos_tagger(): 174 | '''Loads Spacy PoS-tagger which takes pre-tokenized text.''' 175 | 176 | time_0 = time.time() 177 | print 'Loading PoS-tagger...' 178 | pos_tagger = spacy_model.load(disable = ['ner', 'parser']) 179 | print 'Done! Loading PoS-tagger took {0:.2f} seconds'.format(time.time() - time_0) 180 | 181 | return pos_tagger 182 | 183 | def pos_tag(pos_tagger, text): 184 | '''Takes pos_tagger and tokenized utf-8 idiom/sentence, returns list of word|POS strings.''' 185 | 186 | # Normalize quotes, ‘ ’ ❛ ❜ to ', and “ ” ❝ ❞ to ", Spacy doesn't process them well 187 | text = re.sub(u'‘|’|❛|❜', u"'", text) 188 | text = re.sub(u'“|”|❝|❞', u'"', text) 189 | # Make Doc 190 | doc = spacy.tokens.Doc(pos_tagger.vocab, text.split()) 191 | # Set sentence boundary 192 | for token in doc: 193 | if token.i == 0: 194 | token.is_sent_start = True 195 | else: 196 | token.is_sent_start = False 197 | # Do actual tagging 198 | doc = pos_tagger.tagger(doc) 199 | # Convert into list of words and tags 200 | words_and_tags = [] 201 | for token in doc: 202 | words_and_tags.append(token.text + u'|' + token.tag_) 203 | 204 | return words_and_tags 205 | 206 | ###### MORPHA ###### 207 | def morpha(morph_dir, tokens, keep_case = True, keep_pos = False): 208 | '''Interface to morpha and its options, takes list of tokens as input, returns list of uninflected tokens.''' 209 | 210 | # Set flags 211 | if keep_case: 212 | case_flag = 'c' 213 | else: 214 | case_flag = '' 215 | if keep_pos: 216 | pos_flag = 't' 217 | else: 218 | pos_flag = '' 219 | flags = '-{0}{1}f'.format(case_flag, pos_flag) 220 | 221 | # Call morpha via subprocess 222 | call = shlex.split('{0}/morpha {1} {0}/verbstem.list'.format(morph_dir, flags)) 223 | process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 224 | output = process.communicate(input=' '.join(tokens)) 225 | base_tokens = output[0].split(' ') 226 | 227 | return base_tokens 228 | 229 | def morphg(morph_dir, tokens, keep_case = True, keep_pos = False): 230 | '''Interface to morphg and its options, takes list of token+inflection_POS strings as input, returns tuple of inflected tokens.''' 231 | 232 | # Set flags 233 | if keep_case: 234 | case_flag = 'c' 235 | else: 236 | case_flag = '' 237 | if keep_pos: 238 | pos_flag = 't' 239 | else: 240 | pos_flag = '' 241 | flags = '-{0}{1}f'.format(case_flag, pos_flag) 242 | 243 | # Call morphg 244 | call = shlex.split('{0}/morphg {1} {0}/verbstem.list'.format(morph_dir, flags)) 245 | process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 246 | output = process.communicate(input=' '.join(tokens)) 247 | inflected_tokens = output[0].split(' ') 248 | 249 | # Filter out failed inflections, which will still contain '+' 250 | cleaned_inflected_tokens = [i_t for i_t in inflected_tokens if not '+' in i_t] 251 | 252 | return tuple(cleaned_inflected_tokens) 253 | 254 | ###### TOKENIZATION ###### 255 | def load_tokenizer(): 256 | '''Loads Spacy tokenizer''' 257 | 258 | time_0 = time.time() 259 | print 'Loading tokenizer...' 260 | tokenizer = spacy_model.load(disable = ['tagger', 'ner', 'parser']) 261 | print 'Done! Loading tokenizer took {0:.2f} seconds'.format(time.time() - time_0) 262 | 263 | return tokenizer 264 | 265 | def tokenize(tokenizer, sentence): 266 | '''Parses a (unicode) sentence, returns list of Spacy Tokens''' 267 | try: 268 | return tokenizer(unicode(sentence, 'utf-8')) 269 | except TypeError: 270 | return tokenizer(sentence) 271 | 272 | ###### EXAMPLE SENTENCES ###### 273 | def get_example_sentences(idioms, sentences_file, cache_file): 274 | ''' 275 | Takes a list of idioms, searches a large corpus for example sentences, 276 | extracts shortest example sentence, returns dict of format {idiom: sentence}. 277 | Saves extracted sentences and idioms to file, for fast re-use in subsequent runs. 278 | ''' 279 | 280 | time_0 = time.time() 281 | idioms_with_sentences = {} 282 | 283 | # If file is cached example sentences, load those, else extract sentences from corpus 284 | if re.search('.json$', sentences_file): 285 | idioms_with_sentences = json.load(open(sentences_file, 'r')) 286 | if set(idioms) <= set(idioms_with_sentences.keys()): 287 | print 'Using cached example sentences from {0}'.format(sentences_file) 288 | # Select only the idioms part of the idiom dictionary 289 | if set(idioms) < set(idioms_with_sentences.keys()): 290 | idioms_with_sentences = {key: idioms_with_sentences[key] for key in idioms_with_sentences if key in idioms} 291 | return idioms_with_sentences 292 | else: 293 | raise Exception('{0} does not contain entries for all the idioms specified in the dictionary argument, quitting.'.format(sentences_file)) 294 | else: 295 | print '{0} is not a cached json-file, extracting sentences containing idioms...'.format(sentences_file) 296 | 297 | # Add fallback option: no example sentence 298 | for idiom in idioms: 299 | idioms_with_sentences[idiom] = '' 300 | # Compile idiom regexes for efficiency and ignore meta-linguistic uses in quotes 301 | idiom_regexes = [re.compile('[^"\'] ' + idiom + ' [^"\']') for idiom in idioms] 302 | # Find shortest (in tokens) sentence containing idiom in corpus 303 | splitter = nltk.data.load('tokenizers/punkt/english.pickle') 304 | # Extract first 1000 lines containing the idiom with grep, then split and find sentences 305 | for idx, idiom in enumerate(idioms): 306 | if idx%100 == 0 and idx > 0: 307 | print '\tGetting example sentences for {0} of {1} idioms took {2} seconds'.format(idx, len(idioms), time.time()-time_0) 308 | call = shlex.split('grep -m 1000 "{0}" {1}'.format(u8(idiom), sentences_file)) 309 | process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout = subprocess.PIPE, stderr=subprocess.PIPE) 310 | output = process.communicate() 311 | output = output[0].strip() 312 | sentences = splitter.tokenize(unicode(output, 'utf-8')) 313 | for sentence in sentences: 314 | if idiom_regexes[idx].search(sentence): 315 | # Should have at least 3 extra words in the 'sentence' 316 | if len(sentence.split(' ')) > len(idiom.split(' ')) + 3: 317 | if idioms_with_sentences[idiom]: 318 | # Replace old sentence if new sentence one is shorter 319 | if len(sentence.split(' ')) < len(idioms_with_sentences[idiom].split(' ')): 320 | idioms_with_sentences[idiom] = sentence 321 | else: 322 | idioms_with_sentences[idiom] = sentence 323 | 324 | # Caching extracted example sentences 325 | ofn = cache_file 326 | with open(ofn, 'w') as of: 327 | json.dump(idioms_with_sentences, of) 328 | print 'Caching idioms and example sentences in {0}'.format(ofn) 329 | 330 | print 'Done! took {0:.2f} seconds'.format(time.time() - time_0) 331 | 332 | return idioms_with_sentences 333 | 334 | def parse_example_sentences(idioms_with_sentences, ambiguous_word, parser): 335 | '''Parses an example sentence containing an idiom, returns the part of the parse tree spanning the idiom.''' 336 | 337 | parsed_idioms = [] 338 | 339 | # Cycle through idioms, parse sentence if available, extract idiom-spanning subtree 340 | for idiom in idioms_with_sentences: 341 | sentence = idioms_with_sentences[idiom] 342 | if sentence: 343 | parsed_sentence = parse(parser, sentence) 344 | # Find indices of idiom in sentence 345 | start = re.search(idiom, sentence).start() 346 | end = re.search(idiom, sentence).end() 347 | # Extract idiom subtree from parsed example sentence based on character offsets 348 | has_em_dash = u'\u2014' in idiom 349 | idiom_tokens = [] 350 | subtree_start = None 351 | for token in parsed_sentence: 352 | if token.idx >= end: 353 | subtree_end = token.i 354 | break 355 | if token.idx >= start: 356 | if not subtree_start: 357 | subtree_start = token.i 358 | idiom_tokens.append(token) 359 | # Extract top token and lemma 360 | extracted = False 361 | for idiom_token in idiom_tokens: 362 | # If the head of current token is not part of the idiom, it is the top token of the idiom phrase 363 | if idiom_token.head.text not in idiom: 364 | idiom_top_token = idiom_token 365 | idiom_top_lemma = idiom_token.lemma_ 366 | # Detect parses where the idiom does not form a single subtree, parse those idioms w/o contenxt 367 | if extracted: 368 | del parsed_idioms[-1] 369 | parsed_idioms.append(parse_idiom(idiom, ambiguous_word, parser)) 370 | break 371 | else: 372 | parsed_idioms.append((idiom_top_lemma, idiom_top_token, idiom_top_token.doc[subtree_start:subtree_end], has_em_dash)) 373 | extracted = True 374 | 375 | # Parse the idiom if no sentence is available 376 | else: 377 | parsed_idioms.append(parse_idiom(idiom, ambiguous_word, parser)) 378 | 379 | return parsed_idioms 380 | 381 | ###### IDIOM PROCESSING ###### 382 | def parse_idiom(idiom, ambiguous_word, parser): 383 | '''Parse idioms without context, extract top node, lemma and subtree.''' 384 | 385 | parsed_idiom = None # Format: (top_lemma, top_token, idiom subtree, has_em_dash) 386 | 387 | # Deal with em-dash wildcards, e.g. 'too - for words'. Replace wildcard with POS-ambiguous word (e.g. 'fine') and parse 388 | if u'\u2014' in idiom: 389 | has_em_dash = True 390 | parsed_idiom = parse(parser, re.sub(u'\u2014', ambiguous_word, idiom)) 391 | else: 392 | has_em_dash = False 393 | parsed_idiom = parse(parser, idiom) 394 | 395 | # Extract top token and lemma 396 | for token in parsed_idiom: 397 | if token.dep_ == 'ROOT': 398 | idiom_top_lemma = token.lemma_ 399 | idiom_top_token = token 400 | idiom_subtree = [] 401 | parsed_idiom = (idiom_top_lemma, idiom_top_token, idiom_subtree, has_em_dash) 402 | 403 | return parsed_idiom 404 | 405 | def inflect_idioms(idioms, morph_dir): 406 | ''' 407 | Generate inflectional variants of idioms using the Spacy PoS-tagger, 408 | morpha and morphg. Takes a list of idioms, returns a list of inflected 409 | idioms and a mapping between inflectional variants and the base form. 410 | ''' 411 | 412 | pos_tagger = load_pos_tagger() 413 | inflected_idioms = [] 414 | base_form_map = {} # Maps inflectional variants to base form, format: {'inflectional variant': 'base form'} 415 | print 'Inflecting idioms...' 416 | time_0 = time.time() 417 | 418 | for idiom in idioms: 419 | # Add original form to base form map 420 | base_form_map[idiom] = idiom 421 | # Tag tokens, convert to Morpha tags 422 | pos_tokens = pos_tag(pos_tagger, idiom) 423 | if pos_tokens: 424 | morpha_tokens = [pos2morpha.convert_token(pos_token).encode('utf-8') for pos_token in pos_tokens] 425 | # Run morpha 426 | base_tokens = morpha(morph_dir, morpha_tokens, keep_case = True, keep_pos = True) 427 | # Generate inflections for verbs and nouns 428 | base_tuples = [] 429 | for base_token in base_tokens: 430 | # Look for NN, not N, because we don't want NP, proper names 431 | # Differentiate noun and verb inflections 432 | # Morphg doesn't handle 'be' well, define manually 433 | if base_token[0:4] == 'be_V': 434 | base_tuples.append(('be', 'being', 'been', 'am', 'are', 'is', 'was', 'were')) 435 | elif '_V' in base_token or '_NN' in base_token: 436 | if '_V' in base_token: 437 | morphg_tokens = (re.sub('_', '+s_', base_token), re.sub('_', '+ing_', base_token), 438 | re.sub('_', '+ed_', base_token), re.sub('_', '+en_', base_token)) 439 | else: 440 | morphg_tokens = (re.sub('_', '+s_', base_token),) 441 | base_tuples.append((base_token.split('_')[0],) + morphg(morph_dir, morphg_tokens, keep_case = True, keep_pos = False)) 442 | else: 443 | base_tuples.append((base_token.split('_')[0],)) 444 | # Generate combinations of inflected tokens and store base form mapping 445 | for inflected_tokens in itertools.product(*base_tuples): 446 | inflected_idiom = unicode(' '.join(inflected_tokens), 'utf-8') 447 | inflected_idioms.append(inflected_idiom) 448 | base_form_map[inflected_idiom] = idiom 449 | 450 | # Join to original list, and filter out duplicates 451 | inflected_idioms = list(set(idioms + inflected_idioms)) 452 | 453 | print 'Done! Inflecting idioms took {0:.2f} seconds'.format(time.time() - time_0) 454 | print 'With inflections, we have {0} idioms'.format(len(inflected_idioms)) 455 | 456 | return inflected_idioms, base_form_map 457 | 458 | def expand_indefinite_pronouns(idioms): 459 | ''' 460 | When one's or someone's or someone occurs in an idiom, remove it, 461 | and add idioms with personal pronouns added in. Don't expand 'one', 462 | because it is too ambiguous. 463 | ''' 464 | 465 | expanded_idioms = [] 466 | base_form_map = {} # Maps expanded variants to base form, format: {'expanded idiom': 'base form'} 467 | possessive_pronouns = ['my', 'your', 'his', 'her', 'its', 'our', 'their'] 468 | objective_pronouns = ['me', 'you', 'him', 'her', 'us', 'them', 'it'] 469 | 470 | for idiom in idioms: 471 | # Add possessive pronouns only 472 | if re.search("\\bone's\\b", idiom): 473 | for possessive_pronoun in possessive_pronouns: 474 | expanded_idiom = re.sub("\\bone's\\b", possessive_pronoun, idiom) 475 | expanded_idioms.append(expanded_idiom) 476 | base_form_map[expanded_idiom] = idiom 477 | # Add possessive pronouns and a wildcard for other words 478 | elif re.search("\\bsomeone's\\b", idiom): 479 | for possessive_pronoun in possessive_pronouns + [unicode("—'s", 'utf-8')]: 480 | expanded_idiom = re.sub("\\bsomeone's\\b", possessive_pronoun, idiom) 481 | expanded_idioms.append(expanded_idiom) 482 | base_form_map[expanded_idiom] = idiom 483 | # Add objective pronouns and a wildcard for other words 484 | elif re.search("\\bsomeone\\b", idiom): 485 | for objective_pronoun in objective_pronouns + [unicode("—", 'utf-8')]: 486 | expanded_idiom = re.sub("\\bsomeone\\b", objective_pronoun, idiom) 487 | expanded_idioms.append(expanded_idiom) 488 | base_form_map[expanded_idiom] = idiom 489 | else: 490 | expanded_idioms.append(idiom) 491 | base_form_map[idiom] = idiom 492 | 493 | return expanded_idioms, base_form_map 494 | 495 | ###### OUTPUT ###### 496 | def u8(u): 497 | '''Encode unicode string in utf-8.''' 498 | 499 | return u.encode('utf-8') 500 | 501 | def write_csv(extracted_idioms, outfile): 502 | '''Writes extracted idioms to file in csv-format''' 503 | 504 | with open(outfile, 'w') as of: 505 | writer = csv.writer(of, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"') 506 | for extracted_idiom in extracted_idioms: 507 | output_row = [u8(extracted_idiom['idiom']), extracted_idiom['start'], extracted_idiom['end'], 508 | u8(extracted_idiom['snippet']), u8(extracted_idiom['bnc_document_id']), u8(extracted_idiom['bnc_sentence']), 509 | extracted_idiom['bnc_char_start'], extracted_idiom['bnc_char_end']] 510 | writer.writerow(output_row) 511 | -------------------------------------------------------------------------------- /wiktionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | '''Get information from Wiktionary using the MediaWiki API and process returned content.''' 5 | 6 | import re 7 | import requests 8 | import lxml.html 9 | 10 | def get_category_members(category): 11 | ''' 12 | Use the MediaWiki API to get all category members of a Wiktionary category. 13 | Takes a category name. Returns a list of pagetitles. 14 | ''' 15 | 16 | titles = [] 17 | cont = True 18 | cmcontinue = '' # Continuation point for query 19 | # Get titles until no members left 20 | while(cont): 21 | # Construct query 22 | endpoint = 'https://en.wiktionary.org/w/api.php?' # Wiktionary API endpoint 23 | action = 'action=' + 'query' # Which action to take (query, naturally) 24 | format = 'format=' + 'json' # Output format 25 | lists = 'list=' + 'categorymembers' 26 | cmtitle = 'cmtitle=Category:' + category 27 | cmtitle = re.sub(' ', '%20', cmtitle) 28 | cmlimit = 'cmlimit=' + '500' # Query result limit 29 | cmprop = 'cmprop=' + 'title' # Get page titles only 30 | 31 | query = endpoint + '&'.join([action, format, lists, cmtitle, cmprop, cmlimit]) 32 | if cmcontinue: # Adding cmcontinue to query makes sure it continues from end of previous query 33 | query += '&cmcontinue=' + cmcontinue 34 | 35 | # Get and process results 36 | res_raw = requests.get(query) 37 | res_json = res_raw.json() 38 | # Collect page titles, i.e. idioms 39 | category_members = res_json['query']['categorymembers'] 40 | for category_member in category_members: 41 | title = category_member['title'] 42 | if not re.search('(^Appendix:)|(^Category:)|(^Special:)|(^Wiktionary:)|(^Category_talk:)|(^Citations:)', title): # Filter out special pages 43 | if ' ' in title: # Exclude single-word 'idioms' 44 | titles.append(title.strip()) 45 | # Check for more members in category 46 | try: 47 | cmcontinue = res_json['continue']['cmcontinue'] 48 | cont = True 49 | except KeyError: 50 | cont = False 51 | 52 | return sorted(list(set(titles))) 53 | 54 | def get_page(title): 55 | ''' 56 | Use the MediaWiki API to get *** from a Wiktionary page. 57 | Takes a page title. Returns *** 58 | ''' 59 | 60 | # Construct query 61 | endpoint = 'http://en.wiktionary.org/w/api.php?' # Wiktionary API endpoint 62 | action = 'action=' + 'query' # Which action to take (query, naturally) 63 | format = 'format=' + 'json' # Output format 64 | prop = 'prop=' + 'revisions' # What info to get 65 | rvprop = 'rvprop=' + 'content' 66 | rvparse = 'rvparse' # Parse content into html 67 | titles = 'titles=' + title 68 | titles = re.sub(' ', '%20', titles) 69 | query = endpoint + '&'.join([action, format, prop, rvprop, rvparse, titles]) 70 | 71 | # Process result, get html only 72 | try: 73 | res_raw = requests.get(query) 74 | res_json = res_raw.json() 75 | temp_1 = res_json['query']['pages'] # Dig through first two layers 76 | res_html = temp_1[temp_1.keys()[0]]['revisions'][0]['*'] # Dig through remaining four layers 77 | parsed_html = lxml.html.document_fromstring(res_html) 78 | except KeyError: 79 | return 80 | 81 | return parsed_html 82 | --------------------------------------------------------------------------------