├── .gitmodules ├── README.md ├── analyze_chunk.py ├── cache └── .gitignore ├── chunk_sents.py ├── data ├── .gitignore └── README.md ├── models ├── .gitignore └── README.md ├── parse_atts.py ├── parse_sents.py ├── pyutils ├── .gitignore ├── __init__.py ├── attparser │ ├── __init__.py │ ├── baseParser.py │ ├── clefParser.py │ ├── cocoParser.py │ ├── cocoParser_punct.py │ ├── config.py │ ├── head.py │ └── simpleParser.py └── corenlp │ ├── .gitignore │ ├── __init__.py │ ├── __main__.py │ ├── client.py │ ├── corenlp.py │ ├── default.properties │ └── progressbar.py ├── senna_sents.py ├── write_atts_html.py └── write_chunk_html.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pyutils/refer"] 2 | path = pyutils/refer 3 | url = https://github.com/lichengunc/refer 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # refer-parser2 2 | Referring Expression Parser 3 | 4 | 5 | ## Introduction 6 | Our parser provides functions of 7 | * parse sentences in multithread mode using StanfordNLP and SENNA 8 | * find the head noun word of a sentence 9 | * find the 7 attribute words as ReferitGame paper 10 | * chunk sentences 11 | * write htmls 12 | 13 | ## Requirements 14 | This code is written in python and requires several libraries. 15 | ```bash 16 | practnlptools 17 | nltk 18 | corenlp 19 | unidecode 20 | ``` 21 | We prune the core part of corenlp-python in this repository, whose original git can be downloaded [here](https://bitbucket.org/jeremybmerrill/corenlp-python.git). 22 | Note this (our) corenlp is able to read [v3.5.1](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-01-29.zip) and [v3.5.2](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-04-20.zip), but not able to load V3.6.0. 23 | Also note Stanford NLP group switches to Universal Dependencies standard since v3.5.2. 24 | We also use [SENNA](http://ronan.collobert.com/senna/)'s python wrapper, [practnlptools](https://pypi.python.org/pypi/practnlptools/1.0) to chunk each sentence into phrase structures. 25 | 26 | ## How to use 27 | 1a) Parse expressions using Stanford Parser: 28 | ```bash 29 | python parse_sents.py --dataset refcoco --splitBy unc --num_workers 4 30 | ``` 31 | 1b) Parse expressions using [Vicente's R1-R7 attributes](http://tamaraberg.com/papers/referit.pdf): 32 | ```bash 33 | python parse_atts.py --dataset refcoco --splitBy unc 34 | ``` 35 | 1c) Visualize decomposed attributes: 36 | ```bash 37 | python write_att_html.py --dataset refcoco --splitBy unc 38 | ``` 39 | 40 | 2a) Parse expression using SENNA parser: 41 | ```bash 42 | python senna_sents.py --dataset refcoco --splitBy unc --num_workers 4 43 | ``` 44 | 2b) Chunk expressions into phrase structures: 45 | ```bash 46 | python chunk_sents.py --dataset refcoco --splitBy unc 47 | ``` 48 | 2c) Analyze the phrase structures from the chunking results: 49 | ```bash 50 | python analyze_chunk.py --dataset refcoco --splitBy unc 51 | ``` 52 | 2d) Visualize the phrase structures: 53 | ```bash 54 | python write_chunk_html.py --dataset refcoco --splitBy unc 55 | ``` 56 | 57 | ## Download 58 | * [**Parsed expressions**](http://bvision.cs.unc.edu/licheng/MattNet/refer-parser2/cache/parsed_atts.zip) using [Vicente's R1-R7 attributes](http://tamaraberg.com/papers/referit.pdf) 59 | 60 | ### License 61 | BSD License. 62 | 63 | -------------------------------------------------------------------------------- /analyze_chunk.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path as osp 4 | from pprint import pprint 5 | import time 6 | import argparse 7 | import json 8 | import operator 9 | import random 10 | 11 | def analyze_structure(sents): 12 | """ 13 | The input sents = [{sent_id, sent, chunk, NPs, senna, tokens}] 14 | where chunk is list of [(phrase, phrase_type)], and NPs is list of noun phrases 15 | We analyze phrase structure 16 | """ 17 | struct_to_num = {} 18 | struct_to_examples = {} 19 | for sent in sents: 20 | chunk = sent['chunk'] 21 | struct = ' '.join([ck[1] for ck in chunk]) 22 | struct_to_num[struct] = struct_to_num.get(struct, 0) + 1 23 | if struct not in struct_to_examples: 24 | struct_to_examples[struct] = [] 25 | struct_to_examples[struct] += [sent['sent']] 26 | sorted_structs = sorted(struct_to_num.items(), key=operator.itemgetter(1)) 27 | sorted_structs.reverse() 28 | 29 | print('%25s: %10s %6s %8s' % ('structure', 'number', 'perc.', 'acc.')) 30 | total_num = sum(struct_to_num.values()) 31 | acc = 0 32 | for struct, num in sorted_structs[:20]: 33 | acc += num 34 | print('%25s: %10d %6.3f%% %4.3f%%, e.g., %s' % (struct, num, num*100.0/total_num, acc*100.0/total_num, random.choice(struct_to_examples[struct]))) 35 | 36 | def analyze_NP(sents): 37 | # NP usage in the raw chunks 38 | NP_usage = 0 39 | for sent in sents: 40 | chunk = sent['chunk'] 41 | NPs = [ck for ck in chunk if ck[1] == 'NP'] 42 | if len(NPs) > 0: 43 | NP_usage += 1 44 | print('%.2f%% (%s/%s) expressions have NPs.' % (NP_usage*100.0/len(sents), NP_usage, len(sents))) 45 | 46 | # NP usage in the filtered NPs 47 | cleaned_NP_usage = 0 48 | for sent in sents: 49 | if len(sent['NPs']) > 0: 50 | cleaned_NP_usage += 1 51 | print('%.2f%% (%s/%s) expressions have cleaned NPs.' % (cleaned_NP_usage*100.0/len(sents), cleaned_NP_usage, len(sents))) 52 | 53 | # average #NP in each expression 54 | total_NPs, total_cleaned_NPs, total_PPs, total_VPs, total_ADVPs, total_ADJPs = 0, 0, 0, 0, 0, 0 55 | total_wds = 0 56 | total_NP_wds = 0 57 | total_cleaned_NP_wds = 0 58 | for sent in sents: 59 | for ck in sent['chunk']: 60 | if ck[1] == 'NP': 61 | total_NPs += 1 62 | total_NP_wds += len(ck[0].split()) 63 | if ck[1] == 'PP': 64 | total_PPs += 1 65 | if ck[1] == 'ADVP': 66 | total_ADVPs += 1 67 | if ck[1] == 'ADJP': 68 | total_ADJPs += 1 69 | total_wds += len(sent['tokens']) 70 | # check cleaned NPs 71 | total_cleaned_NPs += len(sent['NPs']) 72 | total_cleaned_NP_wds += sum([len(phrase.split()) for phrase in sent['NPs']]) 73 | 74 | print('Each expression and has %.2f NPs (%.2f cleaned NPs), %.2f PPs, %.2f ADVPs, %.2f ADJPs,' % (total_NPs*1.0/len(sents), 75 | total_cleaned_NPs*1.0 / len(sents), total_PPs*1.0/len(sents), total_ADVPs*1.0/len(sents), total_ADJPs*1.0/len(sents))) 76 | print('Each expression has %.2f words, among which are %.2f NP words.' % (total_wds/len(sents), total_NP_wds*1.0 / len(sents) )) 77 | print('Each NP has %.2f words.' % (total_NP_wds*1.0/total_NPs)) 78 | print('Each cleaned NP has %.2f words.' % (total_cleaned_NP_wds*1.0 / total_cleaned_NPs)) 79 | 80 | 81 | def main(params): 82 | 83 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 84 | if not osp.isdir('cache/chunk_html/' + dataset_splitBy): 85 | os.makedirs('cache/chunk_html/' + dataset_splitBy) 86 | 87 | # load chunked sents = [{sent_id, sent, chunk, NPs, senna, tokens}] 88 | # where chunk is list of [(phrase, phrase_type)] 89 | # and NPs is list of noun phrases 90 | path_to_chunked_sents = osp.join('cache/chunked_sents', dataset_splitBy, 'sents.json') 91 | sents = json.load(open(path_to_chunked_sents)) 92 | 93 | # analyze phrase structure 94 | analyze_structure(sents) 95 | 96 | # analyze the usage of NPs 97 | analyze_NP(sents) 98 | 99 | 100 | if __name__ == '__main__': 101 | 102 | # input 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 105 | parser.add_argument('--splitBy', default='unc', help='split By') 106 | parser.add_argument('--num_per_page', type=int, default=10000, help='number of pages to be written') 107 | args = parser.parse_args() 108 | params = vars(args) 109 | 110 | # main 111 | main(params) -------------------------------------------------------------------------------- /cache/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /chunk_sents.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code will convert senna's chunk 3 | [('Biplab', 'S-NP'), ('is', 'S-VP'), ('a', 'B-NP'), ('good', 'I-NP'), ('boy', 'E-NP'), ('.', 'O')] 4 | into [(NP, Biplab), (VP, is), (NP, good boy), (O, .)] 5 | 6 | We will also do cleaning on the chunked phrase, by excluding all location words like 'left', 'right', etc. 7 | For example, (NP, right white dog) -> (NP, white dog) 8 | 9 | We read cache/senna_sents/dataset_splitBy/sents.json and save the chucking redsults into 10 | cache/chunked_sents/dataset_splitBy/sents.json 11 | """ 12 | import sys 13 | import os 14 | import os.path as osp 15 | from pprint import pprint 16 | import time 17 | import argparse 18 | import json 19 | 20 | # nltk's stopping words 21 | import nltk 22 | # nltk.data.path.append('/Users/liyu/Documents/nltk_data') 23 | nltk.data.path.append('/mnt/ilcompf6d0/user/liyu/Developments/nltk_data') 24 | from nltk.corpus import stopwords 25 | stop_words = stopwords.words("english") + ['.', ',', ':', '(', ')', '"', "'s", '!', "'", 26 | 'between', 'against', 'above', 'below', 'up', 'down', 'out', 'off', 'over'] 27 | stop_words.remove('and') # we may need 'and' token, e.g., black and white 28 | 29 | # location words 30 | location_words = ['right', 'left', 'top', 'bottom', 'middle', 'mid', 'second', '2nd', 'first', '1st', 'front', 31 | 'closest', 'nearest', 'center', 'central', 'third', '3rd', 'corner', 'upper', 'back', 'behind', 'far', 'anywhere', 32 | 'leftmost', 'lower', 'rightmost', 'farthest', 'furthest', 'next', 'last', 'fourth', '4th', 'up', 'above', 'below', 33 | 'down', 'side'] 34 | 35 | # color words 36 | color_words = ['white', 'green', 'blue', 'red', 'yellow', 'black', 'brown', 'pink', 'dark', 'darker', 'orange', 37 | 'gray', 'grey', 'purple', 'beige', 'bright'] 38 | 39 | # size words 40 | size_words = ['big', 'bigger', 'biggest', 'small', 'smaller', 'smallest', 'tall', 'taller', 'tallest', 'large', 41 | 'larger', 'largest', 'little', 'short', 'shorter', 'tiny', 'long', 'longer', 'longest', 'huge'] 42 | 43 | def extract_chunk(senna): 44 | """ 45 | senna = {chunk, pos, srl, syntax_tree, verbs, words, ner} 46 | where chunk = [(the, B-NP), (lady, E-NP), ...], there are B, I, E, S, O prefix in total. 47 | We extract the chunk in to [(phrase, phrase_type)], e.g., 48 | [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'] 49 | 50 | Besides, we specifically deal with such case: 51 | sent = 'boy', senna's chunk = [('boy', 'O')], senna's pos = [('boy', 'NN')] 52 | We also consider this single word to be NP 53 | """ 54 | raw_chunk = senna['chunk'] 55 | chunk = [] 56 | phrase, pix = '', 0 57 | for c in raw_chunk: 58 | if pix > 0: 59 | phrase += ' ' 60 | phrase += c[0] 61 | pix += 1 62 | if 'E-' in c[1] or 'S-' in c[1]: 63 | ptype = c[1][2:] 64 | chunk += [(phrase, ptype)] 65 | phrase, pix = '', 0 66 | if c[1] == 'O': 67 | if len(raw_chunk) == 1: 68 | if senna['pos'][0][1] == 'NN': # when sentence = 'boy', senna ouputs 'O' but we take it as 'NP' 69 | chunk += [(phrase, 'NP')] 70 | else: 71 | chunk += [(phrase, 'O')] 72 | else: 73 | chunk += [(phrase, 'O')] 74 | phrase, pix = '', 0 75 | # in case the last phrase has no "-E" to finish 76 | if phrase != '': 77 | chunk += [(phrase, c[1][2:])] 78 | return chunk 79 | 80 | def extract_NPs(chunk): 81 | """ 82 | Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'], 83 | we extract the NPs with stopping and location words filtered out, and return list of noun phrases. 84 | """ 85 | forbid_wds = stop_words + location_words 86 | NPs = [] 87 | for phrase, ptype in chunk: 88 | if ptype == 'NP': 89 | filtered_wds = [] 90 | for wd in phrase.split(): 91 | if wd not in forbid_wds: 92 | filtered_wds += [wd] 93 | if len(' '.join(filtered_wds)) > 0: 94 | NPs += [' '.join(filtered_wds)] 95 | return NPs 96 | 97 | def extract_NNs(chunk, pos): 98 | """ 99 | Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'], 100 | and pos [(word, pos)], e.g., [('man', 'NN')] 101 | we extract from NPs with stopping, location, color, size words filtered out, 102 | and return list of NN words only. 103 | """ 104 | forbid_wds = stop_words + location_words + color_words + size_words 105 | NNs = [] 106 | for phrase, ptype in chunk: 107 | if ptype == 'NP': 108 | filtered_wds = [] 109 | for wd in phrase.split(): 110 | wd_pos = [p[1] for p in pos if p[0] == wd][0] 111 | if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD': # we don't need JJ nor CD words neither. 112 | filtered_wds += [wd] 113 | if len(' '.join(filtered_wds)) > 0: 114 | NNs += [' '.join(filtered_wds)] 115 | return NNs 116 | 117 | def main(params): 118 | 119 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 120 | if not osp.isdir('cache/chunked_sents/'+dataset_splitBy): 121 | os.makedirs('cache/chunked_sents/'+dataset_splitBy) 122 | 123 | # load senna_sents = [{sent_id, tokens, sent, senna}] 124 | # where senna = {chunk, pos, srl, syntax_tree, verbs, words, ner} 125 | path_to_senna_sents = osp.join('cache/senna_sents', dataset_splitBy, 'sents.json') 126 | sents = json.load(open(path_to_senna_sents)) 127 | 128 | # chunk convert 129 | for i, sent in enumerate(sents): 130 | senna = sent['senna'] 131 | chunk = extract_chunk(senna) 132 | NPs = extract_NPs(chunk) 133 | NNs = extract_NNs(chunk, senna['pos']) 134 | # deal with special case: chunk failed 135 | # won't extract NPs nor NNs for this faked ones. 136 | if ' '.join([ck[0] for ck in chunk]) == 'none': 137 | print('raise chunk error!') 138 | chunk = [(sent['sent'], 'NP')] 139 | sent['chunk'] = chunk 140 | sent['NPs'] = NPs 141 | sent['NNs'] = NNs 142 | if i % 1000 == 0: 143 | print('%s/%s done.' % (i+1, len(sents))) 144 | 145 | # save 146 | cur_folder = os.path.abspath('.') 147 | output_path = osp.join(cur_folder, 'cache/chunked_sents/'+dataset_splitBy, 'sents.json') 148 | with open(output_path, 'w') as io: 149 | json.dump(sents, io) 150 | print('chunked_sents saved in %s.' % output_path) 151 | 152 | 153 | if __name__ == '__main__': 154 | 155 | # input 156 | parser = argparse.ArgumentParser() 157 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 158 | parser.add_argument('--splitBy', default='unc', help='dataset name') 159 | args = parser.parse_args() 160 | params = vars(args) 161 | 162 | # main 163 | main(params) -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !README.md 3 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Download 2 | Download my cleaned data and extract them into this folder. 3 | - 1) http://tlberg.cs.unc.edu/licheng/referit/data/refclef.zip 4 | - 2) http://tlberg.cs.unc.edu/licheng/referit/data/refcoco.zip 5 | - 3) http://tlberg.cs.unc.edu/licheng/referit/data/refcoco+.zip 6 | - 4) http://tlberg.cs.unc.edu/licheng/referit/data/refcocog.zip 7 | 8 | Besides make a folder named as "images". 9 | Add "mscoco" into "images/". 10 | Download MSCOCO from [mscoco](http://mscoco.org/dataset/#overview) 11 | 12 | Add "saiapr_tc-12" into "images/". I only extracted the related images as a subset of the original [imageCLEF](http://imageclef.org/SIAPRdata), i.e., 19997 images. Please download the subset from here (http://tlberg.cs.unc.edu/licheng/referit/data/images/saiapr_tc-12.zip). 13 | -------------------------------------------------------------------------------- /models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !README.md 3 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | This folder should contain two pre-trained models: 2 | 1. corenlp v3.5.2 3 | 2. googlenews-vectors-negative300.bin 4 | Check README.md in the main repository and download these two here. 5 | -------------------------------------------------------------------------------- /parse_atts.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code will call pyutils/attparser to parse each sentence into 7 attributes. 3 | The parser rule is according to Vicente's paper "Referit Game", EMNLP2014. 4 | Specifically, r1 = entry-level name, r2 = color, r3 = size, r4 = abs. location, 5 | r5 = rel. location, r6 = rel. object, r7 = generic, r8 = the left words 6 | 7 | Before running this code, make sure you have already run parse_sents.py, whose output is 8 | sents = [{sent_id, sent, parse, raw, tokens}] 9 | The attparser will fetch the parse of each sent, then decompose it into 7 categories. 10 | 11 | The output will be saved in 'cache/parsed_atts/dataset_splitBy/sents.json', where 12 | sents = [{sent_id, sent, parse, raw, tokens, atts, left}] 13 | """ 14 | import sys 15 | import os 16 | import os.path as osp 17 | from pprint import pprint 18 | import time 19 | import argparse 20 | import json 21 | from pyutils.attparser import cocoParser, clefParser 22 | # set nltk data path 23 | import nltk 24 | # nltk.data.path.append('/Users/liyu/Documents/nltk_data') 25 | nltk.data.path.append('/mnt/ilcompf6d0/user/liyu/Developments/nltk_data') 26 | 27 | def analyze(sents): 28 | # do some statistics 29 | usage = {'r1': 0, 'r2': 0, 'r3': 0, 'r4': 0, 'r5': 0, 'r6': 0, 'r7': 0, 'r8': 0} 30 | for sent in sents: 31 | for r in usage: 32 | usage[r] = usage[r] + 1 if sent['atts'][r] != ['none'] else usage[r] 33 | for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']: 34 | usage[r] /= float(len(sents)) 35 | print('Usage of %s is %.2f%%.' % (r, usage[r] * 100)) 36 | 37 | def main(params): 38 | 39 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 40 | if not osp.isdir('cache/parsed_atts/' + dataset_splitBy): 41 | os.makedirs('cache/parsed_atts/' + dataset_splitBy) 42 | 43 | # load parsed sents, where sents.json = 44 | # [{sent_id, sent, parse, raw, tokens}], where parse = {dependencies, parsetree, text, workds} 45 | path_to_parsed_sents = osp.join('cache/parsed_sents', dataset_splitBy, 'sents.json') 46 | sents = json.load(open(path_to_parsed_sents)) 47 | 48 | # parse attributes for each sent 49 | if 'refcoco' in params['dataset']: 50 | attparser = cocoParser.CocoParser() 51 | elif 'refclef' in params['dataset']: 52 | attparser = clefParser.ClefParser() 53 | 54 | for i, sent in enumerate(sents): 55 | parse = sent['parse'] 56 | try: 57 | attparser.reset(parse) 58 | sent['atts'] = attparser.decompose() # return list of atts, i.e., {r1: [man], r2: [blue], r3: [], ...} 59 | sent['left'] = attparser.leftWords() # return list of (wd, pos), excluding stopping words 60 | except: 61 | sent['atts'] = {'r1': ['none'], 'r2': ['none'], 'r3': ['none'], 'r4': ['none'], 'r5': ['none'], 62 | 'r6': ['none'], 'r7': ['none'], 'r8': ['none']} 63 | sent['left'] = attparser.leftWords() 64 | if i % 100 == 0: 65 | print('%s/%s has been decomposed into attributes r1-r8.' % (i+1, len(sents))) 66 | 67 | # analyze 68 | analyze(sents) 69 | 70 | # save 71 | with open(osp.join('cache/parsed_atts/', dataset_splitBy, 'sents.json'), 'w') as io: 72 | json.dump(sents, io) 73 | 74 | 75 | if __name__ == '__main__': 76 | 77 | # input 78 | parser = argparse.ArgumentParser() 79 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 80 | parser.add_argument('--splitBy', default='unc', help='split By') 81 | args = parser.parse_args() 82 | params = vars(args) 83 | 84 | # main 85 | main(params) 86 | 87 | -------------------------------------------------------------------------------- /parse_sents.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code parse sentences into dependencies, parsetree, text and workds using Stanford-CoreNLP-Parser, 3 | but current corenlp is only able to load v3.5.1 and v3.5.2. 4 | 5 | The parsed sentences are saved in cache/parsed_sents/dataset_splitBy/sents.json 6 | The sents.json = [{sent_id, sent, parse, raw, tokens}], where parse = {dependencies, parsetree, text, workds} 7 | """ 8 | import sys 9 | import os 10 | import os.path as osp 11 | from pprint import pprint 12 | from nltk.tree import * 13 | from Queue import Queue 14 | from threading import Thread, Lock 15 | import time 16 | import argparse 17 | import json 18 | from pyutils.corenlp import StanfordCoreNLP 19 | 20 | def load_corenlp(params): 21 | # load corenlp 22 | b = time.time() 23 | core = StanfordCoreNLP(params['corenlp_model']) 24 | print('corenlp model loaded in %.2f seconds.' % (time.time() - b)) 25 | return core 26 | 27 | def parse_sents(sents, params): 28 | """ 29 | The input sents is list of [{sent_id, sent, raw, tokens}] 30 | The parse results if {dependencies: [(det, dog, the), (root, ROOT, dog)...] 31 | parsetree: u'(ROOT (NP (NP (DT the) (JJ left) (NN dog)) (PP (IN on) (NP (DT the) (NN tree)))))' 32 | text: u'the left dog on the tree' 33 | words: [(u'the', 34 | {u'CharacterOffsetBegin': u'0', 35 | u'CharacterOffsetEnd': u'3', 36 | u'Lemma': u'the', 37 | u'NamedEntityTag': u'O', 38 | u'PartOfSpeech': u'DT'}), ...]} 39 | Return sents = [{sent_id, sent, parse, raw, tokens}] 40 | """ 41 | num_sents = len(sents) 42 | 43 | # enqueue 44 | q = Queue() 45 | for i in range(num_sents): 46 | q.put((i, sents[i])) 47 | 48 | # work: dequeue and do job 49 | def worker(): 50 | core = load_corenlp(params) 51 | while True: 52 | i, sent = q.get() 53 | try: 54 | output = core.raw_parse(sent['sent'])['sentences'][0] 55 | except: 56 | output = core.raw_parse('none')['sentences'][0] 57 | if i % 100 == 0: 58 | print('%s/%s done.' % (i, num_sents)) 59 | sents[i]['parse'] = output 60 | q.task_done() 61 | 62 | # workers 63 | for w in range(params['num_workers']): 64 | t = Thread(target=worker) 65 | t.daemon = True 66 | t.start() 67 | q.join() 68 | 69 | 70 | def main(params): 71 | 72 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 73 | if not osp.isdir('cache/parsed_sents/'+dataset_splitBy): 74 | os.makedirs('cache/parsed_sents/'+dataset_splitBy) 75 | 76 | # load refer 77 | sys.path.insert(0, 'pyutils/refer') 78 | from refer import REFER 79 | refer = REFER(params['data_root'], params['dataset'], params['splitBy']) 80 | 81 | # parse sents 82 | sents = refer.Sents.values() 83 | parse_sents(sents, params) 84 | 85 | # save 86 | with open(osp.join('cache/parsed_sents/'+dataset_splitBy, 'sents.json'), 'w') as io: 87 | json.dump(sents, io) 88 | 89 | 90 | if __name__ == '__main__': 91 | 92 | # input 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('--data_root', default='data', help='dataset root directory') 95 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 96 | parser.add_argument('--splitBy', default='unc', help='split By') 97 | parser.add_argument('--corenlp_model', default='models/stanford-corenlp-full-2015-01-29') 98 | parser.add_argument('--num_workers', type=int, default=2, help='number of workers') 99 | args = parser.parse_args() 100 | params = vars(args) 101 | 102 | # main 103 | main(params) 104 | 105 | 106 | -------------------------------------------------------------------------------- /pyutils/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /pyutils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lichengunc/refer-parser2/a5214d0c4b086e1da5ccd92fd105d7c95a6f6fc3/pyutils/__init__.py -------------------------------------------------------------------------------- /pyutils/attparser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lichengunc/refer-parser2/a5214d0c4b086e1da5ccd92fd105d7c95a6f6fc3/pyutils/attparser/__init__.py -------------------------------------------------------------------------------- /pyutils/attparser/baseParser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | """ 4 | BaseParser defines: 5 | reset: initialize parse, head word, rels and Deps. 6 | """ 7 | 8 | from nltk.tree import * 9 | import sys 10 | from nltk.corpus import stopwords 11 | import os.path as osp 12 | import config 13 | import head 14 | 15 | class BaseParser(): 16 | def __init__(self, dataset): 17 | if dataset == 'refclef': 18 | self.config = config.configCLEF() 19 | self._headMode = 'vicente' 20 | elif dataset == 'refcoco' or dataset == 'refcoco+': 21 | self.config = config.configCOCO() 22 | self._headMode = 'licheng' 23 | else: 24 | print 'No configuration set yet.' 25 | sys.exit() 26 | 27 | def reset(self, parse): 28 | # load parse 29 | self._tree = Tree.fromstring(parse['parsetree']) 30 | self._dependencies = parse['dependencies'] 31 | self._words = parse['words'] 32 | self._text = parse['text'] 33 | 34 | # reset seven attributes 35 | self.r1, self.r2, self.r3, self.r4, self.r5, self.r6, self.r7 = [], [], [], [], [], [], [] 36 | 37 | # find head word 38 | self.head_word, _ = head.findHead(self._tree, mode = self._headMode) 39 | if self.head_word != '' and self.head_word != None: 40 | self.r1 = [wd[1]['Lemma'] for wd in self._words if wd[0] == self.head_word] 41 | self.r1 = [self.r1[0]] # we only need one 42 | else: 43 | self.r1 = ['none'] 44 | 45 | # dependency's relations that have 'prep' 46 | rels_prep = [dep for dep in self._dependencies if 'prep' in dep[0]] 47 | rels_prep_in = [dep for dep in self._dependencies if 'prep_in' in dep[0]] 48 | rels_prep_on = [dep for dep in self._dependencies if 'prep_on' in dep[0]] 49 | rels_prep_at = [dep for dep in self._dependencies if 'prep_at' in dep[0]] 50 | rels_prep_to = [dep for dep in self._dependencies if 'prep_to' in dep[0]] 51 | rels_prep_from = [dep for dep in self._dependencies if 'prep_from' in dep[0] or 'prepc_from' in dep[0]] 52 | rels_prep_of = [dep for dep in self._dependencies if 'prep_of' in dep[0]] 53 | rels_det = [dep for dep in self._dependencies if 'det' in dep[0]] 54 | 55 | # dependency's sources equal to head_word 56 | rels_direct = [dep for dep in self._dependencies if dep[1] == self.head_word] if self.r1[0]!='none' else [] 57 | direct_att_dep = [dep for dep in rels_direct if dep not in rels_prep + rels_det] 58 | prep_dep = [dep for dep in rels_direct if dep in rels_prep] 59 | prep_in_dep = [dep for dep in rels_direct if dep in rels_prep_in] 60 | prep_on_dep = [dep for dep in rels_direct if dep in rels_prep_on] 61 | prep_of_dep = [dep for dep in rels_direct if dep in rels_prep_of] 62 | prep_from_dep = [dep for dep in rels_direct if dep in rels_prep_from] 63 | prep_at_dep = [dep for dep in rels_direct if dep in rels_prep_at] 64 | prep_to_dep = [dep for dep in rels_direct if dep in rels_prep_to] 65 | 66 | # initialize types of dependencies 67 | self.rels = {} 68 | self.rels['prep'] = rels_prep 69 | self.rels['prep_in'] = rels_prep_in 70 | self.rels['prep_on'] = rels_prep_on 71 | self.rels['prep_at'] = rels_prep_at 72 | self.rels['prep_to'] = rels_prep_to 73 | self.rels['prep_from'] = rels_prep_from 74 | self.rels['prep_of'] = rels_prep_of 75 | 76 | # initialize types of dependencies whose source is head word 77 | # Deps denots Direct dependencies 78 | self.Deps = {} 79 | self.Deps['att'] = direct_att_dep 80 | self.Deps['prep'] = prep_dep 81 | self.Deps['prep_in'] = prep_in_dep 82 | self.Deps['prep_on'] = prep_on_dep 83 | self.Deps['prep_of'] = prep_of_dep 84 | self.Deps['prep_from'] = prep_from_dep 85 | self.Deps['prep_at'] = prep_at_dep 86 | self.Deps['prep_to'] = prep_to_dep 87 | 88 | def leftWords(self): 89 | all_wds = [word[0] for word in self._words] 90 | att_wds = [self.head_word] + self.r2 + self.r3 + self.r4 + self.r7 91 | # we then add r5, r6 to att_wds, need some tricks 92 | for wd in self.r5: 93 | if 'prep' in wd: 94 | wd = wd[5:] # prep_on_left -> on_left 95 | idx = wd.find('_') 96 | if idx >= 0: 97 | att_wds += [wd[:idx], wd[idx+1:]] 98 | else: 99 | att_wds += [wd] # prep_from -> from 100 | else: # ordinary_position, e.g., second_left 101 | idx = wd.find('_') 102 | att_wds += [wd[:idx], wd[idx+1:]] 103 | for wd in self.r6: 104 | att_wds = att_wds + [wd] if wd != 'self' else att_wds 105 | # the left word set 106 | left_wds = list(set(all_wds).difference(set(att_wds))) 107 | # word to POS dictionary 108 | wdToPOSs = {word[0]: [] for word in self._words} 109 | for word in self._words: 110 | wdToPOSs[word[0]] += [word[1]] 111 | # return left words 112 | # stopwds = ['the', 'of', 'a', 'an', ',', '.', 'on', 'in', 'from', 'at', 'of', 'to', 'and', 'or', '(', ')', 'that', 'this', 'it'] 113 | stopwds = stopwords.words("english") + ['.', ',', ':', '(', ')', '"', "'s", '!', 'between', 'against', 'above', 114 | 'below', 'up', 'down', 'out', 'off', 'over'] 115 | left_words = [(wd, wdToPOSs[wd][0]['PartOfSpeech']) for wd in left_wds if wd not in stopwds] 116 | return left_words 117 | 118 | if __name__ == '__main__': 119 | from pprint import pprint 120 | 121 | ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit') 122 | sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils')) 123 | from corenlp.corenlp import StanfordCoreNLP 124 | parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30') 125 | stanfordParser = StanfordCoreNLP(parser_path) 126 | 127 | sent = 'players close to us in dark uniform' 128 | parse = stanfordParser.raw_parse(sent)['sentences'][0] 129 | pprint(parse) 130 | 131 | attParser = BaseParser('refclef') 132 | attParser.reset(parse) 133 | 134 | 135 | -------------------------------------------------------------------------------- /pyutils/attparser/clefParser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | """ 4 | r1: [lemma of head word] 5 | r2: [color word describing r1] 6 | r3: [size word describing r1] 7 | r4: [location word describing r1], e.g., upper dog, dog on the left (of the picture) 8 | r5: [relative location and object], e.g., person under the door, dog on the table, dog on the left of the cat 9 | r6: [generic attribute describing r1], i.e., other JJ attributes describing head word 10 | """ 11 | 12 | from baseParser import BaseParser 13 | 14 | class ClefParser(BaseParser): 15 | 16 | def __init__(self): 17 | BaseParser.__init__(self, 'refclef') 18 | 19 | def decompose(self): 20 | # r2: color 21 | color_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['words']] 22 | color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['words']] 23 | for wd in color_wds: 24 | ix = self.config.color_table['wordtoix'][wd] 25 | self.r2 += [self.config.color_table['ixtoword'][ix]] 26 | 27 | # r3: size 28 | size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']] 29 | for wd in size_wds: 30 | ix = self.config.size_table['wordtoix'][wd] 31 | self.r3 += [self.config.size_table['ixtoword'][ix]] 32 | 33 | # r4: absolute location 34 | # 1) left sth 35 | location_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']] 36 | # 2) sth in/on/at/to the left 37 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to'] 38 | position_wds = [dep[3] for dep in commonDeps if dep[3] in self.config.position_table['words']] 39 | for wd in position_wds: 40 | of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == wd] 41 | if len(of_exist) == 0: 42 | location_wds += [wd] 43 | # 2) sth in/on/at/to the left of the image 44 | AllowWds = ['image', 'picture', 'im', 'pic'] 45 | location_wds += [dep[1] for dep in self.rels['prep_of'] if dep[1] in position_wds and dep[3] in AllowWds] 46 | # add to r4 47 | for wd in location_wds: 48 | ix = self.config.location_table['wordtoix'][wd] 49 | self.r4 += [self.config.location_table['ixtoword'][ix]] 50 | 51 | # r5, r6: relative location and object 52 | ''' 53 | e.g., sent = 'players at the door' 54 | dependencies = [('root', 'ROOT', '0', 'players', '1'), 55 | ('det', 'door', '4', 'the', '3'), 56 | ('prep_at', 'players', '1', 'door', '4')] 57 | 58 | sent = 'players on the left of the dog' 59 | dependencies = [('root', 'ROOT', '0', 'players', '1'), 60 | ('det', 'left', '4', 'the', '3'), 61 | ('prep_on', 'players', '1', 'left', '4'), 62 | ('det', 'dog', '7', 'the', '6'), 63 | ('prep_of', 'left', '4', 'dog', '7')] 64 | 65 | Note, in vicente's matlab, the parsing differs at adding punctuation in the end. 66 | ''' 67 | # 1) the dog from the river 68 | ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 69 | rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words'] 70 | if dep[3] not in ForbiddenWds] 71 | for pair in rel_pairs: 72 | self.r5 += [pair[0]] 73 | self.r6 += [pair[1]] 74 | # 2) the dog on/in/at the table 75 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at'] 76 | ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 77 | rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds] 78 | for pair in rel_pairs: 79 | self.r5 += [pair[0]] 80 | self.r6 += [pair[1]] 81 | # 3) the dog on/in/at/to the left of table 82 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to'] 83 | rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] in self.config.position_table['words']] 84 | ForbiddenWds = ['image', 'picture', 'im', 'pic'] 85 | for rel, position_wd in rel_pairs: 86 | of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == position_wd] 87 | if len(of_exist) > 0: 88 | for of_object in of_exist: 89 | if of_object not in ForbiddenWds: 90 | self.r5 += [rel+'_'+position_wd] 91 | self.r6 += [of_object] 92 | 93 | # r7: generic attribute 94 | ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] \ 95 | + self.config.location_table['words'] 96 | generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds] 97 | for gwd in generic_wds: 98 | gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0] 99 | if gpos[:2] == 'JJ': 100 | self.r7 += [gwd] 101 | 102 | self.r2 = ['none'] if len(self.r2) == 0 else self.r2 103 | self.r3 = ['none'] if len(self.r3) == 0 else self.r3 104 | self.r4 = ['none'] if len(self.r4) == 0 else self.r4 105 | self.r5 = ['none'] if len(self.r5) == 0 else self.r5 106 | self.r6 = ['none'] if len(self.r6) == 0 else self.r6 107 | self.r7 = ['none'] if len(self.r7) == 0 else self.r7 108 | 109 | # left words -> r8 110 | left_wds = [word[0] for word in self.leftWords()] 111 | self.r8 = ['none'] if len(left_wds) == 0 else left_wds 112 | 113 | return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8} 114 | 115 | 116 | # def decompose(self): 117 | # # r2: color 118 | # color_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['wordtoix']] 119 | # color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['wordtoix']] 120 | # for wd in color_wds: 121 | # ix = self.config.color_table['wordtoix'][wd] 122 | # self.r2 += [self.config.color_table['ixtoword'][ix]] 123 | # 124 | # # r3: size 125 | # size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']] 126 | # for wd in size_wds: 127 | # ix = self.config.size_table['wordtoix'][wd] 128 | # self.r3 += [self.config.size_table['ixtoword'][ix]] 129 | # 130 | # # r4: absolute location 131 | # # 1) left sth 132 | # location_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']] 133 | # # 2) sth in/on/at the left. 134 | # commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at'] 135 | # position_deps = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']] 136 | # if len(self.Deps['prep_of']) == 0: 137 | # location_wds += [dep[3] for dep in position_deps] 138 | # else: 139 | # # 3) sth in/on/at the left of the picture. 140 | # # 4) sth of sth in/on/at the left. Note we allow 'of' appear before 'left/right/...' 141 | # ForbiddenWds = ['image', 'picture', 'im', 'pic'] 142 | # position_id = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0 143 | # position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id] 144 | # if len(position_of_objects) == 0: 145 | # location_wds += [dep[3] for dep in position_deps] 146 | # 147 | # # add to r4 148 | # for wd in location_wds: 149 | # ix = self.config.location_table['wordtoix'][wd] 150 | # self.r4 += [self.config.location_table['ixtoword'][ix]] 151 | # 152 | # # r5, r6: relative location and object 153 | # ''' 154 | # e.g., sent = 'players at the door.' 155 | # dependencies = [('root', 'ROOT', '0', 'players', '1'), 156 | # ('det', 'door', '4', 'the', '3'), 157 | # ('prep_at', 'players', '1', 'door', '4')] 158 | # 159 | # sent = 'players on the left of the dog.' 160 | # dependencies = [('root', 'ROOT', '0', 'players', '1'), 161 | # ('det', 'left', '4', 'the', '3'), 162 | # ('prep_on', 'players', '1', 'left', '4'), 163 | # ('det', 'dog', '7', 'the', '6'), 164 | # ('prep_of', 'players', '1', 'dog', '7')] 165 | # 166 | # Following vicente's matlab, this parsing is for sentence with punctuation at the end. 167 | # ''' 168 | # # 1) the dog from the river. 169 | # ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 170 | # rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words'] 171 | # if dep[3] not in ForbiddenWds] 172 | # for pair in rel_pairs: 173 | # self.r5 += [pair[0]] 174 | # self.r6 += [pair[1]] 175 | # # 2) the dog on/in/at the table. 176 | # commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at'] 177 | # ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 178 | # rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds] 179 | # for pair in rel_pairs: 180 | # self.r5 += [pair[0]] 181 | # self.r6 += [pair[1]] 182 | # # 3) the dog on/in/at/to the left of table. 183 | # # 4) the face of woman on the left of the window. Note we only detect position_of_objects 184 | # commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to'] 185 | # ForbiddenWds = ['image', 'picture', 'im', 'pic'] 186 | # position_deps = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']] 187 | # position_id = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0 # find the earliest position for 'left', 'right', 'top', ... 188 | # position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id] # 'of' must appear after position_id 189 | # for dep in position_deps: 190 | # if len(position_of_objects) > 0: 191 | # for of_object in position_of_objects: 192 | # self.r5 += [dep[0]+'_'+dep[3]] 193 | # self.r6 += [of_object] 194 | # 195 | # # r7: generic attribute 196 | # ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] \ 197 | # + self.config.location_table['words'] 198 | # generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds] 199 | # for gwd in generic_wds: 200 | # gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0] 201 | # if gpos[:2] == 'JJ': 202 | # self.r7 += [gwd] 203 | # 204 | # self.r2 = ['none'] if len(self.r2) == 0 else self.r2 205 | # self.r3 = ['none'] if len(self.r3) == 0 else self.r3 206 | # self.r4 = ['none'] if len(self.r4) == 0 else self.r4 207 | # self.r5 = ['none'] if len(self.r5) == 0 else self.r5 208 | # self.r6 = ['none'] if len(self.r6) == 0 else self.r6 209 | # self.r7 = ['none'] if len(self.r7) == 0 else self.r7 210 | # 211 | # # left words -> r8 212 | # left_wds = [word[0] for word in self.leftWords()] 213 | # self.r8 = ['none'] if len(left_wds) == 0 else left_wds 214 | # 215 | # return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8} 216 | 217 | 218 | if __name__ == '__main__': 219 | import sys 220 | from pprint import pprint 221 | import os.path as osp 222 | ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit') 223 | sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils')) 224 | from corenlp.corenlp import StanfordCoreNLP 225 | parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30') 226 | stanfordParser = StanfordCoreNLP(parser_path) 227 | 228 | sent = 'woman in red shirt' 229 | parse = stanfordParser.raw_parse(sent)['sentences'][0] 230 | pprint(parse['dependencies']) 231 | 232 | attParser = ClefParser() 233 | attParser.reset(parse) 234 | pprint(attParser.decompose()) 235 | pprint(attParser.leftWords()) 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | -------------------------------------------------------------------------------- /pyutils/attparser/cocoParser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | """ 4 | r1: [lemma of head word] 5 | r2: [color word describing r1] 6 | r3: [size word describing r1] 7 | r4: [location word describing r1], e.g., upper dog, dog on the left (of the picture) 8 | r5: [relative location and object], e.g., person under the door, dog on the table, dog on the left of the cat, 9 | Note, we also take "second cat from left", i.e., [r5 = second_left, r6 = self] 10 | r6: [generic attribute describing r1], i.e., other JJ and dep attributes describing head word 11 | """ 12 | 13 | from baseParser import BaseParser 14 | 15 | class CocoParser(BaseParser): 16 | 17 | def __init__(self): 18 | BaseParser.__init__(self, 'refcoco') 19 | 20 | def reset(self, parse): 21 | BaseParser.reset(self, parse) 22 | # Now, let's extract dependencies related to ordinary words 23 | self.Deps['ord'] = [dep for dep in self._dependencies if dep[1] == self.r1[0] and dep[3] in 24 | self.config.ordinal_table['words']] if self.r1[0]!='none' else [] 25 | self.rels['ord_prep'] = [dep for dep in self.rels['prep'] if dep[1] in self.config.ordinal_table['words']] 26 | 27 | def decompose(self): 28 | # r2: color 29 | color_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['wordtoix']] 30 | color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['wordtoix']] 31 | for wd in color_wds: 32 | ix = self.config.color_table['wordtoix'][wd] 33 | self.r2 += [self.config.color_table['ixtoword'][ix]] 34 | 35 | # r3: size 36 | size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']] 37 | for wd in size_wds: 38 | ix = self.config.size_table['wordtoix'][wd] 39 | self.r3 += [self.config.size_table['ixtoword'][ix]] 40 | 41 | # r4: absolute location 42 | location_wds = [] 43 | if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0: 44 | # 1) left sth (no ordinal word) 45 | location_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']] 46 | # 2) sth in/on/at/to the left (no ordinal word) 47 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to'] 48 | position_wds = [dep[3] for dep in commonDeps if dep[3] in self.config.position_table['words']] 49 | for wd in position_wds: 50 | of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == wd] 51 | if len(of_exist) == 0: 52 | location_wds += [wd] 53 | # 3) sth in/on/at/to the left of the image (no ordinal word) 54 | AllowWds = ['image', 'picture', 'im', 'pic'] 55 | location_wds += [dep[1] for dep in self.rels['prep_of'] if dep[1] in position_wds and dep[3] in AllowWds] 56 | # add to r4 57 | for wd in location_wds: 58 | ix = self.config.location_table['wordtoix'][wd] 59 | self.r4 += [self.config.location_table['ixtoword'][ix]] 60 | 61 | # r5, r6: relative location and object 62 | ''' 63 | e.g., case 1 and 2: 64 | sent = 'players at the door' or 'players from the river' 65 | dependencies = [('root', 'ROOT', '0', 'players', '1'), 66 | ('det', 'door', '4', 'the', '3'), 67 | ('prep_at', 'players', '1', 'door', '4')] 68 | case 3: 69 | sent = 'players on the left of the dog' 70 | dependencies = [('root', 'ROOT', '0', 'players', '1'), 71 | ('det', 'left', '4', 'the', '3'), 72 | ('prep_on', 'players', '1', 'left', '4'), 73 | ('det', 'dog', '7', 'the', '6'), 74 | ('prep_of', 'left', '4', 'dog', '7')] 75 | case 4: 76 | sent = 'second left man' 77 | dependencies = [('root', 'ROOT', '0', 'man', '3'), 78 | ('amod', 'man', '3', 'second', '1'), 79 | ('amod', 'man', '3', 'left', '2')] 80 | case 5: 81 | sent = 'second man from left' 82 | dependencies = [('root', 'ROOT', '0', 'man', '2'), 83 | ('amod', 'man', '2', 'second', '1'), 84 | ('prepc_from', 'man', '2', 'left', '4')] 85 | case 6: 86 | sent = 'man second from right' 87 | dependencies = [('root', 'ROOT', '0', 'second', '2'), 88 | ('nn', 'second', '2', 'man', '1'), 89 | ('prep_from', 'second', '2', 'right', '4')] 90 | case 7: 91 | sent = 'second from right man' 92 | dependencies = [('root', 'ROOT', '0', 'second', '1'), 93 | ('amod', 'man', '4', 'right', '3'), 94 | ('prep_from', 'second', '1', 'man', '4')] 95 | Note, in vicente's matlab, the parsing differs at adding punct in the end. 96 | ''' 97 | if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0: 98 | # 1) the dog from the river 99 | ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 100 | rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words'] 101 | if dep[3] not in ForbiddenWds] 102 | for pair in rel_pairs: 103 | self.r5 += [pair[0]] 104 | self.r6 += [pair[1]] 105 | # 2) the dog on/in/at the table 106 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at'] 107 | ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 108 | rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds] 109 | for pair in rel_pairs: 110 | self.r5 += [pair[0]] 111 | self.r6 += [pair[1]] 112 | # 3) players on the left of the dog 113 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to'] 114 | rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] in self.config.position_table['words']] 115 | ForbiddenWds = ['image', 'picture', 'im', 'pic'] 116 | for rel, position_wd in rel_pairs: 117 | of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == position_wd] 118 | if len(of_exist) > 0: 119 | for of_object in of_exist: 120 | if of_object not in ForbiddenWds: 121 | self.r5 += [rel+'_'+position_wd] 122 | self.r6 += [of_object] 123 | else: 124 | position_wds, ordinary_wds = [], [] 125 | if len(self.Deps['ord']) > 0: 126 | ordinary_wds = [dep[3] for dep in self.Deps['ord']] 127 | # 4) second left man 128 | position_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']] 129 | # 5) second man from left 130 | position_wds += [dep[3] for dep in self.rels['prep_from'] if dep[3] in self.config.position_table['words']] 131 | if len(self.rels['ord_prep']) > 0: 132 | ordinary_wds = [dep[1] for dep in self.rels['ord_prep']] 133 | # 6) man second from right 134 | position_wds = [dep[3] for dep in self.rels['ord_prep'] if dep[3] in self.config.position_table['words']] 135 | # 7) second from right man 136 | position_wds += [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']] 137 | # add to r5 and r6 138 | if len(position_wds) > 0: 139 | self.r5 += [ordinary_wds[0]+'_'+position_wds[0]] 140 | self.r6 += ['self'] 141 | 142 | # r7: generic attribute 143 | ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] + self.config.position_table['words'] \ 144 | + self.config.location_table['words'] + self.config.ordinal_table['words'] 145 | generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds] 146 | for gwd in generic_wds: 147 | gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0] 148 | if gpos[:2] == 'JJ': 149 | self.r7 += [gwd] 150 | 151 | self.r2 = ['none'] if len(self.r2) == 0 else self.r2 152 | self.r3 = ['none'] if len(self.r3) == 0 else self.r3 153 | self.r4 = ['none'] if len(self.r4) == 0 else self.r4 154 | self.r5 = ['none'] if len(self.r5) == 0 else self.r5 155 | self.r6 = ['none'] if len(self.r6) == 0 else self.r6 156 | self.r7 = ['none'] if len(self.r7) == 0 else self.r7 157 | 158 | # left words -> r8 159 | left_wds = [word[0] for word in self.leftWords()] 160 | self.r8 = ['none'] if len(left_wds) == 0 else left_wds 161 | 162 | return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8} 163 | 164 | 165 | if __name__ == '__main__': 166 | import sys 167 | from pprint import pprint 168 | import os.path as osp 169 | # set nltk data path 170 | import nltk 171 | nltk.data.path.append('/Users/liyu/Documents/nltk_data') 172 | sys.path.insert(0, '../..') 173 | from pyutils.corenlp import StanfordCoreNLP 174 | core = StanfordCoreNLP('../../models/stanford-corenlp-full-2015-01-29') 175 | 176 | # sent = 'face of woman to the left' 177 | # sent = 'guy in blue' 178 | sent = 'a sandal colour teddy bear in between the other two teddys' 179 | parse = core.raw_parse(sent)['sentences'][0] 180 | pprint(parse['dependencies']) 181 | 182 | attParser = CocoParser() 183 | attParser.reset(parse) 184 | pprint(attParser.decompose()) 185 | pprint(attParser.leftWords()) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /pyutils/attparser/cocoParser_punct.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | """ 4 | r1: [lemma of head word] 5 | r2: [color word describing r1] 6 | r3: [size word describing r1] 7 | r4: [location word describing r1], e.g., upper dog, dog on the left (of the picture) 8 | r5: [relative location and object], e.g., person under the door, dog on the table, dog on the left of the cat, 9 | Note, we also take "second cat from left", i.e., [r5 = second_left, r6 = self] 10 | r6: [generic attribute describing r1], i.e., other JJ and dep attributes describing head word 11 | """ 12 | 13 | from baseParser import BaseParser 14 | 15 | class CocoParser(BaseParser): 16 | 17 | def __init__(self): 18 | BaseParser.__init__(self, 'refcoco') 19 | 20 | def reset(self, parse): 21 | BaseParser.reset(self, parse) 22 | # Now, let's extract dependencies related to ordinary words 23 | self.Deps['ord'] = [dep for dep in self._dependencies if dep[1] == self.r1[0] and dep[3] in 24 | self.config.ordinal_table['words']] if self.r1[0]!='none' else [] 25 | self.rels['ord_prep'] = [dep for dep in self.rels['prep'] if dep[1] in self.config.ordinal_table['words']] 26 | 27 | def decompose(self): 28 | # r2: color 29 | color_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['wordtoix']] 30 | color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['wordtoix']] 31 | for wd in color_wds: 32 | ix = self.config.color_table['wordtoix'][wd] 33 | self.r2 += [self.config.color_table['ixtoword'][ix]] 34 | 35 | # r3: size 36 | size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']] 37 | for wd in size_wds: 38 | ix = self.config.size_table['wordtoix'][wd] 39 | self.r3 += [self.config.size_table['ixtoword'][ix]] 40 | 41 | # r4: absolute location (no ordinal word) 42 | location_wds = [] 43 | if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0: 44 | # 1) left sth. 45 | location_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']] 46 | # 2) sth in/on the left. 47 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at'] 48 | position_deps = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']] 49 | if len(self.Deps['prep_of']) == 0: 50 | location_wds += [dep[3] for dep in position_deps] 51 | else: 52 | # 3) sth in/on/at the left of the image. 53 | # 4) sth of sth in/on/at the left. 54 | ForbiddenWds = ['image', 'picture', 'im', 'pic'] 55 | position_id = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0 56 | position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id] 57 | if len(position_of_objects) == 0: 58 | location_wds += [dep[3] for dep in position_deps] 59 | # add to r4 60 | for wd in location_wds: 61 | ix = self.config.location_table['wordtoix'][wd] 62 | self.r4 += [self.config.location_table['ixtoword'][ix]] 63 | 64 | # r5, r6: relative location and object 65 | ''' 66 | e.g., case 1 and 2: 67 | sent = 'players at the door.' or 'players from the river.' 68 | dependencies = [('root', 'ROOT', '0', 'players', '1'), 69 | ('det', 'door', '4', 'the', '3'), 70 | ('prep_at', 'players', '1', 'door', '4')] 71 | case 3: 72 | sent = 'players on the left of the dog.' 73 | dependencies = [('root', 'ROOT', '0', 'players', '1'), 74 | ('det', 'left', '4', 'the', '3'), 75 | ('prep_on', 'players', '1', 'left', '4'), 76 | ('det', 'dog', '7', 'the', '6'), 77 | ('prep_of', 'players', '1', 'dog', '7')] 78 | 79 | case 4: 80 | sent = 'second left man.' 81 | dependencies = [('root', 'ROOT', '0', 'man', '3'), 82 | ('amod', 'man', '3', 'second', '1'), 83 | ('amod', 'man', '3', 'left', '2')] 84 | case 5: 85 | sent = 'second man from left.' 86 | dependencies = [('root', 'ROOT', '0', 'left', '4'), 87 | ('amod', 'man', '2', 'second', '1'), 88 | ('nsubj', 'left', '4', 'man', '2'), 89 | ('prep', 'man', '2', 'from', '3')] 90 | case 6: 91 | sent = 'man second from right.' 92 | dependencies = [('root', 'ROOT', '0', 'second', '2'), 93 | ('nn', 'second', '2', 'man', '1'), 94 | ('prep_from', 'second', '2', 'right', '4')] 95 | case 7: 96 | sent = 'second from right man.' 97 | dependencies = [('root', 'ROOT', '0', 'second', '1'), 98 | ('amod', 'man', '4', 'right', '3'), 99 | ('prep_from', 'second', '1', 'man', '4')] 100 | Note, in vicente's matlab, the parsing differs at adding punct in the end. 101 | ''' 102 | if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0: 103 | # 1) the dog from the river 104 | ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 105 | rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words'] 106 | if dep[3] not in ForbiddenWds] 107 | for pair in rel_pairs: 108 | self.r5 += [pair[0]] 109 | self.r6 += [pair[1]] 110 | # 2) the dog on/in/at the table 111 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at'] 112 | ForbiddenWds = self.config.position_table['words']+self.config.color_table['words'] 113 | rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds] 114 | for pair in rel_pairs: 115 | self.r5 += [pair[0]] 116 | self.r6 += [pair[1]] 117 | # 3) the dog on/in/at/to the left of table. 118 | # 4) the face of woman on the left of the window. Note we only detect position_of_objects 119 | commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to'] 120 | ForbiddenWds = ['image', 'picture', 'im', 'pic'] 121 | position_deps = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']] 122 | position_id = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0 # find the earliest position for 'left', 'right', 'top', ... 123 | position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id] # 'of' must appear after position_id 124 | for dep in position_deps: 125 | if len(position_of_objects) > 0: 126 | for of_object in position_of_objects: 127 | self.r5 += [dep[0]+'_'+dep[3]] 128 | self.r6 += [of_object] 129 | else: 130 | position_wds, ordinary_wds = [], [] 131 | if len(self.Deps['ord']) > 0: 132 | ordinary_wds = [dep[3] for dep in self.Deps['ord']] 133 | # 4) second left man 134 | position_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']] 135 | # 5) second man from left 136 | position_wds += [dep[3] for dep in self._dependencies if dep[3] in self.config.position_table['words']] # no pattern, so search from all dependencies 137 | if len(self.rels['ord_prep']) > 0: 138 | ordinary_wds = [dep[1] for dep in self.rels['ord_prep']] 139 | # 6) man second from right 140 | position_wds = [dep[3] for dep in self.rels['ord_prep'] if dep[3] in self.config.position_table['words']] 141 | # 7) second from right man 142 | position_wds += [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']] 143 | # add to r5 and r6 144 | if len(position_wds) > 0: 145 | self.r5 += [ordinary_wds[0]+'_'+position_wds[0]] 146 | self.r6 += ['self'] 147 | 148 | # r7: generic attribute 149 | ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] + self.config.position_table['words'] \ 150 | + self.config.location_table['words'] + self.config.ordinal_table['words'] 151 | generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds] 152 | for gwd in generic_wds: 153 | gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0] 154 | if gpos[:2] == 'JJ': 155 | self.r7 += [gwd] 156 | 157 | self.r2 = ['none'] if len(self.r2) == 0 else self.r2 158 | self.r3 = ['none'] if len(self.r3) == 0 else self.r3 159 | self.r4 = ['none'] if len(self.r4) == 0 else self.r4 160 | self.r5 = ['none'] if len(self.r5) == 0 else self.r5 161 | self.r6 = ['none'] if len(self.r6) == 0 else self.r6 162 | self.r7 = ['none'] if len(self.r7) == 0 else self.r7 163 | 164 | # left words -> r8 165 | left_wds = [word[0] for word in self.leftWords()] 166 | self.r8 = ['none'] if len(left_wds) == 0 else left_wds 167 | 168 | return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8} 169 | 170 | if __name__ == '__main__': 171 | import sys 172 | from pprint import pprint 173 | import os.path as osp 174 | ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit') 175 | sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils')) 176 | from corenlp.corenlp import StanfordCoreNLP 177 | parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30') 178 | stanfordParser = StanfordCoreNLP(parser_path) 179 | 180 | sent = 'a bunch of flower at the door.' 181 | parse = stanfordParser.raw_parse(sent)['sentences'][0] 182 | pprint(parse['dependencies']) 183 | 184 | attParser = CocoParser() 185 | attParser.reset(parse) 186 | pprint(attParser.decompose()) 187 | pprint(attParser.leftWords()) 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /pyutils/attparser/config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | class config(): 4 | def __init__(self): 5 | self.attribute_names = ['entrylevel', 'color', 'size', 'absolute_location', 'relative_location', 'relative_object', 'generic'] 6 | 7 | def buildTable(self, words): 8 | table = {'wordtoix': {}, 'ixtoword': {}, 'words': []} 9 | for ix, wd in enumerate(words): 10 | if wd.find(',') > 0: 11 | jx = wd.find(',') 12 | wd1, wd2 = wd[:jx].strip(), wd[jx+1:].strip() 13 | table['wordtoix'][wd1], table['wordtoix'][wd2] = ix, ix 14 | table['ixtoword'][ix] = wd1 15 | else: 16 | table['wordtoix'][wd] = ix 17 | table['ixtoword'][ix] = wd 18 | table['words'] = table['wordtoix'].keys() 19 | return table 20 | 21 | 22 | class configCLEF(config): 23 | def __init__(self): 24 | config.__init__(self) 25 | # color 26 | self.color_words = ['white', 'green, greenish', 'blue, bluish', 'red', 'yellow, yellowish', 'black', 'brown, brownish', 27 | 'pink', 'dark, darker', 'orange', 'gray, grey', 'purple', 'beige', 'bright'] 28 | self.color_table = self.buildTable(self.color_words) 29 | # size 30 | self.size_words = ['big', 'small', 'tall', 'large', 'little', 'short', 'tiny', 'long', 'huge'] 31 | self.size_table = self.buildTable(self.size_words) 32 | # location 33 | self.location_words = ['right', 'left', 'top', 'bottom', 'middle, mid', 'second, 2nd', 'first, 1st', 'front', 34 | 'closest, nearest', 'center, central', 'third, 3rd', 'corner', 'upper', 'back, behind', 35 | 'far', 'anywhere', 'leftmost', 'lower', 'rightmost', 'farthest, furthest', 'next', 'last', 36 | 'fourth, 4th', 'up, above', 'below, down', 'side'] 37 | self.location_table = self.buildTable(self.location_words) 38 | # position 39 | self.position_words = ['right', 'left', 'top', 'bottom', 'middle, center, centre', 'front', 'back'] 40 | self.position_table = self.buildTable(self.position_words) 41 | # relative preps 42 | self.relative_preps_words = ['prep_above', 'prep_about', 'prep_below', 'prep_behind', 'prep_beneath', 'prep_beside', 43 | 'prep_between', 'prep_close_to', 'prep_by', 'prep_in_front_of', 'prep_against', 44 | 'prep_from', 'prep_next_to', 'prep_through', 'prep_under', 'prep_underneath', 'prep_with', 45 | 'prep_near', 'prep_inside'] 46 | self.relative_preps_table = self.buildTable(self.relative_preps_words) 47 | # ordinal number 48 | self.ordinal_words = ['first', 'second', 'third', 'fourth', 'fifth', 'most'] 49 | self.ordinal_table = self.buildTable(self.ordinal_words) 50 | 51 | class configCOCO(config): 52 | def __init__(self): 53 | config.__init__(self) 54 | # color 55 | self.color_words = ['white', 'green', 'blue', 'red', 'yellow', 'black', 'brown', 'pink', 'dark, darker', 'orange', 56 | 'gray', 'purple', 'beige', 'bright'] 57 | self.color_table = self.buildTable(self.color_words) 58 | # size 59 | self.size_words = ['big, bigger', 'small, smaller', 'tall, taller', 'large, larger', 'little', 'short, shorter', 60 | 'tiny', 'long, longer', 'huge'] 61 | self.size_table = self.buildTable(self.size_words) 62 | # location 63 | self.location_words = ['right', 'left', 'top', 'bottom', 'middle, mid', 'front', 'closest, nearest', 'center, central', 64 | 'corner', 'upper', 'back, behind', 'far', 'leftmost', 'lower, low', 'rightmost', 65 | 'farthest, furthest', 'next', 'last', 'up, above', 'below, down', 'side'] 66 | self.location_table = self.buildTable(self.location_words) 67 | # position 68 | self.position_words = ['right', 'left', 'top', 'bottom', 'middle, center, centre', 'front', 'back'] 69 | self.position_table = self.buildTable(self.position_words) 70 | # relative preps 71 | self.relative_preps_words = ['prep_above', 'prep_about', 'prep_below', 'prep_behind', 'prep_beneath', 'prep_beside', 72 | 'prep_between', 'prep_close_to', 'prep_by', 'prep_in_front_of', 'prep_against', 73 | 'prep_from', 'prep_next_to', 'prep_through', 'prep_under', 'prep_underneath', 'prep_with', 74 | 'prep_near', 'prep_inside', 'prepc_from'] 75 | self.relative_preps_table = self.buildTable(self.relative_preps_words) 76 | # ordinal number 77 | self.ordinal_words = ['first', 'second', 'third', 'fourth', 'fifth'] 78 | self.ordinal_table = self.buildTable(self.ordinal_words) 79 | 80 | 81 | if __name__ == '__main__': 82 | c = configCOCO() 83 | print c.color_table 84 | print c.size_table 85 | print c.location_table 86 | print c.position_table['words'] 87 | print c.relative_preps_table 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /pyutils/attparser/head.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | from nltk.tree import * 4 | 5 | ing_allowed = ['duckling', 'frosting', 'something', 'anything', 'thing', 'king', 'nothing', 6 | 'ring', 'wing', 'darling', 'building', 'painting', 'everything', 'string', 7 | 'ceiling', 'pudding', ] 8 | not_allowed = ['first', 'second', 'third', 'fourth', 'front', 'fifth', 'right', 'left'] 9 | 10 | def findFirstbreadthFirst(T, label): 11 | # input: tree, and label ('NN' or 'NP') 12 | # return: tree, or None 13 | myqueue = [] 14 | label_len = len(label) 15 | for i in range(len(T)): 16 | myqueue.append(str(T[i])) # push the sons 17 | while len(myqueue) > 0: 18 | cur_T = Tree.fromstring( myqueue.pop(0) ) # pop the front node as current tree 19 | cur_label = cur_T.label() 20 | if len(cur_label)>=label_len and cur_label[:label_len] == label: 21 | if cur_T[0] in not_allowed: # in case parser take 'first', 'second' as noun 22 | continue 23 | if cur_T[0][-3:] == 'ing' and cur_T[0] not in ing_allowed: # in case parser take both 'man' and 'standing' as Noun for 'man standing under the tree' 24 | continue 25 | return cur_T 26 | else: 27 | if not isinstance(cur_T[0], str): # if not the leaf node, i.e., 'dog', 'tree' 28 | for i in range(len(cur_T)): 29 | myqueue.append(str(cur_T[i])) 30 | return None 31 | 32 | def findLastbreadthFirst(T, label): 33 | myqueue = [] 34 | label_len = len(label) 35 | for i in reversed(range(len(T))): 36 | myqueue.append(str(T[i])) # push the sons 37 | while len(myqueue) > 0: 38 | cur_T = Tree.fromstring( myqueue.pop(0) ) # pop the front node as current tree 39 | cur_label = cur_T.label() 40 | if len(cur_label)>=label_len and cur_label[:label_len] == label: 41 | if cur_T[0] in not_allowed: 42 | # in case parser take 'first', 'second' as noun 43 | continue 44 | if cur_T[0][-3:] == 'ing' and cur_T[0] not in ing_allowed: # in case parser take 'standing' as Noun for 'man standing under the tree' 45 | continue 46 | return cur_T 47 | else: 48 | if not isinstance(cur_T[0], str): # if not the leaf node, i.e., 'dog', 'tree' 49 | for i in reversed(range(len(cur_T))): # push_back from the last to the first 50 | myqueue.append(str(cur_T[i])) 51 | return None 52 | 53 | def findHead(T, mode='vicente'): 54 | if mode == 'vicente': # find the left-most NP, and then its left-most NN 55 | if not T[0].label() == 'NP': 56 | foundNP = findFirstbreadthFirst(T[0], 'NP') 57 | if foundNP: 58 | head = findFirstbreadthFirst(foundNP, 'NN') 59 | else: 60 | head = findFirstbreadthFirst(T[0], 'NN') 61 | else: 62 | head = findFirstbreadthFirst(T[0], 'NN') 63 | if head == None: 64 | return None, -1 65 | else: 66 | head = head[0] 67 | idx = [pos[0] for pos in T.pos()].index(head) 68 | return head, idx 69 | elif mode == 'licheng': # find bottom-left NP first, then search its rightmost NN son 70 | np_exist = T 71 | np_found = findFirstbreadthFirst(np_exist, 'NP') 72 | while np_found: 73 | np_exist = np_found 74 | np_found = findFirstbreadthFirst(np_exist, 'NP') 75 | if np_exist != T: 76 | head_tr = findLastbreadthFirst(np_exist, 'NN') 77 | if not head_tr: # if this NP tree has no NN son, we just take the first NN as head. 78 | head_tr = findFirstbreadthFirst(T[0], 'NN') 79 | else: 80 | head_tr = findFirstbreadthFirst(T[0], 'NN') 81 | 82 | if head_tr == None or (head_tr != None and head_tr[0] in not_allowed): 83 | return None, -1 84 | else: 85 | head = head_tr[0] 86 | idx = [pos[0] for pos in T.pos()].index(head) 87 | return head, idx 88 | 89 | if __name__ == '__main__': 90 | import sys 91 | from pprint import pprint 92 | import os.path as osp 93 | sys.path.insert(0, '../..') 94 | from pyutils.corenlp import StanfordCoreNLP 95 | core = StanfordCoreNLP('../../models/stanford-corenlp-full-2015-01-29') 96 | 97 | # sent = "baseball man" 98 | # sent = 'a running person under the tree.' 99 | sent = 'a sandal colour teddy bear in between the other two teddys' 100 | parse = core.raw_parse(sent)['sentences'][0] 101 | parse_tree = parse['parsetree'] 102 | t = Tree.fromstring(parse_tree) 103 | t.draw() 104 | print t 105 | print parse['dependencies'] 106 | 107 | # vicente version 108 | head, idx = findHead(t, mode='vicente') 109 | print 'vicente - head: %s, idx: %s' % (head, idx) 110 | # licheng version 111 | head, idx = findHead(t, mode='licheng') 112 | print 'ylc_leftNP_rightNN - head: %s, idx: %s' % (head, idx) 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /pyutils/attparser/simpleParser.py: -------------------------------------------------------------------------------- 1 | """ 2 | This parser is to extract "real" relative position word, as well as absolute position word. 3 | For clefParser and cocoParser, there are quite amount of inaccurate (r5, r6), e.g., (prep_in, shirt) for "The boy in white shirt." 4 | On the one hand, this belongs to details of the referred object itself; On the other hand, this doesn't actually reflect position. 5 | Here, we extract: 6 | 1) NN, JJ, VB, ... as object's attribute words, 7 | 2) big, large, ... as object's relative size words, 8 | 3) left, right, top, ... as absolute position words, 9 | 4) object, ... as relative location and object pairs, 10 | 11 | Approach: we rely on the parsed r1-r7 from clef/coco Parser, and categorize them into the above four types. 12 | 1) r1, r2, r7 and r8 into attribute words (without forbidden words) 13 | 2) check "prep_with" and "prep_in" of (r5, r6), put some of them into attribute words 14 | 3) 15 | 16 | """ 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /pyutils/corenlp/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /pyutils/corenlp/__init__.py: -------------------------------------------------------------------------------- 1 | # corenlp 2 | # Copyright 2013- Hiroyoshi Komatsu 3 | # See LICENSE for details. 4 | 5 | """ 6 | Stanford CoreNLP Python wrapper 7 | """ 8 | __version__ = '1.0.3' 9 | __author__ = 'Hiroyoshi Komatsu' 10 | __license__ = 'GNU v2+' 11 | 12 | # classes 13 | from .corenlp import StanfordCoreNLP, ParserError, TimeoutError, ProcessError 14 | # functions 15 | from .corenlp import batch_parse 16 | -------------------------------------------------------------------------------- /pyutils/corenlp/__main__.py: -------------------------------------------------------------------------------- 1 | from . import corenlp 2 | 3 | corenlp.main() 4 | -------------------------------------------------------------------------------- /pyutils/corenlp/client.py: -------------------------------------------------------------------------------- 1 | import json 2 | # from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp 3 | import jsonrpclib 4 | from pprint import pprint 5 | 6 | 7 | class StanfordNLP: 8 | def __init__(self, port_number=8080): 9 | self.server = jsonrpclib.Server("http://localhost:%d" % port_number) 10 | 11 | def parse(self, text): 12 | return json.loads(self.server.parse(text)) 13 | 14 | nlp = StanfordNLP() 15 | result = nlp.parse("Hello world! It is so beautiful.") 16 | pprint(result) 17 | 18 | from nltk.tree import Tree 19 | tree = Tree.parse(result['sentences'][0]['parsetree']) 20 | pprint(tree) 21 | -------------------------------------------------------------------------------- /pyutils/corenlp/corenlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # corenlp - Python interface to Stanford Core NLP tools 4 | # Copyright (c) 2012 Dustin Smith 5 | # https://github.com/dasmith/stanford-corenlp-python 6 | # 7 | # This program is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License 9 | # as published by the Free Software Foundation; either version 2 10 | # of the License, or (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with this program; if not, write to the Free Software 19 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 20 | 21 | import json 22 | import optparse 23 | import os 24 | import re 25 | import sys 26 | import traceback 27 | import pexpect 28 | import tempfile 29 | import shutil 30 | import re 31 | from progressbar import ProgressBar, Fraction 32 | from unidecode import unidecode 33 | from subprocess import call 34 | import glob 35 | 36 | use_winpexpect = True 37 | 38 | try: 39 | import winpexpect 40 | except ImportError: 41 | use_winpexpect = False 42 | 43 | VERBOSE = False 44 | STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 45 | WORD_PATTERN = re.compile('\[([^\]]+)\]') 46 | CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"") 47 | 48 | DIRECTORY = "stanford-corenlp-full-2013-06-20" 49 | 50 | 51 | class bc: 52 | HEADER = '\033[95m' 53 | OKBLUE = '\033[94m' 54 | OKGREEN = '\033[92m' 55 | WARNING = '\033[93m' 56 | FAIL = '\033[91m' 57 | ENDC = '\033[0m' 58 | 59 | 60 | class ProcessError(Exception): 61 | 62 | def __init__(self, value): 63 | self.value = value 64 | 65 | def __str__(self): 66 | return repr(self.value) 67 | 68 | 69 | class ParserError(Exception): 70 | 71 | def __init__(self, value): 72 | self.value = value 73 | 74 | def __str__(self): 75 | return repr(self.value) 76 | 77 | 78 | class TimeoutError(Exception): 79 | 80 | def __init__(self, value): 81 | self.value = value 82 | 83 | def __str__(self): 84 | return repr(self.value) 85 | 86 | 87 | class OutOfMemoryError(Exception): 88 | 89 | def __init__(self, value): 90 | self.value = value 91 | 92 | def __str__(self): 93 | return repr(self.value) 94 | 95 | 96 | def init_corenlp_command(corenlp_path, memory, properties): 97 | """ 98 | Checks the location of the jar files. 99 | Spawns the server as a process. 100 | """ 101 | 102 | # TODO: Can edit jar constants 103 | jar_mask = "*.jar" 104 | jars = glob.glob(os.path.join(corenlp_path, jar_mask)) 105 | 106 | java_path = "java" 107 | classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" 108 | # include the properties file, so you can change defaults 109 | # but any changes in output format will break parse_parser_results() 110 | current_dir_pr = os.path.join(os.path.dirname(os.path.abspath(__file__)), properties) 111 | if os.path.exists(properties): 112 | props = "-props %s" % (properties.replace(" ", "\\ ")) 113 | elif os.path.exists(current_dir_pr): 114 | props = "-props %s" % (current_dir_pr.replace(" ", "\\ ")) 115 | else: 116 | raise Exception("Error! Cannot locate: %s" % properties) 117 | 118 | # add memory limit on JVM 119 | if memory: 120 | limit = "-Xmx%s" % memory 121 | else: 122 | limit = "" 123 | 124 | return "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props) 125 | 126 | def parse_bracketed(s): 127 | '''Parse word features [abc=... def = ...] 128 | Also manages to parse out features that have XML within them 129 | ''' 130 | word = None 131 | attrs = {} 132 | temp = {} 133 | # Substitute XML tags, to replace them later 134 | for i, tag in enumerate(re.findall(r"(<[^<>]+>.*<\/[^<>]+>)", s)): 135 | temp["^^^%d^^^" % i] = tag 136 | s = s.replace(tag, "^^^%d^^^" % i) 137 | # Load key-value pairs, substituting as necessary 138 | for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s): 139 | if val in temp: 140 | val = remove_escapes(temp[val]) 141 | if attr == 'Text': 142 | word = remove_escapes(val) 143 | else: 144 | attrs[attr] = remove_escapes(val) 145 | return (word, attrs) 146 | 147 | 148 | def parse_parser_results(text): 149 | """ This is the nasty bit of code to interact with the command-line 150 | interface of the CoreNLP tools. Takes a string of the parser results 151 | and then returns a Python list of dictionaries, one for each parsed 152 | sentence. 153 | """ 154 | results = {"sentences": []} 155 | state = STATE_START 156 | for line in unidecode(text.decode('utf-8')).split("\n"): 157 | line = line.strip() 158 | 159 | if line.startswith("Sentence #"): 160 | sentence = {'words': [], 'parsetree': [], 'dependencies': []} 161 | results["sentences"].append(sentence) 162 | state = STATE_TEXT 163 | 164 | elif state == STATE_TEXT: 165 | sentence['text'] = remove_escapes(line) 166 | state = STATE_WORDS 167 | 168 | elif state == STATE_WORDS: 169 | if not line.startswith("[Text="): 170 | raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) 171 | for s in WORD_PATTERN.findall(line): 172 | sentence['words'].append(parse_bracketed(s)) 173 | state = STATE_TREE 174 | 175 | elif state == STATE_TREE: 176 | if len(line) == 0: 177 | state = STATE_DEPENDENCY 178 | sentence['parsetree'] = " ".join(sentence['parsetree']) 179 | else: 180 | sentence['parsetree'].append(remove_escapes(line)) 181 | 182 | elif state == STATE_DEPENDENCY: 183 | if len(line) == 0: 184 | state = STATE_COREFERENCE 185 | else: 186 | split_entry = re.split("\(|, |-", line[:-1]) 187 | if len(split_entry) == 5: 188 | rel, left, leftindex, right, rightindex = split_entry 189 | leftindex = re.sub("[^0-9]", "", leftindex) 190 | rightindex = re.sub("[^0-9]", "", rightindex) 191 | sentence['dependencies'].append(tuple([rel, 192 | remove_escapes(left), leftindex, remove_escapes(right), 193 | rightindex])) 194 | 195 | elif state == STATE_COREFERENCE: 196 | if "Coreference set" in line: 197 | if 'coref' not in results: 198 | results['coref'] = [] 199 | coref_set = [] 200 | results['coref'].append(coref_set) 201 | else: 202 | for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): 203 | src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1 204 | sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1 205 | coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) 206 | 207 | return results 208 | 209 | 210 | def parse_parser_xml_results(xml, file_name="", raw_output=False): 211 | import xmltodict 212 | from collections import OrderedDict 213 | 214 | def extract_words_from_xml(sent_node): 215 | exted = map(lambda x: x['word'], sent_node['tokens']['token']) 216 | return exted 217 | 218 | # Turning the raw xml into a raw python dictionary: 219 | raw_dict = xmltodict.parse(xml) 220 | if raw_output: 221 | return raw_dict 222 | 223 | document = raw_dict[u'root'][u'document'] 224 | 225 | # Making a raw sentence list of dictionaries: 226 | raw_sent_list = document[u'sentences'][u'sentence'] 227 | 228 | # Convert sentences to the format like python 229 | # TODO: If there is only one sentence in input sentence, 230 | # raw_sent_list is dict and cannot decode following code... 231 | sentences = [{'dependencies': [[dep['dep'][i]['@type'], 232 | dep['dep'][i]['governor']['#text'], 233 | dep['dep'][i]['governor']['@idx'], 234 | dep['dep'][i]['dependent']['#text'], 235 | dep['dep'][i]['dependent']['@idx']] 236 | for dep in raw_sent_list[j][u'dependencies'] 237 | if 'dep' in dep 238 | for i in xrange(len(dep['dep'])) 239 | if dep['@type'] == 'collapsed-ccprocessed-dependencies'], 240 | 'text': extract_words_from_xml(raw_sent_list[j]), 241 | 'parsetree': str(raw_sent_list[j]['parse']), 242 | 'words': [[str(token['word']), OrderedDict([ 243 | ('CharacterOffsetEnd', str(token['CharacterOffsetEnd'])), 244 | ('CharacterOffsetBegin', str(token['CharacterOffsetBegin'])), 245 | ('PartOfSpeech', str(token['POS'])), 246 | ('Lemma', str(token['lemma']))])] 247 | for index, token in enumerate(raw_sent_list[j][u'tokens'][u'token'])]} 248 | 249 | for j in xrange(len(raw_sent_list))] 250 | 251 | 252 | results = {'sentences': sentences} 253 | 254 | if file_name: 255 | results['file_name'] = file_name 256 | 257 | return results 258 | 259 | 260 | def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False, properties='default.properties'): 261 | """Because interaction with the command-line interface of the CoreNLP 262 | tools is limited to very short text bits, it is necessary to parse xml 263 | output""" 264 | #First, we change to the directory where we place the xml files from the 265 | #parser: 266 | 267 | xml_dir = tempfile.mkdtemp() 268 | file_list = tempfile.NamedTemporaryFile() 269 | 270 | #we get a list of the cleaned files that we want to parse: 271 | 272 | files = [os.path.join(input_dir , f) for f in os.listdir(input_dir) if f.endswith(".txt")] 273 | 274 | #creating the file list of files to parse 275 | 276 | file_list.write('\n'.join(files)) 277 | file_list.seek(0) 278 | 279 | command = init_corenlp_command(corenlp_path, memory, properties)\ 280 | + ' -filelist %s -outputDirectory %s' % (file_list.name, xml_dir) 281 | 282 | #creates the xml file of parser output: 283 | 284 | call(command, shell=True) 285 | 286 | #reading in the raw xml file: 287 | # result = [] 288 | try: 289 | for output_file in os.listdir(xml_dir): 290 | with open(os.path.join(xml_dir + output_file), 'r') as xml: 291 | # parsed = xml.read() 292 | file_name = re.sub('.xml$', '', os.path.basename(output_file)) 293 | # result.append(parse_parser_xml_results(xml.read(), file_name, 294 | # raw_output=raw_output)) 295 | yield parse_parser_xml_results(xml.read(), file_name, 296 | raw_output=raw_output) 297 | finally: 298 | file_list.close() 299 | shutil.rmtree(xml_dir) 300 | # return result 301 | 302 | 303 | class StanfordCoreNLP: 304 | 305 | """ 306 | Command-line interaction with Stanford's CoreNLP java utilities. 307 | Can be run as a JSON-RPC server or imported as a module. 308 | """ 309 | 310 | def _spawn_corenlp(self): 311 | if VERBOSE: 312 | print self.start_corenlp 313 | if use_winpexpect: 314 | self.corenlp = winpexpect.winspawn(self.start_corenlp, maxread=8192, 315 | searchwindowsize=80) 316 | else: 317 | self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192, 318 | searchwindowsize=80) 319 | 320 | # show progress bar while loading the models 321 | if VERBOSE: 322 | widgets = ['Loading Models: ', Fraction()] 323 | pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start() 324 | # Model timeouts: 325 | # pos tagger model (~5sec) 326 | # NER-all classifier (~33sec) 327 | # NER-muc classifier (~60sec) 328 | # CoNLL classifier (~50sec) 329 | # PCFG (~3sec) 330 | timeouts = [20, 200, 600, 600, 20] 331 | for i in xrange(5): 332 | self.corenlp.expect("done.", timeout=timeouts[i]) # Load model 333 | pbar.update(i + 1) 334 | self.corenlp.expect("Entering interactive shell.") 335 | pbar.finish() 336 | 337 | # interactive shell 338 | self.corenlp.expect("\nNLP> ") 339 | 340 | def __init__(self, corenlp_path=DIRECTORY, memory="3g", properties='default.properties', serving=False): 341 | """ 342 | Checks the location of the jar files. 343 | Spawns the server as a process. 344 | """ 345 | 346 | # spawn the server 347 | self.serving = serving 348 | self.start_corenlp = init_corenlp_command(corenlp_path, memory, properties) 349 | self._spawn_corenlp() 350 | 351 | def close(self, force=True): 352 | global use_winpexpect 353 | if use_winpexpect: 354 | self.corenlp.terminate() 355 | else: 356 | self.corenlp.terminate(force) 357 | 358 | 359 | def isalive(self): 360 | return self.corenlp.isalive() 361 | 362 | def __del__(self): 363 | # If our child process is still around, kill it 364 | if self.isalive(): 365 | self.close() 366 | 367 | def _parse(self, text): 368 | """ 369 | This is the core interaction with the parser. 370 | 371 | It returns a Python data-structure, while the parse() 372 | function returns a JSON object 373 | """ 374 | 375 | # CoreNLP interactive shell cannot recognize newline 376 | if '\n' in text or '\r' in text: 377 | to_send = re.sub("[\r\n]", " ", text).strip() 378 | else: 379 | to_send = text 380 | 381 | # clean up anything leftover 382 | def clean_up(): 383 | while True: 384 | try: 385 | self.corenlp.read_nonblocking(8192, 0.1) 386 | except pexpect.TIMEOUT: 387 | break 388 | clean_up() 389 | 390 | self.corenlp.sendline(to_send) 391 | 392 | # How much time should we give the parser to parse it? 393 | # the idea here is that you increase the timeout as a 394 | # function of the text's length. 395 | # max_expected_time = max(5.0, 3 + len(to_send) / 5.0) 396 | max_expected_time = max(300.0, len(to_send) / 3.0) 397 | 398 | # repeated_input = self.corenlp.except("\n") # confirm it 399 | t = self.corenlp.expect(["\nNLP> ", pexpect.TIMEOUT, pexpect.EOF, 400 | "\nWARNING: Parsing of sentence failed, possibly because of out of memory."], 401 | timeout=max_expected_time) 402 | incoming = self.corenlp.before 403 | if t == 1: 404 | # TIMEOUT, clean up anything left in buffer 405 | clean_up() 406 | print >>sys.stderr, {'error': "timed out after %f seconds" % max_expected_time, 407 | 'input': to_send, 408 | 'output': incoming} 409 | raise TimeoutError("Timed out after %d seconds" % max_expected_time) 410 | elif t == 2: 411 | # EOF, probably crash CoreNLP process 412 | print >>sys.stderr, {'error': "CoreNLP terminates abnormally while parsing", 413 | 'input': to_send, 414 | 'output': incoming} 415 | raise ProcessError("CoreNLP process terminates abnormally while parsing") 416 | elif t == 3: 417 | # out of memory 418 | print >>sys.stderr, {'error': "WARNING: Parsing of sentence failed, possibly because of out of memory.", 419 | 'input': to_send, 420 | 'output': incoming} 421 | raise OutOfMemoryError 422 | 423 | if VERBOSE: 424 | print "%s\n%s" % ('=' * 40, incoming) 425 | try: 426 | results = parse_parser_results(incoming) 427 | except Exception as e: 428 | if VERBOSE: 429 | print traceback.format_exc() 430 | raise e 431 | 432 | return results 433 | 434 | def raw_parse(self, text): 435 | """ 436 | This function takes a text string, sends it to the Stanford parser, 437 | reads in the result, parses the results and returns a list 438 | with one dictionary entry for each parsed sentence. 439 | """ 440 | try: 441 | r = self._parse(text) 442 | return r 443 | except Exception as e: 444 | print e # Should probably log somewhere instead of printing 445 | self.corenlp.close() 446 | self._spawn_corenlp() 447 | if self.serving: # We don't want to raise the exception when acting as a server 448 | return [] 449 | raise e 450 | 451 | def parse(self, text): 452 | """ 453 | This function takes a text string, sends it to the Stanford parser, 454 | reads in the result, parses the results and returns a list 455 | with one dictionary entry for each parsed sentence, in JSON format. 456 | """ 457 | return json.dumps(self.raw_parse(text)) 458 | 459 | 460 | def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=False): 461 | """ 462 | This function takes input files, 463 | sends list of input files to the Stanford parser, 464 | reads in the results from temporary folder in your OS and 465 | returns a generator object of list that consist of dictionary entry. 466 | If raw_output is true, the dictionary returned will correspond exactly to XML. 467 | ( The function needs xmltodict, 468 | and doesn't need init 'StanfordCoreNLP' class. ) 469 | """ 470 | if not os.path.exists(input_folder): 471 | raise Exception("input_folder does not exist") 472 | 473 | return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output) 474 | 475 | def remove_escapes(text): 476 | """Given a string, remove PTB3 escape characters. 477 | """ 478 | escapes = {"-lrb-": "(", 479 | "-rrb-": ")", 480 | "-lsb-": "[", 481 | "-rsb-": "]", 482 | "-lcb-": "{", 483 | "-rcb-": "}", 484 | "-LRB-": "(", 485 | "-RRB-": ")", 486 | "-LSB-": "[", 487 | "-RSB-": "]", 488 | "-LCB-": "{", 489 | "-RCB-": "}"} 490 | if text: 491 | pattern = re.compile('|'.join(re.escape(key) for key in escapes.keys())) 492 | return pattern.sub(lambda x: escapes[x.group()], text) 493 | 494 | if __name__ == '__main__': 495 | """ 496 | The code below starts an JSONRPC server 497 | """ 498 | from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer 499 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 500 | parser.add_option('-p', '--port', default='8080', 501 | help='Port to serve on (default 8080)') 502 | parser.add_option('-H', '--host', default='127.0.0.1', 503 | help='Host to serve on (default localhost; 0.0.0.0 to make public)') 504 | parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose', 505 | help="Quiet mode, don't print status msgs to stdout") 506 | parser.add_option('-S', '--corenlp', default=DIRECTORY, 507 | help='Stanford CoreNLP tool directory (default %s)' % DIRECTORY) 508 | parser.add_option('-P', '--properties', default='default.properties', 509 | help='Stanford CoreNLP properties fieles (default: default.properties)') 510 | options, args = parser.parse_args() 511 | VERBOSE = options.verbose 512 | # server = jsonrpc.Server(jsonrpc.JsonRpc20(), 513 | # jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) 514 | try: 515 | server = SimpleJSONRPCServer((options.host, int(options.port))) 516 | 517 | nlp = StanfordCoreNLP(options.corenlp, properties=options.properties, serving=True) 518 | server.register_function(nlp.parse) 519 | server.register_function(nlp.raw_parse) 520 | 521 | print 'Serving on http://%s:%s' % (options.host, options.port) 522 | # server.serve() 523 | server.serve_forever() 524 | except KeyboardInterrupt: 525 | print >>sys.stderr, "Bye." 526 | exit() 527 | 528 | -------------------------------------------------------------------------------- /pyutils/corenlp/default.properties: -------------------------------------------------------------------------------- 1 | annotators = tokenize, ssplit, pos, lemma, parse 2 | 3 | # A true-casing annotator is also available (see below) 4 | #annotators = tokenize, ssplit, pos, lemma, truecase 5 | 6 | # A simple regex NER annotator is also available 7 | # annotators = tokenize, ssplit, regexner 8 | 9 | #Use these as EOS punctuation and discard them from the actual sentence content 10 | #These are HTML tags that get expanded internally to correct syntax, e.g., from "p" to "

", "

" etc. 11 | #Will have no effect if the "cleanxml" annotator is used 12 | #ssplit.htmlBoundariesToDiscard = p,text 13 | 14 | # 15 | # None of these paths are necessary anymore: we load all models from the JAR file 16 | # 17 | 18 | #pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-left3words/left3words-distsim-wsj-0-18.tagger 19 | ## slightly better model but much slower: 20 | ##pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/bidirectional-distsim-wsj-0-18.tagger 21 | 22 | #ner.model.3class = /u/nlp/data/ner/goodClassifiers/all.3class.distsim.crf.ser.gz 23 | #ner.model.7class = /u/nlp/data/ner/goodClassifiers/muc.distsim.crf.ser.gz 24 | #ner.model.MISCclass = /u/nlp/data/ner/goodClassifiers/conll.distsim.crf.ser.gz 25 | 26 | #regexner.mapping = /u/nlp/data/TAC-KBP2010/sentence_extraction/type_map_clean 27 | #regexner.ignorecase = false 28 | 29 | #nfl.gazetteer = /scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt 30 | #nfl.relation.model = /scr/nlp/data/ldc/LDC2009E112/Machine_Reading_P1_NFL_Scoring_Training_Data_V1.2/models/nfl_relation_model.ser 31 | #nfl.entity.model = /scr/nlp/data/ldc/LDC2009E112/Machine_Reading_P1_NFL_Scoring_Training_Data_V1.2/models/nfl_entity_model.ser 32 | #printable.relation.beam = 20 33 | 34 | #parser.model = /u/nlp/data/lexparser/englishPCFG.ser.gz 35 | 36 | #srl.verb.args=/u/kristina/srl/verbs.core_args 37 | #srl.model.cls=/u/nlp/data/srl/trainedModels/englishPCFG/cls/train.ann 38 | #srl.model.id=/u/nlp/data/srl/trainedModels/englishPCFG/id/train.ann 39 | 40 | #coref.model=/u/nlp/rte/resources/anno/coref/corefClassifierAll.March2009.ser.gz 41 | #coref.name.dir=/u/nlp/data/coref/ 42 | #wordnet.dir=/u/nlp/data/wordnet/wordnet-3.0-prolog 43 | 44 | #dcoref.demonym = /scr/heeyoung/demonyms.txt 45 | #dcoref.animate = /scr/nlp/data/DekangLin-Animacy-Gender/Animacy/animate.unigrams.txt 46 | #dcoref.inanimate = /scr/nlp/data/DekangLin-Animacy-Gender/Animacy/inanimate.unigrams.txt 47 | #dcoref.male = /scr/nlp/data/Bergsma-Gender/male.unigrams.txt 48 | #dcoref.neutral = /scr/nlp/data/Bergsma-Gender/neutral.unigrams.txt 49 | #dcoref.female = /scr/nlp/data/Bergsma-Gender/female.unigrams.txt 50 | #dcoref.plural = /scr/nlp/data/Bergsma-Gender/plural.unigrams.txt 51 | #dcoref.singular = /scr/nlp/data/Bergsma-Gender/singular.unigrams.txt 52 | 53 | 54 | # This is the regular expression that describes which xml tags to keep 55 | # the text from. In order to on off the xml removal, add cleanxml 56 | # to the list of annotators above after "tokenize". 57 | #clean.xmltags = .* 58 | # A set of tags which will force the end of a sentence. HTML example: 59 | # you would not want to end on , but you would want to end on

. 60 | # Once again, a regular expression. 61 | # (Blank means there are no sentence enders.) 62 | #clean.sentenceendingtags = 63 | # Whether or not to allow malformed xml 64 | # StanfordCoreNLP.properties 65 | #wordnet.dir=models/wordnet-3.0-prolog 66 | -------------------------------------------------------------------------------- /pyutils/corenlp/progressbar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: iso-8859-1 -*- 3 | # 4 | # progressbar - Text progressbar library for python. 5 | # Copyright (c) 2005 Nilton Volpato 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public 18 | # License along with this library; if not, write to the Free Software 19 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | 21 | 22 | """Text progressbar library for python. 23 | 24 | This library provides a text mode progressbar. This is typically used 25 | to display the progress of a long running operation, providing a 26 | visual clue that processing is underway. 27 | 28 | The ProgressBar class manages the progress, and the format of the line 29 | is given by a number of widgets. A widget is an object that may 30 | display diferently depending on the state of the progress. There are 31 | three types of widget: 32 | - a string, which always shows itself; 33 | - a ProgressBarWidget, which may return a diferent value every time 34 | it's update method is called; and 35 | - a ProgressBarWidgetHFill, which is like ProgressBarWidget, except it 36 | expands to fill the remaining width of the line. 37 | 38 | The progressbar module is very easy to use, yet very powerful. And 39 | automatically supports features like auto-resizing when available. 40 | """ 41 | 42 | __author__ = "Nilton Volpato" 43 | __author_email__ = "first-name dot last-name @ gmail.com" 44 | __date__ = "2006-05-07" 45 | __version__ = "2.2" 46 | 47 | # Changelog 48 | # 49 | # 2006-05-07: v2.2 fixed bug in windows 50 | # 2005-12-04: v2.1 autodetect terminal width, added start method 51 | # 2005-12-04: v2.0 everything is now a widget (wow!) 52 | # 2005-12-03: v1.0 rewrite using widgets 53 | # 2005-06-02: v0.5 rewrite 54 | # 2004-??-??: v0.1 first version 55 | 56 | import sys 57 | import time 58 | from array import array 59 | try: 60 | from fcntl import ioctl 61 | import termios 62 | except ImportError: 63 | pass 64 | import signal 65 | 66 | 67 | class ProgressBarWidget(object): 68 | """This is an element of ProgressBar formatting. 69 | 70 | The ProgressBar object will call it's update value when an update 71 | is needed. It's size may change between call, but the results will 72 | not be good if the size changes drastically and repeatedly. 73 | """ 74 | def update(self, pbar): 75 | """Returns the string representing the widget. 76 | 77 | The parameter pbar is a reference to the calling ProgressBar, 78 | where one can access attributes of the class for knowing how 79 | the update must be made. 80 | 81 | At least this function must be overriden.""" 82 | pass 83 | 84 | 85 | class ProgressBarWidgetHFill(object): 86 | """This is a variable width element of ProgressBar formatting. 87 | 88 | The ProgressBar object will call it's update value, informing the 89 | width this object must the made. This is like TeX \\hfill, it will 90 | expand to fill the line. You can use more than one in the same 91 | line, and they will all have the same width, and together will 92 | fill the line. 93 | """ 94 | def update(self, pbar, width): 95 | """Returns the string representing the widget. 96 | 97 | The parameter pbar is a reference to the calling ProgressBar, 98 | where one can access attributes of the class for knowing how 99 | the update must be made. The parameter width is the total 100 | horizontal width the widget must have. 101 | 102 | At least this function must be overriden.""" 103 | pass 104 | 105 | 106 | class ETA(ProgressBarWidget): 107 | "Widget for the Estimated Time of Arrival" 108 | def format_time(self, seconds): 109 | return time.strftime('%H:%M:%S', time.gmtime(seconds)) 110 | 111 | def update(self, pbar): 112 | if pbar.currval == 0: 113 | return 'ETA: --:--:--' 114 | elif pbar.finished: 115 | return 'Time: %s' % self.format_time(pbar.seconds_elapsed) 116 | else: 117 | elapsed = pbar.seconds_elapsed 118 | eta = elapsed * pbar.maxval / pbar.currval - elapsed 119 | return 'ETA: %s' % self.format_time(eta) 120 | 121 | 122 | class FileTransferSpeed(ProgressBarWidget): 123 | "Widget for showing the transfer speed (useful for file transfers)." 124 | def __init__(self): 125 | self.fmt = '%6.2f %s' 126 | self.units = ['B', 'K', 'M', 'G', 'T', 'P'] 127 | 128 | def update(self, pbar): 129 | if pbar.seconds_elapsed < 2e-6: # == 0: 130 | bps = 0.0 131 | else: 132 | bps = float(pbar.currval) / pbar.seconds_elapsed 133 | spd = bps 134 | for u in self.units: 135 | if spd < 1000: 136 | break 137 | spd /= 1000 138 | return self.fmt % (spd, u + '/s') 139 | 140 | 141 | class RotatingMarker(ProgressBarWidget): 142 | "A rotating marker for filling the bar of progress." 143 | def __init__(self, markers='|/-\\'): 144 | self.markers = markers 145 | self.curmark = -1 146 | 147 | def update(self, pbar): 148 | if pbar.finished: 149 | return self.markers[0] 150 | self.curmark = (self.curmark + 1) % len(self.markers) 151 | return self.markers[self.curmark] 152 | 153 | 154 | class Percentage(ProgressBarWidget): 155 | "Just the percentage done." 156 | def update(self, pbar): 157 | return '%3d%%' % pbar.percentage() 158 | 159 | 160 | class Fraction(ProgressBarWidget): 161 | "Just the fraction done." 162 | def update(self, pbar): 163 | return "%d/%d" % (pbar.currval, pbar.maxval) 164 | 165 | 166 | class Bar(ProgressBarWidgetHFill): 167 | "The bar of progress. It will strech to fill the line." 168 | def __init__(self, marker='#', left='|', right='|'): 169 | self.marker = marker 170 | self.left = left 171 | self.right = right 172 | 173 | def _format_marker(self, pbar): 174 | if isinstance(self.marker, (str, unicode)): 175 | return self.marker 176 | else: 177 | return self.marker.update(pbar) 178 | 179 | def update(self, pbar, width): 180 | percent = pbar.percentage() 181 | cwidth = width - len(self.left) - len(self.right) 182 | marked_width = int(percent * cwidth / 100) 183 | m = self._format_marker(pbar) 184 | bar = (self.left + (m * marked_width).ljust(cwidth) + self.right) 185 | return bar 186 | 187 | 188 | class ReverseBar(Bar): 189 | "The reverse bar of progress, or bar of regress. :)" 190 | def update(self, pbar, width): 191 | percent = pbar.percentage() 192 | cwidth = width - len(self.left) - len(self.right) 193 | marked_width = int(percent * cwidth / 100) 194 | m = self._format_marker(pbar) 195 | bar = (self.left + (m * marked_width).rjust(cwidth) + self.right) 196 | return bar 197 | 198 | default_widgets = [Percentage(), ' ', Bar()] 199 | 200 | 201 | class ProgressBar(object): 202 | """This is the ProgressBar class, it updates and prints the bar. 203 | 204 | The term_width parameter may be an integer. Or None, in which case 205 | it will try to guess it, if it fails it will default to 80 columns. 206 | 207 | The simple use is like this: 208 | >>> pbar = ProgressBar().start() 209 | >>> for i in xrange(100): 210 | ... # do something 211 | ... pbar.update(i+1) 212 | ... 213 | >>> pbar.finish() 214 | 215 | But anything you want to do is possible (well, almost anything). 216 | You can supply different widgets of any type in any order. And you 217 | can even write your own widgets! There are many widgets already 218 | shipped and you should experiment with them. 219 | 220 | When implementing a widget update method you may access any 221 | attribute or function of the ProgressBar object calling the 222 | widget's update method. The most important attributes you would 223 | like to access are: 224 | - currval: current value of the progress, 0 <= currval <= maxval 225 | - maxval: maximum (and final) value of the progress 226 | - finished: True if the bar is have finished (reached 100%), False o/w 227 | - start_time: first time update() method of ProgressBar was called 228 | - seconds_elapsed: seconds elapsed since start_time 229 | - percentage(): percentage of the progress (this is a method) 230 | """ 231 | def __init__(self, maxval=100, widgets=default_widgets, term_width=None, 232 | fd=sys.stderr, force_update=False): 233 | assert maxval > 0 234 | self.maxval = maxval 235 | self.widgets = widgets 236 | self.fd = fd 237 | self.signal_set = False 238 | if term_width is None: 239 | try: 240 | self.handle_resize(None, None) 241 | signal.signal(signal.SIGWINCH, self.handle_resize) 242 | self.signal_set = True 243 | except: 244 | self.term_width = 79 245 | else: 246 | self.term_width = term_width 247 | 248 | self.currval = 0 249 | self.finished = False 250 | self.prev_percentage = -1 251 | self.start_time = None 252 | self.seconds_elapsed = 0 253 | self.force_update = force_update 254 | 255 | def handle_resize(self, signum, frame): 256 | h, w = array('h', ioctl(self.fd, termios.TIOCGWINSZ, '\0' * 8))[:2] 257 | self.term_width = w 258 | 259 | def percentage(self): 260 | "Returns the percentage of the progress." 261 | return self.currval * 100.0 / self.maxval 262 | 263 | def _format_widgets(self): 264 | r = [] 265 | hfill_inds = [] 266 | num_hfill = 0 267 | currwidth = 0 268 | for i, w in enumerate(self.widgets): 269 | if isinstance(w, ProgressBarWidgetHFill): 270 | r.append(w) 271 | hfill_inds.append(i) 272 | num_hfill += 1 273 | elif isinstance(w, (str, unicode)): 274 | r.append(w) 275 | currwidth += len(w) 276 | else: 277 | weval = w.update(self) 278 | currwidth += len(weval) 279 | r.append(weval) 280 | for iw in hfill_inds: 281 | r[iw] = r[iw].update(self, 282 | (self.term_width - currwidth) / num_hfill) 283 | return r 284 | 285 | def _format_line(self): 286 | return ''.join(self._format_widgets()).ljust(self.term_width) 287 | 288 | def _need_update(self): 289 | if self.force_update: 290 | return True 291 | return int(self.percentage()) != int(self.prev_percentage) 292 | 293 | def reset(self): 294 | if not self.finished and self.start_time: 295 | self.finish() 296 | self.finished = False 297 | self.currval = 0 298 | self.start_time = None 299 | self.seconds_elapsed = None 300 | self.prev_percentage = None 301 | return self 302 | 303 | def update(self, value): 304 | "Updates the progress bar to a new value." 305 | assert 0 <= value <= self.maxval 306 | self.currval = value 307 | if not self._need_update() or self.finished: 308 | return 309 | if not self.start_time: 310 | self.start_time = time.time() 311 | self.seconds_elapsed = time.time() - self.start_time 312 | self.prev_percentage = self.percentage() 313 | if value != self.maxval: 314 | self.fd.write(self._format_line() + '\r') 315 | else: 316 | self.finished = True 317 | self.fd.write(self._format_line() + '\n') 318 | 319 | def start(self): 320 | """Start measuring time, and prints the bar at 0%. 321 | 322 | It returns self so you can use it like this: 323 | >>> pbar = ProgressBar().start() 324 | >>> for i in xrange(100): 325 | ... # do something 326 | ... pbar.update(i+1) 327 | ... 328 | >>> pbar.finish() 329 | """ 330 | self.update(0) 331 | return self 332 | 333 | def finish(self): 334 | """Used to tell the progress is finished.""" 335 | self.update(self.maxval) 336 | if self.signal_set: 337 | signal.signal(signal.SIGWINCH, signal.SIG_DFL) 338 | 339 | 340 | def example1(): 341 | widgets = ['Test: ', Percentage(), ' ', Bar(marker=RotatingMarker()), 342 | ' ', ETA(), ' ', FileTransferSpeed()] 343 | pbar = ProgressBar(widgets=widgets, maxval=10000000).start() 344 | for i in range(1000000): 345 | # do something 346 | pbar.update(10 * i + 1) 347 | pbar.finish() 348 | return pbar 349 | 350 | 351 | def example2(): 352 | class CrazyFileTransferSpeed(FileTransferSpeed): 353 | "It's bigger between 45 and 80 percent" 354 | def update(self, pbar): 355 | if 45 < pbar.percentage() < 80: 356 | return 'Bigger Now ' + FileTransferSpeed.update(self, pbar) 357 | else: 358 | return FileTransferSpeed.update(self, pbar) 359 | 360 | widgets = [CrazyFileTransferSpeed(), ' <<<', 361 | Bar(), '>>> ', Percentage(), ' ', ETA()] 362 | pbar = ProgressBar(widgets=widgets, maxval=10000000) 363 | # maybe do something 364 | pbar.start() 365 | for i in range(2000000): 366 | # do something 367 | pbar.update(5 * i + 1) 368 | pbar.finish() 369 | return pbar 370 | 371 | 372 | def example3(): 373 | widgets = [Bar('>'), ' ', ETA(), ' ', ReverseBar('<')] 374 | pbar = ProgressBar(widgets=widgets, maxval=10000000).start() 375 | for i in range(1000000): 376 | # do something 377 | pbar.update(10 * i + 1) 378 | pbar.finish() 379 | return pbar 380 | 381 | 382 | def example4(): 383 | widgets = ['Test: ', Percentage(), ' ', 384 | Bar(marker='0', left='[', right=']'), 385 | ' ', ETA(), ' ', FileTransferSpeed()] 386 | pbar = ProgressBar(widgets=widgets, maxval=500) 387 | pbar.start() 388 | for i in range(100, 500 + 1, 50): 389 | time.sleep(0.2) 390 | pbar.update(i) 391 | pbar.finish() 392 | return pbar 393 | 394 | 395 | def example5(): 396 | widgets = ['Test: ', Fraction(), ' ', Bar(marker=RotatingMarker()), 397 | ' ', ETA(), ' ', FileTransferSpeed()] 398 | pbar = ProgressBar(widgets=widgets, maxval=10, force_update=True).start() 399 | for i in range(1, 11): 400 | # do something 401 | time.sleep(0.5) 402 | pbar.update(i) 403 | pbar.finish() 404 | return pbar 405 | 406 | 407 | def main(): 408 | example1() 409 | print 410 | example2() 411 | print 412 | example3() 413 | print 414 | example4() 415 | print 416 | example5() 417 | print 418 | 419 | if __name__ == '__main__': 420 | main() 421 | -------------------------------------------------------------------------------- /senna_sents.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code chunk sentences into NP, VP, PP, O, etc. 3 | It uses the SENNA tool, (https://github.com/biplab-iitb/practNLPTools, https://pypi.python.org/pypi/practnlptools/1.0), 4 | to perform the chunking. 5 | 6 | The results will be saved in cache/chunked_sents/dataset_splitBy/sents.json 7 | The sents.json = [{sent_id, sent, senna}], where senna = {chunk, pos, srl, syntax_tree, verbs, words, ner} 8 | """ 9 | import sys 10 | import os 11 | import os.path as osp 12 | from pprint import pprint 13 | from Queue import Queue 14 | from threading import Thread, Lock 15 | import time 16 | import argparse 17 | import json 18 | # import SENNA tool 19 | from practnlptools.tools import Annotator 20 | 21 | def senna_sents(sents, params): 22 | """ 23 | The input sents is list of [{sent_id, sent, raw, tokens}] 24 | Return sents of [{sent_id, sent, raw, tokens, chunk}] 25 | """ 26 | num_sents = len(sents) 27 | 28 | # enqueue 29 | q = Queue() 30 | for i in range(num_sents): 31 | q.put((i, sents[i])) 32 | 33 | # work: dequeue and do job 34 | def worker(): 35 | annotator = Annotator() 36 | while True: 37 | i, sent = q.get() 38 | try: 39 | senna = annotator.getAnnotations(sent['sent']) 40 | except: 41 | print('exception found.') 42 | senna = annotator.getAnnotations('none') 43 | if i % 100 == 0: 44 | print('%s/%s done.' % (i, num_sents)) 45 | sents[i]['senna'] = senna 46 | sents[i]['senna'].pop('dep_parse', None) # including chunk, pos, srl, syntax_tree, verbs, words, ner 47 | q.task_done() 48 | 49 | # workers 50 | for w in range(params['num_workers']): 51 | t = Thread(target=worker) 52 | t.daemon = True 53 | t.start() 54 | q.join() 55 | 56 | 57 | def main(params): 58 | 59 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 60 | if not osp.isdir('cache/senna_sents/' + dataset_splitBy): 61 | os.makedirs('cache/senna_sents/' + dataset_splitBy) 62 | 63 | # we have to prepare current folder path 64 | # practnlptools might change current folder to python's site-packages 65 | cur_folder = os.path.abspath('.') 66 | 67 | # load refer 68 | sys.path.insert(0, 'pyutils/refer') 69 | from refer import REFER 70 | refer = REFER(params['data_root'], params['dataset'], params['splitBy']) 71 | 72 | # read sents and pop unnecessary keys 73 | sents = refer.Sents.values() 74 | for sent in sents: 75 | sent.pop('raw', None) 76 | 77 | # parse sents 78 | senna_sents(sents, params) 79 | 80 | # save results 81 | output_path = osp.join(cur_folder, 'cache/senna_sents/'+dataset_splitBy, 'sents.json') 82 | with open(output_path, 'w') as io: 83 | json.dump(sents, io) 84 | print('senna parsed sents.json saved in %s.' % output_path) 85 | 86 | 87 | if __name__ == '__main__': 88 | 89 | # input 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--data_root', default='data', help='dataset root directory') 92 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 93 | parser.add_argument('--splitBy', default='unc', help='split By') 94 | parser.add_argument('--num_workers', type=int, default=2, help='number of workers') 95 | args = parser.parse_args() 96 | params = vars(args) 97 | 98 | # main 99 | main(params) -------------------------------------------------------------------------------- /write_atts_html.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path as osp 4 | from pprint import pprint 5 | import time 6 | import argparse 7 | import json 8 | 9 | attribute_names = {'r1': 'entry-level name', 'r2': 'color', 'r3': 'size', 'r4': 'abs. location', 10 | 'r5': 'rel. location', 'r6': 'rel. object', 'r7': 'other atts.', 'r8': 'left words'} 11 | 12 | def analyze(sents): 13 | # do some statistics 14 | usage = {'r1': 0, 'r2': 0, 'r3': 0, 'r4': 0, 'r5': 0, 'r6': 0, 'r7': 0, 'r8': 0} 15 | for sent in sents: 16 | for r in usage: 17 | usage[r] = usage[r] + 1 if sent['atts'][r] != ['none'] else usage[r] 18 | for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']: 19 | usage[r] /= float(len(sents)) 20 | print('Usage of %s is %.2f%%.' % (r, usage[r] * 100)) 21 | return usage 22 | 23 | def main(params): 24 | 25 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 26 | if not osp.isdir('cache/atts_html/' + dataset_splitBy): 27 | os.makedirs('cache/atts_html/' + dataset_splitBy) 28 | 29 | # load parsed sents with attributes, where sents.json = 30 | # [{sent_id, sent, parse, atts, left, raw, tokens}] 31 | # where parse = {dependencies, parsetree, text, workds}, atts = {r1, r2, ...}, left = [(wd, POS)] 32 | path_to_parsed_atts = osp.join('cache/parsed_atts', dataset_splitBy, 'sents.json') 33 | sents = json.load(open(path_to_parsed_atts)) 34 | 35 | # analyze 36 | usage = analyze(sents) 37 | 38 | # write htmls 39 | num_per_page = params['num_per_page'] 40 | for page_id, s in enumerate(range(0, len(sents), num_per_page)): 41 | html = open(osp.join('cache/atts_html', dataset_splitBy, str(page_id)+'.html'), 'w') 42 | html.write('

Show %s sentences and their attribute parses.' % len(sents)) 43 | html.write('') 44 | html.write('') 45 | html.write('') 46 | html.write('') 47 | html.write('') 48 | html.write('') 49 | html.write('') 50 | for j in range(s, min(s+num_per_page, len(sents))): 51 | if j % 2 == 0: 52 | color_str = '#eef' 53 | else: 54 | color_str = '#fee' 55 | # fetch info of this sent 56 | sent_id = sents[j]['sent_id'] 57 | sent_txt = sents[j]['sent'].encode('ascii', 'ignore').decode('ascii') 58 | atts = sents[j]['atts'] 59 | left = sents[j]['left'] 60 | # write a row of the info 61 | html.write('' % (color_str, j)) 62 | html.write('' % sent_id) 63 | html.write('' % sent_txt) 64 | for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7']: 65 | att = atts[r][0] if atts[r][0] != 'none' else '' 66 | html.write('' % att) 67 | html.write('') 71 | html.write('') 72 | html.write('
sent_idReferring-expressionEntry-level nameColorSizeAbs. LocationRel. LocationRel. ObjectOtherLeft Words
%06d%s%s%s') 68 | for l in left: 69 | html.write('%s [%s], ' % (l[0], l[1])) 70 | html.write('
') 73 | html.write('') 74 | print('Page %s written.' % page_id) 75 | 76 | # write summary 77 | html = open(osp.join('cache/atts_html', dataset_splitBy, 'main.html'), 'w') 78 | html.write('') 79 | html.write('') 80 | for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']: 81 | html.write('

usage of %s [%s] is %.2f%%

' % (r, attribute_names[r], usage[r]*100)) 82 | html.write('
    ') 83 | for page_id, s in enumerate(range(0, len(sents), num_per_page)): 84 | page_html = str(page_id)+'.html' 85 | print(page_html) 86 | html.write('
  • page_id%s
  • ' % (page_html, page_id)) 87 | html.write('
') 88 | 89 | 90 | if __name__ == '__main__': 91 | 92 | # input 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 95 | parser.add_argument('--splitBy', default='unc', help='split By') 96 | parser.add_argument('--num_per_page', type=int, default=10000, help='number of pages to be written') 97 | args = parser.parse_args() 98 | params = vars(args) 99 | 100 | # main 101 | main(params) 102 | -------------------------------------------------------------------------------- /write_chunk_html.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path as osp 4 | from pprint import pprint 5 | import time 6 | import argparse 7 | import json 8 | import operator 9 | import random 10 | random.seed(8) 11 | 12 | def write_structures(html, sents): 13 | """ 14 | The input sents = [{sent_id, sent, chunk, NPs, senna, tokens}] 15 | where chunk is list of [(phrase, phrase_type)], and NPs is list of noun phrases 16 | We analyze phrase structure 17 | """ 18 | struct_to_num = {} 19 | struct_to_examples = {} 20 | for sent in sents: 21 | chunk = sent['chunk'] 22 | struct = ' '.join([ck[1] for ck in chunk]) 23 | struct_to_num[struct] = struct_to_num.get(struct, 0) + 1 24 | if struct not in struct_to_examples: 25 | struct_to_examples[struct] = [] 26 | struct_to_examples[struct] += [sent['sent']] 27 | sorted_structs = sorted(struct_to_num.items(), key=operator.itemgetter(1)) 28 | sorted_structs.reverse() 29 | 30 | html.write('
') 31 | html.write('') 32 | html.write('') 33 | total_num = sum(struct_to_num.values()) 34 | acc = 0 35 | for j, (struct, num) in enumerate(sorted_structs[:20]): 36 | acc += num 37 | html.write('' % j) 38 | html.write('' % struct) 39 | html.write('' % num) 40 | html.write('' % (num*100.0/total_num)) 41 | html.write('' % (acc*100.0/total_num)) 42 | html.write('' % (random.choice(struct_to_examples[struct]))) 43 | html.write('') 44 | html.write('') 45 | html.write('
Top Phrase StructsNumberPercentageAccumulatedExample
%02d%s%s%.2f%%%.2f%%%s
') 46 | 47 | def write_info(html, sents): 48 | # NP usage in the raw chunks 49 | NP_usage = 0 50 | for sent in sents: 51 | chunk = sent['chunk'] 52 | NPs = [ck for ck in chunk if ck[1] == 'NP'] 53 | if len(NPs) > 0: 54 | NP_usage += 1 55 | html.write('

%.2f%% expressions have NPs' % (NP_usage*100.0/len(sents))) 56 | 57 | # NP usage in the filtered NPs 58 | cleaned_NP_usage = 0 59 | for sent in sents: 60 | if len(sent['NPs']) > 0: 61 | cleaned_NP_usage += 1 62 | html.write(', and %.2f%% cleaned NPs.

' % (cleaned_NP_usage*100.0/len(sents))) 63 | 64 | # average #NP in each expression 65 | total_NPs, total_cleaned_NPs, total_PPs, total_VPs, total_ADVPs, total_ADJPs = 0, 0, 0, 0, 0, 0 66 | total_wds = 0 67 | total_NP_wds = 0 68 | total_cleaned_NP_wds = 0 69 | for sent in sents: 70 | for ck in sent['chunk']: 71 | if ck[1] == 'NP': 72 | total_NPs += 1 73 | total_NP_wds += len(ck[0].split()) 74 | if ck[1] == 'PP': 75 | total_PPs += 1 76 | if ck[1] == 'ADVP': 77 | total_ADVPs += 1 78 | if ck[1] == 'ADJP': 79 | total_ADJPs += 1 80 | total_wds += len(sent['tokens']) 81 | # check cleaned NPs 82 | total_cleaned_NPs += len(sent['NPs']) 83 | total_cleaned_NP_wds += sum([len(phrase.split()) for phrase in sent['NPs']]) 84 | 85 | html.write('

Each expression and has %.2f NPs (%.2f cleaned NPs), %.2f PPs, %.2f ADVPs, %.2f ADJPs.

' % (total_NPs*1.0/len(sents), 86 | total_cleaned_NPs*1.0 / len(sents), total_PPs*1.0/len(sents), total_ADVPs*1.0/len(sents), total_ADJPs*1.0/len(sents))) 87 | html.write('

Each expression has %.2f words, among which are %.2f NP words.

' % (total_wds/len(sents), total_NP_wds*1.0 / len(sents) )) 88 | html.write('

Each NP has %.2f words' % (total_NP_wds*1.0/total_NPs)) 89 | html.write(', and each cleaned NP has %.2f words.

' % (total_cleaned_NP_wds*1.0 / total_cleaned_NPs)) 90 | 91 | 92 | def main(params): 93 | 94 | dataset_splitBy = params['dataset'] + '_' + params['splitBy'] 95 | if not osp.isdir('cache/chunk_html/' + dataset_splitBy): 96 | os.makedirs('cache/chunk_html/' + dataset_splitBy) 97 | 98 | # load chunked sents = [{sent_id, sent, chunk, NPs, senna, tokens}] 99 | # where chunk is list of [(phrase, phrase_type)] 100 | # and NPs is list of noun phrases 101 | path_to_chunked_sents = osp.join('cache/chunked_sents', dataset_splitBy, 'sents.json') 102 | sents = json.load(open(path_to_chunked_sents)) 103 | 104 | # write htmls 105 | num_per_page = params['num_per_page'] 106 | for page_id, s in enumerate(range(0, len(sents), num_per_page)): 107 | html = open(osp.join('cache/chunk_html', dataset_splitBy, str(page_id)+'.html'), 'w') 108 | html.write('

Show %s sentences and their phrase structures.' % len(sents)) 109 | html.write('') 110 | html.write('') 111 | html.write('') 112 | html.write('') 113 | html.write('') 114 | html.write('') 115 | html.write('') 116 | for j in range(s, min(s+num_per_page, len(sents))): 117 | if j % 2 == 0: 118 | color_str = '#eef' 119 | else: 120 | color_str = '#fee' 121 | # fetch info of this sent 122 | sent_id = sents[j]['sent_id'] 123 | sent_txt = sents[j]['sent'].encode('ascii', 'ignore').decode('ascii') 124 | chunk_txt = ' '.join(['(%s, %s)' % (ck[0], ck[1]) for ck in sents[j]['chunk']]) 125 | NPs_txt = ' '.join(['(%s, NP)' % phrase for phrase in sents[j]['NPs']]) 126 | NNs_txt = ' '.join(['(%s, NN)' % phrase for phrase in sents[j]['NNs']]) 127 | # write a row of the info 128 | html.write('' % (color_str, j)) 129 | html.write('' % sent_id) 130 | html.write('' % sent_txt) 131 | html.write('' % chunk_txt) 132 | html.write('' % NPs_txt) 133 | html.write('' % NNs_txt) 134 | html.write('') 135 | html.write('') 136 | html.write('
sent_idReferring-expressionPhrase StructureNoun Phrase(s)Noun Word(s)
%06d%s%s%s%s%s
') 137 | html.write('') 138 | print('Page %s written.' % page_id) 139 | 140 | # write summary 141 | html = open(osp.join('cache/chunk_html', dataset_splitBy, 'main.html'), 'w') 142 | html.write('') 143 | html.write('') 144 | 145 | # write phrase structures 146 | write_structures(html, sents) 147 | 148 | # write other info 149 | write_info(html, sents) 150 | 151 | # write pages 152 | html.write('
    ') 153 | for page_id, s in enumerate(range(0, len(sents), num_per_page)): 154 | page_html = str(page_id)+'.html' 155 | print(page_html) 156 | html.write('
  • page_id%s
  • ' % (page_html, page_id)) 157 | html.write('
') 158 | 159 | 160 | if __name__ == '__main__': 161 | 162 | # input 163 | parser = argparse.ArgumentParser() 164 | parser.add_argument('--dataset', default='refcoco', help='dataset name') 165 | parser.add_argument('--splitBy', default='unc', help='split By') 166 | parser.add_argument('--num_per_page', type=int, default=10000, help='number of pages to be written') 167 | args = parser.parse_args() 168 | params = vars(args) 169 | 170 | # main 171 | main(params) --------------------------------------------------------------------------------