├── .gitmodules
├── README.md
├── analyze_chunk.py
├── cache
    └── .gitignore
├── chunk_sents.py
├── data
    ├── .gitignore
    └── README.md
├── models
    ├── .gitignore
    └── README.md
├── parse_atts.py
├── parse_sents.py
├── pyutils
    ├── .gitignore
    ├── __init__.py
    ├── attparser
    │   ├── __init__.py
    │   ├── baseParser.py
    │   ├── clefParser.py
    │   ├── cocoParser.py
    │   ├── cocoParser_punct.py
    │   ├── config.py
    │   ├── head.py
    │   └── simpleParser.py
    └── corenlp
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── client.py
    │   ├── corenlp.py
    │   ├── default.properties
    │   └── progressbar.py
├── senna_sents.py
├── write_atts_html.py
└── write_chunk_html.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pyutils/refer"]
2 | 	path = pyutils/refer
3 | 	url = https://github.com/lichengunc/refer
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # refer-parser2
 2 | Referring Expression Parser
 3 | 
 4 | 
 5 | ## Introduction
 6 | Our parser provides functions of
 7 | * parse sentences in multithread mode using StanfordNLP and SENNA
 8 | * find the head noun word of a sentence
 9 | * find the 7 attribute words as ReferitGame paper
10 | * chunk sentences
11 | * write htmls
12 | 
13 | ## Requirements
14 | This code is written in python and requires several libraries.
15 | ```bash
16 | practnlptools
17 | nltk
18 | corenlp
19 | unidecode
20 | ```
21 | We prune the core part of corenlp-python in this repository, whose original git can be downloaded [here](https://bitbucket.org/jeremybmerrill/corenlp-python.git). 
22 | Note this (our) corenlp is able to read [v3.5.1](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-01-29.zip) and [v3.5.2](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-04-20.zip), but not able to load V3.6.0. 
23 | Also note Stanford NLP group switches to Universal Dependencies standard since v3.5.2.
24 | We also use [SENNA](http://ronan.collobert.com/senna/)'s python wrapper, [practnlptools](https://pypi.python.org/pypi/practnlptools/1.0) to chunk each sentence into phrase structures.
25 | 
26 | ## How to use
27 | 1a) Parse expressions using Stanford Parser:
28 | ```bash
29 | python parse_sents.py --dataset refcoco --splitBy unc --num_workers 4
30 | ```
31 | 1b) Parse expressions using [Vicente's R1-R7 attributes](http://tamaraberg.com/papers/referit.pdf):
32 | ```bash
33 | python parse_atts.py --dataset refcoco --splitBy unc
34 | ```
35 | 1c) Visualize decomposed attributes:
36 | ```bash
37 | python write_att_html.py --dataset refcoco --splitBy unc
38 | ```
39 | 
40 | 2a) Parse expression using SENNA parser:
41 | ```bash
42 | python senna_sents.py --dataset refcoco --splitBy unc --num_workers 4
43 | ```
44 | 2b) Chunk expressions into phrase structures:
45 | ```bash
46 | python chunk_sents.py --dataset refcoco --splitBy unc
47 | ```
48 | 2c) Analyze the phrase structures from the chunking results:
49 | ```bash
50 | python analyze_chunk.py --dataset refcoco --splitBy unc
51 | ```
52 | 2d) Visualize the phrase structures:
53 | ```bash
54 | python write_chunk_html.py --dataset refcoco --splitBy unc
55 | ```
56 | 
57 | ## Download
58 | * [**Parsed expressions**](http://bvision.cs.unc.edu/licheng/MattNet/refer-parser2/cache/parsed_atts.zip) using [Vicente's R1-R7 attributes](http://tamaraberg.com/papers/referit.pdf)
59 | 
60 | ### License
61 | BSD License.
62 | 
63 | 


--------------------------------------------------------------------------------
/analyze_chunk.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import os.path as osp
  4 | from pprint import pprint
  5 | import time
  6 | import argparse
  7 | import json
  8 | import operator
  9 | import random
 10 | 
 11 | def analyze_structure(sents):
 12 | 	"""
 13 | 	The input sents = [{sent_id, sent, chunk, NPs, senna, tokens}]
 14 | 	where chunk is list of [(phrase, phrase_type)], and NPs is list of noun phrases
 15 | 	We analyze phrase structure
 16 | 	"""
 17 | 	struct_to_num = {}
 18 | 	struct_to_examples = {}
 19 | 	for sent in sents:
 20 | 		chunk = sent['chunk']
 21 | 		struct = ' '.join([ck[1] for ck in chunk])
 22 | 		struct_to_num[struct] = struct_to_num.get(struct, 0) + 1
 23 | 		if struct not in struct_to_examples:
 24 | 			struct_to_examples[struct] = []
 25 | 		struct_to_examples[struct] += [sent['sent']]
 26 | 	sorted_structs = sorted(struct_to_num.items(), key=operator.itemgetter(1))
 27 | 	sorted_structs.reverse()
 28 | 
 29 | 	print('%25s: %10s  %6s  %8s' % ('structure', 'number', 'perc.', 'acc.'))
 30 | 	total_num = sum(struct_to_num.values())
 31 | 	acc = 0
 32 | 	for struct, num in sorted_structs[:20]:
 33 | 		acc += num
 34 | 		print('%25s: %10d  %6.3f%%  %4.3f%%, e.g., %s' % (struct, num, num*100.0/total_num, acc*100.0/total_num, random.choice(struct_to_examples[struct])))
 35 | 
 36 | def analyze_NP(sents):
 37 | 	# NP usage in the raw chunks
 38 | 	NP_usage = 0
 39 | 	for sent in sents:
 40 | 		chunk = sent['chunk']
 41 | 		NPs = [ck for ck in chunk if ck[1] == 'NP']
 42 | 		if len(NPs) > 0:
 43 | 			NP_usage += 1
 44 | 	print('%.2f%% (%s/%s) expressions have NPs.' % (NP_usage*100.0/len(sents), NP_usage, len(sents)))
 45 | 
 46 | 	# NP usage in the filtered NPs
 47 | 	cleaned_NP_usage = 0
 48 | 	for sent in sents:
 49 | 		if len(sent['NPs']) > 0:
 50 | 			cleaned_NP_usage += 1
 51 | 	print('%.2f%% (%s/%s) expressions have cleaned NPs.' % (cleaned_NP_usage*100.0/len(sents), cleaned_NP_usage, len(sents)))
 52 | 
 53 | 	# average #NP in each expression
 54 | 	total_NPs, total_cleaned_NPs, total_PPs, total_VPs, total_ADVPs, total_ADJPs = 0, 0, 0, 0, 0, 0
 55 | 	total_wds = 0
 56 | 	total_NP_wds = 0
 57 | 	total_cleaned_NP_wds = 0
 58 | 	for sent in sents:
 59 | 		for ck in sent['chunk']:
 60 | 			if ck[1] == 'NP':
 61 | 				total_NPs += 1
 62 | 				total_NP_wds += len(ck[0].split())
 63 | 			if ck[1] == 'PP':
 64 | 				total_PPs += 1
 65 | 			if ck[1] == 'ADVP':
 66 | 				total_ADVPs += 1
 67 | 			if ck[1] == 'ADJP':
 68 | 				total_ADJPs += 1
 69 | 		total_wds += len(sent['tokens'])
 70 | 		# check cleaned NPs
 71 | 		total_cleaned_NPs += len(sent['NPs'])
 72 | 		total_cleaned_NP_wds += sum([len(phrase.split()) for phrase in sent['NPs']])
 73 | 
 74 | 	print('Each expression and has %.2f NPs (%.2f cleaned NPs), %.2f PPs, %.2f ADVPs, %.2f ADJPs,' % (total_NPs*1.0/len(sents), 
 75 | 		total_cleaned_NPs*1.0 / len(sents), total_PPs*1.0/len(sents), total_ADVPs*1.0/len(sents), total_ADJPs*1.0/len(sents)))
 76 | 	print('Each expression has %.2f words, among which are %.2f NP words.' % (total_wds/len(sents), total_NP_wds*1.0 / len(sents) ))
 77 | 	print('Each NP has %.2f words.' % (total_NP_wds*1.0/total_NPs))
 78 | 	print('Each cleaned NP has %.2f words.' % (total_cleaned_NP_wds*1.0 / total_cleaned_NPs))
 79 | 
 80 | 
 81 | def main(params):
 82 | 
 83 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
 84 | 	if not osp.isdir('cache/chunk_html/' + dataset_splitBy):
 85 | 		os.makedirs('cache/chunk_html/' + dataset_splitBy)
 86 | 
 87 | 	# load chunked sents = [{sent_id, sent, chunk, NPs, senna, tokens}]
 88 | 	# where chunk is list of [(phrase, phrase_type)]
 89 | 	# and NPs is list of noun phrases
 90 | 	path_to_chunked_sents = osp.join('cache/chunked_sents', dataset_splitBy, 'sents.json')
 91 | 	sents = json.load(open(path_to_chunked_sents))
 92 | 
 93 | 	# analyze phrase structure
 94 | 	analyze_structure(sents)
 95 | 
 96 | 	# analyze the usage of NPs
 97 | 	analyze_NP(sents)
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 | 
102 | 	# input
103 | 	parser = argparse.ArgumentParser()
104 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
105 | 	parser.add_argument('--splitBy', default='unc', help='split By')
106 | 	parser.add_argument('--num_per_page', type=int, default=10000, help='number of pages to be written')
107 | 	args = parser.parse_args()
108 | 	params = vars(args)
109 | 
110 | 	# main
111 | 	main(params)


--------------------------------------------------------------------------------
/cache/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/chunk_sents.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code will convert senna's chunk 
  3 | [('Biplab', 'S-NP'), ('is', 'S-VP'), ('a', 'B-NP'), ('good', 'I-NP'), ('boy', 'E-NP'), ('.', 'O')]
  4 | into [(NP, Biplab), (VP, is), (NP, good boy), (O, .)]
  5 | 
  6 | We will also do cleaning on the chunked phrase, by excluding all location words like 'left', 'right', etc.
  7 | For example, (NP, right white dog) -> (NP, white dog)
  8 | 
  9 | We read cache/senna_sents/dataset_splitBy/sents.json and save the chucking redsults into 
 10 | cache/chunked_sents/dataset_splitBy/sents.json
 11 | """
 12 | import sys
 13 | import os
 14 | import os.path as osp
 15 | from pprint import pprint
 16 | import time
 17 | import argparse
 18 | import json
 19 | 
 20 | # nltk's stopping words
 21 | import nltk
 22 | # nltk.data.path.append('/Users/liyu/Documents/nltk_data')
 23 | nltk.data.path.append('/mnt/ilcompf6d0/user/liyu/Developments/nltk_data')
 24 | from nltk.corpus import stopwords
 25 | stop_words = stopwords.words("english") + ['.', ',', ':', '(', ')', '"', "'s", '!', "'",
 26 | 'between', 'against', 'above', 'below', 'up', 'down', 'out', 'off', 'over']
 27 | stop_words.remove('and')  # we may need 'and' token, e.g., black and white
 28 | 
 29 | # location words
 30 | location_words =  ['right', 'left', 'top', 'bottom', 'middle', 'mid', 'second', '2nd', 'first', '1st', 'front', 
 31 | 'closest', 'nearest', 'center', 'central', 'third', '3rd', 'corner', 'upper', 'back', 'behind', 'far', 'anywhere', 
 32 | 'leftmost', 'lower', 'rightmost', 'farthest', 'furthest', 'next', 'last', 'fourth', '4th', 'up', 'above', 'below', 
 33 | 'down', 'side']
 34 | 
 35 | # color words
 36 | color_words = ['white', 'green', 'blue', 'red', 'yellow', 'black', 'brown', 'pink', 'dark', 'darker', 'orange', 
 37 | 'gray', 'grey', 'purple', 'beige', 'bright']
 38 | 
 39 | # size words
 40 | size_words = ['big', 'bigger', 'biggest', 'small', 'smaller', 'smallest', 'tall', 'taller', 'tallest', 'large', 
 41 | 'larger', 'largest', 'little', 'short', 'shorter', 'tiny', 'long', 'longer', 'longest', 'huge']
 42 | 
 43 | def extract_chunk(senna):
 44 | 	"""
 45 | 	senna = {chunk, pos, srl, syntax_tree, verbs, words, ner}
 46 | 	where chunk = [(the, B-NP), (lady, E-NP), ...], there are B, I, E, S, O prefix in total.
 47 | 	We extract the chunk in to [(phrase, phrase_type)], e.g.,
 48 | 	[('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP']
 49 | 
 50 | 	Besides, we specifically deal with such case: 
 51 | 	sent = 'boy', senna's chunk = [('boy', 'O')], senna's pos = [('boy', 'NN')]
 52 | 	We also consider this single word to be NP
 53 | 	"""
 54 | 	raw_chunk = senna['chunk']
 55 | 	chunk = []
 56 | 	phrase, pix = '', 0
 57 | 	for c in raw_chunk:
 58 | 		if pix > 0:
 59 | 			phrase += ' '
 60 | 		phrase += c[0]
 61 | 		pix += 1
 62 | 		if 'E-' in c[1] or 'S-' in c[1]:
 63 | 			ptype = c[1][2:] 
 64 | 			chunk += [(phrase, ptype)]
 65 | 			phrase, pix = '', 0
 66 | 		if c[1] == 'O':
 67 | 			if len(raw_chunk) == 1: 
 68 | 				if senna['pos'][0][1] == 'NN':  # when sentence = 'boy', senna ouputs 'O' but we take it as 'NP'
 69 | 					chunk += [(phrase, 'NP')]
 70 | 				else:
 71 | 					chunk += [(phrase, 'O')]
 72 | 			else:
 73 | 				chunk += [(phrase, 'O')]
 74 | 			phrase, pix = '', 0
 75 | 	# in case the last phrase has no "-E" to finish
 76 | 	if phrase != '':
 77 | 		chunk += [(phrase, c[1][2:])]
 78 | 	return chunk
 79 | 
 80 | def extract_NPs(chunk):
 81 | 	"""
 82 | 	Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
 83 | 	we extract the NPs with stopping and location words filtered out, and return list of noun phrases.
 84 | 	"""
 85 | 	forbid_wds = stop_words + location_words
 86 | 	NPs = []
 87 | 	for phrase, ptype in chunk:
 88 | 		if ptype == 'NP':
 89 | 			filtered_wds = []
 90 | 			for wd in phrase.split():
 91 | 				if wd not in forbid_wds:
 92 | 					filtered_wds += [wd]
 93 | 			if len(' '.join(filtered_wds)) > 0:
 94 | 				NPs += [' '.join(filtered_wds)]
 95 | 	return NPs
 96 | 
 97 | def extract_NNs(chunk, pos):
 98 | 	"""
 99 | 	Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
100 | 	and pos [(word, pos)], e.g., [('man', 'NN')]
101 | 	we extract from NPs with stopping, location, color, size words filtered out, 
102 | 	and return list of NN words only.
103 | 	"""
104 | 	forbid_wds = stop_words + location_words + color_words + size_words
105 | 	NNs = []
106 | 	for phrase, ptype in chunk:
107 | 		if ptype == 'NP':
108 | 			filtered_wds = []
109 | 			for wd in phrase.split():
110 | 				wd_pos = [p[1] for p in pos if p[0] == wd][0]
111 | 				if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD':  # we don't need JJ nor CD words neither.
112 | 					filtered_wds += [wd]
113 | 			if len(' '.join(filtered_wds)) > 0:
114 | 				NNs += [' '.join(filtered_wds)]
115 | 	return NNs
116 | 
117 | def main(params):
118 | 
119 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
120 | 	if not osp.isdir('cache/chunked_sents/'+dataset_splitBy):
121 | 		os.makedirs('cache/chunked_sents/'+dataset_splitBy)
122 | 
123 | 	# load senna_sents = [{sent_id, tokens, sent, senna}]
124 | 	# where senna = {chunk, pos, srl, syntax_tree, verbs, words, ner}
125 | 	path_to_senna_sents = osp.join('cache/senna_sents', dataset_splitBy, 'sents.json')
126 | 	sents = json.load(open(path_to_senna_sents))
127 | 
128 | 	# chunk convert
129 | 	for i, sent in enumerate(sents):
130 | 		senna = sent['senna']
131 | 		chunk = extract_chunk(senna)
132 | 		NPs = extract_NPs(chunk)
133 | 		NNs = extract_NNs(chunk, senna['pos'])
134 | 		# deal with special case: chunk failed
135 | 		# won't extract NPs nor NNs for this faked ones.
136 | 		if ' '.join([ck[0] for ck in chunk]) == 'none':
137 | 			print('raise chunk error!')
138 | 			chunk = [(sent['sent'], 'NP')]
139 | 		sent['chunk'] = chunk
140 | 		sent['NPs'] = NPs
141 | 		sent['NNs'] = NNs
142 | 		if i % 1000 == 0:
143 | 			print('%s/%s done.' % (i+1, len(sents)))
144 | 
145 | 	# save
146 | 	cur_folder = os.path.abspath('.')
147 | 	output_path = osp.join(cur_folder, 'cache/chunked_sents/'+dataset_splitBy, 'sents.json')
148 | 	with open(output_path, 'w') as io:
149 | 		json.dump(sents, io)	
150 | 	print('chunked_sents saved in %s.' % output_path)
151 | 
152 | 
153 | if __name__ == '__main__':
154 | 
155 | 	# input
156 | 	parser = argparse.ArgumentParser()
157 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
158 | 	parser.add_argument('--splitBy', default='unc', help='dataset name')
159 | 	args = parser.parse_args()
160 | 	params = vars(args)
161 | 
162 | 	# main
163 | 	main(params)


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !README.md
3 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Download
 2 | Download my cleaned data and extract them into this folder.
 3 | - 1) http://tlberg.cs.unc.edu/licheng/referit/data/refclef.zip
 4 | - 2) http://tlberg.cs.unc.edu/licheng/referit/data/refcoco.zip
 5 | - 3) http://tlberg.cs.unc.edu/licheng/referit/data/refcoco+.zip 
 6 | - 4) http://tlberg.cs.unc.edu/licheng/referit/data/refcocog.zip 
 7 | 
 8 | Besides make a folder named as "images".
 9 | Add "mscoco" into "images/". 
10 | Download MSCOCO from [mscoco](http://mscoco.org/dataset/#overview)
11 | 
12 | Add "saiapr_tc-12" into "images/". I only extracted the related images as a subset of the original [imageCLEF](http://imageclef.org/SIAPRdata), i.e., 19997 images. Please download the subset from here (http://tlberg.cs.unc.edu/licheng/referit/data/images/saiapr_tc-12.zip).
13 | 


--------------------------------------------------------------------------------
/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !README.md
3 | 


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
1 | This folder should contain two pre-trained models:
2 | 1. corenlp v3.5.2
3 | 2. googlenews-vectors-negative300.bin
4 | Check README.md in the main repository and download these two here.
5 | 


--------------------------------------------------------------------------------
/parse_atts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This code will call pyutils/attparser to parse each sentence into 7 attributes.
 3 | The parser rule is according to Vicente's paper "Referit Game", EMNLP2014.
 4 | Specifically, r1 = entry-level name, r2 = color, r3 = size, r4 = abs. location,
 5 | r5 = rel. location, r6 = rel. object, r7 = generic, r8 = the left words
 6 | 
 7 | Before running this code, make sure you have already run parse_sents.py, whose output is
 8 | sents = [{sent_id, sent, parse, raw, tokens}]
 9 | The attparser will fetch the parse of each sent, then decompose it into 7 categories.
10 | 
11 | The output will be saved in 'cache/parsed_atts/dataset_splitBy/sents.json', where
12 | sents = [{sent_id, sent, parse, raw, tokens, atts, left}]
13 | """
14 | import sys
15 | import os
16 | import os.path as osp
17 | from pprint import pprint
18 | import time
19 | import argparse
20 | import json
21 | from pyutils.attparser import cocoParser, clefParser
22 | # set nltk data path
23 | import nltk
24 | # nltk.data.path.append('/Users/liyu/Documents/nltk_data')
25 | nltk.data.path.append('/mnt/ilcompf6d0/user/liyu/Developments/nltk_data')
26 | 
27 | def analyze(sents):
28 | 	# do some statistics
29 | 	usage = {'r1': 0, 'r2': 0, 'r3': 0, 'r4': 0, 'r5': 0, 'r6': 0, 'r7': 0, 'r8': 0}
30 | 	for sent in sents:
31 | 		for r in usage:
32 | 			usage[r] = usage[r] + 1 if sent['atts'][r] != ['none'] else usage[r]
33 | 	for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']:
34 | 		usage[r] /= float(len(sents))
35 | 		print('Usage of %s is %.2f%%.' % (r, usage[r] * 100))
36 | 
37 | def main(params):
38 | 
39 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
40 | 	if not osp.isdir('cache/parsed_atts/' + dataset_splitBy):
41 | 		os.makedirs('cache/parsed_atts/' + dataset_splitBy)
42 | 
43 | 	# load parsed sents, where sents.json =
44 | 	# [{sent_id, sent, parse, raw, tokens}], where parse = {dependencies, parsetree, text, workds}
45 | 	path_to_parsed_sents = osp.join('cache/parsed_sents', dataset_splitBy, 'sents.json')
46 | 	sents = json.load(open(path_to_parsed_sents))
47 | 
48 | 	# parse attributes for each sent
49 | 	if 'refcoco' in params['dataset']:
50 | 		attparser = cocoParser.CocoParser()
51 | 	elif 'refclef' in params['dataset']:
52 | 		attparser = clefParser.ClefParser()
53 | 
54 | 	for i, sent in enumerate(sents):
55 | 		parse = sent['parse']
56 | 		try:
57 | 			attparser.reset(parse)
58 | 			sent['atts'] = attparser.decompose() # return list of atts, i.e., {r1: [man], r2: [blue], r3: [], ...}
59 | 			sent['left'] = attparser.leftWords() # return list of (wd, pos), excluding stopping words
60 | 		except:
61 | 			sent['atts'] = {'r1': ['none'], 'r2': ['none'], 'r3': ['none'], 'r4': ['none'], 'r5': ['none'],
62 | 			'r6': ['none'], 'r7': ['none'], 'r8': ['none']}
63 | 			sent['left'] = attparser.leftWords()
64 | 		if i % 100 == 0:
65 | 			print('%s/%s has been decomposed into attributes r1-r8.' % (i+1, len(sents)))
66 | 
67 | 	# analyze
68 | 	analyze(sents)
69 | 
70 | 	# save
71 | 	with open(osp.join('cache/parsed_atts/', dataset_splitBy, 'sents.json'), 'w') as io:
72 | 		json.dump(sents, io)
73 | 
74 | 
75 | if __name__ == '__main__':
76 | 
77 | 	# input
78 | 	parser = argparse.ArgumentParser()
79 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
80 | 	parser.add_argument('--splitBy', default='unc', help='split By')
81 | 	args = parser.parse_args()
82 | 	params = vars(args)
83 | 
84 | 	# main
85 | 	main(params)
86 | 
87 | 


--------------------------------------------------------------------------------
/parse_sents.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code parse sentences into dependencies, parsetree, text and workds using Stanford-CoreNLP-Parser,
  3 | but current corenlp is only able to load v3.5.1 and v3.5.2.
  4 | 
  5 | The parsed sentences are saved in cache/parsed_sents/dataset_splitBy/sents.json
  6 | The sents.json = [{sent_id, sent, parse, raw, tokens}], where parse = {dependencies, parsetree, text, workds}
  7 | """
  8 | import sys
  9 | import os
 10 | import os.path as osp
 11 | from pprint import pprint
 12 | from nltk.tree import *
 13 | from Queue import Queue
 14 | from threading import Thread, Lock
 15 | import time
 16 | import argparse
 17 | import json
 18 | from pyutils.corenlp import StanfordCoreNLP
 19 | 
 20 | def load_corenlp(params):
 21 | 	# load corenlp
 22 | 	b = time.time()
 23 | 	core = StanfordCoreNLP(params['corenlp_model'])
 24 | 	print('corenlp model loaded in %.2f seconds.' % (time.time() - b))
 25 | 	return core
 26 | 
 27 | def parse_sents(sents, params):
 28 | 	"""
 29 | 	The input sents is list of [{sent_id, sent, raw, tokens}]
 30 | 	The parse results if {dependencies: [(det, dog, the), (root, ROOT, dog)...]
 31 | 						  parsetree: u'(ROOT (NP (NP (DT the) (JJ left) (NN dog)) (PP (IN on) (NP (DT the) (NN tree)))))'
 32 | 						  text: u'the left dog on the tree'
 33 | 						  words: [(u'the',
 34 |                        			  {u'CharacterOffsetBegin': u'0',
 35 |                         	 	   u'CharacterOffsetEnd': u'3',
 36 |                         	 	   u'Lemma': u'the',
 37 |                        		 	   u'NamedEntityTag': u'O',
 38 |                          	 	   u'PartOfSpeech': u'DT'}), ...]}
 39 | 	Return sents = [{sent_id, sent, parse, raw, tokens}]	
 40 | 	"""
 41 | 	num_sents = len(sents)
 42 | 
 43 | 	# enqueue
 44 | 	q = Queue()
 45 | 	for i in range(num_sents):
 46 | 		q.put((i, sents[i]))
 47 | 
 48 | 	# work: dequeue and do job
 49 | 	def worker():
 50 | 		core = load_corenlp(params)
 51 | 		while True:
 52 | 			i, sent = q.get()
 53 | 			try:
 54 | 				output = core.raw_parse(sent['sent'])['sentences'][0]
 55 | 			except:
 56 | 				output = core.raw_parse('none')['sentences'][0]
 57 | 			if i % 100 == 0:
 58 | 				print('%s/%s done.' % (i, num_sents))
 59 | 			sents[i]['parse'] = output
 60 | 			q.task_done()
 61 | 
 62 | 	# workers
 63 | 	for w in range(params['num_workers']):
 64 | 		t = Thread(target=worker)
 65 | 		t.daemon = True
 66 | 		t.start()
 67 | 	q.join()
 68 | 
 69 | 
 70 | def main(params):
 71 | 	
 72 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
 73 | 	if not osp.isdir('cache/parsed_sents/'+dataset_splitBy):
 74 | 		os.makedirs('cache/parsed_sents/'+dataset_splitBy)
 75 | 
 76 | 	# load refer
 77 | 	sys.path.insert(0, 'pyutils/refer')
 78 | 	from refer import REFER
 79 | 	refer = REFER(params['data_root'], params['dataset'], params['splitBy'])
 80 | 
 81 | 	# parse sents
 82 | 	sents = refer.Sents.values()
 83 | 	parse_sents(sents, params)
 84 | 
 85 | 	# save
 86 | 	with open(osp.join('cache/parsed_sents/'+dataset_splitBy, 'sents.json'), 'w') as io:
 87 | 		json.dump(sents, io)
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 | 
 92 | 	# input
 93 | 	parser = argparse.ArgumentParser()
 94 | 	parser.add_argument('--data_root', default='data', help='dataset root directory')
 95 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
 96 | 	parser.add_argument('--splitBy', default='unc', help='split By')
 97 | 	parser.add_argument('--corenlp_model', default='models/stanford-corenlp-full-2015-01-29')
 98 | 	parser.add_argument('--num_workers', type=int, default=2, help='number of workers')
 99 | 	args = parser.parse_args()
100 | 	params = vars(args)
101 | 
102 | 	# main
103 | 	main(params)
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/pyutils/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/pyutils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lichengunc/refer-parser2/a5214d0c4b086e1da5ccd92fd105d7c95a6f6fc3/pyutils/__init__.py


--------------------------------------------------------------------------------
/pyutils/attparser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lichengunc/refer-parser2/a5214d0c4b086e1da5ccd92fd105d7c95a6f6fc3/pyutils/attparser/__init__.py


--------------------------------------------------------------------------------
/pyutils/attparser/baseParser.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'licheng'
  2 | 
  3 | """
  4 | BaseParser defines:
  5 | reset: initialize parse, head word, rels and Deps.
  6 | """
  7 | 
  8 | from nltk.tree import *
  9 | import sys
 10 | from nltk.corpus import stopwords
 11 | import os.path as osp
 12 | import config
 13 | import head
 14 | 
 15 | class BaseParser():
 16 |     def __init__(self, dataset):
 17 |         if dataset == 'refclef':
 18 |             self.config = config.configCLEF()
 19 |             self._headMode = 'vicente'
 20 |         elif dataset == 'refcoco' or dataset == 'refcoco+':
 21 |             self.config = config.configCOCO()
 22 |             self._headMode = 'licheng'
 23 |         else:
 24 |             print 'No configuration set yet.'
 25 |             sys.exit()
 26 | 
 27 |     def reset(self, parse):
 28 |         # load parse
 29 |         self._tree = Tree.fromstring(parse['parsetree'])
 30 |         self._dependencies = parse['dependencies']
 31 |         self._words = parse['words']
 32 |         self._text  = parse['text']
 33 | 
 34 |         # reset seven attributes
 35 |         self.r1, self.r2, self.r3, self.r4, self.r5, self.r6, self.r7 = [], [], [], [], [], [], []
 36 | 
 37 |         # find head word
 38 |         self.head_word, _ = head.findHead(self._tree, mode = self._headMode)
 39 |         if self.head_word != '' and self.head_word != None:
 40 |             self.r1 = [wd[1]['Lemma'] for wd in self._words if wd[0] == self.head_word]
 41 |             self.r1 = [self.r1[0]]  # we only need one
 42 |         else:
 43 |             self.r1 = ['none']
 44 | 
 45 |         # dependency's relations that have 'prep'
 46 |         rels_prep      = [dep for dep in self._dependencies if 'prep' in dep[0]]
 47 |         rels_prep_in   = [dep for dep in self._dependencies if 'prep_in' in dep[0]]
 48 |         rels_prep_on   = [dep for dep in self._dependencies if 'prep_on' in dep[0]]
 49 |         rels_prep_at   = [dep for dep in self._dependencies if 'prep_at' in dep[0]]
 50 |         rels_prep_to   = [dep for dep in self._dependencies if 'prep_to' in dep[0]]
 51 |         rels_prep_from = [dep for dep in self._dependencies if 'prep_from' in dep[0] or 'prepc_from' in dep[0]]
 52 |         rels_prep_of   = [dep for dep in self._dependencies if 'prep_of' in dep[0]]
 53 |         rels_det       = [dep for dep in self._dependencies if 'det' in dep[0]]
 54 | 
 55 |         # dependency's sources equal to head_word
 56 |         rels_direct    = [dep for dep in self._dependencies if dep[1] == self.head_word] if self.r1[0]!='none' else []
 57 |         direct_att_dep = [dep for dep in rels_direct if dep not in rels_prep + rels_det]
 58 |         prep_dep       = [dep for dep in rels_direct if dep in rels_prep]
 59 |         prep_in_dep    = [dep for dep in rels_direct if dep in rels_prep_in]
 60 |         prep_on_dep    = [dep for dep in rels_direct if dep in rels_prep_on]
 61 |         prep_of_dep    = [dep for dep in rels_direct if dep in rels_prep_of]
 62 |         prep_from_dep  = [dep for dep in rels_direct if dep in rels_prep_from]
 63 |         prep_at_dep    = [dep for dep in rels_direct if dep in rels_prep_at]
 64 |         prep_to_dep    = [dep for dep in rels_direct if dep in rels_prep_to]
 65 | 
 66 |         # initialize types of dependencies
 67 |         self.rels = {}
 68 |         self.rels['prep']      = rels_prep
 69 |         self.rels['prep_in']   = rels_prep_in
 70 |         self.rels['prep_on']   = rels_prep_on
 71 |         self.rels['prep_at']   = rels_prep_at
 72 |         self.rels['prep_to']   = rels_prep_to
 73 |         self.rels['prep_from'] = rels_prep_from
 74 |         self.rels['prep_of']   = rels_prep_of
 75 | 
 76 |         # initialize types of dependencies whose source is head word
 77 |         # Deps denots Direct dependencies
 78 |         self.Deps = {}
 79 |         self.Deps['att']       = direct_att_dep
 80 |         self.Deps['prep']      = prep_dep
 81 |         self.Deps['prep_in']   = prep_in_dep
 82 |         self.Deps['prep_on']   = prep_on_dep
 83 |         self.Deps['prep_of']   = prep_of_dep
 84 |         self.Deps['prep_from'] = prep_from_dep
 85 |         self.Deps['prep_at']   = prep_at_dep
 86 |         self.Deps['prep_to']   = prep_to_dep
 87 | 
 88 |     def leftWords(self):
 89 |         all_wds = [word[0] for word in self._words]
 90 |         att_wds = [self.head_word] + self.r2 + self.r3 + self.r4 + self.r7
 91 |         # we then add r5, r6 to att_wds, need some tricks
 92 |         for wd in self.r5:
 93 |             if 'prep' in wd:
 94 |                 wd = wd[5:] # prep_on_left -> on_left
 95 |                 idx = wd.find('_')
 96 |                 if idx >= 0:
 97 |                     att_wds += [wd[:idx], wd[idx+1:]]
 98 |                 else:
 99 |                     att_wds += [wd] # prep_from -> from
100 |             else: # ordinary_position, e.g., second_left
101 |                 idx = wd.find('_')
102 |                 att_wds += [wd[:idx], wd[idx+1:]]
103 |         for wd in self.r6:
104 |             att_wds = att_wds + [wd] if wd != 'self' else att_wds
105 |         # the left word set
106 |         left_wds = list(set(all_wds).difference(set(att_wds)))
107 |         # word to POS dictionary
108 |         wdToPOSs = {word[0]: [] for word in self._words}
109 |         for word in self._words:
110 |             wdToPOSs[word[0]] += [word[1]]
111 |         # return left words
112 |         # stopwds = ['the', 'of', 'a', 'an', ',', '.', 'on', 'in', 'from', 'at', 'of', 'to', 'and', 'or', '(', ')', 'that', 'this', 'it']
113 |         stopwds = stopwords.words("english") + ['.', ',', ':', '(', ')', '"', "'s", '!', 'between', 'against', 'above',
114 |                                                 'below', 'up', 'down', 'out', 'off', 'over']
115 |         left_words = [(wd, wdToPOSs[wd][0]['PartOfSpeech']) for wd in left_wds if wd not in stopwds]
116 |         return left_words
117 | 
118 | if __name__ == '__main__':
119 |     from pprint import pprint
120 | 
121 |     ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit')
122 |     sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils'))
123 |     from corenlp.corenlp import StanfordCoreNLP
124 |     parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30')
125 |     stanfordParser = StanfordCoreNLP(parser_path)
126 | 
127 |     sent = 'players close to us in dark uniform'
128 |     parse = stanfordParser.raw_parse(sent)['sentences'][0]
129 |     pprint(parse)
130 | 
131 |     attParser = BaseParser('refclef')
132 |     attParser.reset(parse)
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/pyutils/attparser/clefParser.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'licheng'
  2 | 
  3 | """
  4 | r1: [lemma of head word]
  5 | r2: [color word describing r1]
  6 | r3: [size word describing r1]
  7 | r4: [location word describing r1], e.g., upper dog, dog on the left (of the picture)
  8 | r5: [relative location and object], e.g., person under the door, dog on the table, dog on the left of the cat
  9 | r6: [generic attribute describing r1], i.e., other JJ attributes describing head word
 10 | """
 11 | 
 12 | from baseParser import BaseParser
 13 | 
 14 | class ClefParser(BaseParser):
 15 | 
 16 |     def __init__(self):
 17 |         BaseParser.__init__(self, 'refclef')
 18 | 
 19 |     def decompose(self):
 20 |         # r2: color
 21 |         color_wds  = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['words']]
 22 |         color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['words']]
 23 |         for wd in color_wds:
 24 |             ix = self.config.color_table['wordtoix'][wd]
 25 |             self.r2 += [self.config.color_table['ixtoword'][ix]]
 26 | 
 27 |         # r3: size
 28 |         size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']]
 29 |         for wd in size_wds:
 30 |             ix = self.config.size_table['wordtoix'][wd]
 31 |             self.r3 += [self.config.size_table['ixtoword'][ix]]
 32 | 
 33 |         # r4: absolute location
 34 |         # 1) left sth
 35 |         location_wds =  [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']]
 36 |         # 2) sth in/on/at/to the left
 37 |         commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to']
 38 |         position_wds =  [dep[3] for dep in commonDeps if dep[3] in self.config.position_table['words']]
 39 |         for wd in position_wds:
 40 |             of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == wd]
 41 |             if len(of_exist) == 0:
 42 |                 location_wds += [wd]
 43 |         # 2) sth in/on/at/to the left of the image
 44 |         AllowWds = ['image', 'picture', 'im', 'pic']
 45 |         location_wds += [dep[1] for dep in self.rels['prep_of'] if dep[1] in position_wds and dep[3] in AllowWds]
 46 |         # add to r4
 47 |         for wd in location_wds:
 48 |             ix = self.config.location_table['wordtoix'][wd]
 49 |             self.r4 += [self.config.location_table['ixtoword'][ix]]
 50 | 
 51 |         # r5, r6: relative location and object
 52 |         '''
 53 |         e.g., sent         = 'players at the door'
 54 |               dependencies = [('root', 'ROOT', '0', 'players', '1'),
 55 |                               ('det', 'door', '4', 'the', '3'),
 56 |                               ('prep_at', 'players', '1', 'door', '4')]
 57 | 
 58 |               sent         = 'players on the left of the dog'
 59 |               dependencies = [('root', 'ROOT', '0', 'players', '1'),
 60 |                               ('det', 'left', '4', 'the', '3'),
 61 |                               ('prep_on', 'players', '1', 'left', '4'),
 62 |                               ('det', 'dog', '7', 'the', '6'),
 63 |                               ('prep_of', 'left', '4', 'dog', '7')]
 64 | 
 65 |         Note, in vicente's matlab, the parsing differs at adding punctuation in the end.
 66 |         '''
 67 |         # 1) the dog from the river
 68 |         ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
 69 |         rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words']
 70 |                      if dep[3] not in ForbiddenWds]
 71 |         for pair in rel_pairs:
 72 |             self.r5 += [pair[0]]
 73 |             self.r6 += [pair[1]]
 74 |         # 2) the dog on/in/at the table
 75 |         commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']
 76 |         ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
 77 |         rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds]
 78 |         for pair in rel_pairs:
 79 |             self.r5 += [pair[0]]
 80 |             self.r6 += [pair[1]]
 81 |         # 3) the dog on/in/at/to the left of table
 82 |         commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to']
 83 |         rel_pairs =  [(dep[0], dep[3]) for dep in commonDeps if dep[3] in self.config.position_table['words']]
 84 |         ForbiddenWds = ['image', 'picture', 'im', 'pic']
 85 |         for rel, position_wd in rel_pairs:
 86 |             of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == position_wd]
 87 |             if len(of_exist) > 0:
 88 |                 for of_object in of_exist:
 89 |                     if of_object not in ForbiddenWds:
 90 |                         self.r5 += [rel+'_'+position_wd]
 91 |                         self.r6 += [of_object]
 92 | 
 93 |         # r7: generic attribute
 94 |         ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] \
 95 |                         + self.config.location_table['words']
 96 |         generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds]
 97 |         for gwd in generic_wds:
 98 |             gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0]
 99 |             if gpos[:2] == 'JJ':
100 |                 self.r7 += [gwd]
101 | 
102 |         self.r2 = ['none'] if len(self.r2) == 0 else self.r2
103 |         self.r3 = ['none'] if len(self.r3) == 0 else self.r3
104 |         self.r4 = ['none'] if len(self.r4) == 0 else self.r4
105 |         self.r5 = ['none'] if len(self.r5) == 0 else self.r5
106 |         self.r6 = ['none'] if len(self.r6) == 0 else self.r6
107 |         self.r7 = ['none'] if len(self.r7) == 0 else self.r7
108 | 
109 |         # left words -> r8
110 |         left_wds = [word[0] for word in self.leftWords()]
111 |         self.r8 = ['none'] if len(left_wds) == 0 else left_wds
112 | 
113 |         return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8}
114 | 
115 | 
116 |     # def decompose(self):
117 |     #     # r2: color
118 |     #     color_wds  = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['wordtoix']]
119 |     #     color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['wordtoix']]
120 |     #     for wd in color_wds:
121 |     #         ix = self.config.color_table['wordtoix'][wd]
122 |     #         self.r2 += [self.config.color_table['ixtoword'][ix]]
123 |     #
124 |     #     # r3: size
125 |     #     size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']]
126 |     #     for wd in size_wds:
127 |     #         ix = self.config.size_table['wordtoix'][wd]
128 |     #         self.r3 += [self.config.size_table['ixtoword'][ix]]
129 |     #
130 |     #     # r4: absolute location
131 |     #     # 1) left sth
132 |     #     location_wds =  [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']]
133 |     #     # 2) sth in/on/at the left.
134 |     #     commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']
135 |     #     position_deps = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']]
136 |     #     if len(self.Deps['prep_of']) == 0:
137 |     #         location_wds += [dep[3] for dep in position_deps]
138 |     #     else:
139 |     #     # 3) sth in/on/at the left of the picture.
140 |     #     # 4) sth of sth in/on/at the left. Note we allow 'of' appear before 'left/right/...'
141 |     #         ForbiddenWds = ['image', 'picture', 'im', 'pic']
142 |     #         position_id         = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0
143 |     #         position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id]
144 |     #         if len(position_of_objects) == 0:
145 |     #             location_wds += [dep[3] for dep in position_deps]
146 |     #
147 |     #     # add to r4
148 |     #     for wd in location_wds:
149 |     #         ix = self.config.location_table['wordtoix'][wd]
150 |     #         self.r4 += [self.config.location_table['ixtoword'][ix]]
151 |     #
152 |     #     # r5, r6: relative location and object
153 |     #     '''
154 |     #     e.g., sent         = 'players at the door.'
155 |     #           dependencies = [('root', 'ROOT', '0', 'players', '1'),
156 |     #                           ('det', 'door', '4', 'the', '3'),
157 |     #                           ('prep_at', 'players', '1', 'door', '4')]
158 |     #
159 |     #           sent         = 'players on the left of the dog.'
160 |     #           dependencies = [('root', 'ROOT', '0', 'players', '1'),
161 |     #                           ('det', 'left', '4', 'the', '3'),
162 |     #                           ('prep_on', 'players', '1', 'left', '4'),
163 |     #                           ('det', 'dog', '7', 'the', '6'),
164 |     #                           ('prep_of', 'players', '1', 'dog', '7')]
165 |     #
166 |     #     Following vicente's matlab, this parsing is for sentence with punctuation at the end.
167 |     #     '''
168 |     #     # 1) the dog from the river.
169 |     #     ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
170 |     #     rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words']
171 |     #                  if dep[3] not in ForbiddenWds]
172 |     #     for pair in rel_pairs:
173 |     #         self.r5 += [pair[0]]
174 |     #         self.r6 += [pair[1]]
175 |     #     # 2) the dog on/in/at the table.
176 |     #     commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']
177 |     #     ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
178 |     #     rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds]
179 |     #     for pair in rel_pairs:
180 |     #         self.r5 += [pair[0]]
181 |     #         self.r6 += [pair[1]]
182 |     #     # 3) the dog on/in/at/to the left of table.
183 |     #     # 4) the face of woman on the left of the window. Note we only detect position_of_objects
184 |     #     commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to']
185 |     #     ForbiddenWds = ['image', 'picture', 'im', 'pic']
186 |     #     position_deps       = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']]
187 |     #     position_id         = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0 # find the earliest position for 'left', 'right', 'top', ...
188 |     #     position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id]  # 'of' must appear after position_id
189 |     #     for dep in position_deps:
190 |     #         if len(position_of_objects) > 0:
191 |     #             for of_object in position_of_objects:
192 |     #                 self.r5 += [dep[0]+'_'+dep[3]]
193 |     #                 self.r6 += [of_object]
194 |     #
195 |     #     # r7: generic attribute
196 |     #     ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] \
197 |     #                     + self.config.location_table['words']
198 |     #     generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds]
199 |     #     for gwd in generic_wds:
200 |     #         gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0]
201 |     #         if gpos[:2] == 'JJ':
202 |     #             self.r7 += [gwd]
203 |     #
204 |     #     self.r2 = ['none'] if len(self.r2) == 0 else self.r2
205 |     #     self.r3 = ['none'] if len(self.r3) == 0 else self.r3
206 |     #     self.r4 = ['none'] if len(self.r4) == 0 else self.r4
207 |     #     self.r5 = ['none'] if len(self.r5) == 0 else self.r5
208 |     #     self.r6 = ['none'] if len(self.r6) == 0 else self.r6
209 |     #     self.r7 = ['none'] if len(self.r7) == 0 else self.r7
210 |     #
211 |     #     # left words -> r8
212 |     #     left_wds = [word[0] for word in self.leftWords()]
213 |     #     self.r8 = ['none'] if len(left_wds) == 0 else left_wds
214 |     #
215 |     #     return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8}
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     import sys
220 |     from pprint import pprint
221 |     import os.path as osp
222 |     ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit')
223 |     sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils'))
224 |     from corenlp.corenlp import StanfordCoreNLP
225 |     parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30')
226 |     stanfordParser = StanfordCoreNLP(parser_path)
227 | 
228 |     sent = 'woman in red shirt'
229 |     parse = stanfordParser.raw_parse(sent)['sentences'][0]
230 |     pprint(parse['dependencies'])
231 | 
232 |     attParser = ClefParser()
233 |     attParser.reset(parse)
234 |     pprint(attParser.decompose())
235 |     pprint(attParser.leftWords())
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 


--------------------------------------------------------------------------------
/pyutils/attparser/cocoParser.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'licheng'
  2 | 
  3 | """
  4 | r1: [lemma of head word]
  5 | r2: [color word describing r1]
  6 | r3: [size word describing r1]
  7 | r4: [location word describing r1], e.g., upper dog, dog on the left (of the picture)
  8 | r5: [relative location and object], e.g., person under the door, dog on the table, dog on the left of the cat,
  9 |     Note, we also take "second cat from left", i.e., [r5 = second_left, r6 = self]
 10 | r6: [generic attribute describing r1], i.e., other JJ and dep attributes describing head word
 11 | """
 12 | 
 13 | from baseParser import BaseParser
 14 | 
 15 | class CocoParser(BaseParser):
 16 | 
 17 |     def __init__(self):
 18 |         BaseParser.__init__(self, 'refcoco')
 19 | 
 20 |     def reset(self, parse):
 21 |         BaseParser.reset(self, parse)
 22 |         # Now, let's extract dependencies related to ordinary words
 23 |         self.Deps['ord'] = [dep for dep in self._dependencies if dep[1] == self.r1[0] and dep[3] in
 24 |                             self.config.ordinal_table['words']] if self.r1[0]!='none' else []
 25 |         self.rels['ord_prep'] = [dep for dep in self.rels['prep'] if dep[1] in self.config.ordinal_table['words']]
 26 | 
 27 |     def decompose(self):
 28 |         # r2: color
 29 |         color_wds  = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['wordtoix']]
 30 |         color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['wordtoix']]
 31 |         for wd in color_wds:
 32 |             ix = self.config.color_table['wordtoix'][wd]
 33 |             self.r2 += [self.config.color_table['ixtoword'][ix]]
 34 | 
 35 |         # r3: size
 36 |         size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']]
 37 |         for wd in size_wds:
 38 |             ix = self.config.size_table['wordtoix'][wd]
 39 |             self.r3 += [self.config.size_table['ixtoword'][ix]]
 40 | 
 41 |         # r4: absolute location
 42 |         location_wds = []
 43 |         if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0:
 44 |             # 1) left sth (no ordinal word)
 45 |             location_wds =  [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']]
 46 |             # 2) sth in/on/at/to the left (no ordinal word)
 47 |             commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to']
 48 |             position_wds =  [dep[3] for dep in commonDeps if dep[3] in self.config.position_table['words']]
 49 |             for wd in position_wds:
 50 |                 of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == wd]
 51 |                 if len(of_exist) == 0:
 52 |                     location_wds += [wd]
 53 |             # 3) sth in/on/at/to the left of the image (no ordinal word)
 54 |             AllowWds = ['image', 'picture', 'im', 'pic']
 55 |             location_wds += [dep[1] for dep in self.rels['prep_of'] if dep[1] in position_wds and dep[3] in AllowWds]
 56 |          # add to r4
 57 |         for wd in location_wds:
 58 |             ix = self.config.location_table['wordtoix'][wd]
 59 |             self.r4 += [self.config.location_table['ixtoword'][ix]]
 60 | 
 61 |         # r5, r6: relative location and object
 62 |         '''
 63 |         e.g., case 1 and 2:
 64 |               sent         = 'players at the door'  or 'players from the river'
 65 |               dependencies = [('root', 'ROOT', '0', 'players', '1'),
 66 |                               ('det', 'door', '4', 'the', '3'),
 67 |                               ('prep_at', 'players', '1', 'door', '4')]
 68 |               case 3:
 69 |               sent         = 'players on the left of the dog'
 70 |               dependencies = [('root', 'ROOT', '0', 'players', '1'),
 71 |                               ('det', 'left', '4', 'the', '3'),
 72 |                               ('prep_on', 'players', '1', 'left', '4'),
 73 |                               ('det', 'dog', '7', 'the', '6'),
 74 |                               ('prep_of', 'left', '4', 'dog', '7')]
 75 |               case 4:
 76 |               sent         = 'second left man'
 77 |               dependencies = [('root', 'ROOT', '0', 'man', '3'),
 78 |                               ('amod', 'man', '3', 'second', '1'),
 79 |                               ('amod', 'man', '3', 'left', '2')]
 80 |               case 5:
 81 |               sent         = 'second man from left'
 82 |               dependencies = [('root', 'ROOT', '0', 'man', '2'),
 83 |                               ('amod', 'man', '2', 'second', '1'),
 84 |                               ('prepc_from', 'man', '2', 'left', '4')]
 85 |               case 6:
 86 |               sent         = 'man second from right'
 87 |               dependencies = [('root', 'ROOT', '0', 'second', '2'),
 88 |                               ('nn', 'second', '2', 'man', '1'),
 89 |                               ('prep_from', 'second', '2', 'right', '4')]
 90 |               case 7:
 91 |               sent         = 'second from right man'
 92 |               dependencies = [('root', 'ROOT', '0', 'second', '1'),
 93 |                               ('amod', 'man', '4', 'right', '3'),
 94 |                               ('prep_from', 'second', '1', 'man', '4')]
 95 |         Note, in vicente's matlab, the parsing differs at adding punct in the end.
 96 |         '''
 97 |         if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0:
 98 |             # 1) the dog from the river
 99 |             ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
100 |             rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words']
101 |                          if dep[3] not in ForbiddenWds]
102 |             for pair in rel_pairs:
103 |                 self.r5 += [pair[0]]
104 |                 self.r6 += [pair[1]]
105 |             # 2) the dog on/in/at the table
106 |             commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']
107 |             ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
108 |             rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds]
109 |             for pair in rel_pairs:
110 |                 self.r5 += [pair[0]]
111 |                 self.r6 += [pair[1]]
112 |             # 3) players on the left of the dog
113 |             commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to']
114 |             rel_pairs =  [(dep[0], dep[3]) for dep in commonDeps if dep[3] in self.config.position_table['words']]
115 |             ForbiddenWds = ['image', 'picture', 'im', 'pic']
116 |             for rel, position_wd in rel_pairs:
117 |                 of_exist = [dep[3] for dep in self.rels['prep_of'] if dep[1] == position_wd]
118 |                 if len(of_exist) > 0:
119 |                     for of_object in of_exist:
120 |                         if of_object not in ForbiddenWds:
121 |                             self.r5 += [rel+'_'+position_wd]
122 |                             self.r6 += [of_object]
123 |         else:
124 |             position_wds, ordinary_wds = [], []
125 |             if len(self.Deps['ord']) > 0:
126 |                 ordinary_wds = [dep[3] for dep in self.Deps['ord']]
127 |                 # 4) second left man
128 |                 position_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']]
129 |                 # 5) second man from left
130 |                 position_wds += [dep[3] for dep in self.rels['prep_from'] if dep[3] in self.config.position_table['words']]
131 |             if len(self.rels['ord_prep']) > 0:
132 |                 ordinary_wds = [dep[1] for dep in self.rels['ord_prep']]
133 |                 # 6) man second from right
134 |                 position_wds = [dep[3] for dep in self.rels['ord_prep'] if dep[3] in self.config.position_table['words']]
135 |                 # 7) second from right man
136 |                 position_wds += [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']]
137 |             # add to r5 and r6
138 |             if len(position_wds) > 0:
139 |                 self.r5 += [ordinary_wds[0]+'_'+position_wds[0]]
140 |                 self.r6 += ['self']
141 | 
142 |         # r7: generic attribute
143 |         ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] + self.config.position_table['words'] \
144 |                        + self.config.location_table['words'] + self.config.ordinal_table['words']
145 |         generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds]
146 |         for gwd in generic_wds:
147 |             gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0]
148 |             if gpos[:2] == 'JJ':
149 |                 self.r7 += [gwd]
150 | 
151 |         self.r2 = ['none'] if len(self.r2) == 0 else self.r2
152 |         self.r3 = ['none'] if len(self.r3) == 0 else self.r3
153 |         self.r4 = ['none'] if len(self.r4) == 0 else self.r4
154 |         self.r5 = ['none'] if len(self.r5) == 0 else self.r5
155 |         self.r6 = ['none'] if len(self.r6) == 0 else self.r6
156 |         self.r7 = ['none'] if len(self.r7) == 0 else self.r7
157 | 
158 |         # left words -> r8
159 |         left_wds = [word[0] for word in self.leftWords()]
160 |         self.r8 = ['none'] if len(left_wds) == 0 else left_wds
161 | 
162 |         return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8}
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     import sys
167 |     from pprint import pprint
168 |     import os.path as osp
169 |     # set nltk data path
170 |     import nltk
171 |     nltk.data.path.append('/Users/liyu/Documents/nltk_data')
172 |     sys.path.insert(0, '../..')
173 |     from pyutils.corenlp import StanfordCoreNLP
174 |     core = StanfordCoreNLP('../../models/stanford-corenlp-full-2015-01-29')
175 | 
176 |     # sent = 'face of woman to the left'
177 |     # sent = 'guy in blue'
178 |     sent = 'a sandal colour teddy bear in between the other two teddys'
179 |     parse = core.raw_parse(sent)['sentences'][0]
180 |     pprint(parse['dependencies'])
181 | 
182 |     attParser = CocoParser()
183 |     attParser.reset(parse)
184 |     pprint(attParser.decompose())
185 |     pprint(attParser.leftWords())
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/pyutils/attparser/cocoParser_punct.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'licheng'
  2 | 
  3 | """
  4 | r1: [lemma of head word]
  5 | r2: [color word describing r1]
  6 | r3: [size word describing r1]
  7 | r4: [location word describing r1], e.g., upper dog, dog on the left (of the picture)
  8 | r5: [relative location and object], e.g., person under the door, dog on the table, dog on the left of the cat,
  9 |     Note, we also take "second cat from left", i.e., [r5 = second_left, r6 = self]
 10 | r6: [generic attribute describing r1], i.e., other JJ and dep attributes describing head word
 11 | """
 12 | 
 13 | from baseParser import BaseParser
 14 | 
 15 | class CocoParser(BaseParser):
 16 | 
 17 |     def __init__(self):
 18 |         BaseParser.__init__(self, 'refcoco')
 19 | 
 20 |     def reset(self, parse):
 21 |         BaseParser.reset(self, parse)
 22 |         # Now, let's extract dependencies related to ordinary words
 23 |         self.Deps['ord'] = [dep for dep in self._dependencies if dep[1] == self.r1[0] and dep[3] in
 24 |                             self.config.ordinal_table['words']] if self.r1[0]!='none' else []
 25 |         self.rels['ord_prep'] = [dep for dep in self.rels['prep'] if dep[1] in self.config.ordinal_table['words']]
 26 | 
 27 |     def decompose(self):
 28 |         # r2: color
 29 |         color_wds  = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.color_table['wordtoix']]
 30 |         color_wds += [dep[3] for dep in self.Deps['prep_in'] if dep[3] in self.config.color_table['wordtoix']]
 31 |         for wd in color_wds:
 32 |             ix = self.config.color_table['wordtoix'][wd]
 33 |             self.r2 += [self.config.color_table['ixtoword'][ix]]
 34 | 
 35 |         # r3: size
 36 |         size_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.size_table['words']]
 37 |         for wd in size_wds:
 38 |             ix = self.config.size_table['wordtoix'][wd]
 39 |             self.r3 += [self.config.size_table['ixtoword'][ix]]
 40 | 
 41 |         # r4: absolute location (no ordinal word)
 42 |         location_wds = []
 43 |         if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0:
 44 |             # 1) left sth.
 45 |             location_wds =  [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.location_table['words']]
 46 |             # 2) sth in/on the left.
 47 |             commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']
 48 |             position_deps =  [dep for dep in commonDeps if dep[3] in self.config.position_table['words']]
 49 |             if len(self.Deps['prep_of']) == 0:
 50 |                 location_wds += [dep[3] for dep in position_deps]
 51 |             else:
 52 |             # 3) sth in/on/at the left of the image.
 53 |             # 4) sth of sth in/on/at the left.
 54 |                 ForbiddenWds = ['image', 'picture', 'im', 'pic']
 55 |                 position_id  = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0
 56 |                 position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id]
 57 |                 if len(position_of_objects) == 0:
 58 |                     location_wds += [dep[3] for dep in position_deps]
 59 |          # add to r4
 60 |         for wd in location_wds:
 61 |             ix = self.config.location_table['wordtoix'][wd]
 62 |             self.r4 += [self.config.location_table['ixtoword'][ix]]
 63 | 
 64 |         # r5, r6: relative location and object
 65 |         '''
 66 |         e.g., case 1 and 2:
 67 |               sent         = 'players at the door.'  or 'players from the river.'
 68 |               dependencies = [('root', 'ROOT', '0', 'players', '1'),
 69 |                               ('det', 'door', '4', 'the', '3'),
 70 |                               ('prep_at', 'players', '1', 'door', '4')]
 71 |               case 3:
 72 |               sent         = 'players on the left of the dog.'
 73 |               dependencies = [('root', 'ROOT', '0', 'players', '1'),
 74 |                               ('det', 'left', '4', 'the', '3'),
 75 |                               ('prep_on', 'players', '1', 'left', '4'),
 76 |                               ('det', 'dog', '7', 'the', '6'),
 77 |                               ('prep_of', 'players', '1', 'dog', '7')]
 78 | 
 79 |               case 4:
 80 |               sent         = 'second left man.'
 81 |               dependencies = [('root', 'ROOT', '0', 'man', '3'),
 82 |                               ('amod', 'man', '3', 'second', '1'),
 83 |                               ('amod', 'man', '3', 'left', '2')]
 84 |               case 5:
 85 |               sent         = 'second man from left.'
 86 |               dependencies = [('root', 'ROOT', '0', 'left', '4'),
 87 |                               ('amod', 'man', '2', 'second', '1'),
 88 |                               ('nsubj', 'left', '4', 'man', '2'),
 89 |                               ('prep', 'man', '2', 'from', '3')]
 90 |               case 6:
 91 |               sent         = 'man second from right.'
 92 |               dependencies = [('root', 'ROOT', '0', 'second', '2'),
 93 |                               ('nn', 'second', '2', 'man', '1'),
 94 |                               ('prep_from', 'second', '2', 'right', '4')]
 95 |               case 7:
 96 |               sent         = 'second from right man.'
 97 |               dependencies = [('root', 'ROOT', '0', 'second', '1'),
 98 |                               ('amod', 'man', '4', 'right', '3'),
 99 |                               ('prep_from', 'second', '1', 'man', '4')]
100 |         Note, in vicente's matlab, the parsing differs at adding punct in the end.
101 |         '''
102 |         if len(self.Deps['ord']) + len(self.rels['ord_prep']) == 0:
103 |             # 1) the dog from the river
104 |             ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
105 |             rel_pairs = [(dep[0], dep[3]) for dep in self.Deps['prep'] if dep[0] in self.config.relative_preps_table['words']
106 |                          if dep[3] not in ForbiddenWds]
107 |             for pair in rel_pairs:
108 |                 self.r5 += [pair[0]]
109 |                 self.r6 += [pair[1]]
110 |             # 2) the dog on/in/at the table
111 |             commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']
112 |             ForbiddenWds = self.config.position_table['words']+self.config.color_table['words']
113 |             rel_pairs = [(dep[0], dep[3]) for dep in commonDeps if dep[3] not in ForbiddenWds]
114 |             for pair in rel_pairs:
115 |                 self.r5 += [pair[0]]
116 |                 self.r6 += [pair[1]]
117 |             # 3) the dog on/in/at/to the left of table.
118 |             # 4) the face of woman on the left of the window. Note we only detect position_of_objects
119 |             commonDeps = self.Deps['prep_on']+self.Deps['prep_in']+self.Deps['prep_at']+self.Deps['prep_to']
120 |             ForbiddenWds = ['image', 'picture', 'im', 'pic']
121 |             position_deps = [dep for dep in commonDeps if dep[3] in self.config.position_table['words']]
122 |             position_id = min([dep[4] for dep in position_deps]) if len(position_deps) > 0 else 0 # find the earliest position for 'left', 'right', 'top', ...
123 |             position_of_objects = [dep[3] for dep in self.Deps['prep_of'] if dep[3] not in ForbiddenWds and dep[4] > position_id]  # 'of' must appear after position_id
124 |             for dep in position_deps:
125 |                 if len(position_of_objects) > 0:
126 |                     for of_object in position_of_objects:
127 |                         self.r5 += [dep[0]+'_'+dep[3]]
128 |                         self.r6 += [of_object]
129 |         else:
130 |             position_wds, ordinary_wds = [], []
131 |             if len(self.Deps['ord']) > 0:
132 |                 ordinary_wds = [dep[3] for dep in self.Deps['ord']]
133 |                 # 4) second left man
134 |                 position_wds = [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']]
135 |                 # 5) second man from left
136 |                 position_wds += [dep[3] for dep in self._dependencies if dep[3] in self.config.position_table['words']]  # no pattern, so search from all dependencies
137 |             if len(self.rels['ord_prep']) > 0:
138 |                 ordinary_wds = [dep[1] for dep in self.rels['ord_prep']]
139 |                 # 6) man second from right
140 |                 position_wds = [dep[3] for dep in self.rels['ord_prep'] if dep[3] in self.config.position_table['words']]
141 |                 # 7) second from right man
142 |                 position_wds += [dep[3] for dep in self.Deps['att'] if dep[3] in self.config.position_table['words']]
143 |             # add to r5 and r6
144 |             if len(position_wds) > 0:
145 |                 self.r5 += [ordinary_wds[0]+'_'+position_wds[0]]
146 |                 self.r6 += ['self']
147 | 
148 |         # r7: generic attribute
149 |         ForbiddenWds = self.config.size_table['words'] + self.config.color_table['words'] + self.config.position_table['words'] \
150 |                        + self.config.location_table['words'] + self.config.ordinal_table['words']
151 |         generic_wds = [dep[3] for dep in self.Deps['att'] if dep[3] not in ForbiddenWds]
152 |         for gwd in generic_wds:
153 |             gpos = [wd[1]['PartOfSpeech'] for wd in self._words if wd[0] == gwd][0]
154 |             if gpos[:2] == 'JJ':
155 |                 self.r7 += [gwd]
156 | 
157 |         self.r2 = ['none'] if len(self.r2) == 0 else self.r2
158 |         self.r3 = ['none'] if len(self.r3) == 0 else self.r3
159 |         self.r4 = ['none'] if len(self.r4) == 0 else self.r4
160 |         self.r5 = ['none'] if len(self.r5) == 0 else self.r5
161 |         self.r6 = ['none'] if len(self.r6) == 0 else self.r6
162 |         self.r7 = ['none'] if len(self.r7) == 0 else self.r7
163 | 
164 |         # left words -> r8
165 |         left_wds = [word[0] for word in self.leftWords()]
166 |         self.r8 = ['none'] if len(left_wds) == 0 else left_wds
167 | 
168 |         return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8}
169 | 
170 | if __name__ == '__main__':
171 |     import sys
172 |     from pprint import pprint
173 |     import os.path as osp
174 |     ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit')
175 |     sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils'))
176 |     from corenlp.corenlp import StanfordCoreNLP
177 |     parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30')
178 |     stanfordParser = StanfordCoreNLP(parser_path)
179 | 
180 |     sent = 'a bunch of flower at the door.'
181 |     parse = stanfordParser.raw_parse(sent)['sentences'][0]
182 |     pprint(parse['dependencies'])
183 | 
184 |     attParser = CocoParser()
185 |     attParser.reset(parse)
186 |     pprint(attParser.decompose())
187 |     pprint(attParser.leftWords())
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/pyutils/attparser/config.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'licheng'
 2 | 
 3 | class config():
 4 |     def __init__(self):
 5 |         self.attribute_names = ['entrylevel', 'color', 'size', 'absolute_location', 'relative_location', 'relative_object', 'generic']
 6 | 
 7 |     def buildTable(self, words):
 8 |         table = {'wordtoix': {}, 'ixtoword': {}, 'words': []}
 9 |         for ix, wd in enumerate(words):
10 |             if wd.find(',') > 0:
11 |                 jx = wd.find(',')
12 |                 wd1, wd2 = wd[:jx].strip(), wd[jx+1:].strip()
13 |                 table['wordtoix'][wd1], table['wordtoix'][wd2] = ix, ix
14 |                 table['ixtoword'][ix] = wd1
15 |             else:
16 |                 table['wordtoix'][wd] = ix
17 |                 table['ixtoword'][ix] = wd
18 |         table['words'] = table['wordtoix'].keys()
19 |         return table
20 | 
21 | 
22 | class configCLEF(config):
23 |     def __init__(self):
24 |         config.__init__(self)
25 |         # color
26 |         self.color_words = ['white', 'green, greenish', 'blue, bluish', 'red', 'yellow, yellowish', 'black', 'brown, brownish',
27 |                             'pink', 'dark, darker', 'orange', 'gray, grey', 'purple', 'beige', 'bright']
28 |         self.color_table = self.buildTable(self.color_words)
29 |         # size
30 |         self.size_words = ['big', 'small', 'tall', 'large', 'little', 'short', 'tiny', 'long', 'huge']
31 |         self.size_table = self.buildTable(self.size_words)
32 |         # location
33 |         self.location_words = ['right', 'left', 'top', 'bottom', 'middle, mid', 'second, 2nd', 'first, 1st', 'front',
34 |                                'closest, nearest', 'center, central', 'third, 3rd', 'corner', 'upper', 'back, behind',
35 |                                'far', 'anywhere', 'leftmost', 'lower', 'rightmost', 'farthest, furthest', 'next', 'last',
36 |                                'fourth, 4th', 'up, above', 'below, down', 'side']
37 |         self.location_table = self.buildTable(self.location_words)
38 |         # position
39 |         self.position_words = ['right', 'left', 'top', 'bottom', 'middle, center, centre', 'front', 'back']
40 |         self.position_table = self.buildTable(self.position_words)
41 |         # relative preps
42 |         self.relative_preps_words = ['prep_above', 'prep_about', 'prep_below', 'prep_behind', 'prep_beneath', 'prep_beside',
43 |                                      'prep_between', 'prep_close_to', 'prep_by', 'prep_in_front_of', 'prep_against',
44 |                                      'prep_from', 'prep_next_to', 'prep_through', 'prep_under', 'prep_underneath', 'prep_with',
45 |                                      'prep_near', 'prep_inside']
46 |         self.relative_preps_table = self.buildTable(self.relative_preps_words)
47 |         # ordinal number
48 |         self.ordinal_words = ['first', 'second', 'third', 'fourth', 'fifth', 'most']
49 |         self.ordinal_table = self.buildTable(self.ordinal_words)
50 | 
51 | class configCOCO(config):
52 |     def __init__(self):
53 |         config.__init__(self)
54 |         # color
55 |         self.color_words = ['white', 'green', 'blue', 'red', 'yellow', 'black', 'brown', 'pink', 'dark, darker', 'orange',
56 |                             'gray', 'purple', 'beige', 'bright']
57 |         self.color_table = self.buildTable(self.color_words)
58 |         # size
59 |         self.size_words = ['big, bigger', 'small, smaller', 'tall, taller', 'large, larger', 'little', 'short, shorter',
60 |                            'tiny', 'long, longer', 'huge']
61 |         self.size_table = self.buildTable(self.size_words)
62 |         # location
63 |         self.location_words = ['right', 'left', 'top', 'bottom', 'middle, mid', 'front', 'closest, nearest', 'center, central',
64 |                                'corner', 'upper', 'back, behind', 'far', 'leftmost', 'lower, low', 'rightmost',
65 |                                'farthest, furthest', 'next', 'last', 'up, above', 'below, down', 'side']
66 |         self.location_table = self.buildTable(self.location_words)
67 |         # position
68 |         self.position_words = ['right', 'left', 'top', 'bottom', 'middle, center, centre', 'front', 'back']
69 |         self.position_table = self.buildTable(self.position_words)
70 |         # relative preps
71 |         self.relative_preps_words = ['prep_above', 'prep_about', 'prep_below', 'prep_behind', 'prep_beneath', 'prep_beside',
72 |                                      'prep_between', 'prep_close_to', 'prep_by', 'prep_in_front_of', 'prep_against',
73 |                                      'prep_from', 'prep_next_to', 'prep_through', 'prep_under', 'prep_underneath', 'prep_with',
74 |                                      'prep_near', 'prep_inside', 'prepc_from']
75 |         self.relative_preps_table = self.buildTable(self.relative_preps_words)
76 |         # ordinal number
77 |         self.ordinal_words = ['first', 'second', 'third', 'fourth', 'fifth']
78 |         self.ordinal_table = self.buildTable(self.ordinal_words)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     c = configCOCO()
83 |     print c.color_table
84 |     print c.size_table
85 |     print c.location_table
86 |     print c.position_table['words']
87 |     print c.relative_preps_table
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/pyutils/attparser/head.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'licheng'
  2 | 
  3 | from nltk.tree import *
  4 | 
  5 | ing_allowed = ['duckling', 'frosting', 'something', 'anything', 'thing', 'king', 'nothing',
  6 |                'ring', 'wing', 'darling', 'building', 'painting', 'everything', 'string', 
  7 |                'ceiling', 'pudding',  ]
  8 | not_allowed = ['first', 'second', 'third', 'fourth', 'front', 'fifth', 'right', 'left']
  9 | 
 10 | def findFirstbreadthFirst(T, label):
 11 |     # input: tree, and label ('NN' or 'NP')
 12 |     # return: tree, or None
 13 |     myqueue = []
 14 |     label_len = len(label)
 15 |     for i in range(len(T)):
 16 |         myqueue.append(str(T[i])) # push the sons
 17 |     while len(myqueue) > 0:
 18 |         cur_T = Tree.fromstring( myqueue.pop(0) )  # pop the front node as current tree
 19 |         cur_label = cur_T.label()
 20 |         if len(cur_label)>=label_len and cur_label[:label_len] == label:
 21 |             if cur_T[0] in not_allowed: # in case parser take 'first', 'second' as noun
 22 |                 continue
 23 |             if cur_T[0][-3:] == 'ing' and cur_T[0] not in ing_allowed: # in case parser take both 'man' and 'standing' as Noun for 'man standing under the tree'
 24 |                 continue
 25 |             return cur_T
 26 |         else:
 27 |             if not isinstance(cur_T[0], str):  # if not the leaf node, i.e., 'dog', 'tree'
 28 |                 for i in range(len(cur_T)):
 29 |                     myqueue.append(str(cur_T[i]))
 30 |     return None
 31 | 
 32 | def findLastbreadthFirst(T, label):
 33 |     myqueue = []
 34 |     label_len = len(label)
 35 |     for i in reversed(range(len(T))):
 36 |         myqueue.append(str(T[i])) # push the sons
 37 |     while len(myqueue) > 0:
 38 |         cur_T = Tree.fromstring( myqueue.pop(0) )  # pop the front node as current tree
 39 |         cur_label = cur_T.label()
 40 |         if len(cur_label)>=label_len and cur_label[:label_len] == label:
 41 |             if cur_T[0] in not_allowed:
 42 |                 # in case parser take 'first', 'second' as noun
 43 |                 continue
 44 |             if cur_T[0][-3:] == 'ing' and cur_T[0] not in ing_allowed: # in case parser take 'standing' as Noun for 'man standing under the tree'
 45 |                 continue
 46 |             return cur_T
 47 |         else:
 48 |             if not isinstance(cur_T[0], str):  # if not the leaf node, i.e., 'dog', 'tree'
 49 |                 for i in reversed(range(len(cur_T))):  # push_back from the last to the first
 50 |                     myqueue.append(str(cur_T[i]))
 51 |     return None
 52 | 
 53 | def findHead(T, mode='vicente'):
 54 |     if mode == 'vicente':  # find the left-most NP, and then its left-most NN
 55 |         if not T[0].label() == 'NP':
 56 |             foundNP = findFirstbreadthFirst(T[0], 'NP')
 57 |             if foundNP:
 58 |                 head = findFirstbreadthFirst(foundNP, 'NN')
 59 |             else:
 60 |                 head = findFirstbreadthFirst(T[0], 'NN')
 61 |         else:
 62 |             head = findFirstbreadthFirst(T[0], 'NN')
 63 |         if head == None:
 64 |             return None, -1
 65 |         else:
 66 |             head = head[0]
 67 |             idx  = [pos[0] for pos in T.pos()].index(head)
 68 |             return head, idx
 69 |     elif mode == 'licheng': # find bottom-left NP first, then search its rightmost NN son
 70 |         np_exist = T
 71 |         np_found = findFirstbreadthFirst(np_exist, 'NP')
 72 |         while np_found:
 73 |             np_exist = np_found
 74 |             np_found = findFirstbreadthFirst(np_exist, 'NP')
 75 |         if np_exist != T:
 76 |             head_tr = findLastbreadthFirst(np_exist, 'NN')
 77 |             if not head_tr: # if this NP tree has no NN son, we just take the first NN as head.
 78 |                 head_tr = findFirstbreadthFirst(T[0], 'NN')
 79 |         else:
 80 |             head_tr = findFirstbreadthFirst(T[0], 'NN')
 81 | 
 82 |         if head_tr == None or (head_tr != None and head_tr[0] in not_allowed):
 83 |             return None, -1
 84 |         else:
 85 |             head = head_tr[0]
 86 |             idx = [pos[0] for pos in T.pos()].index(head)
 87 |             return head, idx
 88 | 
 89 | if __name__ == '__main__':
 90 |     import sys
 91 |     from pprint import pprint
 92 |     import os.path as osp
 93 |     sys.path.insert(0, '../..')
 94 |     from pyutils.corenlp import StanfordCoreNLP
 95 |     core = StanfordCoreNLP('../../models/stanford-corenlp-full-2015-01-29')
 96 | 
 97 |     # sent = "baseball man"
 98 |     # sent = 'a running person under the tree.'
 99 |     sent = 'a sandal colour teddy bear in between the other two teddys'
100 |     parse = core.raw_parse(sent)['sentences'][0]
101 |     parse_tree = parse['parsetree']
102 |     t = Tree.fromstring(parse_tree)
103 |     t.draw()
104 |     print t
105 |     print parse['dependencies']
106 | 
107 |     # vicente version
108 |     head, idx = findHead(t, mode='vicente')
109 |     print 'vicente            - head: %s, idx: %s' % (head, idx)
110 |     # licheng version
111 |     head, idx = findHead(t, mode='licheng')
112 |     print 'ylc_leftNP_rightNN - head: %s, idx: %s' % (head, idx)
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/pyutils/attparser/simpleParser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This parser is to extract "real" relative position word, as well as absolute position word.
 3 | For clefParser and cocoParser, there are quite amount of inaccurate (r5, r6), e.g., (prep_in, shirt) for "The boy in white shirt."
 4 | On the one hand, this belongs to details of the referred object itself; On the other hand, this doesn't actually reflect position.
 5 | Here, we extract:
 6 | 1) NN, JJ, VB, ...             as object's attribute words,
 7 | 2) big, large, ...             as object's relative size words,
 8 | 3) left, right, top, ...       as absolute position words,
 9 | 4) <left> object, ...          as relative location and object pairs,
10 | 
11 | Approach: we rely on the parsed r1-r7 from clef/coco Parser, and categorize them into the above four types.
12 | 1) r1, r2, r7 and r8 into attribute words (without forbidden words)
13 | 2) check "prep_with" and "prep_in" of (r5, r6), put some of them into attribute words
14 | 3)
15 | 
16 | """
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/__init__.py:
--------------------------------------------------------------------------------
 1 | # corenlp
 2 | # Copyright 2013- Hiroyoshi Komatsu
 3 | # See LICENSE for details.
 4 | 
 5 | """
 6 | Stanford CoreNLP Python wrapper
 7 | """
 8 | __version__ = '1.0.3'
 9 | __author__ = 'Hiroyoshi Komatsu'
10 | __license__ = 'GNU v2+'
11 | 
12 | # classes
13 | from .corenlp import StanfordCoreNLP, ParserError, TimeoutError, ProcessError
14 | # functions
15 | from .corenlp import batch_parse
16 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/__main__.py:
--------------------------------------------------------------------------------
1 | from . import corenlp
2 | 
3 | corenlp.main()
4 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/client.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | # from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
 3 | import jsonrpclib
 4 | from pprint import pprint
 5 | 
 6 | 
 7 | class StanfordNLP:
 8 |     def __init__(self, port_number=8080):
 9 |         self.server = jsonrpclib.Server("http://localhost:%d" % port_number)
10 | 
11 |     def parse(self, text):
12 |         return json.loads(self.server.parse(text))
13 | 
14 | nlp = StanfordNLP()
15 | result = nlp.parse("Hello world!  It is so beautiful.")
16 | pprint(result)
17 | 
18 | from nltk.tree import Tree
19 | tree = Tree.parse(result['sentences'][0]['parsetree'])
20 | pprint(tree)
21 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/corenlp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # corenlp  - Python interface to Stanford Core NLP tools
  4 | # Copyright (c) 2012 Dustin Smith
  5 | #   https://github.com/dasmith/stanford-corenlp-python
  6 | #
  7 | # This program is free software; you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License
  9 | # as published by the Free Software Foundation; either version 2
 10 | # of the License, or (at your option) any later version.
 11 | #
 12 | # This program is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU General Public License
 18 | # along with this program; if not, write to the Free Software
 19 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 20 | 
 21 | import json
 22 | import optparse
 23 | import os
 24 | import re
 25 | import sys
 26 | import traceback
 27 | import pexpect
 28 | import tempfile
 29 | import shutil
 30 | import re
 31 | from progressbar import ProgressBar, Fraction
 32 | from unidecode import unidecode
 33 | from subprocess import call
 34 | import glob
 35 | 
 36 | use_winpexpect = True
 37 | 
 38 | try:
 39 |     import winpexpect
 40 | except ImportError:
 41 |     use_winpexpect = False
 42 | 
 43 | VERBOSE = False
 44 | STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
 45 | WORD_PATTERN = re.compile('\[([^\]]+)\]')
 46 | CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
 47 | 
 48 | DIRECTORY = "stanford-corenlp-full-2013-06-20"
 49 | 
 50 | 
 51 | class bc:
 52 |     HEADER = '\033[95m'
 53 |     OKBLUE = '\033[94m'
 54 |     OKGREEN = '\033[92m'
 55 |     WARNING = '\033[93m'
 56 |     FAIL = '\033[91m'
 57 |     ENDC = '\033[0m'
 58 | 
 59 | 
 60 | class ProcessError(Exception):
 61 | 
 62 |     def __init__(self, value):
 63 |         self.value = value
 64 | 
 65 |     def __str__(self):
 66 |         return repr(self.value)
 67 | 
 68 | 
 69 | class ParserError(Exception):
 70 | 
 71 |     def __init__(self, value):
 72 |         self.value = value
 73 | 
 74 |     def __str__(self):
 75 |         return repr(self.value)
 76 | 
 77 | 
 78 | class TimeoutError(Exception):
 79 | 
 80 |     def __init__(self, value):
 81 |         self.value = value
 82 | 
 83 |     def __str__(self):
 84 |         return repr(self.value)
 85 | 
 86 | 
 87 | class OutOfMemoryError(Exception):
 88 | 
 89 |     def __init__(self, value):
 90 |         self.value = value
 91 | 
 92 |     def __str__(self):
 93 |         return repr(self.value)
 94 | 
 95 | 
 96 | def init_corenlp_command(corenlp_path, memory, properties):
 97 |     """
 98 |     Checks the location of the jar files.
 99 |     Spawns the server as a process.
100 |     """
101 | 
102 |     # TODO: Can edit jar constants
103 |     jar_mask = "*.jar"
104 |     jars = glob.glob(os.path.join(corenlp_path, jar_mask))
105 | 
106 |     java_path = "java"
107 |     classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
108 |     # include the properties file, so you can change defaults
109 |     # but any changes in output format will break parse_parser_results()
110 |     current_dir_pr =  os.path.join(os.path.dirname(os.path.abspath(__file__)), properties)
111 |     if os.path.exists(properties):
112 |         props = "-props %s" % (properties.replace(" ", "\\ "))
113 |     elif os.path.exists(current_dir_pr):
114 |         props = "-props %s" % (current_dir_pr.replace(" ", "\\ "))
115 |     else:
116 |         raise Exception("Error! Cannot locate: %s" % properties)
117 | 
118 |     # add memory limit on JVM
119 |     if memory:
120 |         limit = "-Xmx%s" % memory
121 |     else:
122 |         limit = ""
123 | 
124 |     return "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props)
125 | 
126 | def parse_bracketed(s):
127 |     '''Parse word features [abc=... def = ...]
128 |     Also manages to parse out features that have XML within them
129 |     '''
130 |     word = None
131 |     attrs = {}
132 |     temp = {}
133 |     # Substitute XML tags, to replace them later
134 |     for i, tag in enumerate(re.findall(r"(<[^<>]+>.*<\/[^<>]+>)", s)):
135 |         temp["^^^%d^^^" % i] = tag
136 |         s = s.replace(tag, "^^^%d^^^" % i)
137 |     # Load key-value pairs, substituting as necessary
138 |     for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s):
139 |         if val in temp:
140 |             val = remove_escapes(temp[val])
141 |         if attr == 'Text':
142 |             word = remove_escapes(val)
143 |         else:
144 |             attrs[attr] = remove_escapes(val)
145 |     return (word, attrs)
146 | 
147 | 
148 | def parse_parser_results(text):
149 |     """ This is the nasty bit of code to interact with the command-line
150 |     interface of the CoreNLP tools.  Takes a string of the parser results
151 |     and then returns a Python list of dictionaries, one for each parsed
152 |     sentence.
153 |     """
154 |     results = {"sentences": []}
155 |     state = STATE_START
156 |     for line in unidecode(text.decode('utf-8')).split("\n"):
157 |         line = line.strip()
158 | 
159 |         if line.startswith("Sentence #"):
160 |             sentence = {'words': [], 'parsetree': [], 'dependencies': []}
161 |             results["sentences"].append(sentence)
162 |             state = STATE_TEXT
163 | 
164 |         elif state == STATE_TEXT:
165 |             sentence['text'] = remove_escapes(line)
166 |             state = STATE_WORDS
167 | 
168 |         elif state == STATE_WORDS:
169 |             if not line.startswith("[Text="):
170 |                 raise ParserError('Parse error. Could not find "[Text=" in: %s' % line)
171 |             for s in WORD_PATTERN.findall(line):
172 |                 sentence['words'].append(parse_bracketed(s))
173 |             state = STATE_TREE
174 | 
175 |         elif state == STATE_TREE:
176 |             if len(line) == 0:
177 |                 state = STATE_DEPENDENCY
178 |                 sentence['parsetree'] = " ".join(sentence['parsetree'])
179 |             else:
180 |                 sentence['parsetree'].append(remove_escapes(line))
181 | 
182 |         elif state == STATE_DEPENDENCY:
183 |             if len(line) == 0:
184 |                 state = STATE_COREFERENCE
185 |             else:
186 |                 split_entry = re.split("\(|, |-", line[:-1])
187 |                 if len(split_entry) == 5:
188 |                     rel, left, leftindex, right, rightindex = split_entry
189 |                     leftindex = re.sub("[^0-9]", "", leftindex)
190 |                     rightindex = re.sub("[^0-9]", "", rightindex)
191 |                     sentence['dependencies'].append(tuple([rel,
192 |                         remove_escapes(left), leftindex, remove_escapes(right),
193 |                         rightindex]))
194 | 
195 |         elif state == STATE_COREFERENCE:
196 |             if "Coreference set" in line:
197 |                 if 'coref' not in results:
198 |                     results['coref'] = []
199 |                 coref_set = []
200 |                 results['coref'].append(coref_set)
201 |             else:
202 |                 for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
203 |                     src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1
204 |                     sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1
205 |                     coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
206 | 
207 |     return results
208 | 
209 | 
210 | def parse_parser_xml_results(xml, file_name="", raw_output=False):
211 |     import xmltodict
212 |     from collections import OrderedDict
213 | 
214 |     def extract_words_from_xml(sent_node):
215 |         exted = map(lambda x: x['word'], sent_node['tokens']['token'])
216 |         return exted
217 | 
218 |     # Turning the raw xml into a raw python dictionary:
219 |     raw_dict = xmltodict.parse(xml)
220 |     if raw_output:
221 |         return raw_dict
222 | 
223 |     document = raw_dict[u'root'][u'document']
224 | 
225 |     # Making a raw sentence list of dictionaries:
226 |     raw_sent_list = document[u'sentences'][u'sentence']
227 | 
228 |     # Convert sentences to the format like python
229 |     # TODO: If there is only one sentence in input sentence,
230 |     # raw_sent_list is dict and cannot decode following code...
231 |     sentences = [{'dependencies': [[dep['dep'][i]['@type'],
232 |                                     dep['dep'][i]['governor']['#text'],
233 |                                     dep['dep'][i]['governor']['@idx'],
234 |                                     dep['dep'][i]['dependent']['#text'],
235 |                                     dep['dep'][i]['dependent']['@idx']]
236 |                                    for dep in raw_sent_list[j][u'dependencies']
237 |                                    if 'dep' in dep
238 |                                    for i in xrange(len(dep['dep']))
239 |                                    if dep['@type'] == 'collapsed-ccprocessed-dependencies'],
240 |                   'text': extract_words_from_xml(raw_sent_list[j]),
241 |                   'parsetree': str(raw_sent_list[j]['parse']),
242 |                   'words': [[str(token['word']), OrderedDict([
243 |                       ('CharacterOffsetEnd', str(token['CharacterOffsetEnd'])),
244 |                       ('CharacterOffsetBegin', str(token['CharacterOffsetBegin'])),
245 |                       ('PartOfSpeech', str(token['POS'])),
246 |                       ('Lemma', str(token['lemma']))])]
247 |                   for index, token in enumerate(raw_sent_list[j][u'tokens'][u'token'])]}
248 | 
249 |                  for j in xrange(len(raw_sent_list))]
250 | 
251 | 
252 |     results = {'sentences': sentences}
253 | 
254 |     if file_name:
255 |         results['file_name'] = file_name
256 | 
257 |     return results
258 | 
259 | 
260 | def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=False, properties='default.properties'):
261 |     """Because interaction with the command-line interface of the CoreNLP
262 |     tools is limited to very short text bits, it is necessary to parse xml
263 |     output"""
264 |     #First, we change to the directory where we place the xml files from the
265 |     #parser:
266 | 
267 |     xml_dir = tempfile.mkdtemp()
268 |     file_list = tempfile.NamedTemporaryFile()
269 | 
270 |     #we get a list of the cleaned files that we want to parse:
271 | 
272 |     files = [os.path.join(input_dir , f) for f in os.listdir(input_dir) if f.endswith(".txt")]
273 | 
274 |     #creating the file list of files to parse
275 | 
276 |     file_list.write('\n'.join(files))
277 |     file_list.seek(0)
278 | 
279 |     command = init_corenlp_command(corenlp_path, memory, properties)\
280 |         + ' -filelist %s -outputDirectory %s' % (file_list.name, xml_dir)
281 | 
282 |     #creates the xml file of parser output:
283 | 
284 |     call(command, shell=True)
285 | 
286 |     #reading in the raw xml file:
287 |     # result = []
288 |     try:
289 |         for output_file in os.listdir(xml_dir):
290 |             with open(os.path.join(xml_dir + output_file), 'r') as xml:
291 |                 # parsed = xml.read()
292 |                 file_name = re.sub('.xml$', '', os.path.basename(output_file))
293 |                 # result.append(parse_parser_xml_results(xml.read(), file_name,
294 |                 #                                        raw_output=raw_output))
295 |                 yield parse_parser_xml_results(xml.read(), file_name,
296 |                                                raw_output=raw_output)
297 |     finally:
298 |         file_list.close()
299 |         shutil.rmtree(xml_dir)
300 |     # return result
301 | 
302 | 
303 | class StanfordCoreNLP:
304 | 
305 |     """
306 |     Command-line interaction with Stanford's CoreNLP java utilities.
307 |     Can be run as a JSON-RPC server or imported as a module.
308 |     """
309 | 
310 |     def _spawn_corenlp(self):
311 |         if VERBOSE:
312 |             print self.start_corenlp
313 |         if use_winpexpect:
314 |             self.corenlp = winpexpect.winspawn(self.start_corenlp, maxread=8192,
315 |                 searchwindowsize=80)
316 |         else:
317 |             self.corenlp = pexpect.spawn(self.start_corenlp, maxread=8192,
318 |                 searchwindowsize=80)
319 | 
320 |         # show progress bar while loading the models
321 |         if VERBOSE:
322 |             widgets = ['Loading Models: ', Fraction()]
323 |             pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
324 |             # Model timeouts:
325 |             # pos tagger model (~5sec)
326 |             # NER-all classifier (~33sec)
327 |             # NER-muc classifier (~60sec)
328 |             # CoNLL classifier (~50sec)
329 |             # PCFG (~3sec)
330 |             timeouts = [20, 200, 600, 600, 20]
331 |             for i in xrange(5):
332 |                 self.corenlp.expect("done.", timeout=timeouts[i])  # Load model
333 |                 pbar.update(i + 1)
334 |             self.corenlp.expect("Entering interactive shell.")
335 |             pbar.finish()
336 | 
337 |         # interactive shell
338 |         self.corenlp.expect("\nNLP> ")
339 | 
340 |     def __init__(self, corenlp_path=DIRECTORY, memory="3g", properties='default.properties', serving=False):
341 |         """
342 |         Checks the location of the jar files.
343 |         Spawns the server as a process.
344 |         """
345 | 
346 |         # spawn the server
347 |         self.serving = serving
348 |         self.start_corenlp = init_corenlp_command(corenlp_path, memory, properties)
349 |         self._spawn_corenlp()
350 | 
351 |     def close(self, force=True):
352 |         global use_winpexpect
353 |         if use_winpexpect:
354 |             self.corenlp.terminate()
355 |         else:
356 |             self.corenlp.terminate(force)
357 | 
358 | 
359 |     def isalive(self):
360 |         return self.corenlp.isalive()
361 | 
362 |     def __del__(self):
363 |         # If our child process is still around, kill it
364 |         if self.isalive():
365 |             self.close()
366 | 
367 |     def _parse(self, text):
368 |         """
369 |         This is the core interaction with the parser.
370 | 
371 |         It returns a Python data-structure, while the parse()
372 |         function returns a JSON object
373 |         """
374 | 
375 |         # CoreNLP interactive shell cannot recognize newline
376 |         if '\n' in text or '\r' in text:
377 |             to_send = re.sub("[\r\n]", " ", text).strip()
378 |         else:
379 |             to_send = text
380 | 
381 |         # clean up anything leftover
382 |         def clean_up():
383 |             while True:
384 |                 try:
385 |                     self.corenlp.read_nonblocking(8192, 0.1)
386 |                 except pexpect.TIMEOUT:
387 |                     break
388 |         clean_up()
389 | 
390 |         self.corenlp.sendline(to_send)
391 | 
392 |         # How much time should we give the parser to parse it?
393 |         # the idea here is that you increase the timeout as a
394 |         # function of the text's length.
395 |         # max_expected_time = max(5.0, 3 + len(to_send) / 5.0)
396 |         max_expected_time = max(300.0, len(to_send) / 3.0)
397 | 
398 |         # repeated_input = self.corenlp.except("\n")  # confirm it
399 |         t = self.corenlp.expect(["\nNLP> ", pexpect.TIMEOUT, pexpect.EOF,
400 |                                  "\nWARNING: Parsing of sentence failed, possibly because of out of memory."],
401 |                                 timeout=max_expected_time)
402 |         incoming = self.corenlp.before
403 |         if t == 1:
404 |             # TIMEOUT, clean up anything left in buffer
405 |             clean_up()
406 |             print >>sys.stderr, {'error': "timed out after %f seconds" % max_expected_time,
407 |                                  'input': to_send,
408 |                                  'output': incoming}
409 |             raise TimeoutError("Timed out after %d seconds" % max_expected_time)
410 |         elif t == 2:
411 |             # EOF, probably crash CoreNLP process
412 |             print >>sys.stderr, {'error': "CoreNLP terminates abnormally while parsing",
413 |                                  'input': to_send,
414 |                                  'output': incoming}
415 |             raise ProcessError("CoreNLP process terminates abnormally while parsing")
416 |         elif t == 3:
417 |             # out of memory
418 |             print >>sys.stderr, {'error': "WARNING: Parsing of sentence failed, possibly because of out of memory.",
419 |                                  'input': to_send,
420 |                                  'output': incoming}
421 |             raise OutOfMemoryError
422 | 
423 |         if VERBOSE:
424 |             print "%s\n%s" % ('=' * 40, incoming)
425 |         try:
426 |             results = parse_parser_results(incoming)
427 |         except Exception as e:
428 |             if VERBOSE:
429 |                 print traceback.format_exc()
430 |             raise e
431 | 
432 |         return results
433 | 
434 |     def raw_parse(self, text):
435 |         """
436 |         This function takes a text string, sends it to the Stanford parser,
437 |         reads in the result, parses the results and returns a list
438 |         with one dictionary entry for each parsed sentence.
439 |         """
440 |         try:
441 |             r = self._parse(text)
442 |             return r
443 |         except Exception as e:
444 |             print e  # Should probably log somewhere instead of printing
445 |             self.corenlp.close()
446 |             self._spawn_corenlp()
447 |             if self.serving:  # We don't want to raise the exception when acting as a server
448 |                 return []
449 |             raise e
450 | 
451 |     def parse(self, text):
452 |         """
453 |         This function takes a text string, sends it to the Stanford parser,
454 |         reads in the result, parses the results and returns a list
455 |         with one dictionary entry for each parsed sentence, in JSON format.
456 |         """
457 |         return json.dumps(self.raw_parse(text))
458 | 
459 | 
460 | def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=False):
461 |     """
462 |     This function takes input files,
463 |     sends list of input files to the Stanford parser,
464 |     reads in the results from temporary folder in your OS and
465 |     returns a generator object of list that consist of dictionary entry.
466 |     If raw_output is true, the dictionary returned will correspond exactly to XML.
467 |     ( The function needs xmltodict,
468 |     and doesn't need init 'StanfordCoreNLP' class. )
469 |     """
470 |     if not os.path.exists(input_folder):
471 |         raise Exception("input_folder does not exist")
472 | 
473 |     return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output)
474 | 
475 | def remove_escapes(text):
476 |     """Given a string, remove PTB3 escape characters.
477 |     """
478 |     escapes = {"-lrb-": "(",
479 |         "-rrb-": ")",
480 |         "-lsb-": "[",
481 |         "-rsb-": "]",
482 |         "-lcb-": "{",
483 |         "-rcb-": "}",
484 |         "-LRB-": "(",
485 |         "-RRB-": ")",
486 |         "-LSB-": "[",
487 |         "-RSB-": "]",
488 |         "-LCB-": "{",
489 |         "-RCB-": "}"}
490 |     if text:
491 |         pattern = re.compile('|'.join(re.escape(key) for key in escapes.keys()))
492 |         return pattern.sub(lambda x: escapes[x.group()], text)
493 | 
494 | if __name__ == '__main__':
495 |     """
496 |     The code below starts an JSONRPC server
497 |     """
498 |     from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer
499 |     parser = optparse.OptionParser(usage="%prog [OPTIONS]")
500 |     parser.add_option('-p', '--port', default='8080',
501 |                       help='Port to serve on (default 8080)')
502 |     parser.add_option('-H', '--host', default='127.0.0.1',
503 |                       help='Host to serve on (default localhost; 0.0.0.0 to make public)')
504 |     parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose',
505 |                       help="Quiet mode, don't print status msgs to stdout")
506 |     parser.add_option('-S', '--corenlp', default=DIRECTORY,
507 |                       help='Stanford CoreNLP tool directory (default %s)' % DIRECTORY)
508 |     parser.add_option('-P', '--properties', default='default.properties',
509 |                       help='Stanford CoreNLP properties fieles (default: default.properties)')
510 |     options, args = parser.parse_args()
511 |     VERBOSE = options.verbose
512 |     # server = jsonrpc.Server(jsonrpc.JsonRpc20(),
513 |     #                         jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
514 |     try:
515 |         server = SimpleJSONRPCServer((options.host, int(options.port)))
516 | 
517 |         nlp = StanfordCoreNLP(options.corenlp, properties=options.properties, serving=True)
518 |         server.register_function(nlp.parse)
519 |         server.register_function(nlp.raw_parse)
520 | 
521 |         print 'Serving on http://%s:%s' % (options.host, options.port)
522 |         # server.serve()
523 |         server.serve_forever()
524 |     except KeyboardInterrupt:
525 |         print >>sys.stderr, "Bye."
526 |         exit()
527 | 
528 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/default.properties:
--------------------------------------------------------------------------------
 1 | annotators = tokenize, ssplit, pos, lemma, parse
 2 | 
 3 | # A true-casing annotator is also available (see below)
 4 | #annotators = tokenize, ssplit, pos, lemma, truecase
 5 | 
 6 | # A simple regex NER annotator is also available
 7 | # annotators = tokenize, ssplit, regexner
 8 | 
 9 | #Use these as EOS punctuation and discard them from the actual sentence content
10 | #These are HTML tags that get expanded internally to correct syntax, e.g., from "p" to "<p>", "</p>" etc.
11 | #Will have no effect if the "cleanxml" annotator is used
12 | #ssplit.htmlBoundariesToDiscard = p,text
13 | 
14 | #
15 | # None of these paths are necessary anymore: we load all models from the JAR file
16 | #
17 | 
18 | #pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-left3words/left3words-distsim-wsj-0-18.tagger
19 | ## slightly better model but much slower:
20 | ##pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/bidirectional-distsim-wsj-0-18.tagger
21 | 
22 | #ner.model.3class = /u/nlp/data/ner/goodClassifiers/all.3class.distsim.crf.ser.gz
23 | #ner.model.7class = /u/nlp/data/ner/goodClassifiers/muc.distsim.crf.ser.gz
24 | #ner.model.MISCclass = /u/nlp/data/ner/goodClassifiers/conll.distsim.crf.ser.gz
25 | 
26 | #regexner.mapping = /u/nlp/data/TAC-KBP2010/sentence_extraction/type_map_clean
27 | #regexner.ignorecase = false
28 | 
29 | #nfl.gazetteer = /scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt
30 | #nfl.relation.model =  /scr/nlp/data/ldc/LDC2009E112/Machine_Reading_P1_NFL_Scoring_Training_Data_V1.2/models/nfl_relation_model.ser
31 | #nfl.entity.model =  /scr/nlp/data/ldc/LDC2009E112/Machine_Reading_P1_NFL_Scoring_Training_Data_V1.2/models/nfl_entity_model.ser
32 | #printable.relation.beam = 20
33 | 
34 | #parser.model = /u/nlp/data/lexparser/englishPCFG.ser.gz
35 | 
36 | #srl.verb.args=/u/kristina/srl/verbs.core_args
37 | #srl.model.cls=/u/nlp/data/srl/trainedModels/englishPCFG/cls/train.ann
38 | #srl.model.id=/u/nlp/data/srl/trainedModels/englishPCFG/id/train.ann
39 | 
40 | #coref.model=/u/nlp/rte/resources/anno/coref/corefClassifierAll.March2009.ser.gz
41 | #coref.name.dir=/u/nlp/data/coref/
42 | #wordnet.dir=/u/nlp/data/wordnet/wordnet-3.0-prolog
43 | 
44 | #dcoref.demonym = /scr/heeyoung/demonyms.txt
45 | #dcoref.animate = /scr/nlp/data/DekangLin-Animacy-Gender/Animacy/animate.unigrams.txt
46 | #dcoref.inanimate = /scr/nlp/data/DekangLin-Animacy-Gender/Animacy/inanimate.unigrams.txt
47 | #dcoref.male = /scr/nlp/data/Bergsma-Gender/male.unigrams.txt
48 | #dcoref.neutral = /scr/nlp/data/Bergsma-Gender/neutral.unigrams.txt
49 | #dcoref.female = /scr/nlp/data/Bergsma-Gender/female.unigrams.txt
50 | #dcoref.plural = /scr/nlp/data/Bergsma-Gender/plural.unigrams.txt
51 | #dcoref.singular = /scr/nlp/data/Bergsma-Gender/singular.unigrams.txt
52 | 
53 | 
54 | # This is the regular expression that describes which xml tags to keep
55 | # the text from.  In order to on off the xml removal, add cleanxml
56 | # to the list of annotators above after "tokenize".
57 | #clean.xmltags = .*
58 | # A set of tags which will force the end of a sentence.  HTML example:
59 | # you would not want to end on <i>, but you would want to end on <p>.
60 | # Once again, a regular expression.
61 | # (Blank means there are no sentence enders.)
62 | #clean.sentenceendingtags =
63 | # Whether or not to allow malformed xml
64 | # StanfordCoreNLP.properties
65 | #wordnet.dir=models/wordnet-3.0-prolog
66 | 


--------------------------------------------------------------------------------
/pyutils/corenlp/progressbar.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: iso-8859-1 -*-
  3 | #
  4 | # progressbar  - Text progressbar library for python.
  5 | # Copyright (c) 2005 Nilton Volpato
  6 | #
  7 | # This library is free software; you can redistribute it and/or
  8 | # modify it under the terms of the GNU Lesser General Public
  9 | # License as published by the Free Software Foundation; either
 10 | # version 2.1 of the License, or (at your option) any later version.
 11 | #
 12 | # This library is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # Lesser General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Lesser General Public
 18 | # License along with this library; if not, write to the Free Software
 19 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 20 | 
 21 | 
 22 | """Text progressbar library for python.
 23 | 
 24 | This library provides a text mode progressbar. This is typically used
 25 | to display the progress of a long running operation, providing a
 26 | visual clue that processing is underway.
 27 | 
 28 | The ProgressBar class manages the progress, and the format of the line
 29 | is given by a number of widgets. A widget is an object that may
 30 | display diferently depending on the state of the progress. There are
 31 | three types of widget:
 32 | - a string, which always shows itself;
 33 | - a ProgressBarWidget, which may return a diferent value every time
 34 | it's update method is called; and
 35 | - a ProgressBarWidgetHFill, which is like ProgressBarWidget, except it
 36 | expands to fill the remaining width of the line.
 37 | 
 38 | The progressbar module is very easy to use, yet very powerful. And
 39 | automatically supports features like auto-resizing when available.
 40 | """
 41 | 
 42 | __author__ = "Nilton Volpato"
 43 | __author_email__ = "first-name dot last-name @ gmail.com"
 44 | __date__ = "2006-05-07"
 45 | __version__ = "2.2"
 46 | 
 47 | # Changelog
 48 | #
 49 | # 2006-05-07: v2.2 fixed bug in windows
 50 | # 2005-12-04: v2.1 autodetect terminal width, added start method
 51 | # 2005-12-04: v2.0 everything is now a widget (wow!)
 52 | # 2005-12-03: v1.0 rewrite using widgets
 53 | # 2005-06-02: v0.5 rewrite
 54 | # 2004-??-??: v0.1 first version
 55 | 
 56 | import sys
 57 | import time
 58 | from array import array
 59 | try:
 60 |     from fcntl import ioctl
 61 |     import termios
 62 | except ImportError:
 63 |     pass
 64 | import signal
 65 | 
 66 | 
 67 | class ProgressBarWidget(object):
 68 |     """This is an element of ProgressBar formatting.
 69 | 
 70 |     The ProgressBar object will call it's update value when an update
 71 |     is needed. It's size may change between call, but the results will
 72 |     not be good if the size changes drastically and repeatedly.
 73 |     """
 74 |     def update(self, pbar):
 75 |         """Returns the string representing the widget.
 76 | 
 77 |         The parameter pbar is a reference to the calling ProgressBar,
 78 |         where one can access attributes of the class for knowing how
 79 |         the update must be made.
 80 | 
 81 |         At least this function must be overriden."""
 82 |         pass
 83 | 
 84 | 
 85 | class ProgressBarWidgetHFill(object):
 86 |     """This is a variable width element of ProgressBar formatting.
 87 | 
 88 |     The ProgressBar object will call it's update value, informing the
 89 |     width this object must the made. This is like TeX \\hfill, it will
 90 |     expand to fill the line. You can use more than one in the same
 91 |     line, and they will all have the same width, and together will
 92 |     fill the line.
 93 |     """
 94 |     def update(self, pbar, width):
 95 |         """Returns the string representing the widget.
 96 | 
 97 |         The parameter pbar is a reference to the calling ProgressBar,
 98 |         where one can access attributes of the class for knowing how
 99 |         the update must be made. The parameter width is the total
100 |         horizontal width the widget must have.
101 | 
102 |         At least this function must be overriden."""
103 |         pass
104 | 
105 | 
106 | class ETA(ProgressBarWidget):
107 |     "Widget for the Estimated Time of Arrival"
108 |     def format_time(self, seconds):
109 |         return time.strftime('%H:%M:%S', time.gmtime(seconds))
110 | 
111 |     def update(self, pbar):
112 |         if pbar.currval == 0:
113 |             return 'ETA:  --:--:--'
114 |         elif pbar.finished:
115 |             return 'Time: %s' % self.format_time(pbar.seconds_elapsed)
116 |         else:
117 |             elapsed = pbar.seconds_elapsed
118 |             eta = elapsed * pbar.maxval / pbar.currval - elapsed
119 |             return 'ETA:  %s' % self.format_time(eta)
120 | 
121 | 
122 | class FileTransferSpeed(ProgressBarWidget):
123 |     "Widget for showing the transfer speed (useful for file transfers)."
124 |     def __init__(self):
125 |         self.fmt = '%6.2f %s'
126 |         self.units = ['B', 'K', 'M', 'G', 'T', 'P']
127 | 
128 |     def update(self, pbar):
129 |         if pbar.seconds_elapsed < 2e-6:  # == 0:
130 |             bps = 0.0
131 |         else:
132 |             bps = float(pbar.currval) / pbar.seconds_elapsed
133 |         spd = bps
134 |         for u in self.units:
135 |             if spd < 1000:
136 |                 break
137 |             spd /= 1000
138 |         return self.fmt % (spd, u + '/s')
139 | 
140 | 
141 | class RotatingMarker(ProgressBarWidget):
142 |     "A rotating marker for filling the bar of progress."
143 |     def __init__(self, markers='|/-\\'):
144 |         self.markers = markers
145 |         self.curmark = -1
146 | 
147 |     def update(self, pbar):
148 |         if pbar.finished:
149 |             return self.markers[0]
150 |         self.curmark = (self.curmark + 1) % len(self.markers)
151 |         return self.markers[self.curmark]
152 | 
153 | 
154 | class Percentage(ProgressBarWidget):
155 |     "Just the percentage done."
156 |     def update(self, pbar):
157 |         return '%3d%%' % pbar.percentage()
158 | 
159 | 
160 | class Fraction(ProgressBarWidget):
161 |     "Just the fraction done."
162 |     def update(self, pbar):
163 |         return "%d/%d" % (pbar.currval, pbar.maxval)
164 | 
165 | 
166 | class Bar(ProgressBarWidgetHFill):
167 |     "The bar of progress. It will strech to fill the line."
168 |     def __init__(self, marker='#', left='|', right='|'):
169 |         self.marker = marker
170 |         self.left = left
171 |         self.right = right
172 | 
173 |     def _format_marker(self, pbar):
174 |         if isinstance(self.marker, (str, unicode)):
175 |             return self.marker
176 |         else:
177 |             return self.marker.update(pbar)
178 | 
179 |     def update(self, pbar, width):
180 |         percent = pbar.percentage()
181 |         cwidth = width - len(self.left) - len(self.right)
182 |         marked_width = int(percent * cwidth / 100)
183 |         m = self._format_marker(pbar)
184 |         bar = (self.left + (m * marked_width).ljust(cwidth) + self.right)
185 |         return bar
186 | 
187 | 
188 | class ReverseBar(Bar):
189 |     "The reverse bar of progress, or bar of regress. :)"
190 |     def update(self, pbar, width):
191 |         percent = pbar.percentage()
192 |         cwidth = width - len(self.left) - len(self.right)
193 |         marked_width = int(percent * cwidth / 100)
194 |         m = self._format_marker(pbar)
195 |         bar = (self.left + (m * marked_width).rjust(cwidth) + self.right)
196 |         return bar
197 | 
198 | default_widgets = [Percentage(), ' ', Bar()]
199 | 
200 | 
201 | class ProgressBar(object):
202 |     """This is the ProgressBar class, it updates and prints the bar.
203 | 
204 |     The term_width parameter may be an integer. Or None, in which case
205 |     it will try to guess it, if it fails it will default to 80 columns.
206 | 
207 |     The simple use is like this:
208 |     >>> pbar = ProgressBar().start()
209 |     >>> for i in xrange(100):
210 |     ...    # do something
211 |     ...    pbar.update(i+1)
212 |     ...
213 |     >>> pbar.finish()
214 | 
215 |     But anything you want to do is possible (well, almost anything).
216 |     You can supply different widgets of any type in any order. And you
217 |     can even write your own widgets! There are many widgets already
218 |     shipped and you should experiment with them.
219 | 
220 |     When implementing a widget update method you may access any
221 |     attribute or function of the ProgressBar object calling the
222 |     widget's update method. The most important attributes you would
223 |     like to access are:
224 |     - currval: current value of the progress, 0 <= currval <= maxval
225 |     - maxval: maximum (and final) value of the progress
226 |     - finished: True if the bar is have finished (reached 100%), False o/w
227 |     - start_time: first time update() method of ProgressBar was called
228 |     - seconds_elapsed: seconds elapsed since start_time
229 |     - percentage(): percentage of the progress (this is a method)
230 |     """
231 |     def __init__(self, maxval=100, widgets=default_widgets, term_width=None,
232 |                  fd=sys.stderr, force_update=False):
233 |         assert maxval > 0
234 |         self.maxval = maxval
235 |         self.widgets = widgets
236 |         self.fd = fd
237 |         self.signal_set = False
238 |         if term_width is None:
239 |             try:
240 |                 self.handle_resize(None, None)
241 |                 signal.signal(signal.SIGWINCH, self.handle_resize)
242 |                 self.signal_set = True
243 |             except:
244 |                 self.term_width = 79
245 |         else:
246 |             self.term_width = term_width
247 | 
248 |         self.currval = 0
249 |         self.finished = False
250 |         self.prev_percentage = -1
251 |         self.start_time = None
252 |         self.seconds_elapsed = 0
253 |         self.force_update = force_update
254 | 
255 |     def handle_resize(self, signum, frame):
256 |         h, w = array('h', ioctl(self.fd, termios.TIOCGWINSZ, '\0' * 8))[:2]
257 |         self.term_width = w
258 | 
259 |     def percentage(self):
260 |         "Returns the percentage of the progress."
261 |         return self.currval * 100.0 / self.maxval
262 | 
263 |     def _format_widgets(self):
264 |         r = []
265 |         hfill_inds = []
266 |         num_hfill = 0
267 |         currwidth = 0
268 |         for i, w in enumerate(self.widgets):
269 |             if isinstance(w, ProgressBarWidgetHFill):
270 |                 r.append(w)
271 |                 hfill_inds.append(i)
272 |                 num_hfill += 1
273 |             elif isinstance(w, (str, unicode)):
274 |                 r.append(w)
275 |                 currwidth += len(w)
276 |             else:
277 |                 weval = w.update(self)
278 |                 currwidth += len(weval)
279 |                 r.append(weval)
280 |         for iw in hfill_inds:
281 |             r[iw] = r[iw].update(self,
282 |                                  (self.term_width - currwidth) / num_hfill)
283 |         return r
284 | 
285 |     def _format_line(self):
286 |         return ''.join(self._format_widgets()).ljust(self.term_width)
287 | 
288 |     def _need_update(self):
289 |         if self.force_update:
290 |             return True
291 |         return int(self.percentage()) != int(self.prev_percentage)
292 | 
293 |     def reset(self):
294 |         if not self.finished and self.start_time:
295 |             self.finish()
296 |         self.finished = False
297 |         self.currval = 0
298 |         self.start_time = None
299 |         self.seconds_elapsed = None
300 |         self.prev_percentage = None
301 |         return self
302 | 
303 |     def update(self, value):
304 |         "Updates the progress bar to a new value."
305 |         assert 0 <= value <= self.maxval
306 |         self.currval = value
307 |         if not self._need_update() or self.finished:
308 |             return
309 |         if not self.start_time:
310 |             self.start_time = time.time()
311 |         self.seconds_elapsed = time.time() - self.start_time
312 |         self.prev_percentage = self.percentage()
313 |         if value != self.maxval:
314 |             self.fd.write(self._format_line() + '\r')
315 |         else:
316 |             self.finished = True
317 |             self.fd.write(self._format_line() + '\n')
318 | 
319 |     def start(self):
320 |         """Start measuring time, and prints the bar at 0%.
321 | 
322 |         It returns self so you can use it like this:
323 |         >>> pbar = ProgressBar().start()
324 |         >>> for i in xrange(100):
325 |         ...    # do something
326 |         ...    pbar.update(i+1)
327 |         ...
328 |         >>> pbar.finish()
329 |         """
330 |         self.update(0)
331 |         return self
332 | 
333 |     def finish(self):
334 |         """Used to tell the progress is finished."""
335 |         self.update(self.maxval)
336 |         if self.signal_set:
337 |             signal.signal(signal.SIGWINCH, signal.SIG_DFL)
338 | 
339 | 
340 | def example1():
341 |     widgets = ['Test: ', Percentage(), ' ', Bar(marker=RotatingMarker()),
342 |                ' ', ETA(), ' ', FileTransferSpeed()]
343 |     pbar = ProgressBar(widgets=widgets, maxval=10000000).start()
344 |     for i in range(1000000):
345 |         # do something
346 |         pbar.update(10 * i + 1)
347 |     pbar.finish()
348 |     return pbar
349 | 
350 | 
351 | def example2():
352 |     class CrazyFileTransferSpeed(FileTransferSpeed):
353 |         "It's bigger between 45 and 80 percent"
354 |         def update(self, pbar):
355 |             if 45 < pbar.percentage() < 80:
356 |                 return 'Bigger Now ' + FileTransferSpeed.update(self, pbar)
357 |             else:
358 |                 return FileTransferSpeed.update(self, pbar)
359 | 
360 |     widgets = [CrazyFileTransferSpeed(), ' <<<',
361 |                Bar(), '>>> ', Percentage(), ' ', ETA()]
362 |     pbar = ProgressBar(widgets=widgets, maxval=10000000)
363 |     # maybe do something
364 |     pbar.start()
365 |     for i in range(2000000):
366 |         # do something
367 |         pbar.update(5 * i + 1)
368 |     pbar.finish()
369 |     return pbar
370 | 
371 | 
372 | def example3():
373 |     widgets = [Bar('>'), ' ', ETA(), ' ', ReverseBar('<')]
374 |     pbar = ProgressBar(widgets=widgets, maxval=10000000).start()
375 |     for i in range(1000000):
376 |         # do something
377 |         pbar.update(10 * i + 1)
378 |     pbar.finish()
379 |     return pbar
380 | 
381 | 
382 | def example4():
383 |     widgets = ['Test: ', Percentage(), ' ',
384 |                Bar(marker='0', left='[', right=']'),
385 |                ' ', ETA(), ' ', FileTransferSpeed()]
386 |     pbar = ProgressBar(widgets=widgets, maxval=500)
387 |     pbar.start()
388 |     for i in range(100, 500 + 1, 50):
389 |         time.sleep(0.2)
390 |         pbar.update(i)
391 |     pbar.finish()
392 |     return pbar
393 | 
394 | 
395 | def example5():
396 |     widgets = ['Test: ', Fraction(), ' ', Bar(marker=RotatingMarker()),
397 |                ' ', ETA(), ' ', FileTransferSpeed()]
398 |     pbar = ProgressBar(widgets=widgets, maxval=10, force_update=True).start()
399 |     for i in range(1, 11):
400 |         # do something
401 |         time.sleep(0.5)
402 |         pbar.update(i)
403 |     pbar.finish()
404 |     return pbar
405 | 
406 | 
407 | def main():
408 |     example1()
409 |     print
410 |     example2()
411 |     print
412 |     example3()
413 |     print
414 |     example4()
415 |     print
416 |     example5()
417 |     print
418 | 
419 | if __name__ == '__main__':
420 |     main()
421 | 


--------------------------------------------------------------------------------
/senna_sents.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This code chunk sentences into NP, VP, PP, O, etc.
 3 | It uses the SENNA tool, (https://github.com/biplab-iitb/practNLPTools, https://pypi.python.org/pypi/practnlptools/1.0), 
 4 | to perform the chunking.
 5 | 
 6 | The results will be saved in cache/chunked_sents/dataset_splitBy/sents.json
 7 | The sents.json = [{sent_id, sent, senna}], where senna = {chunk, pos, srl, syntax_tree, verbs, words, ner}
 8 | """
 9 | import sys
10 | import os
11 | import os.path as osp
12 | from pprint import pprint
13 | from Queue import Queue
14 | from threading import Thread, Lock
15 | import time
16 | import argparse
17 | import json
18 | # import SENNA tool
19 | from practnlptools.tools import Annotator
20 | 
21 | def senna_sents(sents, params):
22 | 	"""
23 | 	The input sents is list of [{sent_id, sent, raw, tokens}]
24 | 	Return sents of [{sent_id, sent, raw, tokens, chunk}]
25 | 	"""
26 | 	num_sents = len(sents)
27 | 
28 | 	# enqueue
29 | 	q = Queue()
30 | 	for i in range(num_sents):
31 | 		q.put((i, sents[i]))
32 | 
33 | 	# work: dequeue and do job
34 | 	def worker():
35 | 		annotator = Annotator()
36 | 		while True:
37 | 			i, sent = q.get()
38 | 			try:
39 | 				senna = annotator.getAnnotations(sent['sent'])
40 | 			except:
41 | 				print('exception found.')
42 | 				senna = annotator.getAnnotations('none')
43 | 			if i % 100 == 0:
44 | 				print('%s/%s done.' % (i, num_sents))
45 | 			sents[i]['senna'] = senna  
46 | 			sents[i]['senna'].pop('dep_parse', None) # including chunk, pos, srl, syntax_tree, verbs, words, ner
47 | 			q.task_done()
48 | 
49 | 	# workers
50 | 	for w in range(params['num_workers']):
51 | 		t = Thread(target=worker)
52 | 		t.daemon = True
53 | 		t.start()
54 | 	q.join()
55 | 
56 | 
57 | def main(params):
58 | 
59 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
60 | 	if not osp.isdir('cache/senna_sents/' + dataset_splitBy):
61 | 		os.makedirs('cache/senna_sents/' + dataset_splitBy)
62 | 
63 | 	# we have to prepare current folder path
64 | 	# practnlptools might change current folder to python's site-packages
65 | 	cur_folder = os.path.abspath('.')
66 | 
67 | 	# load refer
68 | 	sys.path.insert(0, 'pyutils/refer')
69 | 	from refer import REFER
70 | 	refer = REFER(params['data_root'], params['dataset'], params['splitBy'])
71 | 	
72 | 	# read sents and pop unnecessary keys 
73 | 	sents = refer.Sents.values()
74 | 	for sent in sents:
75 | 		sent.pop('raw', None)
76 | 
77 | 	# parse sents
78 | 	senna_sents(sents, params)
79 | 
80 | 	# save results
81 | 	output_path = osp.join(cur_folder, 'cache/senna_sents/'+dataset_splitBy, 'sents.json')
82 | 	with open(output_path, 'w') as io:
83 | 		json.dump(sents, io)
84 | 	print('senna parsed sents.json saved in %s.' % output_path)
85 | 
86 | 
87 | if __name__ == '__main__':
88 | 
89 | 	# input
90 | 	parser = argparse.ArgumentParser()
91 | 	parser.add_argument('--data_root', default='data', help='dataset root directory')
92 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
93 | 	parser.add_argument('--splitBy', default='unc', help='split By')
94 | 	parser.add_argument('--num_workers', type=int, default=2, help='number of workers')
95 | 	args = parser.parse_args()
96 | 	params = vars(args)
97 | 
98 | 	# main
99 | 	main(params)


--------------------------------------------------------------------------------
/write_atts_html.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import os.path as osp
  4 | from pprint import pprint
  5 | import time
  6 | import argparse
  7 | import json
  8 | 
  9 | attribute_names = {'r1': 'entry-level name', 'r2': 'color', 'r3': 'size', 'r4': 'abs. location', 
 10 | 'r5': 'rel. location', 'r6': 'rel. object', 'r7': 'other atts.', 'r8': 'left words'}
 11 | 
 12 | def analyze(sents):
 13 | 	# do some statistics
 14 | 	usage = {'r1': 0, 'r2': 0, 'r3': 0, 'r4': 0, 'r5': 0, 'r6': 0, 'r7': 0, 'r8': 0}
 15 | 	for sent in sents:
 16 | 		for r in usage:
 17 | 			usage[r] = usage[r] + 1 if sent['atts'][r] != ['none'] else usage[r]
 18 | 	for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']:
 19 | 		usage[r] /= float(len(sents))
 20 | 		print('Usage of %s is %.2f%%.' % (r, usage[r] * 100))
 21 | 	return usage
 22 | 
 23 | def main(params):
 24 | 
 25 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
 26 | 	if not osp.isdir('cache/atts_html/' + dataset_splitBy):
 27 | 		os.makedirs('cache/atts_html/' + dataset_splitBy)
 28 | 
 29 | 	# load parsed sents with attributes, where sents.json = 
 30 | 	# [{sent_id, sent, parse, atts, left, raw, tokens}]
 31 | 	# where parse = {dependencies, parsetree, text, workds}, atts = {r1, r2, ...}, left = [(wd, POS)]
 32 | 	path_to_parsed_atts = osp.join('cache/parsed_atts', dataset_splitBy, 'sents.json')
 33 | 	sents = json.load(open(path_to_parsed_atts))
 34 | 
 35 | 	# analyze
 36 | 	usage = analyze(sents)
 37 | 
 38 | 	# write htmls
 39 | 	num_per_page = params['num_per_page']
 40 | 	for page_id, s in enumerate(range(0, len(sents), num_per_page)):
 41 | 		html = open(osp.join('cache/atts_html', dataset_splitBy, str(page_id)+'.html'), 'w')
 42 | 		html.write('<html><body><h2>Show %s sentences and their attribute parses.' % len(sents))
 43 | 		html.write('<table border>')
 44 | 		html.write('<tr style="font-weight:bold"><td></td>')
 45 | 		html.write('<td>sent_id</td>')
 46 | 		html.write('<td><b>Referring-expression</b></td>')
 47 | 		html.write('<td>Entry-level name</td><td>Color</td>')
 48 | 		html.write('<td>Size</td><td>Abs. Location</td>')
 49 | 		html.write('<td>Rel. Location</td><td>Rel. Object</td><td>Other</td><td>Left Words</td></tr>')
 50 | 		for j in range(s, min(s+num_per_page, len(sents))):
 51 | 			if j % 2 == 0:
 52 | 				color_str = '#eef'
 53 | 			else:
 54 | 				color_str = '#fee'	
 55 | 			# fetch info of this sent	
 56 | 			sent_id = sents[j]['sent_id']
 57 | 			sent_txt = sents[j]['sent'].encode('ascii', 'ignore').decode('ascii')
 58 | 			atts = sents[j]['atts']
 59 | 			left = sents[j]['left']
 60 | 			# write a row of the info
 61 | 			html.write('<tr style="background-color:%s"><td>%06d</td>' % (color_str, j))
 62 | 			html.write('<td>%s</td>' % sent_id)
 63 | 			html.write('<td style="width:400px">%s</td>' % sent_txt)
 64 | 			for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7']:
 65 | 				att = atts[r][0] if atts[r][0] != 'none' else ''
 66 | 				html.write('<td style="width:100px">%s</td>' % att)
 67 | 			html.write('<td style="width:400px">')
 68 | 			for l in left:
 69 | 				html.write('%s [%s],&nbsp' % (l[0], l[1]))
 70 | 			html.write('</td>')
 71 | 			html.write('</tr>')
 72 | 		html.write('</table>')
 73 | 		html.write('</body></html>')
 74 | 		print('Page %s written.' % page_id)
 75 | 
 76 | 	# write summary
 77 | 	html = open(osp.join('cache/atts_html', dataset_splitBy, 'main.html'), 'w')
 78 | 	html.write('<html><head><style>.sm{color:#009;font-size:0.8em}</style></head>')
 79 | 	html.write('<body><table border>')
 80 | 	for r in ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']:
 81 | 		html.write('<h2>usage of %s [%s] is %.2f%%</h2>' % (r, attribute_names[r], usage[r]*100))
 82 | 	html.write('<ul>')
 83 | 	for page_id, s in enumerate(range(0, len(sents), num_per_page)):
 84 | 		page_html = str(page_id)+'.html'
 85 | 		print(page_html)
 86 | 		html.write('<li><a target="_blank" href="%s"> page_id%s</a></li>' % (page_html, page_id))
 87 | 	html.write('</ul>')
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 | 
 92 | 	# input
 93 | 	parser = argparse.ArgumentParser()
 94 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
 95 | 	parser.add_argument('--splitBy', default='unc', help='split By')
 96 | 	parser.add_argument('--num_per_page', type=int, default=10000, help='number of pages to be written')
 97 | 	args = parser.parse_args()
 98 | 	params = vars(args)
 99 | 
100 | 	# main
101 | 	main(params)
102 | 


--------------------------------------------------------------------------------
/write_chunk_html.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import os.path as osp
  4 | from pprint import pprint
  5 | import time
  6 | import argparse
  7 | import json
  8 | import operator
  9 | import random
 10 | random.seed(8)
 11 | 
 12 | def write_structures(html, sents):
 13 | 	"""
 14 | 	The input sents = [{sent_id, sent, chunk, NPs, senna, tokens}]
 15 | 	where chunk is list of [(phrase, phrase_type)], and NPs is list of noun phrases
 16 | 	We analyze phrase structure
 17 | 	"""
 18 | 	struct_to_num = {}
 19 | 	struct_to_examples = {}
 20 | 	for sent in sents:
 21 | 		chunk = sent['chunk']
 22 | 		struct = ' '.join([ck[1] for ck in chunk])
 23 | 		struct_to_num[struct] = struct_to_num.get(struct, 0) + 1
 24 | 		if struct not in struct_to_examples:
 25 | 			struct_to_examples[struct] = []
 26 | 		struct_to_examples[struct] += [sent['sent']]
 27 | 	sorted_structs = sorted(struct_to_num.items(), key=operator.itemgetter(1))
 28 | 	sorted_structs.reverse()
 29 | 
 30 | 	html.write('<table border>')
 31 | 	html.write('<tr style="font-weight:bold"><td></td>')
 32 | 	html.write('<td>Top Phrase Structs</td><td>Number</td><td>Percentage</td><td>Accumulated</td><td>Example</td></tr>')
 33 | 	total_num = sum(struct_to_num.values())
 34 | 	acc = 0
 35 | 	for j, (struct, num) in enumerate(sorted_structs[:20]):
 36 | 		acc += num
 37 | 		html.write('<tr><td>%02d</td>' % j)
 38 | 		html.write('<td>%s</td>' % struct)
 39 | 		html.write('<td>%s</td>' % num)
 40 | 		html.write('<td>%.2f%%</td>' % (num*100.0/total_num))
 41 | 		html.write('<td>%.2f%%</td>' % (acc*100.0/total_num))
 42 | 		html.write('<td>%s</td>' % (random.choice(struct_to_examples[struct])))
 43 | 		html.write('</td>')
 44 | 		html.write('</tr>')
 45 | 	html.write('</table>')
 46 | 
 47 | def write_info(html, sents):
 48 | 	# NP usage in the raw chunks
 49 | 	NP_usage = 0
 50 | 	for sent in sents:
 51 | 		chunk = sent['chunk']
 52 | 		NPs = [ck for ck in chunk if ck[1] == 'NP']
 53 | 		if len(NPs) > 0:
 54 | 			NP_usage += 1
 55 | 	html.write('<p>%.2f%% expressions have NPs' % (NP_usage*100.0/len(sents)))
 56 | 
 57 | 	# NP usage in the filtered NPs
 58 | 	cleaned_NP_usage = 0
 59 | 	for sent in sents:
 60 | 		if len(sent['NPs']) > 0:
 61 | 			cleaned_NP_usage += 1
 62 | 	html.write(', and %.2f%% cleaned NPs.</p>' % (cleaned_NP_usage*100.0/len(sents)))
 63 | 
 64 | 	# average #NP in each expression
 65 | 	total_NPs, total_cleaned_NPs, total_PPs, total_VPs, total_ADVPs, total_ADJPs = 0, 0, 0, 0, 0, 0
 66 | 	total_wds = 0
 67 | 	total_NP_wds = 0
 68 | 	total_cleaned_NP_wds = 0
 69 | 	for sent in sents:
 70 | 		for ck in sent['chunk']:
 71 | 			if ck[1] == 'NP':
 72 | 				total_NPs += 1
 73 | 				total_NP_wds += len(ck[0].split())
 74 | 			if ck[1] == 'PP':
 75 | 				total_PPs += 1
 76 | 			if ck[1] == 'ADVP':
 77 | 				total_ADVPs += 1
 78 | 			if ck[1] == 'ADJP':
 79 | 				total_ADJPs += 1
 80 | 		total_wds += len(sent['tokens'])
 81 | 		# check cleaned NPs
 82 | 		total_cleaned_NPs += len(sent['NPs'])
 83 | 		total_cleaned_NP_wds += sum([len(phrase.split()) for phrase in sent['NPs']])
 84 | 
 85 | 	html.write('<p>Each expression and has %.2f NPs (%.2f cleaned NPs), %.2f PPs, %.2f ADVPs, %.2f ADJPs. </p>' % (total_NPs*1.0/len(sents), 
 86 | 		total_cleaned_NPs*1.0 / len(sents), total_PPs*1.0/len(sents), total_ADVPs*1.0/len(sents), total_ADJPs*1.0/len(sents)))
 87 | 	html.write('<p>Each expression has %.2f words, among which are %.2f NP words.</p>' % (total_wds/len(sents), total_NP_wds*1.0 / len(sents) ))
 88 | 	html.write('<p>Each NP has %.2f words' % (total_NP_wds*1.0/total_NPs))
 89 | 	html.write(', and each cleaned NP has %.2f words.</p>' % (total_cleaned_NP_wds*1.0 / total_cleaned_NPs))
 90 | 
 91 | 
 92 | def main(params):
 93 | 
 94 | 	dataset_splitBy = params['dataset'] + '_' + params['splitBy']
 95 | 	if not osp.isdir('cache/chunk_html/' + dataset_splitBy):
 96 | 		os.makedirs('cache/chunk_html/' + dataset_splitBy)
 97 | 
 98 | 	# load chunked sents = [{sent_id, sent, chunk, NPs, senna, tokens}]
 99 | 	# where chunk is list of [(phrase, phrase_type)]
100 | 	# and NPs is list of noun phrases
101 | 	path_to_chunked_sents = osp.join('cache/chunked_sents', dataset_splitBy, 'sents.json')
102 | 	sents = json.load(open(path_to_chunked_sents))
103 | 
104 | 	# write htmls
105 | 	num_per_page = params['num_per_page']
106 | 	for page_id, s in enumerate(range(0, len(sents), num_per_page)):
107 | 		html = open(osp.join('cache/chunk_html', dataset_splitBy, str(page_id)+'.html'), 'w')
108 | 		html.write('<html><body><h2>Show %s sentences and their phrase structures.' % len(sents))
109 | 		html.write('<table border>')
110 | 		html.write('<tr style="font-weight:bold"><td></td>')
111 | 		html.write('<td>sent_id</td>')
112 | 		html.write('<td><b>Referring-expression</b></td>')
113 | 		html.write('<td>Phrase Structure</td>')
114 | 		html.write('<td>Noun Phrase(s)</td>')
115 | 		html.write('<td>Noun Word(s)</td></tr>')
116 | 		for j in range(s, min(s+num_per_page, len(sents))):
117 | 			if j % 2 == 0:
118 | 				color_str = '#eef'
119 | 			else:
120 | 				color_str = '#fee'	
121 | 			# fetch info of this sent
122 | 			sent_id = sents[j]['sent_id']
123 | 			sent_txt = sents[j]['sent'].encode('ascii', 'ignore').decode('ascii')
124 | 			chunk_txt = ' '.join(['(%s, %s)' % (ck[0], ck[1]) for ck in sents[j]['chunk']])
125 | 			NPs_txt = ' '.join(['(%s, NP)' % phrase for phrase in sents[j]['NPs']])
126 | 			NNs_txt = ' '.join(['(%s, NN)' % phrase for phrase in sents[j]['NNs']])
127 | 			# write a row of the info
128 | 			html.write('<tr style="background-color:%s"><td>%06d</td>' % (color_str, j))
129 | 			html.write('<td>%s</td>' % sent_id)
130 | 			html.write('<td style="width:400px">%s</td>' % sent_txt)
131 | 			html.write('<td style="width:400px">%s</td>' % chunk_txt)
132 | 			html.write('<td style="width:400px">%s</td>' % NPs_txt)
133 | 			html.write('<td style="width:400px">%s</td>' % NNs_txt)
134 | 			html.write('</td>')
135 | 			html.write('</tr>')
136 | 		html.write('</table>')
137 | 		html.write('</body></html>')
138 | 		print('Page %s written.' % page_id)
139 | 
140 | 	# write summary
141 | 	html = open(osp.join('cache/chunk_html', dataset_splitBy, 'main.html'), 'w')
142 | 	html.write('<html><head><style>.sm{color:#009;font-size:0.8em}</style></head>')
143 | 	html.write('<body>')
144 | 
145 | 	# write phrase structures
146 | 	write_structures(html, sents)
147 | 
148 | 	# write other info
149 | 	write_info(html, sents)
150 | 
151 | 	# write pages
152 | 	html.write('<ul>')
153 | 	for page_id, s in enumerate(range(0, len(sents), num_per_page)):
154 | 		page_html = str(page_id)+'.html'
155 | 		print(page_html)
156 | 		html.write('<li><a target="_blank" href="%s"> page_id%s</a></li>' % (page_html, page_id))
157 | 	html.write('</ul>')
158 | 
159 | 
160 | if __name__ == '__main__':
161 | 
162 | 	# input
163 | 	parser = argparse.ArgumentParser()
164 | 	parser.add_argument('--dataset', default='refcoco', help='dataset name')
165 | 	parser.add_argument('--splitBy', default='unc', help='split By')
166 | 	parser.add_argument('--num_per_page', type=int, default=10000, help='number of pages to be written')
167 | 	args = parser.parse_args()
168 | 	params = vars(args)
169 | 
170 | 	# main
171 | 	main(params)


--------------------------------------------------------------------------------