292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
--------------------------------------------------------------------------------
/sample_data/parallel_data_example.json:
--------------------------------------------------------------------------------
1 | [{"headline": "Five taken to hospital with minor injuries after crash on A17 near Sleaford", "sentence": "Five people have been taken to hospital with minor injuries following a crash on the A17 near Sleaford this morning."}, {"headline": "Several school districts hold classes on Presidents' day to make up for days missed", "sentence": "Several school districts in Hampton Roads are holding classes this Presidents' Day to make up for days missed because of the snow."}, {"headline": "Luis Suarez spotted in London:", "sentence": "Luis Suarez was spotted in London this afternoon and this has led the Daily Star to link the Liverpool striker to a potential move to Chelsea or Arsenal."}, {"headline": "Woman injured by falling tree", "sentence": "A woman was injured by a falling tree in the Gresham neighborhood, according to the Chicago Fire Department."}, {"headline": "Benjamin Zephaniah to lead poetry day for ex-offenders", "sentence": "Birmingham poet Benjamin Zephaniah is today leading an interactive poetry day for ex-offenders in Birmingham."}, {"headline": "Vodafone seeks regulatory approval to take full control of its Indian unit", "sentence": "British mobile phone giant Vodafone said Tuesday it was seeking regulatory approval to take full control of its Indian unit for $1.65 billion, after New Delhi relaxed foreign ownership rules in the sector."}, {"headline": "Markets remain under pressure;", "sentence": "Markets continued to remain under pressure on Thursday morning as financial heavyweights like ICICI Bank, HDFC, and HDFC Bank declined by 1-2% each."}, {"headline": "Rick Riordan reveals cover for 'the Staff of Serapis'", "sentence": "Rick Riordan has revealed the cover for his latest crossover short story, ``Staff of Serapis,'' which features Annabeth Chase and Sadie Kane."}, {"headline": "Russia sending thousands of troops to Crimea", "sentence": "Ukraine accused Russia Saturday of sending thousands of extra troops into Crimea as the Kremlin vowed to help restore calm on the flashpoint peninsula and Washington warned of ``costs'' to Moscow should it use force."}, {"headline": "Man charged with killing father cleared by grand jury", "sentence": "A man charged with killing his father in a fight outside a busy Orlando restaurant was just cleared by a grand jury."}, {"headline": "Attacks across Iraq kill at least 20 people", "sentence": "A new wave of attacks across Iraq killed at least 20 people and wounded dozens on Monday as the government pressed on with its offensive to hunt down al-Qaida-linked militants in the country's volatile western desert."}, {"headline": "New Zealand shares fall for third session", "sentence": "NEW Zealand shares fell for a third session as institutional investors re-weighted their portfolios ahead of changes to the market index next week."}, {"headline": "Aaron Donald wins Bronko Nagurski Trophy", "sentence": "At a ceremony held in Charlotte, NC, Monday night, Pitt defensive tackle Aaron Donald won the 2013 Bronko Nagurski Trophy."}, {"headline": "One in four men admits raping woman --", "sentence": "One in four men admitted raping a woman once in their life according to a report from United Nations."}, {"headline": "Russia ``fully compensating'' for environmental effects of Olympic Games", "sentence": "Deputy Prime Minister Dmitry Kozak, in charge of the government's Olympic preparations, told an IOC conference in Sochi Wednesday that Russia was ``fully compensating'' for any environmental effects of the Games."}, {"headline": "Cairn Energy plans nine wells in 2014 exploration programme --", "sentence": "Oil & gas giant Cairn Energy Plc., in its pre-close update, said that it plans to drill nine wells in its 2014 exploration programme across an attractive mix of frontier and mature basins."}, {"headline": "Three hacked to death over land dispute", "sentence": "Three members of a family were hacked to death over a land dispute between two groups of people in Garhaiwadi village of Bihar's Kishanganj district, police said."}, {"headline": "Gannett completes acquisition of Belo", "sentence": "MCLEAN, Va., Dec. 23, 2013 /PRNewswire/ Gannett Co., Inc. announced today that it has completed its previously announced acquisition of Belo Corp. for $13.75 per share in cash, in addition to the assumption of $715 million of outstanding debt, for a total transaction value of $2.2 billion."}, {"headline": "US champion Abbott falls hard in short program", "sentence": "SOCHI, Russia US champion Jeremy Abbott fell hard on an attempted quadruple toe loop Thursday in the men's short program at the Olympics, yet finished the routine."}, {"headline": "Kathy Griffin to host TrevorLIVE", "sentence": "Long time LGBTQ ally and activist, multi-Grammy Award-nominated, two-time Emmy Award-winning comedian and #1 New York Times bestselling author Kathy Griffin will host TrevorLIVE in Los Angeles on December 8th at the Hollywood Palladium, presented by Audi of America and Wells Fargo."}, {"headline": "Positive signs emerging:", "sentence": "The Australian Treasury believes positive signs are emerging in the Australian economy."}, {"headline": "Vine bans sexually explicit content", "sentence": "Vine, the mobile app owned by Twitter, has banned sexually explicit content, effective immediately."}, {"headline": "United Kingdom denies visa to Cuban antiterrorist fighter Rene Gonzalez", "sentence": "The government of the United Kingdom denied visa to Cuban antiterrorist fighter Rene Gonzalez, invited to the International Commission of Inquiry about the case of the Five, to be run on March 7-8 in London."}, {"headline": "Dollar falls against rivals", "sentence": "The dollar fell against its major rivals as investors questioned the timing of a potential reduction in Federal Reserve stimulus."}, {"headline": "Maya Rudolph welcomes fourth child!", "sentence": "Maya Rudolph and longtime partner Paul Thomas Anderson have welcomed their fourth child, a source confirms to Us Weekly."}, {"headline": "Kristen Stewart flashes bra:", "sentence": "KStew showed some skin and flashed her black bra while taking her furry new friend for a stroll."}, {"headline": "Vonn returns to skiing over weekend in Chile", "sentence": "Lindsey Vonn returned to skiing over the weekend in Chile nearly seven months after a season-ending crash that required surgery on her right knee."}, {"headline": "Two Chinese war ships arrive at the port of Trincomalee", "sentence": "Two Chinese war ships, ``JING GANGSHA'' and ``HENG SHUI'' arrived at the port of Trincomalee on 13 th January 2014 on a good will visit."}, {"headline": "Art. Burning as it were a lamp by Enrique Martinez Celaya", "sentence": "Art. Burning as it were a lamp, a project by Enrique Martinez Celaya on view during Art Basel Miami Beach."}, {"headline": "Flash floods hit Chai Chee after heavy downpour", "sentence": "Flash floods hit Chai Chee on Monday afternoon after a heavy downpour, leaving some vehicles stranded."}, {"headline": "Largest diamond ever sold at auction fetches $30.6 million", "sentence": "This 118-carat egg-sized jewel, the largest diamond ever sold at auction, fetched $30.6 million at Sotheby's in Hong Kong yesterday, or $259,322 per carat."}, {"headline": "Modi sent emissaries to create 'soft corner':", "sentence": "Srinagar, Apr 18 Hardline Hurriyat Conference chairman Syed Ali Shah Geelani today claimed that BJP's prime ministerial candidate Narendra Modi had sent emissaries to him and separatist leadership in Jammu and Kashmir to create a ``soft corner'' for him by making a ``commitment'' to seek a solution to Kashmir issue."}, {"headline": "Chinese shares close lower Wednesday", "sentence": "Chinese shares closed lower Wednesday dragged down by the bio-pharmaceutical sector and small enterprises with growth potential."}, {"headline": "Kate Bush announces first set of live shows in 35 years", "sentence": "British performer Kate Bush has announced her first set of live shows in 35 years, entitled Behind the Dawn."}, {"headline": "Rajasthan Government misusing public funds:", "sentence": "An RTI activist has alleged that the Rajasthan Government was 'misusing' public funds for advertising its achievements in newspapers and TV channels in an election year."}, {"headline": "Yellow fever mosquito found at Holy Cross cemetery in Menlo Park", "sentence": "A yellow fever mosquito which can carry several viruses included dengue fever was found at Holy Cross cemetery in Menlo Park, the California Department of Public Health confirmed Friday."}, {"headline": "Lea Michele found out about Monteith`s death through phone call", "sentence": "Lea Michele reportedly found out about the death of her boyfriend Cory Monteith in Vancouver through a phone call."}, {"headline": "Rocker Lou Reed of Velvet Underground dies at 71", "sentence": "Lou Reed, the pioneering musician who fronted influential rock band The Velvet Underground and won mainstream acclaim with solo songs ``Walk on the Wild Side'' and ``Perfect Day,'' died on Sunday aged 71."}, {"headline": "Millennium development goals have had 'little impact' on lives of leprosy sufferers", "sentence": "The Millennium Development Goals have had a huge influence on international development but ``little impact'' on the lives of leprosy sufferers, according to The Leprosy Mission."}, {"headline": "Female puffin hatched at National aquarium", "sentence": "A female puffin hatched at the National Aquarium is doing well in its specially constructed burrow."}, {"headline": "Bruce Springsteen dedicates song to Trayvon Martin", "sentence": "NEW YORK Bruce Springsteen dedicated his protest song ``American Skin '' to teenager Trayvon Martin during a concert in Limerick, Ireland."}, {"headline": "Car washed off flooded causeway near Sarina", "sentence": "POLICE were called by passers-by after a car was washed off a flooded causeway on Pier Rd, near Sarina."}, {"headline": "Blue Jackets hire agent Bill Zito as assistant general manager", "sentence": "The Columbus Blue Jackets have hired prominent player agent Bill Zito as assistant general manager."}, {"headline": "Firefighters prepare for conditions 'as bad as it gets'", "sentence": "Firefighters in New South Wales are preparing for weather conditions they say will be 'as bad as it gets' and are calling on residents who don't have a fire plan to evacuate."}, {"headline": "Pita Pit closes in Edwardsville", "sentence": "Pita restaurant chain Pita Pit has closed in Edwardsville."}, {"headline": "Central African Republic poses 'serious threat':", "sentence": "The UN Security Council warned Wednesday that turmoil in the Central African Republic poses a ``serious threat'' to the country and the region, and urged new measures to restore stability, AFP reports."}, {"headline": "CISF trooper foils suicide bid in Delhi metro", "sentence": "Just a week after a CISF trooper foiled a suicide bid by a woman in the Delhi metro, another woman trooper from the same force prevented two women commuters from ending their lives, an official said Monday."}, {"headline": "Man was shot in head in front of teammates as he walked off soccer pitch", "sentence": "A MAN who was shot in the head in front of his teammates as he walked off a soccer pitch knew his life was in danger, an inquest heard."}, {"headline": "Manitowoc man accused of embezzling from Sheboygan firm", "sentence": "A Manitowoc man is accused of embezzling over $300,000 from a Sheboygan firm that makes chairs for health care patients and institutions."}, {"headline": "Gang of youths rob man in Oldbrook underpass for just \u00a310", "sentence": "A gang of youths between eight and sixteen robbed a man in an Oldbrook underpass for just \u00a310."}, {"headline": "Bombay high court directs BEST staff to call off strike & report to work", "sentence": "The Bombay high court on Tuesday directed the BEST staff, including drivers and conductors, to call off their strike and report to work immediately."}, {"headline": "Highway 12 over White Pass closed by slide", "sentence": "A 40-mile section of Highway 12 over White Pass was closed by a mud and rock slide Saturday, and crews will need to inspect the area in the daylight Sunday before estimating when the roadway will reopen."}, {"headline": "Germany, Belgium, Switzerland reach World Cup", "sentence": "THREE-TIME champions Germany, Belgium and Switzerland reached the World Cup finals on Friday as England, Russia and Bosnia-Herzegovina edged closer to Brazil."}, {"headline": "Neo-Nazis sentenced to six years in prison", "sentence": "Seven Austrian neo-Nazis were sentenced to up to six years in prison in a case the judge said should serve as an example to others in the country."}, {"headline": "Pakistan to resume cross-border trade in Kashmir", "sentence": "Pakistan said on Thursday it has decided to resume cross-border trade in Kashmir after weeks of suspension over the arrest of a Pakistani driver by the Indian authorities on drugs smuggling charges."}, {"headline": "Salem's comeback bid falls short vs. Alliance", "sentence": "ALLIANCE-The Salem baseball team made a late comeback bid but fell short to Alliance 7-5 Wednesday."}, {"headline": "Spain condemns mortar attack on Russian embassy in Damascus", "sentence": "Spain Friday strongly condemned the mortar attack against the Russian embassy in Damascus which killed one Syrian and injured nine others."}, {"headline": "Syrian dissident writer arrested", "sentence": "Syrian dissident writer and journalist Akram Al Bunni has been arrested by security forces, his brother, a prominent rights lawyer, told AFP on Sunday."}, {"headline": "Taxi drivers don't know speed limits", "sentence": "A research project has found that taxi drivers often don't know what the speed limit is."}, {"headline": "Venezuelan government to continue pace of land expropriations for ``agrarian socialism''", "sentence": "This year the Venezuelan government plans to continue its pace of land expropriations in order to move towards what it terms ``agrarian socialism''."}, {"headline": "OG&E warns customers of scam", "sentence": "OG&E is warning customers about a prepaid debit card scam that is targeting utility customers across the country."}, {"headline": "Bradley Manning sentenced to 35 years in prison", "sentence": "US whistleblower Bradley Manning, charged with releasing over 700,000 battlefield reports from Iraq and Afghanistan to Wikileaks, received a sentence of 35 years in prison from a military court Wednesday."}, {"headline": "Ballarpur Industries may touch Rs 21-22:", "sentence": "Sharmila Joshi of sharmilajoshi.com feels that Ballarpur Industries may touch Rs 21-22 in next 9-12 months."}, {"headline": "McAuliffe to bring pre-inauguration celebration to sw va Saturday", "sentence": "Governor-elect Terry McAuliffe is bringing pre-inauguration celebrations to southwest Virginia Saturday night with a regional inaugural ball in Abingdon."}, {"headline": "People roulette online chat", "sentence": "The second wild is the picture of all five of the Girls people roulette online chat Guns."}, {"headline": "Bale enjoying the expectation", "sentence": "Gareth Bale has conceded that he enjoys the 'expectation' on his shoulders at Real Madrid."}, {"headline": "Costumes could contain toxic chemicals", "sentence": "But one state lawmaker says some costumes contain toxic chemicals and she wants those toxins identified and labeled."}, {"headline": "Man arrested after child porn found on computer", "sentence": "A 48-year-old Golden Valley man was arrested Friday after child porn was found on his computer."}, {"headline": "Heritage Valley to advertise rates at outpatient clinics", "sentence": "Beaver-based Heritage Valley Health System plans to advertise rates for the top 25 medical services at its ConvenientCare outpatient clinics, the first effort of its kind in western Pennsylvania."}, {"headline": "Hundreds march for 'freedom' in sudan", "sentence": "Hundreds of men and women marched for ``freedom'' in the Sudanese capital today despite the deployment of militia, troops and riot police, AFP correspondents reported."}, {"headline": "Exercise good for treating heart disease", "sentence": "Exercise may be just as good as medication to treat heart disease and should be included as a comparison when new drugs are being developed and tested, scientists said on Wednesday."}, {"headline": "IT stocks extend Friday's gain", "sentence": "Nine IT stocks were up 0.27% to 2.98% at 09:57 IST on BSE, extending Friday's gain triggered by Infosys raising its revenue growth guidance in both rupee and dollar terms for the year ending 31 March 2014."}, {"headline": "World leaders pay tribute to Nelson Mandela", "sentence": "World leaders from across the globe come together to pay tribute to Nelson Mandela, who died last week at the age of 95."}, {"headline": "Redbox rents 3 billionth disc", "sentence": "Redbox rented its 3 billionth disc this month, just 16 months after renting its 2 billionth disc in March 2012."}, {"headline": "Jennifer Holliday coming to Atlanta Botanical Garden", "sentence": "Broadway's original Dreamgirl Jennifer Holliday is coming to the Atlanta Botanical Garden for a concert benefiting Actor's Express."}, {"headline": "University of Dayton basketball player charged with assault", "sentence": "Court records show a University of Dayton basketball player has been charged with domestic violence and assault."}, {"headline": "Silver Standard Resources to purchase Marigold mine", "sentence": "Silver Standard Resources Inc. announces today that it has entered into a Purchase and Sale Agreement with subsidiaries of Goldcorp Inc. and Barrick Gold Corporation to purchase 100% of the Marigold mine, a producing gold mine in Nevada, USA for cash consideration of $275 million."}, {"headline": "The attack on Syria has already failed", "sentence": "The moment Secretary of State John Kerry and President Obama began making speeches instead of launching missiles, the attack on Syria had already failed."}, {"headline": "Master plan begins forming implementation task groups", "sentence": "The Flint master plan team will begin forming implementation task groups to coordinate and implement the strategies developed in Imagine Flint Master Plan."}, {"headline": "Italy recalls ambassador from India", "sentence": "Italy on Tuesday recalled its ambassador from India in protest over a new delay in the legal proceedings against two Italian marines accused of killing two Indian fishermen."}, {"headline": "Kimiko Date-Krumm retires in the first round of Malaysian Open", "sentence": "Japanese veteran Kimiko Date-Krumm was forced to retire in the first round of the BMW Malaysian Open in Kuala Lampur on Tuesday."}, {"headline": "Husband of murdered woman Jill Meagher returning to Ireland", "sentence": "The husband of murdered Melbourne woman Jill Meagher will return to Ireland later this month ``to clear his head'' while fighting for parole board changes."}, {"headline": "Delays are often a simple result of supply and demand", "sentence": "Boaters can become frustrated when a repair or upgrade takes a long time, but delays are often a simple result of supply and demand."}, {"headline": "Gene Haas granted Formula One license by the FIA", "sentence": "It's been officially confirmed that NASCAR Sprint Cup team owner Gene Haas has been granted a Formula One license by the FIA."}, {"headline": "Priest who performs exorcisms signs movie deal", "sentence": "An Indiana priest who gained fame when he performed exorcisms on a local family has signed a movie deal that will tell the story on screen."}, {"headline": "24 veterans receive Medal of Honor decades after service", "sentence": "24 veterans will receive the Medal of Honor next month, decades after completing their military service."}, {"headline": "Midtown Global Market hosts Day of the dead", "sentence": "You and your family can get in on the action this weekend at Midtown Global Market in Minneapolis, where they're hosting the Mexican celebration Dia de los Muertos, or Day of the Dead."}, {"headline": "Stillman to furlough for employees in April, May", "sentence": "Stillman College will furlough employees in April and May as a way to avoid pay cuts or layoffs, according to a statement released Wednesday."}, {"headline": "Klobuchar to highlight efforts to combat heroin use", "sentence": "With the pace of drug overdose deaths on the rise in the Twin Cities, Democratic US Sen. Amy Klobuchar will host an event Sunday at the Hazelden's youth addiction treatment center in Plymouth to highlight efforts to combat heroin use in the state."}, {"headline": "Chris Hemsworth - Chris Hemsworth flees floods", "sentence": "Chris Hemsworth and the crew of his new movie 'In the Heart of the Sea' were forced to flee flash floods in the Canary Islands yesterday."}, {"headline": "Treatment for ringing in the ears", "sentence": "Tinnitus Miracle created by Thomas Coleman is a new revolutionary treatment for ringing in the ears that teaches people how to alleviate, even eliminate tinnitus efficiently."}, {"headline": "Adrian Beltre could return by Sunday", "sentence": "Adrian Beltre could return to the Texas Rangers' lineup as soon as Sunday after feeling tightness in his left quadriceps muscle during a late game Thursday night, and left-hander Matt Harrison is on schedule to make his Cactus League debut Monday night."}, {"headline": "Hartselle defeats Athens", "sentence": "No. 5 Hartselle defeated Athens 14-6 in a key region game at JP Cain Stadium Friday night."}, {"headline": "Shankar Chaudhary, 50 others acquitted in 2002 riots case", "sentence": "BJP MLA from Tharad and party's general secretary Shankar Chaudhary along with 50 others was acquitted on Saturday in a 2002 riots case, in which two people lost their lives in firing and public and private properties were damaged in Radhanpur town."}, {"headline": "Mayors urge Congress to pass immigration reform", "sentence": "Mayors from cities across Kansas urge members of Congress to pass comprehensive immigration reform."}, {"headline": "Kin of rape accused try to chop off victim's tongue", "sentence": "In an audacious incident, kin of a rape accused today tried to chop off the teenage victim's tongue in Pratapgarh district in order to prevent her from giving her statement in the court."}, {"headline": "Two soldiers wounded in north Lebanon", "sentence": "Two soldiers were wounded Friday night in north Lebanon while trying to reopen a road blocked by protesters seeking the release of a ``wanted man.''"}, {"headline": "Early education is vital", "sentence": "Early childhood education is vital to your child's development."}, {"headline": "Office 365 boosts productivity:", "sentence": "MICROSOFT Philippines is rolling out Office 365, its latest product that aims to boost productivity of schools and small and medium businesses."}, {"headline": "Mike Trout leaves game with tightness in his right hamstring", "sentence": "Mike Trout left the Angels game in the sixth inning today with tightness in his right hamstring, but he isn't worried about it being a long-term injury."}]
--------------------------------------------------------------------------------
/python/parallel_data_gen.py:
--------------------------------------------------------------------------------
1 |
2 | import copy
3 | import pydot
4 | import os
5 | from matplotlib.pyplot import imshow
6 | import numpy as np
7 | import matplotlib.image as mpimg
8 | import matplotlib.pyplot as plt
9 |
10 |
11 | import spacy
12 | from spacy.en import English
13 | from spacy.tokens.span import Span
14 | nlp = English()
15 | import copy
16 | from nltk.stem.porter import PorterStemmer
17 |
18 | from spacy.attrs import ORTH, DEP, HEAD
19 |
20 |
21 | merge_rules = {"group1": ["aux", "auxpass", "det", "nummod", "case",
22 | "prt", "poss", "of", "nmod", "compound",
23 | "neg", "xcomp", "quantmod", "advmod", "attr",
24 | "pobj", "as", "aux", "dobj", "amod",
25 | "npadvmod"],
26 | "group2": ["cc"],
27 | "group3": ["mark", "," ]}
28 |
29 | worked = {"npadvmod": [30, 37], "to": [40], "mark": [67], "attr": [58], "pobj": [69], "punct": [69],
30 | "conj": [69], "dobj": [77], "nsubj": [84, 86], "amod": [98], "ccomp": [4]}
31 |
32 | not_worked = {"npadvmod": [30], "mark": [41], "cc": [38], "advcl": [41], "pobj": [45], "with": [63],
33 | "conj": [71, 76], "nsubj": [47, 77], "ccomp": [16]}
34 |
35 |
36 | # Resize and clean edges
37 | def plot_im(im, dpi=80):
38 | py,px,_ = im.shape # depending of your matplotlib.rc you may have to use py,px instead
39 | size = (py/np.float(dpi), px/np.float(dpi)) # note the np.float()
40 |
41 | fig = plt.figure(figsize=size, dpi=dpi)
42 | # fig = plt.figure(figsize=(10,20), dpi=dpi)
43 | ax = fig.add_axes([0, 0, 1, 1])
44 |
45 | # Customize the axis
46 | # remove top and right spines
47 | ax.spines['right'].set_color('none')
48 | ax.spines['left'].set_color('none')
49 | ax.spines['top'].set_color('none')
50 | ax.spines['bottom'].set_color('none')
51 | # turn off ticks
52 | ax.xaxis.set_ticks_position('none')
53 | ax.yaxis.set_ticks_position('none')
54 | ax.xaxis.set_ticklabels([])
55 | ax.yaxis.set_ticklabels([])
56 |
57 | ax.imshow(im)
58 | plt.show()
59 |
60 |
61 | def get_decode(s):
62 | return unicode(s).encode("utf-8")
63 |
64 |
65 | class Tree_node():
66 |
67 | # Initialize tree
68 | def __init__(self, node):
69 | self.node = node
70 |
71 | # Get node's id
72 | def id(self):
73 | return self.node[u'word'][self.node['head_word_index']]['id']
74 |
75 | # Get head word tag
76 | def head_word_tag(self):
77 | return self.node[u'word'][self.node['head_word_index']]['tag']
78 |
79 | # Get head word stem
80 | def head_word_stem(self):
81 | return self.node[u'word'][self.node['head_word_index']]['stem']
82 |
83 | # Get head word
84 | def head_word(self):
85 | return self.node[u'word'][self.node['head_word_index']]
86 |
87 | # Get tag of each word in node
88 | def tags(self):
89 | return [word['tag'] for word in self.node['word']]
90 |
91 | # Get stem of each word in node
92 | def stems(self):
93 | return [word['stem'] for word in self.node['word']]
94 |
95 | # Get form of each word in node
96 | def forms(self):
97 | return [word['form'] for word in self.node['word']]
98 |
99 | # Get id of each word in node
100 | def ids(self):
101 | return [word['id'] for word in self.node['word']]
102 |
103 |
104 | # Get edge
105 | def edge(self):
106 | return self.node['edge']
107 |
108 | # Get form
109 | def form(self):
110 | return self.node['form']
111 |
112 | # Get edge label
113 | def edge_label(self):
114 | return self.node['edge']['label']
115 |
116 | # Get edge parent id
117 | def edge_parent_id(self):
118 | return self.node['edge']['parent_id']
119 |
120 | # Get word
121 | def word(self):
122 | return self.node['word']
123 |
124 | # Set new parent id
125 | def set_parent_id(self, parent_id):
126 | self.node['edge']['parent_id'] = parent_id
127 |
128 | # Set new form
129 | def set_form(self, form):
130 | self.node['form'] = form
131 |
132 | # Set new word
133 | def set_word(self, word):
134 | self.node['word'] = word
135 |
136 | # Set head word index
137 | def set_head_word_index(self, index):
138 | self.node['head_word_index'] = index
139 |
140 | # Set edge label
141 | def set_edge_label(self, label):
142 | self.node['edge']['label'] = label
143 |
144 | # Show form id combination
145 | def node_forms_and_ids(self):
146 | return " ".join([word[u'form'] + "_" + str(word[u'id'])
147 | for word in self.word()])
148 |
149 | # Show node
150 | def describe(self):
151 | return get_decode("node: {:<20} head_word_id:{:<20}".format(self.node_forms_and_ids(), self.id()))
152 |
153 |
154 |
155 | class Parsed_Tree():
156 | '''
157 | Manage and maintain an Tree
158 | '''
159 |
160 | # Initialize from string
161 | def __init__(self, nodes):
162 | self.tree = nodes
163 |
164 | def get_copy(self):
165 | return copy.deepcopy(self)
166 |
167 | # Delete node
168 | def remove_node(self, node):
169 | self.tree.remove(node)
170 |
171 | # Add node
172 | def append_node(self, node):
173 | self.tree.append(node)
174 |
175 | # Any children of A will point to B
176 | def update_children(self, A, B):
177 | for child in self.children(A):
178 | child.set_parent_id(B.id())
179 |
180 | # Merge A to B (parent of A)
181 | def merge(self, A, B):
182 | parent_head_word = B.head_word()['form']
183 |
184 | new_word = A.word() + B.word()
185 | new_word.sort(key=lambda x: x['id'])
186 | word_list = [word[u"form"] for word in new_word]
187 |
188 | B.set_word(new_word)
189 | B.set_form(" ".join(word_list))
190 | B.set_head_word_index(word_list.index(parent_head_word))
191 |
192 | self.update_children(A, B)
193 | self.remove_node(A)
194 |
195 | # Insert between A, B(child of A)
196 | def insert_between(self, node, A, B):
197 | # node point to A
198 | node.set_parent_id(A.id())
199 | # B point to node
200 | B.set_parent_id(node.id())
201 |
202 | # Get children node
203 | def children(self, node):
204 | children = []
205 | for child_node in self.tree:
206 | if node.id() == child_node.edge_parent_id():
207 | children.append((child_node))
208 | return children
209 |
210 | # Find parent node
211 | def find_parent_node(self, node):
212 | return self.find_node_by_id(node.edge_parent_id())
213 |
214 | # check consistency
215 | def consistency(self):
216 | assert all([self.find_parent_node(node) for node in self.tree])
217 | assert len(self.all_roots()) == 1
218 |
219 | # check if node is root node
220 | def is_root(self, node):
221 | return node.id() == node.edge_parent_id()
222 |
223 | # Get all root nodes if it has more than one (shouldn't), used for check consistency
224 | def all_roots(self):
225 | # The nodes is a tree, each node has one edge to it's parent
226 | return [node for node in self.tree if self.is_root(node)]
227 |
228 | # Get root node
229 | def root_node(self):
230 | return self.all_roots()[0]
231 |
232 | # Get path to root
233 | def path_to_root(self, node):
234 | path = []
235 | current_node = node
236 | while not self.is_root(current_node):
237 | path.append(current_node)
238 | current_node = self.find_parent_node(current_node)
239 | path.append(current_node)
240 | return path
241 |
242 | # Get path from A to B
243 | def path(self, A, B, debug = False):
244 | self.consistency()
245 | A_path_root = [node.id() for node in self.path_to_root(A)]
246 | B_path_root = [node.id() for node in self.path_to_root(B)]
247 | B_path_root.reverse()
248 |
249 | joined = set(A_path_root) & set(B_path_root)
250 |
251 | up = copy.deepcopy(A_path_root)
252 |
253 | [up.remove(item) for item in joined]
254 |
255 | down = copy.deepcopy(B_path_root)
256 |
257 | [down.remove(item) for item in joined]
258 |
259 | [A_path_root.remove(item) for item in up]
260 |
261 | top = [] + A_path_root[:1]
262 |
263 | if debug:
264 | print("up:", up)
265 | print("top", top)
266 | print("down:", down)
267 |
268 | return up, top, down
269 |
270 | # Add an dummy on top of original root
271 | def add_dummy_root(self):
272 | # -- Add dummy root node
273 | # Create an dummy root node, append it to node list
274 | dummy_root_id = -1
275 | dummy_root = {u'form': u'ROOT',
276 | u'head_word_index': 0,
277 | u'word': [{u'tag': u'ROOT',
278 | # u'dep': u'ROOT_To_Self',
279 | u'id': dummy_root_id,
280 | u'form': u'ROOT',
281 | u'stem': u'ROOT'}],
282 | u'edge': {u'parent_id': dummy_root_id, u'label': u'ROOT_To_Self'}
283 | }
284 |
285 | self.append_node(Tree_node(dummy_root))
286 |
287 | # Find original root node, which contains self pointed edge
288 | root_node = self.root_node()
289 | root_node.set_parent_id(dummy_root_id)
290 |
291 | # Get node given id
292 | def find_node_by_id(self, id, debug=False):
293 | found = None
294 | if debug:
295 | print("Debug ----- find_node_by_id ----- ")
296 | print("target id:", id)
297 | print([node.id() for node in self.tree])
298 | for node in self.tree:
299 | if id in node.ids():
300 | found = node
301 | break
302 | return found
303 |
304 | # Check if node is in the tree
305 | def is_node_in(self, node):
306 | if self.find_node_by_id(node.id()):
307 | return True
308 | else:
309 | return False
310 |
311 | # Find neighbor nodes
312 | def find_neighbor(self, node, debug = False):
313 | node_ids = [tree_node.id() for tree_node in self.tree]
314 | node_ids.sort()
315 |
316 | rights = filter(lambda x: x > node.id(), node_ids)[:1]
317 | right = next(iter(rights), None)
318 |
319 | node_ids.reverse()
320 | lefts = filter(lambda x: x < node.id(),node_ids )[:1]
321 | left = next(iter(lefts), None)
322 |
323 | left_node = self.find_node_by_id(left) if left else None
324 | right_node = self.find_node_by_id(right) if right else None
325 | if debug:
326 | print("find_neighbor:")
327 | print("ids: {}".format(node_ids))
328 | print("node: {}, left: {}, right: {}".format(node.id(), left, right))
329 | return left_node, right_node
330 |
331 | # Print tree
332 | def print_edges(self, debug=False):
333 | def get_tags(node):
334 | return get_decode(",".join(node.tags()))
335 |
336 | for node in self.tree:
337 | parent_id = node.edge_parent_id()
338 | parent_node = self.find_node_by_id(parent_id)
339 | if node.edge_label():
340 | if not parent_node:
341 | self.find_node_by_id(parent_id, debug=True)
342 | print(get_decode("{:<20}:{:<20}{}->{}".format(node.id(), node.edge_label(), node.form(), parent_id)))
343 | else:
344 | print(get_decode("{:<20}:{:<20}{}->{}[{}]".format(node.id(), node.edge_label(),
345 | node.form(),
346 | parent_node.form(),
347 | get_tags(parent_node)
348 | )))
349 | # Print a graphic tree
350 | def print_graph(self, color_settings = {}):
351 | graph = pydot.Dot(graph_type='digraph')
352 | name_to_node = {}
353 |
354 | id_to_color = {}
355 | for color, ids in color_settings.items():
356 | for id in ids:
357 | id_to_color[id] = color
358 |
359 |
360 | all_ids = [node.id() for node in self.tree]
361 | for id in all_ids:
362 | color = id_to_color.get(id, 'gray')
363 | node = self.find_node_by_id(id)
364 | node_label = node.node_forms_and_ids()
365 | name_to_node[node_label] = pydot.Node(node_label, style="filled", fillcolor=color)
366 |
367 |
368 | for node in name_to_node.values():
369 | graph.add_node(node)
370 |
371 | for edge in self.get_edges():
372 | node_a_name, node_b_name = edge
373 | graph.add_edge(pydot.Edge(name_to_node[node_a_name], name_to_node[node_b_name]))
374 |
375 | graph.write_png('graph.png')
376 | img = mpimg.imread('graph.png')
377 |
378 | plot_im(img, dpi=40)
379 | os.remove("graph.png")
380 |
381 | # Get and edge from a node to its parent
382 | def get_edge(self, node):
383 | parent_id = node.edge_parent_id()
384 | parent_node = self.find_node_by_id(parent_id)
385 | return (parent_node.node_forms_and_ids(), node.node_forms_and_ids())
386 |
387 | # Get tree edges
388 | def get_edges(self):
389 | return [self.get_edge(node) for node in self.tree]
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 | class Sentence_Reduction(object):
398 | def __init__(self, sentence_tree, headline_tree):
399 | # parse sentence into tree structure
400 | self.sentence_tree = sentence_tree
401 | # parse headline into tree structure
402 | self.headline_tree = headline_tree
403 | # Transfer headline into transfered_headline
404 | self.transfered_headline = None
405 | # Transfer sentence_tree into transfered_tree
406 | self.transfered_tree = None
407 | # Flat transfered_tree into flatten_tree
408 | self.flatten_tree = None
409 | # Given headline_tree, reduce flatten_tree into reduced_tree
410 | self.reduced_tree = None
411 | # Get reduced_sentence from reduced_tree
412 | self.reduced_sentence = None
413 |
414 | def transfer_tree(self, debug=False):
415 | # Start
416 | self.transfered_tree = self.sentence_tree.get_copy()
417 | self.transfered_tree.consistency()
418 |
419 | # Add dummy root
420 | self.transfered_tree.add_dummy_root()
421 |
422 | # Remove node that falls in ignore rules
423 | def ignore_node(node):
424 | # ignore_rules = ["``", "''", "'"]
425 | ignore_rules = []
426 | if node.form() in ignore_rules:
427 | parent_node = self.transfered_tree.find_parent_node(node)
428 | self.transfered_tree.update_children(node, parent_node)
429 | return True
430 | else:
431 | return False
432 |
433 | # remove node in ignore rules
434 | self.transfered_tree.tree[:] = [node for node in self.transfered_tree.tree if not ignore_node(node)]
435 |
436 | # -- preposition, punctuation replacement
437 | part_of_speach = ['prep', 'punct']
438 | for node in self.transfered_tree.tree:
439 | if node.edge_label() in part_of_speach:
440 | node.set_edge_label(node.head_word()['form'])
441 |
442 | # -- move conjunction word
443 | for node in self.transfered_tree.tree:
444 | if node.edge_label() in merge_rules['group2']:
445 | # print("Found cc node: node label: {:<20} id: {:<20} form: {:<20}".format(node.edge_label(), node.id(), node.form()))
446 | _, right_neighbor = self.transfered_tree.find_neighbor(node)
447 | up, top, down = self.transfered_tree.path(node, right_neighbor)
448 | if up and down:
449 | A_node = self.transfered_tree.find_node_by_id(top[0])
450 | B_node = self.transfered_tree.find_node_by_id(down[0])
451 | self.transfered_tree.insert_between(node, A_node, B_node)
452 |
453 | # Take a transfered tree and flat it
454 | def flat_tree(self):
455 | self.flatten_tree = self.transfered_tree.get_copy()
456 | self.flatten_tree.consistency()
457 |
458 | # Remove node each time after merged to its parent node
459 | for node in list(self.flatten_tree.tree):
460 | if node.edge_label() in merge_rules['group1']:
461 | self.flatten_tree.merge(node, self.flatten_tree.find_parent_node(node))
462 |
463 |
464 | # Transfer headline
465 | def transfer_headline(self, debug = False):
466 | self.transfered_headline = self.headline_tree.get_copy()
467 | self.transfered_headline.consistency()
468 |
469 | # Remove node that falls in ignore rules
470 | def ignore_node(node):
471 | headline_ignore_rules = ['IN', '``', "''", "DT", ':', '.', 'POS']
472 | if node.head_word_tag() in headline_ignore_rules:
473 | parent_node = self.transfered_headline.find_parent_node(node)
474 | self.transfered_headline.update_children(node, parent_node)
475 | return True
476 | else:
477 | return False
478 |
479 | if debug:
480 | for node in self.headline_tree.tree:
481 | print("{}--{}".format(get_decode(node.head_word_stem()), get_decode(node.head_word_tag())))
482 | self.transfered_headline.tree[:] = [node for node in self.transfered_headline.tree if not ignore_node(node)]
483 |
484 |
485 | def reduce_sentence_by_headline(self, addtional_stem = None, debug=False):
486 | # def is_verb(tag):
487 | # return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
488 |
489 | def stemming(node):
490 | if addtional_stem:
491 | stems = [addtional_stem(form.lower()) for form in node.forms()]
492 | else:
493 | stems = [stem.lower() for stem in node.stems()]
494 | return " ".join(stems)
495 |
496 | def get_node_set(node):
497 | stem = stemming(node)
498 | return stem.split()
499 |
500 | def is_not_in_headline(node, headline_stems):
501 | return not bool(set(get_node_set(node)) & set(headline_stems))
502 |
503 | def check_common_and_update(node, debug = False):
504 | node_stems = get_node_set(node)
505 | common = set(node_stems) & set(headline_stems)
506 |
507 | if bool(common):
508 | for item in common:
509 | headline_stems.remove(item)
510 | if debug:
511 | print("modified_headline_stems: ", headline_stems)
512 |
513 | return True
514 | else:
515 | return False
516 |
517 | # Start
518 | self.reduced_tree = self.flatten_tree.get_copy()
519 | self.reduced_tree.consistency()
520 |
521 | # Get a list of stems and flatten the list
522 | headline_stems = [get_node_set(headline_node) for headline_node in self.transfered_headline.tree]
523 | headline_stems = [item for itemset in headline_stems for item in itemset]
524 |
525 | # Get a list of connect words for later use
526 | connect_nodes = [node for node in self.reduced_tree.tree if node.edge_label() in merge_rules['group3']]
527 |
528 | # Keep node that has headline stem
529 | self.reduced_tree.tree[:] = [node for node in self.reduced_tree.tree if check_common_and_update(node)]
530 |
531 | # if headline_stems:
532 | # print("{} -- Found unmatched headlines: {}".format(self.reduce_sentence_by_headline.__name__, headline_stems))
533 |
534 |
535 | # Return each part of the flatten tree, use different color to print graph
536 | reduced_tree_ids = [n.id() for n in self.reduced_tree.tree]
537 |
538 | # Add node on the path to reduced tree
539 | nodes_on_the_path = []
540 | processed = []
541 | for index, node in enumerate(self.reduced_tree.tree):
542 | path = self.flatten_tree.path_to_root(node)
543 | path_label = [self.reduced_tree.is_node_in(node) for node in path]
544 | # the last item is zero(dummy root)
545 | # Find the first True and the last True
546 | # Index in between will be added to reduced graph
547 | start, end = path_label.index(True) + 1, len(path_label) - path_label[::-1].index(True) - 1
548 |
549 | for node_on_path in path[start:end]:
550 | if not self.reduced_tree.is_node_in(node_on_path) and node_on_path.id() not in processed:
551 | # print("add current node: {}".format(path[node_index]['form']))
552 | processed.append(node_on_path.id())
553 | nodes_on_the_path.append(node_on_path)
554 |
555 | # Return each part of the flatten tree, use different color to print graph
556 | path_node_ids = [n.id() for n in nodes_on_the_path]
557 |
558 | self.reduced_tree.tree += nodes_on_the_path
559 |
560 | def use_connect_word(node):
561 | # Find connnect word like "that" or "which", we use them only if both left words
562 | # and right words are selected in reduced tree.
563 | # The left word is the word right before the connect word
564 | # The right word is any word after connect
565 | left_word_id = node.id() - 1
566 | left_node, right_node = self.reduced_tree.find_neighbor(node)
567 | return self.reduced_tree.find_node_by_id(left_word_id) and right_node
568 |
569 | # Add connect word fot reduced tree if needed
570 | connect_nodes[:] = [node for node in connect_nodes if use_connect_word(node) and not self.reduced_tree.is_node_in(node)]
571 |
572 | for node in connect_nodes:
573 | print("Found connect node: {}".format(node.describe()))
574 |
575 | # Return each part of the flatten tree, use different color to print graph
576 | connect_nodes_ids = [n.id() for n in connect_nodes]
577 |
578 | self.reduced_tree.tree += connect_nodes
579 |
580 | # Make reduced tree consistent
581 | for reduced_node in self.reduced_tree.tree:
582 | if not self.reduced_tree.find_parent_node(reduced_node):
583 | reduced_node.set_parent_id(reduced_node.id())
584 |
585 |
586 | return reduced_tree_ids, path_node_ids, connect_nodes_ids
587 |
588 | # Generate reduced sentence from reduced node
589 | def generate_reduced_sentence(self):
590 | id_word_pairs = [(word[u'id'], word[u'form']) for reduced_node in self.reduced_tree.tree
591 | for word in reduced_node.word()]
592 |
593 | id_word_pairs.sort(key=lambda tuple: tuple[0])
594 |
595 | self.reduced_sentence = " ".join([tuple[1] for tuple in id_word_pairs])
596 |
597 |
598 |
599 |
600 | # Construct tree from sentence
601 | def parse_info(sentence):
602 | doc = nlp(sentence)
603 |
604 | heads = [index + item[0] for index, item in enumerate(doc.to_array([HEAD]))]
605 |
606 | nodes = [{u"form": token.orth_,
607 | u"head_word_index": 0,
608 | u"word": [{u"id": current_id,
609 | # u"dep": doc[current_id].dep_,
610 | u"tag": token.tag_,
611 | u"form": token.orth_,
612 | u"stem": token.lemma_
613 | }],
614 | u"edge": {u"parent_id": parent_id, u"label": doc[current_id].dep_}
615 | }
616 | for current_id, (token, parent_id) in enumerate(zip(doc, heads))]
617 |
618 | return [Tree_node(node) for node in nodes]
--------------------------------------------------------------------------------