├── .classpath
├── .gitignore
├── .metadata
    ├── .lock
    ├── .log
    ├── .mylyn
    │   └── repositories.xml.zip
    ├── .plugins
    │   ├── org.eclipse.core.resources
    │   │   ├── .root
    │   │   │   ├── .indexes
    │   │   │   │   ├── history.version
    │   │   │   │   ├── properties.index
    │   │   │   │   └── properties.version
    │   │   │   └── 2.tree
    │   │   └── .safetable
    │   │   │   └── org.eclipse.core.resources
    │   ├── org.eclipse.core.runtime
    │   │   └── .settings
    │   │   │   ├── org.eclipse.core.resources.prefs
    │   │   │   ├── org.eclipse.e4.ui.css.swt.theme.prefs
    │   │   │   ├── org.eclipse.e4.ui.workbench.renderers.swt.prefs
    │   │   │   ├── org.eclipse.jdt.ui.prefs
    │   │   │   ├── org.eclipse.m2e.discovery.prefs
    │   │   │   ├── org.eclipse.mylyn.context.core.prefs
    │   │   │   ├── org.eclipse.mylyn.monitor.ui.prefs
    │   │   │   ├── org.eclipse.mylyn.tasks.ui.prefs
    │   │   │   ├── org.eclipse.team.ui.prefs
    │   │   │   ├── org.eclipse.ui.editors.prefs
    │   │   │   ├── org.eclipse.ui.ide.prefs
    │   │   │   ├── org.eclipse.ui.prefs
    │   │   │   ├── org.eclipse.ui.workbench.prefs
    │   │   │   └── org.python.pydev.prefs
    │   ├── org.eclipse.e4.workbench
    │   │   └── workbench.xmi
    │   ├── org.eclipse.epp.logging.aeri.ui
    │   │   ├── history
    │   │   │   ├── _0.fdt
    │   │   │   ├── _0.fdx
    │   │   │   ├── _0.fnm
    │   │   │   ├── _0.frq
    │   │   │   ├── _0.nrm
    │   │   │   ├── _0.tii
    │   │   │   ├── _0.tis
    │   │   │   ├── segments.gen
    │   │   │   └── segments_1
    │   │   ├── remote-index
    │   │   │   ├── _2.fdt
    │   │   │   ├── _2.fdx
    │   │   │   ├── _2.fnm
    │   │   │   ├── _2.frq
    │   │   │   ├── _2.nrm
    │   │   │   ├── _2.prx
    │   │   │   ├── _2.tii
    │   │   │   ├── _2.tis
    │   │   │   ├── segments.gen
    │   │   │   └── segments_3
    │   │   └── server-config.json
    │   ├── org.eclipse.jdt.core
    │   │   ├── assumedExternalFilesCache
    │   │   ├── externalFilesCache
    │   │   ├── nonChainingJarsCache
    │   │   └── variablesAndContainers.dat
    │   ├── org.eclipse.jdt.ui
    │   │   ├── OpenTypeHistory.xml
    │   │   ├── QualifiedTypeNameHistory.xml
    │   │   └── dialog_settings.xml
    │   ├── org.eclipse.m2e.logback.configuration
    │   │   ├── 0.log
    │   │   └── logback.1.6.2.20150902-0002.xml
    │   ├── org.eclipse.oomph.setup.ui
    │   │   └── dialog_settings.xml
    │   ├── org.eclipse.oomph.setup
    │   │   └── workspace.setup
    │   ├── org.eclipse.ui.ide
    │   │   └── dialog_settings.xml
    │   └── org.eclipse.ui.workbench
    │   │   ├── dialog_settings.xml
    │   │   └── workingsets.xml
    └── version.ini
├── .project
├── .pydevproject
├── .settings
    ├── org.eclipse.jdt.core.prefs
    ├── org.eclipse.jdt.ui.prefs
    └── org.eclipse.m2e.core.prefs
├── HyperVec.jar
├── README.md
├── code_mapping_across_languages
    ├── AP_evaluation_code
    │   ├── common.py
    │   ├── test_default.py
    │   └── test_norm.py
    ├── alignment_files
    │   ├── de_en.align
    │   └── it_en.align
    ├── convert_w2vTXT_to_w2vBIN.py
    ├── credits_to_CLIC_trento.txt
    ├── mappingcode
    │   ├── __init__.py
    │   ├── demo.sh~
    │   ├── learn_mat.sh
    │   ├── space.py
    │   ├── space.pyc
    │   ├── test_tm.py
    │   ├── test_tm2.py
    │   ├── test_tm_pred.py
    │   ├── train_tm.py
    │   ├── translate_tm.py
    │   ├── utils.py
    │   └── utils.pyc
    ├── perform_mapping.sh
    └── vocabulary file
    │   ├── german_voc_wikipedia.txt.gz
    │   └── italian_voc_wikipedia.txt.gz
├── config.cfg
├── create_features.py
├── datasets_across_languages
    ├── eval_DE
    │   ├── noun_hyp_vs_ant.txt
    │   ├── noun_hyp_vs_syn.txt
    │   └── noun_hyp_vs_synant.txt
    └── eval_IT
    │   ├── noun_hyp_vs-ant.txt
    │   ├── noun_hyp_vs-syn-ant.txt
    │   └── noun_hyp_vs-syn.txt
├── datasets_classification
    ├── ABIBLESS.txt
    ├── AWBLESS.txt
    ├── BLESS.txt
    ├── eval-bless.jar
    ├── eval-dir.jar
    └── readme_how_to.txt
├── evaluation_scripts
    ├── common.py
    └── corrEval.py
├── get-pretrainedHyperVecEmbeddings
    └── download_embeddings.sh
├── hypernymy_resources
    ├── cohyponym_n.txt.gz
    ├── cohyponym_v.txt.gz
    ├── hypernym_n.txt.gz
    └── hypernym_v.txt.gz
├── pom.xml
└── src
    ├── common
        ├── DataStructureUtils.java
        ├── IOUtils.java
        ├── MathUtils.java
        ├── MeanAveragePrecision.java
        ├── SigmoidTable.java
        ├── SimpleMatrixUtils.java
        ├── TanhTable.java
        ├── WordForm.java
        ├── correlation
        │   ├── AreaUnderCurve.java
        │   └── MenCorrelation.java
        ├── exception
        │   ├── OutOfVocabularyException.java
        │   └── ValueException.java
        └── wordnet
        │   ├── LexicalHypernym.java
        │   ├── LexicalResource.java
        │   ├── LexicalResourceAdj.java
        │   ├── LexicalResourceNoun.java
        │   ├── LexicalResourceVerb.java
        │   ├── Synset.java
        │   ├── WordNetAdj.java
        │   ├── WordNetNoun.java
        │   ├── WordNetReader.java
        │   └── WordNetVerb.java
    ├── demo
        ├── HyperVecLearning.java
        └── W2vProperties.java
    ├── io
        ├── sentence
        │   ├── PlainSentenceInputStream.java
        │   ├── SentenceInputStream.java
        │   ├── SubSamplingSentenceInputStream.java
        │   └── TreeInputStream.java
        └── word
        │   ├── CombinedWordInputStream.java
        │   ├── Phrase.java
        │   ├── PushBackWordStream.java
        │   ├── WordFilter.java
        │   └── WordInputStream.java
    ├── neural
        └── function
        │   ├── ActivationFunction.java
        │   ├── Correlation.java
        │   ├── Sigmoid.java
        │   └── Tanh.java
    ├── space
        ├── AbstractSemanticSpace.java
        ├── Neighbor.java
        ├── RawSemanticSpace.java
        ├── SemanticSpace.java
        └── Similarity.java
    ├── tree
        ├── CcgTree.java
        └── Tree.java
    ├── vocab
        ├── HuffmanTree.java
        ├── Vocab.java
        ├── VocabEntry.java
        ├── VocabEntryFilter.java
        └── filter
        │   └── MinFrequencyVocabFilter.java
    └── word2vec
        ├── AbstractWord2Vec.java
        ├── MultiThreadWord2Vec.java
        ├── UniGram.java
        └── multitask
            └── Hyper2Vec.java


/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
15 | 		<attributes>
16 | 			<attribute name="maven.pomderived" value="true"/>
17 | 		</attributes>
18 | 	</classpathentry>
19 | 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
20 | 	<classpathentry kind="output" path="target/classes"/>
21 | </classpath>
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /target
3 | .attach_pid*
4 | 


--------------------------------------------------------------------------------
/.metadata/.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.lock


--------------------------------------------------------------------------------
/.metadata/.log:
--------------------------------------------------------------------------------
 1 | !SESSION 2016-01-11 12:36:53.838 -----------------------------------------------
 2 | eclipse.buildId=4.5.1.M20150904-0015
 3 | java.version=1.7.0_79
 4 | java.vendor=Oracle Corporation
 5 | BootLoader constants: OS=macosx, ARCH=x86_64, WS=cocoa, NL=en_US
 6 | Framework arguments:  -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
 7 | Command-line arguments:  -os macosx -ws cocoa -arch x86_64 -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
 8 | 
 9 | !ENTRY org.eclipse.core.net 1 0 2016-01-11 12:36:55.177
10 | !MESSAGE System property http.nonProxyHosts has been set to local|*.local|169.254/16|*.169.254/16 by an external source. This value will be overwritten using the values from the preferences
11 | 
12 | !ENTRY org.eclipse.jface 2 0 2016-01-11 12:37:55.985
13 | !MESSAGE Keybinding conflicts occurred.  They may interfere with normal accelerator operation.
14 | !SUBENTRY 1 org.eclipse.jface 2 0 2016-01-11 12:37:55.985
15 | !MESSAGE A conflict occurred for ALT+COMMAND+R:
16 | Binding(ALT+COMMAND+R,
17 | 	ParameterizedCommand(Command(org.python.pydev.debug.setnext,Set Next Statement,
18 | 		,
19 | 		Category(org.python.pydev.ui.category.run,PyDev - Run,Python run category,true),
20 | 		org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@1ee8d4b6,
21 | 		,,true),null),
22 | 	org.eclipse.ui.defaultAcceleratorConfiguration,
23 | 	org.eclipse.ui.contexts.window,,,system)
24 | Binding(ALT+COMMAND+R,
25 | 	ParameterizedCommand(Command(org.eclipse.jdt.ui.edit.text.java.rename.element,Rename - Refactoring ,
26 | 		Rename the selected element,
27 | 		Category(org.eclipse.jdt.ui.category.refactoring,Refactor - Java,Java Refactoring Actions,true),
28 | 		org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@47e50894,
29 | 		,,true),null),
30 | 	org.eclipse.ui.defaultAcceleratorConfiguration,
31 | 	org.eclipse.ui.contexts.window,,cocoa,system)
32 | !SESSION 2016-03-23 14:56:47.781 -----------------------------------------------
33 | eclipse.buildId=4.5.1.M20150904-0015
34 | java.version=1.7.0_79
35 | java.vendor=Oracle Corporation
36 | BootLoader constants: OS=macosx, ARCH=x86_64, WS=cocoa, NL=en_US
37 | Framework arguments:  -product org.eclipse.epp.package.java.product -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
38 | Command-line arguments:  -os macosx -ws cocoa -arch x86_64 -product org.eclipse.epp.package.java.product -data /Volumes/Data/Doctorate/Implementation/w2vcomp -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
39 | 
40 | !ENTRY org.eclipse.core.net 1 0 2016-03-23 14:56:48.939
41 | !MESSAGE System property http.nonProxyHosts has been set to local|*.local|169.254/16|*.169.254/16 by an external source. This value will be overwritten using the values from the preferences
42 | 
43 | !ENTRY org.eclipse.jface 2 0 2016-03-23 14:56:52.297
44 | !MESSAGE Keybinding conflicts occurred.  They may interfere with normal accelerator operation.
45 | !SUBENTRY 1 org.eclipse.jface 2 0 2016-03-23 14:56:52.297
46 | !MESSAGE A conflict occurred for ALT+COMMAND+R:
47 | Binding(ALT+COMMAND+R,
48 | 	ParameterizedCommand(Command(org.python.pydev.debug.setnext,Set Next Statement,
49 | 		,
50 | 		Category(org.python.pydev.ui.category.run,PyDev - Run,Python run category,true),
51 | 		org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@33b88372,
52 | 		,,true),null),
53 | 	org.eclipse.ui.defaultAcceleratorConfiguration,
54 | 	org.eclipse.ui.contexts.window,,,system)
55 | Binding(ALT+COMMAND+R,
56 | 	ParameterizedCommand(Command(org.eclipse.jdt.ui.edit.text.java.rename.element,Rename - Refactoring ,
57 | 		Rename the selected element,
58 | 		Category(org.eclipse.jdt.ui.category.refactoring,Refactor - Java,Java Refactoring Actions,true),
59 | 		org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@16bdcbe5,
60 | 		,,true),null),
61 | 	org.eclipse.ui.defaultAcceleratorConfiguration,
62 | 	org.eclipse.ui.contexts.window,,cocoa,system)
63 | 


--------------------------------------------------------------------------------
/.metadata/.mylyn/repositories.xml.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.mylyn/repositories.xml.zip


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/history.version:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/properties.index:
--------------------------------------------------------------------------------
1 |     /  org.eclipse.jdt.core stateVersionNumber 28


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/properties.version:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/2.tree:
--------------------------------------------------------------------------------
1 |                                org.eclipse.jdt.core                


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.safetable/org.eclipse.core.resources:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.core.resources/.safetable/org.eclipse.core.resources


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | version=1
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.e4.ui.css.swt.theme.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | themeid=org.eclipse.e4.ui.css.theme.e4_default
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.e4.ui.workbench.renderers.swt.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | enableMRU=true
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
 1 | content_assist_proposals_background=255,255,255
 2 | content_assist_proposals_foreground=0,0,0
 3 | eclipse.preferences.version=1
 4 | fontPropagated=true
 5 | org.eclipse.jdt.internal.ui.navigator.layout=2
 6 | org.eclipse.jdt.ui.editor.tab.width=
 7 | org.eclipse.jdt.ui.formatterprofiles.version=12
 8 | org.eclipse.jdt.ui.javadoclocations.migrated=true
 9 | org.eclipse.jface.textfont=1|Monaco|13.0|0|COCOA|1|Monaco;
10 | proposalOrderMigrated=true
11 | sourceHoverBackgroundColor=236,235,236
12 | spelling_locale_initialized=true
13 | tabWidthPropagated=true
14 | useAnnotationsPrefPage=true
15 | useQuickDiffPrefPage=true
16 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.m2e.discovery.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.discovery.pref.projects=
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.context.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | mylyn.attention.migrated=true
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.monitor.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.mylyn.monitor.activity.tracking.enabled.checked=true
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.tasks.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | migrated.task.repositories.secure.store=true
3 | org.eclipse.mylyn.tasks.ui.filters.nonmatching=true
4 | org.eclipse.mylyn.tasks.ui.filters.nonmatching.encouraged=true
5 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.team.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.team.ui.first_time=false
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.editors.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | lineNumberRuler=true
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.ide.prefs:
--------------------------------------------------------------------------------
1 | TASKS_FILTERS_MIGRATE=true
2 | eclipse.preferences.version=1
3 | platformState=1450704678997
4 | quickStart=false
5 | tipsAndTricks=true
6 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | showIntro=false
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.workbench.prefs:
--------------------------------------------------------------------------------
1 | //org.eclipse.ui.commands/state/org.eclipse.ui.navigator.resources.nested.changeProjectPresentation/org.eclipse.ui.commands.radioState=false
2 | ColorsAndFontsPreferencePage.expandedCategories=Torg.eclipse.ui.workbenchMisc
3 | ColorsAndFontsPreferencePage.selectedElement=Forg.eclipse.jface.textfont
4 | ENABLED_DECORATORS=org.eclipse.m2e.core.mavenVersionDecorator\:true,org.eclipse.buildship.ui.gradledecorator\:true,org.eclipse.egit.ui.internal.decorators.GitLightweightDecorator\:true,org.eclipse.jdt.ui.override.decorator\:true,org.eclipse.jdt.ui.interface.decorator\:true,org.eclipse.jdt.ui.buildpath.decorator\:true,org.eclipse.m2e.core.maven2decorator\:true,org.eclipse.mylyn.context.ui.decorator.interest\:true,org.eclipse.mylyn.tasks.ui.decorators.task\:true,org.eclipse.mylyn.team.ui.changeset.decorator\:true,org.eclipse.ui.LinkedResourceDecorator\:true,org.eclipse.ui.SymlinkDecorator\:true,org.eclipse.ui.VirtualResourceDecorator\:true,org.eclipse.ui.ContentTypeDecorator\:true,org.eclipse.ui.ResourceFilterDecorator\:false,org.python.pydev.navigator.decorator.problemsLabelDecorator\:true,
5 | PLUGINS_NOT_ACTIVATED_ON_STARTUP=org.eclipse.m2e.discovery;
6 | eclipse.preferences.version=1
7 | org.eclipse.jface.textfont=1|Monaco|13.0|0|COCOA|1|Monaco;
8 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.python.pydev.prefs:
--------------------------------------------------------------------------------
1 | INTERPRETERS_CHECKED_ONCE=true
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fdt:
--------------------------------------------------------------------------------
1 |      0.6


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fdx:
--------------------------------------------------------------------------------
1 |           


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fnm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fnm


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.frq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.frq


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.nrm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.nrm


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tii


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tis


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments.gen


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments_1


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdt


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdx


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fnm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fnm


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.frq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.frq


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.nrm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.nrm


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tii


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tis


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments.gen


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments_3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments_3


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/server-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "v1",
 3 |   "title": "Eclipse.org Error Reporting Server",
 4 |   "description": "Automated Error Reporting for eclipse.org",
 5 |   "timestamp": 1458741415931,
 6 |   "ttl": 20160,
 7 |   "helpUrl": "https://wiki.eclipse.org/EPP/Logging",
 8 |   "feedbackUrl": "https://docs.google.com/a/codetrails.com/forms/d/1wd9AzydLv_TMa7ZBXHO7zQIhZjZCJRNMed-6J4fVNsc/viewform",
 9 |   "aboutUrl": "https://dev.eclipse.org/recommenders/community/confess/#/about",
10 |   "submitUrl": "https://dev.eclipse.org/recommenders/community/confess/0.6/reports/",
11 |   "maxReportSize": 5242880,
12 |   "problemsUrl": "https://www.eclipse.org/downloads/download.php?r\u003d1\u0026file\u003d/technology/epp/logging/problems.zip",
13 |   "problemsTtl": 20160,
14 |   "queryUrl": "https://dev.eclipse.org/recommenders/community/confess/0.6/query/",
15 |   "connectTimeout": 10000,
16 |   "socketTimeout": 100000,
17 |   "acceptedProducts": [
18 |     "org.eclipse.*"
19 |   ],
20 |   "acceptedPlugins": [
21 |     "org.eclipse.*",
22 |     "org.apache.log4j.*",
23 |     "com.codetrails.*"
24 |   ],
25 |   "acceptedPackages": [
26 |     "org.eclipse.*",
27 |     "org.apache.*",
28 |     "java.*",
29 |     "javax.*",
30 |     "javafx.*",
31 |     "sun.*",
32 |     "com.sun.*",
33 |     "com.codetrails.*",
34 |     "com.google.*",
35 |     "org.osgi.*",
36 |     "ch.qos.*",
37 |     "org.slf4j.*"
38 |   ],
39 |   "acceptOtherPackages": true,
40 |   "acceptUiFreezes": true,
41 |   "ignoredStatuses": [
42 |     "org.eclipse.equinox.p2.*::",
43 |     "org.eclipse.epp.mpc.ui:java.io.IOException:",
44 |     "org.eclipse.epp.mpc.ui:java.net.SocketTimeoutException:",
45 |     "org.eclipse.oomph.setup.core:$org.apache.http.ConnectionClosedException:",
46 |     "org.eclipse.ui::Conflicting handlers for*",
47 |     "org.eclipse.jface:java.io.IOException:Unable to resolve plug-in*",
48 |     "org.eclipse.core.runtime::Invalid input url*",
49 |     "org.eclipse.core.filesystem::Could not move*",
50 |     "org.eclipse.core.filesystem::Could not delete*",
51 |     "org.eclipse.pde.core::The current target platform contains errors*",
52 |     ":org.eclipse.equinox.security.storage.StorageException:",
53 |     ":org.eclipse.ecf.filetransfer.*:",
54 |     ":java.net.*:"
55 |   ],
56 |   "problemsZipLastDownloadTimestamp": 1458741427690
57 | }


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/assumedExternalFilesCache:
--------------------------------------------------------------------------------
1 |     


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/externalFilesCache:
--------------------------------------------------------------------------------
1 |     


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/nonChainingJarsCache:
--------------------------------------------------------------------------------
1 |     


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/variablesAndContainers.dat:
--------------------------------------------------------------------------------
1 |                JRE_SRC    M2_REPO    
2 | JUNIT_HOME    JRE_SRCROOT    JRE_LIB    JUNIT_SRC_HOME


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.ui/OpenTypeHistory.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <typeInfoHistroy/>
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.ui/QualifiedTypeNameHistory.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <qualifiedTypeNameHistroy/>
3 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.ui/dialog_settings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <section name="Workbench">
3 | 	<section name="JavaElementSearchActions">
4 | 	</section>
5 | </section>
6 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.m2e.logback.configuration/0.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.m2e.logback.configuration/0.log


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.m2e.logback.configuration/logback.1.6.2.20150902-0002.xml:
--------------------------------------------------------------------------------
 1 | <configuration scan="true">
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
 4 |       <pattern>%date [%thread] %-5level %logger{35} - %msg%n</pattern>
 5 |     </encoder>
 6 |     <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
 7 |       <level>OFF</level> <!-- change to DEBUG to mimic '-consolelog' behaviour -->
 8 |     </filter>
 9 |   </appender>
10 | 
11 |   <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
12 |     <File>${org.eclipse.m2e.log.dir}/0.log</File>
13 |     <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
14 |       <FileNamePattern>${org.eclipse.m2e.log.dir}/%i.log</FileNamePattern>
15 |       <MinIndex>1</MinIndex>
16 |       <MaxIndex>10</MaxIndex>
17 |     </rollingPolicy>
18 |     <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
19 |       <MaxFileSize>100MB</MaxFileSize>
20 |     </triggeringPolicy>
21 |     <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
22 |       <pattern>%date [%thread] %-5level %logger{35} - %msg%n</pattern>
23 |     </encoder>
24 |   </appender>
25 | 
26 |   <appender name="EclipseLog" class="org.eclipse.m2e.logback.appender.EclipseLogAppender">
27 |     <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
28 |       <level>WARN</level>
29 |     </filter>
30 |   </appender>
31 | 
32 |   <appender name="MavenConsoleLog" class="org.eclipse.m2e.logback.appender.MavenConsoleAppender">
33 |   </appender>
34 |         
35 |   <root level="INFO">
36 |     <appender-ref ref="FILE" />
37 |     <appender-ref ref="STDOUT" />
38 |     <appender-ref ref="EclipseLog" />
39 |     <appender-ref ref="MavenConsoleLog" />
40 |   </root>
41 | 
42 |   <logger name="com.ning.http.client" level="INFO" />
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.oomph.setup.ui/dialog_settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <section name="Workbench">
 3 | 	<section name="SetupWizard">
 4 | 		<section name="VariablePage">
 5 | 		</section>
 6 | 		<section name="ProgressPage">
 7 | 		</section>
 8 | 		<section name="ConfirmationPage">
 9 | 		</section>
10 | 	</section>
11 | </section>
12 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.oomph.setup/workspace.setup:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <setup:Workspace
3 |     xmi:version="2.0"
4 |     xmlns:xmi="http://www.omg.org/XMI"
5 |     xmlns:setup="http://www.eclipse.org/oomph/setup/1.0"
6 |     name="workspace"/>
7 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.ui.ide/dialog_settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <section name="Workbench">
 3 | 	<section name="ExternalProjectImportWizard">
 4 | 		<item value="false" key="WizardProjectsImportPage.STORE_ARCHIVE_SELECTED"/>
 5 | 		<item value="false" key="WizardProjectsImportPage.STORE_COPY_PROJECT_ID"/>
 6 | 		<item value="false" key="WizardProjectsImportPage.STORE_NESTED_PROJECTS"/>
 7 | 		<list key="WizardProjectsImportPage.STORE_DIRECTORIES">
 8 | 			<item value="/Volumes/Data/Doctorate/Implementation/w2vcomp"/>
 9 | 		</list>
10 | 		<list key="WizardProjectsImportPage.STORE_ARCHIVES">
11 | 			<item value=""/>
12 | 		</list>
13 | 	</section>
14 | </section>
15 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.ui.workbench/dialog_settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <section name="Workbench">
 3 | 	<section name="org.eclipse.ui.internal.QuickAccess">
 4 | 		<item value="800" key="dialogWidth"/>
 5 | 		<item value="500" key="dialogHeight"/>
 6 | 		<list key="orderedProviders">
 7 | 		</list>
 8 | 		<list key="textArray">
 9 | 		</list>
10 | 		<list key="orderedElements">
11 | 		</list>
12 | 		<list key="textEntries">
13 | 		</list>
14 | 	</section>
15 | 	<section name="ImportExportAction">
16 | 	</section>
17 | </section>
18 | 


--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.ui.workbench/workingsets.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <workingSetManager>
3 | <workingSet aggregate="true" factoryID="org.eclipse.ui.internal.WorkingSetFactory" id="1452512278752_0" label="Window Working Set" name="Aggregate for window 1452512278752"/>
4 | </workingSetManager>


--------------------------------------------------------------------------------
/.metadata/version.ini:
--------------------------------------------------------------------------------
1 | #Wed Mar 23 14:56:49 CET 2016
2 | org.eclipse.core.runtime=2
3 | org.eclipse.platform=4.5.1.v20150904-0015
4 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>HyperVec</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.jdt.core.javabuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 		<buildCommand>
19 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
20 | 			<arguments>
21 | 			</arguments>
22 | 		</buildCommand>
23 | 	</buildSpec>
24 | 	<natures>
25 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
26 | 		<nature>org.eclipse.jdt.core.javanature</nature>
27 | 		<nature>org.python.pydev.pythonNature</nature>
28 | 	</natures>
29 | </projectDescription>
30 | 


--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?eclipse-pydev version="1.0"?><pydev_project>
3 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
4 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
5 | </pydev_project>
6 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_Nghia
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/HyperVec.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/HyperVec.jar


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## HyperVec
 2 | Hierarchical Embeddings for Hypernymy Detection and Directionality
 3 | 
 4 | ### Prerequisite
 5 |   - [spaCy](https://spacy.io): for parsing, version 2.0.11
 6 |   - a corpus such as wikipedia corpus (plain-text)
 7 | 
 8 | ### Preprocess
 9 |  - Create the feature files:
10 |  
11 |     ```python create_features.py -input corpus-file.txt -output output-file-name -pos pos_tag```
12 |     
13 |     in which: pos_tag is either NN (for the noun features) or VB (for the verb features)     
14 | 
15 | ### Configuration
16 | See the config.cfg to set agruments for model.
17 | 
18 | ### Training embeddings
19 |   ```java -jar HyperVec.jar config.cfg vector-size window-size```
20 |   
21 |   For example, training embeddings with 100 dimensions; window-size = 5:
22 | 
23 |   ```java -jar HyperVec.jar config.cfg 100 5```
24 |   
25 |  ### Pretrained (hypervec) embeddings
26 |  The embeddings used in our paper can be downloaded by using the script in `get-pretrainedHyperVecEmbeddings/download_embeddings.sh`. Note that the script downloads 9 files and concatenates them again to a single file (`hypervec.txt.gz`). The format is the default word2vec format: first line with header information, other lines word followed by whitespace seperated vector.
27 | 
28 | Information about the embeddings: creatd using the ENCOW14A corpus (14.5bn token), 100 dimensions, sym. window of 5, 15 negative samples, 0.025 learning rate, threshhold set to 0.05. The resulting vocabulary contains about 2.7m words. 
29 |   
30 | ### Example usage: Evaluation BLESS,BIBLESS and AWBLESS
31 | To reproduce our experiments from Table 3 use the code in the `datasets_classification/`, 
32 | assuming your vector file is located in the same folder and named `hypervec.txt.gz`. 
33 |   `java -jar eval-dir.jar hypervec.txt.gz` (Evaluate directionality on `BLESS.txt` using hyperscore)
34 |   `java -jar eval-bless.jar hypervec.txt.gz 2 1000` (Evaluate classification on `BIBLESS.txt, AWBLESS.txt` using 2% of the training data and 1000 random iterations)
35 |   
36 | 
37 | ### Citation info
38 | If you use the code or the created feature norms, please [cite our paper (Bibtex)](http://www2.ims.uni-stuttgart.de/bibliographie/entry/2811b00e1bbd503adf28648ddb737132dc67a091/), the paper can be found here: [PDF](http://www.aclweb.org/anthology/D17-1022), the poster from EMNLP can be found here: [Poster](http://www.ims.uni-stuttgart.de/institut/mitarbeiter/koepermn/publications/poster_EMNLP2017.pdf)
39 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/AP_evaluation_code/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy import fromstring, dtype
 3 | from numpy.linalg import norm
 4 | 
 5 | def smart_open(fname, mode='rb'):
 6 |     if fname.endswith('.gz'):
 7 |         import gzip
 8 |         return gzip.open(fname, mode)
 9 |     elif fname.endswith('.bz2'):
10 |         import bz2
11 |         return bz2.BZ2File(fname, mode)
12 |     else:
13 |         return open(fname, mode)
14 | 
15 | def load_vecs(binary_file, binary=1):
16 |     vecs = []
17 |     vocab = []
18 |     if binary==1:
19 |         with smart_open(binary_file, 'rb') as f:
20 |             header = to_unicode(f.readline())
21 |             vocab_size, vector_size = map(int, header.split())
22 |             binary_len = dtype(np.float32).itemsize * vector_size
23 |             for _ in range(vocab_size):
24 |                 word = []
25 |                 while True:
26 |                     ch = f.read(1)
27 |                     if ch == b' ':
28 |                         break
29 |                     if ch != b'\n':
30 |                         word.append(ch)
31 |                 word = to_unicode(b''.join(word))
32 |                 vocab.append(word)
33 |                 vec = fromstring(f.read(binary_len), dtype=np.float32)
34 |                 vecs.append(vec)
35 |     else:
36 |         with smart_open(binary_file, 'rb') as f:
37 |             header = to_unicode(f.readline())
38 |             if len(header.split()) == 2: vocab_size, vector_size = map(int, header.split())
39 |             elif len(header.split()) > 2:
40 |                 parts = header.rstrip().split(" ")
41 |                 word, vec = parts[0], list(map(np.float32, parts[1:]))
42 |                 vocab.append(to_unicode(word))
43 |                 vecs.append(vec)
44 |             for _, line in enumerate(f):
45 |                 parts = to_unicode(line.rstrip()).split(" ")
46 |                 word, vec = parts[0], list(map(np.float32, parts[1:]))
47 |                 vocab.append(to_unicode(word))
48 |                 vecs.append(vec)
49 |     #embs_dim = len(vecs[1])   
50 |     #UNKNOWN_WORD = np.random.uniform(-0.25,0.25,embs_dim)
51 |     #vecs = np.vstack((UNKNOWN_WORD, vecs))               
52 |     #vocab = ['#UNKNOWN#'] + list(vocab)
53 |     #words = {word:idx for idx,word in enumerate(vocab)}
54 |     
55 |     return vecs, vocab
56 | 
57 | def to_utf8(text, errors='strict', encoding='utf8'):
58 |     """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
59 |     if isinstance(text, unicode):
60 |         return text.encode('utf8')
61 |     # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
62 |     else:
63 |         return unicode(text, encoding, errors=errors).encode('utf8')
64 | 
65 | def to_unicode(text, encoding='utf8', errors='strict'):
66 |     """Convert a string (bytestring in `encoding` or unicode), to unicode."""
67 |     if isinstance(text, unicode):
68 |         return text
69 |     else:
70 |         return unicode(text, encoding=encoding, errors=errors)


--------------------------------------------------------------------------------
/code_mapping_across_languages/AP_evaluation_code/test_default.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | from sklearn.metrics import average_precision_score
 4 | from numpy.linalg import norm
 5 | import common
 6 | 
 7 | def cosine_sim(u, v):
 8 |     return np.dot(u,v)/(norm(u)*norm(v))
 9 | 
10 | def computeAP(targets, preds):
11 |     paired = zip(preds, targets)
12 |     sorted_paired = sorted(paired, key=lambda x:x[0], reverse=True)
13 |     preds, targets = zip(*sorted_paired)
14 |     preds, targets = list(preds), list(targets)
15 |     
16 |     ap = 0.0
17 |     retrievedCounter = 0.0;
18 |     relevantCounter = 0.0;
19 | 
20 |     for i in range(len(targets)):
21 |         retrievedCounter += 1
22 |         if int(targets[i]) == 1:
23 |             relevantCounter += 1
24 |             ap += relevantCounter / retrievedCounter
25 |     ap /= relevantCounter
26 |     return ap
27 | 
28 | def _filter(word):
29 |     word = word.split('-')
30 |     if len(word) > 2:
31 |         f_word = '-'.join(word[:-1])
32 |     else:
33 |         f_word = word[0]
34 |     return f_word
35 | 
36 | def load_dataset(dataset_file):
37 |     dataset = []
38 |     with open(dataset_file, 'r') as fin:
39 |         for line in fin:
40 |             left, right, label = line.strip().split('\t')
41 |             dataset.append((left, right, int(label)))
42 |     return dataset
43 | 
44 | def compute_similarity(dataset, embs):
45 |     data = []
46 |     for (left, right, label) in dataset:
47 |         if left in embs and right in embs:
48 |             #direct = norm(embs[right]) / norm(embs[left])
49 |             score = cosine_sim(embs[left], embs[right]) #* direct
50 |             data.append((left, right, label, score))
51 |         else:
52 |             continue
53 |     return data
54 | 
55 | def build_data(dataset_file, embeddings_file):
56 |     vecs, words = common.load_vecs(embeddings_file, binary=1) #TODO: set binary=0 to read text file
57 |     embs = {word:vecs[idx] for idx,word in enumerate(words)}
58 |     dataset = load_dataset(dataset_file)
59 |     data = compute_similarity(dataset, embs)
60 |     
61 |     return data
62 | 
63 | def ap_evaluation(data, cutoff=-1):
64 |     
65 |     data = sorted(data, key=lambda line:line[-1], reverse=True)
66 |     targets, scores = [], []
67 |     for (left, right, label, score) in data:
68 |         targets.append(label)
69 |         scores.append(score)
70 |     if cutoff > 0:
71 |         ap_score = average_precision_score(targets[:cutoff], scores[:cutoff])
72 |         #ap_score = computeAP(targets, scores)
73 |         print 'AP at %d cutoff: %f' %(cutoff, ap_score)
74 |     else:
75 |         ap_score = average_precision_score(targets, scores)
76 |         #ap_score = computeAP(targets, scores)
77 |         print 'AP score: %f' %ap_score     
78 |     
79 |     return ap_score
80 | 
81 | if __name__=='__main__':
82 |     dataset_file = sys.argv[1]
83 |     embeddings_file = sys.argv[2]
84 |     data = build_data(dataset_file, embeddings_file)
85 |     ap_evaluation(data)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/AP_evaluation_code/test_norm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | from sklearn.metrics import average_precision_score
 4 | from numpy.linalg import norm
 5 | import common
 6 | 
 7 | def cosine_sim(u, v):
 8 |     return np.dot(u,v)/(norm(u)*norm(v))
 9 | 
10 | def computeAP(targets, preds):
11 |     paired = zip(preds, targets)
12 |     sorted_paired = sorted(paired, key=lambda x:x[0], reverse=True)
13 |     preds, targets = zip(*sorted_paired)
14 |     preds, targets = list(preds), list(targets)
15 |     
16 |     ap = 0.0
17 |     retrievedCounter = 0.0;
18 |     relevantCounter = 0.0;
19 | 
20 |     for i in range(len(targets)):
21 |         retrievedCounter += 1
22 |         if int(targets[i]) == 1:
23 |             relevantCounter += 1
24 |             ap += relevantCounter / retrievedCounter
25 |     ap /= relevantCounter
26 |     return ap
27 | 
28 | def _filter(word):
29 |     word = word.split('-')
30 |     if len(word) > 2:
31 |         f_word = '-'.join(word[:-1])
32 |     else:
33 |         f_word = word[0]
34 |     return f_word
35 | 
36 | def load_dataset(dataset_file):
37 |     dataset = []
38 |     with open(dataset_file, 'r') as fin:
39 |         for line in fin:
40 |             left, right, label = line.strip().split('\t')
41 |             dataset.append((left, right, int(label)))
42 |     return dataset
43 | 
44 | def compute_similarity(dataset, embs):
45 |     data = []
46 |     for (left, right, label) in dataset:
47 |         if left in embs and right in embs:
48 |             direct = norm(embs[right]) / norm(embs[left])
49 |             score = cosine_sim(embs[left], embs[right]) * direct
50 |             data.append((left, right, label, score))
51 |         else:
52 |             continue
53 |     return data
54 | 
55 | def build_data(dataset_file, embeddings_file):
56 |     vecs, words = common.load_vecs(embeddings_file, binary=1) #TODO: set binary=0 to read text file
57 |     embs = {word:vecs[idx] for idx,word in enumerate(words)}
58 |     dataset = load_dataset(dataset_file)
59 |     data = compute_similarity(dataset, embs)
60 |     
61 |     return data
62 | 
63 | def ap_evaluation(data, cutoff=-1):
64 |     
65 |     data = sorted(data, key=lambda line:line[-1], reverse=True)
66 |     targets, scores = [], []
67 |     for (left, right, label, score) in data:
68 |         targets.append(label)
69 |         scores.append(score)
70 |     if cutoff > 0:
71 |         ap_score = average_precision_score(targets[:cutoff], scores[:cutoff])
72 |         #ap_score = computeAP(targets, scores)
73 |         print 'AP at %d cutoff: %f' %(cutoff, ap_score)
74 |     else:
75 |         ap_score = average_precision_score(targets, scores)
76 |         #ap_score = computeAP(targets, scores)
77 |         print 'AP score: %f' %ap_score     
78 |     
79 |     return ap_score
80 | 
81 | if __name__=='__main__':
82 |     dataset_file = sys.argv[1]
83 |     embeddings_file = sys.argv[2]
84 |     data = build_data(dataset_file, embeddings_file)
85 |     ap_evaluation(data)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/convert_w2vTXT_to_w2vBIN.py:
--------------------------------------------------------------------------------
1 | from gensim.models import word2vec
2 | import sys
3 | 
4 | # Script that converts word2vec txtfile into word2vec binary
5 | print ("Script name: %s" % str(sys.argv[1]))
6 | model = word2vec.Word2Vec.load_word2vec_format(str(sys.argv[1]),binary=False)
7 | model.save_word2vec_format(str(sys.argv[1])+'.bin',binary=True)
8 | 
9 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/credits_to_CLIC_trento.txt:
--------------------------------------------------------------------------------
1 | A huge part of this code is taken from an implementation that used to be available at http://clic.cimec.unitn.it and was also used for the paper 'Improving zero-shot learning by mitigating the hubness problem' by  Georgiana Dinu, Angeliki Lazaridou, Marco Baroni https://arxiv.org/pdf/1412.6568.pdf.  


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/demo.sh~:
--------------------------------------------------------------------------------
 1 | echo "Training..."
 2 | 
 3 | python train_tm.py -o tm data/OPUS_en_it_europarl_train_5K.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl 
 4 | 
 5 | 
 6 | echo "Testing standard NN retrieval (baseline)"
 7 | 
 8 | python -c 5000 test_tm.py tm.pkl data/OPUS_en_it_europarl_test.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl 
 9 | 
10 | 
11 | 
12 | echo "Testing GC retrieval with 5000 aditional elements"
13 | 
14 | python -c 5000 test_tm.py tm.pkl data/OPUS_en_it_europarl_test.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl 
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/learn_mat.sh:
--------------------------------------------------------------------------------
 1 | echo "Training..."
 2 | 
 3 | 
 4 | for i in 0 1 2 3 4 5 6 7 8 9
 5 | 	do
 6 | 	python ../train_tm.py -o trainmat_${i} align.train.-${i} encow5.ppmi.train-${i} GNet_img_avg.train.-${i} &&
 7 | 	python ../train_tm.py -o testmat_${i} align.test.-${i} encow5.ppmi.test-${i} GNet_img_avg.test.-${i} 
 8 | 	done;
 9 | 
10 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/space.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Space(object):
 4 | 
 5 |     def __init__(self, matrix_, id2row_):
 6 | 
 7 |         self.mat = matrix_
 8 |         self.id2row = id2row_
 9 |         self.create_row2id()
10 | 
11 |     def create_row2id(self):
12 |         self.row2id = {}
13 |         for idx, word in enumerate(self.id2row):
14 |             if word in self.row2id:
15 |                 raise ValueError("Found duplicate word: %s" % (word))
16 |             self.row2id[word] = idx
17 | 
18 | 
19 |     @classmethod
20 |     def build(cls, fname, lexicon=None):
21 | 
22 |         #if lexicon is provided, only data occurring in the lexicon is loaded
23 |         id2row = []
24 |         def filter_lines(f):
25 |             for i,line in enumerate(f):
26 |                 word = line.split()[0]
27 |                 if i != 0 and (lexicon is None or word in lexicon):
28 |                     id2row.append(word)
29 |                     yield line
30 | 
31 |         #get the number of columns
32 |         with open(fname) as f:
33 |             f.readline()
34 |             ncols = len(f.readline().split())
35 | 
36 |         with open(fname) as f:
37 |             m = np.matrix(np.loadtxt(filter_lines(f),
38 |                           comments=None, usecols=range(1,ncols)))
39 | 
40 |         return Space(m, id2row)
41 | 
42 |     def normalize(self):
43 |         row_norms = np.sqrt(np.multiply(self.mat, self.mat).sum(1))
44 |         row_norms = row_norms.astype(np.double)
45 |         row_norms[row_norms != 0] = np.array(1.0/row_norms[row_norms != 0]).flatten()
46 |         self.mat = np.multiply(self.mat, row_norms)
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/space.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/mappingcode/space.pyc


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/test_tm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import getopt
  3 | import numpy as np
  4 | import collections
  5 | import random
  6 | from space import Space
  7 | from utils import read_dict, apply_tm, score, get_valid_data
  8 | 
  9 | def usage(errno=0):
 10 |     print >>sys.stderr,\
 11 |     """
 12 |     Given a translation matrix, test data (words and their translations) and 
 13 |     source and target language vectors, it returns translations of source test 
 14 |     words and computes Top N accuracy.
 15 | 
 16 |     Usage:
 17 |     python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
 18 |     \n\
 19 |     Options:
 20 |     -o --output <file>: file prefix. It prints the vectors obtained after 
 21 |                         the translation matrix is applied (.vecs.txt and .wds.txt).
 22 |                         Optional. Default is ./translated_vecs
 23 |     -c --correction <int>: Number of additional elements (ADDITIONAL TO TEST DATA)
 24 |                          to be used with Global Correction (GC) strategy. 
 25 |                          Optional. Default, baseline retrieval is run.
 26 |                           
 27 |     -h --help : help
 28 | 
 29 |     Arguments:
 30 |     trans_matrix: <file>, translation matrix
 31 |     test_data: <file>, list of source-target word pairs (space separated words, 
 32 |                 one word pair per line)
 33 |     source_vecs: <file>, vectors in source language, Space-separated, with string 
 34 |                 identifier as first column (dim+1 columns, where dim is the 
 35 |                 dimensionality of the space)
 36 |     target_vecs: <file>, vectors in target language
 37 | 
 38 | 
 39 |     Example:
 40 |     1) Retrieve translations with standard nearest neighbour retrieval
 41 | 
 42 |     python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 43 |     
 44 |     2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 
 45 |     correct for hubs (words that appear as the nearest neighbours of many points))
 46 | 
 47 |     python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 48 | 
 49 |     """
 50 |     sys.exit(errno)
 51 | 
 52 | 
 53 | def main(sys_argv):
 54 | 
 55 |     try:
 56 |         opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
 57 |                                    ["help", "output=", "correction="])
 58 |     except getopt.GetoptError, err:
 59 |         print str(err)
 60 |         usage()
 61 |         sys.exit(1)
 62 | 
 63 |     out_file = "./translated_vecs"
 64 |     additional = None
 65 |     for opt, val in opts:
 66 |         if opt in ("-o", "--ouput"):
 67 |             out_file = val
 68 |         if opt in ("-c", "--correction"):
 69 |             try:
 70 |                 additional = int(val)
 71 |             except ValueError:
 72 |                 usage(1)
 73 |         elif opt in ("-h", "--help"):
 74 |             usage(0)
 75 |         else:
 76 |             usage(1)
 77 | 
 78 |     if len(argv) == 4:
 79 |         tm_file = argv[0] 
 80 |         test_file = argv[1]
 81 |         source_file = argv[2]	
 82 | 	target_file = argv[3]
 83 | 
 84 |     else:
 85 | 	print str(err)
 86 | 	usage(1)
 87 | 
 88 |     print "Loading the translation matrix"
 89 |     tm = np.loadtxt(tm_file)
 90 | 
 91 |     print "Reading the test data"
 92 |     test_data = read_dict(test_file)
 93 | 
 94 |     #in the _source_ space, we only need to load vectors for the words in test.
 95 |     #semantic spaces may contain additional words, ALL words in the _target_ 
 96 |     #space are used as the search space
 97 |     source_words, _ = zip(*test_data)
 98 |     source_words = set(source_words)
 99 | 
100 |     print "Reading: %s" % source_file
101 |     if not additional:
102 |         source_sp = Space.build(source_file, source_words)
103 |     else:
104 |         #read all the words in the space
105 |         lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 
106 |                                     comments=None, usecols=(0,)).flatten())
107 |         #the max number of additional+test elements is bounded by the size 
108 |         #of the lexicon
109 |         additional = min(additional, len(lexicon) - len(source_words))
110 |         #we sample additional elements that are not already in source_words
111 |         random.seed(100)
112 |         lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |         
114 |         #load the source space
115 |         source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |     
117 |     source_sp.normalize()
118 | 
119 |     print "Reading: %s" % target_file
120 |     target_sp = Space.build(target_file)
121 |     target_sp.normalize()
122 | 
123 |     print "Translating" #translates all the elements loaded in the source space
124 |     mapped_source_sp = apply_tm(source_sp, tm)
125 |     
126 |     print "Retrieving translations"
127 |     test_data = get_valid_data(source_sp, target_sp, test_data)
128 | 
129 |     #turn test data into a dictionary (a word can have mutiple translation)
130 |     gold = collections.defaultdict(set)
131 |     for k, v in test_data:
132 |         gold[k].add(v)
133 | 
134 |     score(mapped_source_sp, target_sp, gold, additional)
135 | 
136 |     print "Printing mapped vectors: %s" % out_file
137 |     np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
138 |     np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
139 | 
140 | if __name__ == '__main__':
141 |     main(sys.argv)
142 | 
143 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/test_tm2.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import getopt
  3 | import numpy as np
  4 | import collections
  5 | import random
  6 | from space import Space
  7 | from utils import read_dict, apply_tm, score, get_valid_data
  8 | 
  9 | def usage(errno=0):
 10 |     print >>sys.stderr,\
 11 |     """
 12 |     Given a translation matrix, test data (words and their translations) and 
 13 |     source and target language vectors, it returns translations of source test 
 14 |     words and computes Top N accuracy.
 15 | 
 16 |     Usage:
 17 |     python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
 18 |     \n\
 19 |     Options:
 20 |     -o --output <file>: file prefix. It prints the vectors obtained after 
 21 |                         the translation matrix is applied (.vecs.txt and .wds.txt).
 22 |                         Optional. Default is ./translated_vecs
 23 |     -c --correction <int>: Number of additional elements (ADDITIONAL TO TEST DATA)
 24 |                          to be used with Global Correction (GC) strategy. 
 25 |                          Optional. Default, baseline retrieval is run.
 26 |                           
 27 |     -h --help : help
 28 | 
 29 |     Arguments:
 30 |     trans_matrix: <file>, translation matrix
 31 |     test_data: <file>, list of source-target word pairs (space separated words, 
 32 |                 one word pair per line)
 33 |     source_vecs: <file>, vectors in source language, Space-separated, with string 
 34 |                 identifier as first column (dim+1 columns, where dim is the 
 35 |                 dimensionality of the space)
 36 |     target_vecs: <file>, vectors in target language
 37 | 
 38 | 
 39 |     Example:
 40 |     1) Retrieve translations with standard nearest neighbour retrieval
 41 | 
 42 |     python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 43 |     
 44 |     2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 
 45 |     correct for hubs (words that appear as the nearest neighbours of many points))
 46 | 
 47 |     python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 48 | 
 49 |     """
 50 |     sys.exit(errno)
 51 | 
 52 | 
 53 | def main(sys_argv):
 54 | 
 55 |     try:
 56 |         opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
 57 |                                    ["help", "output=", "correction="])
 58 |     except getopt.GetoptError, err:
 59 |         print str(err)
 60 |         usage()
 61 |         sys.exit(1)
 62 | 
 63 |     out_file = "./translated_vecs2"
 64 |     additional = None
 65 |     for opt, val in opts:
 66 |         if opt in ("-o", "--ouput"):
 67 |             out_file = val
 68 |         if opt in ("-c", "--correction"):
 69 |             try:
 70 |                 additional = int(val)
 71 |             except ValueError:
 72 |                 usage(1)
 73 |         elif opt in ("-h", "--help"):
 74 |             usage(0)
 75 |         else:
 76 |             usage(1)
 77 | 
 78 |     if len(argv) == 4:
 79 |         tm_file = argv[0] 
 80 |         test_file = argv[1]
 81 |         source_file = argv[2]	
 82 | 	target_file = argv[3]
 83 | 
 84 |     else:
 85 | 	print str(err)
 86 | 	usage(1)
 87 | 
 88 |     print "Loading the translation matrix"
 89 |     tm = np.loadtxt(tm_file)
 90 | 
 91 |     print "Reading the test data"
 92 |     test_data = read_dict(test_file)
 93 | 
 94 |     #in the _source_ space, we only need to load vectors for the words in test.
 95 |     #semantic spaces may contain additional words, ALL words in the _target_ 
 96 |     #space are used as the search space
 97 |     source_words, _ = zip(*test_data)
 98 |     source_words = set(source_words)
 99 | 
100 |     print "Reading: %s" % source_file
101 |     if not additional:
102 |         source_sp = Space.build(source_file, source_words)
103 |     else:
104 |         #read all the words in the space
105 |         lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 
106 |                                     comments=None, usecols=(0,)).flatten())
107 |         #the max number of additional+test elements is bounded by the size 
108 |         #of the lexicon
109 |         additional = min(additional, len(lexicon) - len(source_words))
110 |         #we sample additional elements that are not already in source_words
111 |         random.seed(100)
112 |         lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |         
114 |         #load the source space
115 |         source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |     
117 |     source_sp.normalize()
118 | 
119 |     print "Reading: %s" % target_file
120 |     target_sp = Space.build(target_file)
121 |     target_sp.normalize()
122 | 
123 |     print "Translating" #translates all the elements loaded in the source space
124 |     mapped_source_sp = apply_tm(source_sp, tm)
125 |     
126 |     print "Retrieving translations"
127 |     test_data = get_valid_data(source_sp, target_sp, test_data)
128 | 
129 |     #turn test data into a dictionary (a word can have mutiple translation)
130 |     gold = collections.defaultdict(set)
131 |     for k, v in test_data:
132 |         gold[k].add(v)
133 | 
134 |     score(mapped_source_sp, target_sp, gold, additional)
135 | 
136 |     print "Printing mapped vectors: %s" % out_file
137 |     np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
138 |     np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
139 | 
140 | if __name__ == '__main__':
141 |     main(sys.argv)
142 | 
143 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/test_tm_pred.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import getopt
  3 | import numpy as np
  4 | import collections
  5 | import random
  6 | from space import Space
  7 | from utils import read_dict, apply_tm, score, get_valid_data
  8 | 
  9 | def usage(errno=0):
 10 |     print >>sys.stderr,\
 11 |     """
 12 |     Given a translation matrix, test data (words and their translations) and 
 13 |     source and target language vectors, it returns translations of source test 
 14 |     words and computes Top N accuracy.
 15 | 
 16 |     Usage:
 17 |     python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
 18 |     \n\
 19 |     Options:
 20 |     -o --output <file>: file prefix. It prints the vectors obtained after 
 21 |                         the translation matrix is applied (.vecs.txt and .wds.txt).
 22 |                         Optional. Default is ./translated_vecs
 23 |     -c --correction <int>: Number of additional elements (ADDITIONAL TO TEST DATA)
 24 |                          to be used with Global Correction (GC) strategy. 
 25 |                          Optional. Default, baseline retrieval is run.
 26 |                           
 27 |     -h --help : help
 28 | 
 29 |     Arguments:
 30 |     trans_matrix: <file>, translation matrix
 31 |     test_data: <file>, list of source-target word pairs (space separated words, 
 32 |                 one word pair per line)
 33 |     source_vecs: <file>, vectors in source language, Space-separated, with string 
 34 |                 identifier as first column (dim+1 columns, where dim is the 
 35 |                 dimensionality of the space)
 36 |     target_vecs: <file>, vectors in target language
 37 | 
 38 | 
 39 |     Example:
 40 |     1) Retrieve translations with standard nearest neighbour retrieval
 41 | 
 42 |     python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 43 |     
 44 |     2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 
 45 |     correct for hubs (words that appear as the nearest neighbours of many points))
 46 | 
 47 |     python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 48 | 
 49 |     """
 50 |     sys.exit(errno)
 51 | 
 52 | 
 53 | def main(sys_argv):
 54 | 
 55 |     try:
 56 |         opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
 57 |                                    ["help", "output=", "correction="])
 58 |     except getopt.GetoptError, err:
 59 |         print str(err)
 60 |         usage()
 61 |         sys.exit(1)
 62 | 
 63 |     out_file = "./translated_vecs"
 64 |     additional = None
 65 |     for opt, val in opts:
 66 |         if opt in ("-o", "--ouput"):
 67 |             out_file = val
 68 |         if opt in ("-c", "--correction"):
 69 |             try:
 70 |                 additional = int(val)
 71 |             except ValueError:
 72 |                 usage(1)
 73 |         elif opt in ("-h", "--help"):
 74 |             usage(0)
 75 |         else:
 76 |             usage(1)
 77 | 
 78 |     if len(argv) == 4:
 79 |         tm_file = argv[0] 
 80 |         test_file = argv[1]
 81 |         source_file = argv[2]	
 82 | 	target_file = argv[3]
 83 | 
 84 |     else:
 85 | 	print str(err)
 86 | 	usage(1)
 87 | 
 88 |     print "Loading the translation matrix"
 89 |     tm = np.loadtxt(tm_file)
 90 | 
 91 |     print "Reading the test data"
 92 |     test_data = read_dict(test_file)
 93 | 
 94 |     #in the _source_ space, we only need to load vectors for the words in test.
 95 |     #semantic spaces may contain additional words, ALL words in the _target_ 
 96 |     #space are used as the search space
 97 |     source_words, _ = zip(*test_data)
 98 |     source_words = set(source_words)
 99 | 
100 |     print "Reading: %s" % source_file
101 |     if not additional:
102 |         source_sp = Space.build(source_file, source_words)
103 |     else:
104 |         #read all the words in the space
105 |         lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 
106 |                                     comments=None, usecols=(0,)).flatten())
107 |         #the max number of additional+test elements is bounded by the size 
108 |         #of the lexicon
109 |         additional = min(additional, len(lexicon) - len(source_words))
110 |         #we sample additional elements that are not already in source_words
111 |         random.seed(100)
112 |         lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |         
114 |         #load the source space
115 |         source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |     
117 |     source_sp.normalize()
118 | 
119 |     print "Reading: %s" % target_file
120 |     target_sp = Space.build(target_file)
121 |     target_sp.normalize()
122 | 
123 |     print "Translating" #translates all the elements loaded in the source space
124 |     mapped_source_sp = apply_tm(source_sp, tm)
125 |     
126 |     print "Retrieving translations"
127 |     test_data = get_valid_data(source_sp, target_sp, test_data)
128 | 
129 |     #turn test data into a dictionary (a word can have mutiple translation)
130 |     #gold = collections.defaultdict(set)
131 |     #for k, v in test_data:
132 |     #    gold[k].add(v)
133 | 
134 |     #score(mapped_source_sp, target_sp, gold, additional)
135 | 
136 |     print "Printing mapped vectors: %s" % out_file
137 |     np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
138 |     np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
139 | 
140 | if __name__ == '__main__':
141 |     main(sys.argv)
142 | 
143 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/train_tm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import getopt
 3 | import numpy as np
 4 | from space import Space
 5 | from utils import read_dict, train_tm
 6 | 
 7 | def usage(errno=0):
 8 |     print >>sys.stderr,\
 9 |     """
10 |     Given train data (pairs of words and their translation), source language and 
11 |     target language vectors, it outputs a translation matrix between source and 
12 |     target spaces.
13 | 
14 |     Usage:
15 |     python train_tm.py [options] train_data source_vecs target_vecs 
16 |     \n\
17 |     Options:
18 |     -o --output <file>: output file prefix. Optional. Default is ./tm
19 |     -h --help : help
20 | 
21 |     Arguments:
22 |     train_data: <file>, train dictionary, list of word pairs (space separated words, 
23 |             one word pair per line)
24 |     source_vecs: <file>, vectors in source language. Space-separated, with string 
25 |                 identifier as first column (dim+1 columns, where dim is the dimensionality
26 |                 of the space)
27 |     target_vecs: <file>, vectors in target language
28 | 
29 | 
30 |     Example:
31 |     python train_tm.py train_data.txt ENspace.pkl ITspace.pkl
32 | 
33 |     """
34 |     sys.exit(errno)
35 | 
36 | 
37 | def main(sys_argv):
38 | 
39 |     try:
40 |         opts, argv = getopt.getopt(sys_argv[1:], "ho:",
41 |                                    ["help", "output="])
42 |     except getopt.GetoptError, err:
43 |         print str(err)
44 |         usage()
45 |         sys.exit(1)
46 | 
47 |     out_file = "./tm"
48 |     for opt, val in opts:
49 |         if opt in ("-o", "--output"):
50 |             out_file = val
51 |         elif opt in ("-h", "--help"):
52 |             usage(0)
53 |         else:
54 |             usage(1)
55 | 
56 |     if len(argv) == 3:
57 |         source_file = argv[1]	
58 |         target_file = argv[2]
59 | 	dict_file = argv[0]
60 |     else:
61 | 	print str(err)
62 | 	usage(1)
63 | 
64 | 
65 |     print "Reading the training data"
66 |     train_data = read_dict(dict_file)
67 | 
68 |     #we only need to load the vectors for the words in the training data
69 |     #semantic spaces contain additional words
70 |     source_words, target_words = zip(*train_data)
71 | 
72 |     print "Reading: %s" % source_file
73 |     source_sp = Space.build(source_file, set(source_words))
74 |     source_sp.normalize()
75 | 
76 |     print "Reading: %s" % target_file
77 |     target_sp = Space.build(target_file, set(target_words))
78 |     target_sp.normalize()
79 | 
80 |     print "Learning the translation matrix"
81 |     tm = train_tm(source_sp, target_sp, train_data)
82 | 
83 |     print "Printing the translation matrix"
84 |     np.savetxt("%s.txt" % out_file, tm)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main(sys.argv)
89 | 
90 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/translate_tm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import getopt
  3 | import numpy as np
  4 | import collections
  5 | import random
  6 | from space import Space
  7 | from utils import read_dict, apply_tm, score, get_valid_data
  8 | 
  9 | def usage(errno=0):
 10 |     print >>sys.stderr,\
 11 |     """
 12 |     Given a translation matrix, test data (words and their translations) and 
 13 |     source and target language vectors, it returns translations of source test 
 14 |     words and computes Top N accuracy.
 15 | 
 16 |     Usage:
 17 |     python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
 18 |     \n\
 19 |     Options:
 20 |     -o --output <file>: file prefix. It prints the vectors obtained after 
 21 |                         the translation matrix is applied (.vecs.txt and .wds.txt).
 22 |                         Optional. Default is ./translated_vecs
 23 |     -c --correction <int>: Number of additional elements (ADDITIONAL TO TEST DATA)
 24 |                          to be used with Global Correction (GC) strategy. 
 25 |                          Optional. Default, baseline retrieval is run.
 26 |                           
 27 |     -h --help : help
 28 | 
 29 |     Arguments:
 30 |     trans_matrix: <file>, translation matrix
 31 |     test_data: <file>, list of source-target word pairs (space separated words, 
 32 |                 one word pair per line)
 33 |     source_vecs: <file>, vectors in source language, Space-separated, with string 
 34 |                 identifier as first column (dim+1 columns, where dim is the 
 35 |                 dimensionality of the space)
 36 |     target_vecs: <file>, vectors in target language
 37 | 
 38 | 
 39 |     Example:
 40 |     1) Retrieve translations with standard nearest neighbour retrieval
 41 | 
 42 |     python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 43 |     
 44 |     2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 
 45 |     correct for hubs (words that appear as the nearest neighbours of many points))
 46 | 
 47 |     python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
 48 | 
 49 |     """
 50 |     sys.exit(errno)
 51 | 
 52 | 
 53 | def main(sys_argv):
 54 | 
 55 |     try:
 56 |         opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
 57 |                                    ["help", "output=", "correction="])
 58 |     except getopt.GetoptError, err:
 59 |         print str(err)
 60 |         usage()
 61 |         sys.exit(1)
 62 | 
 63 |     out_file = "./translated_vecs"
 64 |     additional = None
 65 |     for opt, val in opts:
 66 |         if opt in ("-o", "--ouput"):
 67 |             out_file = val
 68 |         if opt in ("-c", "--correction"):
 69 |             try:
 70 |                 additional = int(val)
 71 |             except ValueError:
 72 |                 usage(1)
 73 |         elif opt in ("-h", "--help"):
 74 |             usage(0)
 75 |         else:
 76 |             usage(1)
 77 | 
 78 |     if len(argv) == 4:
 79 |         tm_file = argv[0] 
 80 |         test_file = argv[1]
 81 |         source_file = argv[2]	
 82 | 	target_file = argv[3]
 83 | 
 84 |     else:
 85 | 	print str(err)
 86 | 	usage(1)
 87 | 
 88 |     print "Loading the translation matrix"
 89 |     tm = np.loadtxt(tm_file)
 90 | 
 91 |     print "Reading the test data"
 92 |     test_data = read_dict(test_file)
 93 | 
 94 |     #in the _source_ space, we only need to load vectors for the words in test.
 95 |     #semantic spaces may contain additional words, ALL words in the _target_ 
 96 |     #space are used as the search space
 97 |     source_words, _ = zip(*test_data)
 98 |     source_words = set(source_words)
 99 | 
100 |     print "Reading: %s" % source_file
101 |     if not additional:
102 |         source_sp = Space.build(source_file, source_words)
103 |     else:
104 |         #read all the words in the space
105 |         lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 
106 |                                     comments=None, usecols=(0,)).flatten())
107 |         #the max number of additional+test elements is bounded by the size 
108 |         #of the lexicon
109 |         additional = min(additional, len(lexicon) - len(source_words))
110 |         #we sample additional elements that are not already in source_words
111 |         random.seed(100)
112 |         lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |         
114 |         #load the source space
115 |         source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |     
117 |     source_sp.normalize()
118 | 
119 |     print "Reading: %s" % target_file
120 |     target_sp = Space.build(target_file)
121 |     target_sp.normalize()
122 | 
123 |     print "Translating" #translates all the elements loaded in the source space
124 |     mapped_source_sp = apply_tm(source_sp, tm)
125 | 
126 | 
127 |     print "Printing mapped vectors: %s" % out_file
128 |     np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
129 |     np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
130 | 
131 | if __name__ == '__main__':
132 |     main(sys.argv)
133 | 
134 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import collections
 3 | from space import Space
 4 | 
 5 | 
 6 | def prec_at(ranks, cut):
 7 |     return len([r for r in ranks if r <= cut])/float(len(ranks))
 8 | 
 9 | def get_rank(nn, gold):
10 |     for idx,word in enumerate(nn):
11 |         if word in gold:
12 |             return idx + 1
13 |     return idx + 1
14 | 
15 |         
16 | def read_dict(dict_file):
17 |     return [tuple(line.strip().split()) for line in file(dict_file)]
18 | 
19 | 
20 | def apply_tm(sp, tm):
21 |     
22 |     print "Applying the translation matrix, size of data: %d" % sp.mat.shape[0] 
23 |     return Space(sp.mat*tm, sp.id2row)
24 |     
25 | def get_valid_data(sp1, sp2, data):
26 |     return [(el1, el2) for el1,el2 in data if 
27 |             el1 in sp1.row2id and el2 in sp2.row2id]
28 | 
29 | def train_tm(sp1, sp2, data):
30 | 
31 |     data = get_valid_data(sp1, sp2, data)
32 |     print "Training using: %d word pairs" % len(data)
33 |     
34 |     els1, els2 = zip(*data)
35 |     m1 = sp1.mat[[sp1.row2id[el] for el in els1],:]
36 |     m2 = sp2.mat[[sp2.row2id[el] for el in els2],:]
37 | 
38 |     tm = np.linalg.lstsq(m1, m2, -1)[0]
39 | 
40 |     return tm 
41 |     
42 | 
43 | def score(sp1, sp2, gold, additional):
44 |     
45 |     sp1.normalize()
46 | 
47 |     print "Computing cosines and sorting target space elements"    
48 |     sim_mat = -sp2.mat*sp1.mat.T
49 |     
50 |     if additional:
51 |         #for each element, computes its rank in the ranked list of
52 |         #similarites. sorting done on the opposite axis (inverse querying) 
53 |         srtd_idx = np.argsort(np.argsort(sim_mat, axis=1), axis=1)
54 | 
55 |         #for each element, the resulting rank is combined with cosine scores. 
56 |         #the effect will be of breaking the ties, because cosines are smaller
57 |         #than 1. sorting done on the standard axis (regular NN querying)
58 |         srtd_idx = np.argsort(srtd_idx + sim_mat, axis=0)
59 |     else:
60 |         srtd_idx = np.argsort(sim_mat, axis=0)
61 | 
62 |     ranks = []
63 |     for i,el1 in enumerate(gold.keys()):
64 | 
65 |         sp1_idx = sp1.row2id[el1]
66 | 
67 |         #print the top 5 translations
68 |         translations = []        
69 |         for j in range(5):
70 |             sp2_idx = srtd_idx[j, sp1_idx]
71 |             word, score = sp2.id2row[sp2_idx], -sim_mat[sp2_idx, sp1_idx]        
72 |             translations.append("\t\t%s:%.3f" % (word, score))
73 | 
74 |         translations = "\n".join(translations) 
75 | 
76 |         #get the rank of the (highest-ranked) translation
77 |         rnk = get_rank(srtd_idx[:,sp1_idx].A.ravel(), 
78 |                         [sp2.row2id[el] for el in gold[el1]])
79 |         ranks.append(rnk)
80 | 
81 |         print ("\nId: %d Source: %s \n\tTranslation:\n%s \n\tGold: %s \n\tRank: %d" %
82 |                (len(ranks), el1, translations, gold[el1], rnk))
83 | 
84 |     print "Corrected: %s" % str(additional)
85 |     if additional:
86 |         print "Total extra elements, Test(%d) + Additional:%d" % (len(gold.keys()),
87 |                                                            sp1.mat.shape[0]) 
88 |     for k in [1,5,10]:
89 |         print "Prec@%d: %.3f" % (k, prec_at(ranks, k))
90 |         
91 | 


--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/mappingcode/utils.pyc


--------------------------------------------------------------------------------
/code_mapping_across_languages/perform_mapping.sh:
--------------------------------------------------------------------------------
 1 | MAIN="/mount/arbeitsdaten29/corpora/waterloo/img/en_vec/zeroShot/"
 2 | CODE="mappingcode/"
 3 | 
 4 | 
 5 | EN="hypercos.txt" #<- English Vectors (plain text w2v format) 
 6 | DE="de_cow_vecs.txt" #<- Source Language Vectors DE/IT
 7 | A="zero_full.align" # Alignment file format word-source TAB word-target (EN)
 8 | AV="fullvoc_de.txt" # <- Vocabulary file of the source language (used to predict every word in Source -> Target)
 9 | OUT="out/" #<- Output folder
10 | 
11 | python ${CODE}train_tm.py -o TM1 ${A} ${DE} ${EN}; # Learn the mapping Matrix
12 | python ${CODE}test_tm_pred.py TM1.txt ${AV} ${DE} ${EN};	# Apply the Mapping Matrix
13 | paste -d" " translated_vecs.wds.txt translated_vecs.vecs.txt >> ${OUT}output-vecs-tmp.txt # this is just formating
14 | rm -f translated_vecs*; # remove temporary files
15 | less ${DE} | head -1 > HEAD.txt;
16 | cat HEAD.txt ${OUT}output-vecs-tmp.txt > ${OUT}output-vecs.txt;
17 | rm -f HEAD.txt;
18 | rm -f  ${OUT}output-vecs-tmp.txt
19 | rm -f TM1;
20 | #gzip ${OUT}output-vecs.txt # <- final new file!
21 | 
22 | 
23 | # Now we can convert the vectors into binary vectors using the script convert_w2vTXT_to_w2vBIN.py
24 | python convert_w2vTXT_to_w2vBIN.py ${OUT}output-vecs.txt # (will create) output-vecs.txt.bin
25 | 
26 | # Now we can evaluate the binary embeddings 
27 | # Using hyperscore python AP_evaluation_code/test_norm.py <TaskFile> <Embeddings (Binary)>
28 | # Using default cosine   python AP_evaluation_code/test_default.py <TaskFile> <Embeddings (Binary)>


--------------------------------------------------------------------------------
/code_mapping_across_languages/vocabulary file/german_voc_wikipedia.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/vocabulary file/german_voc_wikipedia.txt.gz


--------------------------------------------------------------------------------
/code_mapping_across_languages/vocabulary file/italian_voc_wikipedia.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/vocabulary file/italian_voc_wikipedia.txt.gz


--------------------------------------------------------------------------------
/config.cfg:
--------------------------------------------------------------------------------
 1 | ProjectDir           = /mount/arbeitsdaten34/projekte/slu/KimAnh/HypeEmb
 2 | 
 3 | TrainDir             = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/corpus
 4 | VocabFileName        = wiki_en.vocab
 5 | 
 6 | WordVectorFileName   = hypervec.bin
 7 | 
 8 | hypeNoun		  = /projekte/semrel/Users/kim-anh/hypernyms/hypernym_n.txt
 9 | hypeVerb		  = /projekte/semrel/Users/kim-anh/hypernyms/hypernym_v.txt
10 | cohypoNoun		  = /projekte/semrel/Users/kim-anh/hypernyms/cohyponym_n.txt
11 | cohypoVerb		  = /projekte/semrel/Users/kim-anh/hypernyms/cohyponym_v.txt
12 | 
13 | featureNoun      = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/lexical-contrast/wiki_en_features.noun
14 | featureVerb      = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/lexical-contrast/wiki_en_features.verb
15 | 
16 | HierarchialSoftmax   = false
17 | NegativeSampling     = 15
18 | SubSampling          = 1e-5
19 | MinFrequency         = 50
20 | 


--------------------------------------------------------------------------------
/create_features.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import spacy
 3 | from spacy.lang.en import English
 4 | import gzip
 5 | from collections import Counter, defaultdict
 6 | import six.moves.cPickle as pickle
 7 | from itertools import count
 8 | 
 9 | def main():
10 |     """
11 |     TODO: extracts the feature files in the corpus
12 |     Usage: python create_features.py -input corpus -output output-file-name -pos
13 |     -pos: <NN/VB>
14 |     """
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('-input', type=str)
17 |     parser.add_argument('-output', type=str)
18 |     parser.add_argument('-pos', type=str)
19 |     args = parser.parse_args()
20 |     
21 |     nlp = English()
22 |     nlp.add_pipe(nlp.create_pipe('sentencizer'))
23 | 
24 |     window_size = 5
25 |     dfeatures = defaultdict(set)
26 |         
27 |     output_dir = '/mount/arbeitsdaten34/projekte/slu/KimAnh/Corpora/'
28 |     
29 |     vocab_to_id = defaultdict(count(0).next)
30 |     
31 |     with gzip.open(args.input,'rb') as fin:
32 |         para_num = 0
33 |         # Read each paragraph in corpus
34 |         for paragraph in fin:
35 |             # Check empty paragraph
36 |             paragraph = paragraph.strip()
37 |             if len(paragraph) == 0: continue
38 |             para_num += 1
39 |             print 'Processing para: %d' %para_num
40 |             # Parse each sentence
41 |             parsed_para = nlp(unicode(paragraph))
42 |             for sent in parsed_para.sents:
43 |                 features = process_one_sentence(sent, args.pos, window_size, vocab_to_id)
44 |                 dfeatures.update(features)
45 | 
46 |     id_to_vocab = {idx:word for word,idx in vocab_to_id.iteritems()}
47 |     save_file(dfeatures, id_to_vocab, args.output)
48 |     
49 |     print 'Parsing corpus done....!'                    
50 | 
51 | def save_file(dfeatures, id_to_vocab, outfile):
52 |     with gzip.open(outfile, 'w') as fout:
53 |         for kk,vv in dfeatures:
54 |             contexts = [id_to_vocab[idx] for idx in list(vv)]
55 |             fout.write(str(id_to_vocab[kk]))
56 |             for word in contexts:
57 |                 fout.write('\t' + str(word))
58 |             fout.write('\n')
59 |     print 'Saved file!'
60 | 
61 | def process_one_sentence(sent, pos, window_size, vocab_to_id):
62 |     features = defaultdict(set)
63 |     
64 |     for idx,token in enumerate(sent):
65 |         if token.tag_[:2] == pos and len(token.string.strip()) > 2:
66 |             for idw in range(idx-window_size, idx+window_size):
67 |                 if idw != idx and idw >= 0 and idw < len(sent):
68 |                     features[vocab_to_id[sent[idx]]].add(vocab_to_id[sent[idw]])
69 |                     
70 |     return features
71 | 
72 | if __name__=='__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/datasets_across_languages/eval_DE/noun_hyp_vs_ant.txt:
--------------------------------------------------------------------------------
  1 | Pflanze	Lebewesen	1
  2 | Anrufbeantworter	Gerät	1
  3 | Rakete	Flugobjekt	1
  4 | Jagd	Freizeitbeschäftigung	1
  5 | Persönlichkeit	Eigenschaft	1
  6 | Restaurierung	Pflege	1
  7 | Vizekönig	Adeliger	1
  8 | Bein	Körperteil	1
  9 | Bauabschnitt	Bau	1
 10 | Hoheit	Herrscher	1
 11 | Mittelklasse	Klasse	1
 12 | Schwergewicht	Gewichtsklasse	1
 13 | Bauabschnitt	Etappe	1
 14 | Recherche	Tätigkeit	1
 15 | MEZ	Zeitzone	1
 16 | Ekel	Gefühl	1
 17 | Trio	Gruppe	1
 18 | Kirchengemeinde	Gruppe	1
 19 | Fisch	Tier	1
 20 | Modell	Ding	1
 21 | Bock	Säugetier	1
 22 | Agentur	Firma	1
 23 | Gesamtbelastung	Belastung	1
 24 | Mittelklasse	Gesellschaftsschicht	1
 25 | Bremer	Norddeutscher	1
 26 | Kern	Pflanzenbestandteil	1
 27 | Looping	Figur	1
 28 | Bahnsteig	Verkehr	1
 29 | Komplex	Gebäude	1
 30 | Stuhl	Möbel	1
 31 | Bremer	Person	1
 32 | Sirene	Alarmgerät	1
 33 | Nutzer	Kunde	1
 34 | Hoheit	Titel	1
 35 | Torf	Material	1
 36 | Bedienung	Restaurant	1
 37 | Lieblichkeit	Attraktivität	1
 38 | Bibel	Buch	1
 39 | Bein	Gliedmaße	1
 40 | Los	Glücksspiel	1
 41 | Wache	Schutz	1
 42 | Persönlichkeit	Person	1
 43 | Bahnsteig	Haltestelle	1
 44 | Breite	Dimension	1
 45 | Bremer	Deutscher	1
 46 | Nobelpreis	Preis	1
 47 | Professor	Gelehrter	1
 48 | Gegenargument	Diskussion	1
 49 | Zeit	Dimension	1
 50 | Nutzer	Person	1
 51 | Großmeister	Meister	1
 52 | Wache	Tätigkeit	1
 53 | Killer	Krimineller	1
 54 | Schleife	Gebinde	1
 55 | Alb	Gebirgsmassiv	1
 56 | Bedienung	Dienstleistung	1
 57 | Vizekönig	Herrscher	1
 58 | Arzt	Beruf	1
 59 | Bock	Tier	1
 60 | Zivilgesellschaft	Gesellschaft	1
 61 | Ablehnung	Reaktion	1
 62 | Lieblichkeit	Eigenschaft	1
 63 | Gerät	Ding	1
 64 | Professor	Lehrperson	1
 65 | Bundesland	Region	1
 66 | Eiweiß	Nährstoff	1
 67 | Natter	Tier	1
 68 | Restaurierung	Modernisierung	1
 69 | Breite	Maß	1
 70 | Katze	Tier	1
 71 | Bahnsteig	Bahnhof	1
 72 | Glaubenssatz	Satz	1
 73 | Katze	Haustier	1
 74 | Signatur	Kennzeichen	1
 75 | Eiweiß	Ei	1
 76 | Hexe	Märchengestalt	1
 77 | Agentur	Unternehmen	1
 78 | Propagandist	Politiker	1
 79 | Signatur	Schrift	1
 80 | Entscheidungskompetenz	Kompetenz	1
 81 | Verhandlung	Gespräch	1
 82 | Temperatur	Maßeinheit	1
 83 | Wandzeitung	Zeitung	1
 84 | Gatter	Begrenzung	1
 85 | Auflösung	Ende	1
 86 | Nutzer	Mensch	1
 87 | Bauer	Beruf	1
 88 | Ausnahmefall	Fall	1
 89 | Rübe	Gemüse	1
 90 | Funk	Kommunikation	1
 91 | Pension	Einkommen	1
 92 | Pflanze	Natur	1
 93 | Atomwaffe	Waffe	1
 94 | Ablehnung	Verhalten	1
 95 | Buchhalterin	Beruf	1
 96 | Individualismus	Wertesystem	1
 97 | MEZ	Zeit	1
 98 | Signatur	Überschrift	0
 99 | Kriminalität	Sicherheit	0
100 | Bein	Arm	0
101 | Ablehnung	Annahme	0
102 | Komplex	Einfachheit	0
103 | MEZ	UTC	0
104 | Schulbuch	Roman	0
105 | Ausnahmefall	Regelfall	0
106 | Fluch	Segen	0
107 | Killer	Opfer	0
108 | Verhandlung	Bestimmung	0
109 | Breite	Enge	0
110 | Ausnahmefall	Normalfall	0
111 | Individualismus	Kollektivismus	0
112 | Hexe	Fee	0
113 | Ablehnung	Zustimmung	0
114 | Kriminalität	Friedlichkeit	0
115 | Komplex	einfach	0
116 | Lieblichkeit	Bitterkeit	0
117 | Bauer	Stadtbewohner	0
118 | Devise	Landeswährung	0
119 | Bremer	Hamburger	0
120 | Breite	Länge	0
121 | Regression	Fortschritt	0
122 | Torf	Sand	0
123 | Kaufhaus	Tante-Emma-Laden	0
124 | Gegenargument	Fürargument	0
125 | Pfeffer	Zucker	0
126 | Natter	Viper	0
127 | Hoheit	Untertan	0
128 | Hoheit	Bürger	0
129 | Deutlichkeit	Unklarheit	0
130 | Gesamtbelastung	Einzelbelastung	0
131 | Großmeister	Lehrling	0
132 | Kadett	Offizier	0
133 | Restaurierung	Verfall	0
134 | Gerät	Mensch	0
135 | Bahnsteig	Bahngleis	0
136 | Bedienung	Selbstbedienung	0
137 | Fisch	Fleisch	0
138 | Eiweiß	Kohlenhydrat	0
139 | Gesamtbelastung	Teilbelastung	0
140 | Ekel	Zuneigung	0
141 | Materialsammlung	Einzelstück	0
142 | VHS	Universität	0
143 | Auslöser	Bremser	0
144 | Funk	Kabel	0
145 | Explosion	Implosion	0
146 | Nutzer	Entwickler	0
147 | Rakete	U-Boot	0
148 | Buchhalterin	Buchhalter	0
149 | Bibel	Koran	0
150 | Stuhl	Tisch	0
151 | Alb	Flachland	0
152 | Alb	Tal	0
153 | Auflösung	Gründung	0
154 | Pfeffer	Salz	0
155 | Zeit	Raum	0
156 | Katalysator	Motor	0
157 | Kern	Hülle	0
158 | Ekel	Anziehung	0
159 | Bock	Geiß	0
160 | Regression	Progression	0
161 | Betrieb	Ruhe	0
162 | Restaurierung	Zerstörung	0
163 | Wache	Einbrecher	0
164 | Pflanze	Tier	0
165 | Natter	Vogel	0
166 | Arzt	Patient	0
167 | Klage	Zufriedenheit	0
168 | Mittelklasse	Oberklasse	0
169 | Bremer	Münchner	0
170 | Los	Niete	0
171 | Verjüngung	Alterung	0
172 | Nutzer	Hersteller	0
173 | Zeit	Unendlichkeit	0
174 | Ausnahmefall	Regel	0
175 | Sirene	Stille	0
176 | Lieblichkeit	Hässlichkeit	0
177 | Hexe	Zauberer	0
178 | Entscheidungskompetenz	Entscheidungsunfähigkeit	0
179 | VHS	DVD	0
180 | Modell	Realität	0
181 | Mittelklasse	Oberschicht	0
182 | Trio	Solo	0
183 | Bedienung	Gast	0
184 | Schwergewicht	Leichtgewicht	0
185 | Deutlichkeit	Undeutlichkeit	0
186 | Abschneiden	Ankleben	0
187 | Verhandlung	Entscheidung	0
188 | Gatter	Freiheit	0
189 | Katze	Hund	0
190 | Bahnsteig	Bushaltestelle	0
191 | Bauer	Industrieller	0
192 | Betrieb	Stillstand	0
193 | 


--------------------------------------------------------------------------------
/datasets_across_languages/eval_DE/noun_hyp_vs_syn.txt:
--------------------------------------------------------------------------------
  1 | Pflanze	Lebewesen	1
  2 | Anrufbeantworter	Gerät	1
  3 | Rakete	Flugobjekt	1
  4 | Jagd	Freizeitbeschäftigung	1
  5 | Persönlichkeit	Eigenschaft	1
  6 | Restaurierung	Pflege	1
  7 | Vizekönig	Adeliger	1
  8 | Bein	Körperteil	1
  9 | Bauabschnitt	Bau	1
 10 | Hoheit	Herrscher	1
 11 | Mittelklasse	Klasse	1
 12 | Schwergewicht	Gewichtsklasse	1
 13 | Bauabschnitt	Etappe	1
 14 | Recherche	Tätigkeit	1
 15 | MEZ	Zeitzone	1
 16 | Ekel	Gefühl	1
 17 | Trio	Gruppe	1
 18 | Kirchengemeinde	Gruppe	1
 19 | Fisch	Tier	1
 20 | Modell	Ding	1
 21 | Bock	Säugetier	1
 22 | Agentur	Firma	1
 23 | Gesamtbelastung	Belastung	1
 24 | Mittelklasse	Gesellschaftsschicht	1
 25 | Bremer	Norddeutscher	1
 26 | Kern	Pflanzenbestandteil	1
 27 | Looping	Figur	1
 28 | Bahnsteig	Verkehr	1
 29 | Komplex	Gebäude	1
 30 | Stuhl	Möbel	1
 31 | Bremer	Person	1
 32 | Sirene	Alarmgerät	1
 33 | Nutzer	Kunde	1
 34 | Hoheit	Titel	1
 35 | Torf	Material	1
 36 | Bedienung	Restaurant	1
 37 | Lieblichkeit	Attraktivität	1
 38 | Bibel	Buch	1
 39 | Bein	Gliedmaße	1
 40 | Los	Glücksspiel	1
 41 | Wache	Schutz	1
 42 | Persönlichkeit	Person	1
 43 | Bahnsteig	Haltestelle	1
 44 | Breite	Dimension	1
 45 | Bremer	Deutscher	1
 46 | Nobelpreis	Preis	1
 47 | Professor	Gelehrter	1
 48 | Gegenargument	Diskussion	1
 49 | Zeit	Dimension	1
 50 | Nutzer	Person	1
 51 | Großmeister	Meister	1
 52 | Wache	Tätigkeit	1
 53 | Killer	Krimineller	1
 54 | Schleife	Gebinde	1
 55 | Alb	Gebirgsmassiv	1
 56 | Bedienung	Dienstleistung	1
 57 | Vizekönig	Herrscher	1
 58 | Arzt	Beruf	1
 59 | Bock	Tier	1
 60 | Zivilgesellschaft	Gesellschaft	1
 61 | Ablehnung	Reaktion	1
 62 | Lieblichkeit	Eigenschaft	1
 63 | Gerät	Ding	1
 64 | Professor	Lehrperson	1
 65 | Bundesland	Region	1
 66 | Eiweiß	Nährstoff	1
 67 | Natter	Tier	1
 68 | Restaurierung	Modernisierung	1
 69 | Breite	Maß	1
 70 | Katze	Tier	1
 71 | Bahnsteig	Bahnhof	1
 72 | Glaubenssatz	Satz	1
 73 | Katze	Haustier	1
 74 | Signatur	Kennzeichen	1
 75 | Eiweiß	Ei	1
 76 | Hexe	Märchengestalt	1
 77 | Agentur	Unternehmen	1
 78 | Propagandist	Politiker	1
 79 | Signatur	Schrift	1
 80 | Entscheidungskompetenz	Kompetenz	1
 81 | Verhandlung	Gespräch	1
 82 | Temperatur	Maßeinheit	1
 83 | Wandzeitung	Zeitung	1
 84 | Gatter	Begrenzung	1
 85 | Auflösung	Ende	1
 86 | Nutzer	Mensch	1
 87 | Bauer	Beruf	1
 88 | Ausnahmefall	Fall	1
 89 | Rübe	Gemüse	1
 90 | Funk	Kommunikation	1
 91 | Pension	Einkommen	1
 92 | Pflanze	Natur	1
 93 | Atomwaffe	Waffe	1
 94 | Ablehnung	Verhalten	1
 95 | Buchhalterin	Beruf	1
 96 | Individualismus	Wertesystem	1
 97 | MEZ	Zeit	1
 98 | Wandzeitung	Aushang	0
 99 | Propagandist	Verkaufsförderer	0
100 | Schulbuch	Lehrbuch	0
101 | Hoheit	König	0
102 | Sirene	Martinshorn	0
103 | Ausnahmefall	Sonderfall	0
104 | Lieblichkeit	Anmut	0
105 | VHS	Videokassette	0
106 | Kadett	Anfänger	0
107 | Gatter	Umzäunung	0
108 | Devise	Wahlspruch	0
109 | Fisch	Meeresbewohner	0
110 | Pfeffer	Würze	0
111 | Zivilgesellschaft	Volk	0
112 | Anrufbeantworter	AB	0
113 | Katze	Stubentiger	0
114 | Professor	Dozent	0
115 | Lieblichkeit	Liebreiz	0
116 | Kirchengemeinde	Glaubensanhänger	0
117 | Wache	Aufsicht	0
118 | Gegenargument	Widerspruch	0
119 | Temperatur	Wärmegrad	0
120 | Bauer	Landwirt	0
121 | Restaurierung	Instandsetzung	0
122 | Großmeister	Sachkundiger	0
123 | Bein	Gliedmaß	0
124 | Numerus	Nummer	0
125 | Pension	Gästehaus	0
126 | Agentur	Geschäftsstelle	0
127 | Funk	Radio	0
128 | Abschneiden	Abtrennen	0
129 | Kondom	Pariser	0
130 | Trio	Dreiergruppe	0
131 | Rübe	Karotte	0
132 | Rakete	Geschoss	0
133 | Zeile	Linie	0
134 | Kaufhaus	Einkaufszentrum	0
135 | Komplex	Störung	0
136 | Trio	drei	0
137 | Glaubenssatz	Lehre	0
138 | Looping	Salto	0
139 | Explosion	Ausbruch	0
140 | Kondom	Präservativ	0
141 | Anrufbeantworter	Mailbox	0
142 | Rübe	Wurzel	0
143 | VHS	Volkshochschule	0
144 | Ablehnung	Abweisung	0
145 | Looping	Überschlag	0
146 | Kriminalität	Delinquenz	0
147 | Signatur	Unterschrift	0
148 | Verhandlung	Besprechung	0
149 | Klage	Anschuldigung	0
150 | Pflanze	Gewächs	0
151 | Atomwaffe	Kernwaffe	0
152 | Bauer	Farmer	0
153 | Zeit	Dauer	0
154 | Fisch	Wassertier	0
155 | Umgebung	Umland	0
156 | Bauabschnitt	Bausektion	0
157 | Verhandlung	Prozess	0
158 | Individualismus	Eigenheit	0
159 | Rübe	Möhre	0
160 | Anrufbeantworter	Telefonbeantworter	0
161 | Zeit	Epoche	0
162 | Invasion	Einfall	0
163 | Jagd	Hetze	0
164 | Mittelklasse	Mittelschicht	0
165 | Breite	Weite	0
166 | Schleife	Schlaufe	0
167 | Schwergewicht	Schwerpunkt	0
168 | Arzt	Doktor	0
169 | Katze	Samtpfote	0
170 | Bahnsteig	Perron	0
171 | Bundesland	Gliedstaat	0
172 | Zivilgesellschaft	Bürgergesellschaft	0
173 | Bock	Hammel	0
174 | Gesamtbelastung	Totalbelastung	0
175 | Gerät	Apparat	0
176 | Hoheit	Majestät	0
177 | Zeile	Reihe	0
178 | Mittelklasse	Mittelstand	0
179 | Katalysator	Beschleuniger	0
180 | Looping	Schleife	0
181 | Explosion	Sprengung	0
182 | Kaufhaus	Warenhaus	0
183 | Professor	Hochschullehrer	0
184 | Blatt	Zettel	0
185 | Schulbuch	Schullektüre	0
186 | Temperatur	Wärme	0
187 | Verjüngung	Verengung	0
188 | 


--------------------------------------------------------------------------------
/datasets_across_languages/eval_DE/noun_hyp_vs_synant.txt:
--------------------------------------------------------------------------------
  1 | Pflanze	Lebewesen	1
  2 | Anrufbeantworter	Gerät	1
  3 | Rakete	Flugobjekt	1
  4 | Jagd	Freizeitbeschäftigung	1
  5 | Persönlichkeit	Eigenschaft	1
  6 | Restaurierung	Pflege	1
  7 | Vizekönig	Adeliger	1
  8 | Bein	Körperteil	1
  9 | Bauabschnitt	Bau	1
 10 | Hoheit	Herrscher	1
 11 | Mittelklasse	Klasse	1
 12 | Schwergewicht	Gewichtsklasse	1
 13 | Bauabschnitt	Etappe	1
 14 | Recherche	Tätigkeit	1
 15 | MEZ	Zeitzone	1
 16 | Ekel	Gefühl	1
 17 | Trio	Gruppe	1
 18 | Kirchengemeinde	Gruppe	1
 19 | Fisch	Tier	1
 20 | Modell	Ding	1
 21 | Bock	Säugetier	1
 22 | Agentur	Firma	1
 23 | Gesamtbelastung	Belastung	1
 24 | Mittelklasse	Gesellschaftsschicht	1
 25 | Bremer	Norddeutscher	1
 26 | Kern	Pflanzenbestandteil	1
 27 | Looping	Figur	1
 28 | Bahnsteig	Verkehr	1
 29 | Komplex	Gebäude	1
 30 | Stuhl	Möbel	1
 31 | Bremer	Person	1
 32 | Sirene	Alarmgerät	1
 33 | Nutzer	Kunde	1
 34 | Hoheit	Titel	1
 35 | Torf	Material	1
 36 | Bedienung	Restaurant	1
 37 | Lieblichkeit	Attraktivität	1
 38 | Bibel	Buch	1
 39 | Bein	Gliedmaße	1
 40 | Los	Glücksspiel	1
 41 | Wache	Schutz	1
 42 | Persönlichkeit	Person	1
 43 | Bahnsteig	Haltestelle	1
 44 | Breite	Dimension	1
 45 | Bremer	Deutscher	1
 46 | Nobelpreis	Preis	1
 47 | Professor	Gelehrter	1
 48 | Gegenargument	Diskussion	1
 49 | Zeit	Dimension	1
 50 | Nutzer	Person	1
 51 | Großmeister	Meister	1
 52 | Wache	Tätigkeit	1
 53 | Killer	Krimineller	1
 54 | Schleife	Gebinde	1
 55 | Alb	Gebirgsmassiv	1
 56 | Bedienung	Dienstleistung	1
 57 | Vizekönig	Herrscher	1
 58 | Arzt	Beruf	1
 59 | Bock	Tier	1
 60 | Zivilgesellschaft	Gesellschaft	1
 61 | Ablehnung	Reaktion	1
 62 | Lieblichkeit	Eigenschaft	1
 63 | Gerät	Ding	1
 64 | Professor	Lehrperson	1
 65 | Bundesland	Region	1
 66 | Eiweiß	Nährstoff	1
 67 | Natter	Tier	1
 68 | Restaurierung	Modernisierung	1
 69 | Breite	Maß	1
 70 | Katze	Tier	1
 71 | Bahnsteig	Bahnhof	1
 72 | Glaubenssatz	Satz	1
 73 | Katze	Haustier	1
 74 | Signatur	Kennzeichen	1
 75 | Eiweiß	Ei	1
 76 | Hexe	Märchengestalt	1
 77 | Agentur	Unternehmen	1
 78 | Propagandist	Politiker	1
 79 | Signatur	Schrift	1
 80 | Entscheidungskompetenz	Kompetenz	1
 81 | Verhandlung	Gespräch	1
 82 | Temperatur	Maßeinheit	1
 83 | Wandzeitung	Zeitung	1
 84 | Gatter	Begrenzung	1
 85 | Auflösung	Ende	1
 86 | Nutzer	Mensch	1
 87 | Bauer	Beruf	1
 88 | Ausnahmefall	Fall	1
 89 | Rübe	Gemüse	1
 90 | Funk	Kommunikation	1
 91 | Pension	Einkommen	1
 92 | Pflanze	Natur	1
 93 | Atomwaffe	Waffe	1
 94 | Ablehnung	Verhalten	1
 95 | Buchhalterin	Beruf	1
 96 | Individualismus	Wertesystem	1
 97 | MEZ	Zeit	1
 98 | Signatur	Überschrift	0
 99 | Kriminalität	Sicherheit	0
100 | Bein	Arm	0
101 | Ablehnung	Annahme	0
102 | Komplex	Einfachheit	0
103 | MEZ	UTC	0
104 | Schulbuch	Roman	0
105 | Ausnahmefall	Regelfall	0
106 | Fluch	Segen	0
107 | Killer	Opfer	0
108 | Verhandlung	Bestimmung	0
109 | Breite	Enge	0
110 | Ausnahmefall	Normalfall	0
111 | Individualismus	Kollektivismus	0
112 | Hexe	Fee	0
113 | Ablehnung	Zustimmung	0
114 | Kriminalität	Friedlichkeit	0
115 | Komplex	einfach	0
116 | Lieblichkeit	Bitterkeit	0
117 | Bauer	Stadtbewohner	0
118 | Devise	Landeswährung	0
119 | Bremer	Hamburger	0
120 | Breite	Länge	0
121 | Regression	Fortschritt	0
122 | Torf	Sand	0
123 | Kaufhaus	Tante-Emma-Laden	0
124 | Gegenargument	Fürargument	0
125 | Pfeffer	Zucker	0
126 | Natter	Viper	0
127 | Hoheit	Untertan	0
128 | Hoheit	Bürger	0
129 | Deutlichkeit	Unklarheit	0
130 | Gesamtbelastung	Einzelbelastung	0
131 | Großmeister	Lehrling	0
132 | Kadett	Offizier	0
133 | Restaurierung	Verfall	0
134 | Gerät	Mensch	0
135 | Bahnsteig	Bahngleis	0
136 | Bedienung	Selbstbedienung	0
137 | Fisch	Fleisch	0
138 | Eiweiß	Kohlenhydrat	0
139 | Gesamtbelastung	Teilbelastung	0
140 | Ekel	Zuneigung	0
141 | Materialsammlung	Einzelstück	0
142 | VHS	Universität	0
143 | Auslöser	Bremser	0
144 | Funk	Kabel	0
145 | Explosion	Implosion	0
146 | Nutzer	Entwickler	0
147 | Rakete	U-Boot	0
148 | Buchhalterin	Buchhalter	0
149 | Bibel	Koran	0
150 | Stuhl	Tisch	0
151 | Alb	Flachland	0
152 | Alb	Tal	0
153 | Auflösung	Gründung	0
154 | Pfeffer	Salz	0
155 | Zeit	Raum	0
156 | Katalysator	Motor	0
157 | Kern	Hülle	0
158 | Ekel	Anziehung	0
159 | Bock	Geiß	0
160 | Regression	Progression	0
161 | Betrieb	Ruhe	0
162 | Restaurierung	Zerstörung	0
163 | Wache	Einbrecher	0
164 | Pflanze	Tier	0
165 | Natter	Vogel	0
166 | Arzt	Patient	0
167 | Klage	Zufriedenheit	0
168 | Mittelklasse	Oberklasse	0
169 | Bremer	Münchner	0
170 | Los	Niete	0
171 | Verjüngung	Alterung	0
172 | Nutzer	Hersteller	0
173 | Zeit	Unendlichkeit	0
174 | Ausnahmefall	Regel	0
175 | Sirene	Stille	0
176 | Lieblichkeit	Hässlichkeit	0
177 | Hexe	Zauberer	0
178 | Entscheidungskompetenz	Entscheidungsunfähigkeit	0
179 | VHS	DVD	0
180 | Modell	Realität	0
181 | Mittelklasse	Oberschicht	0
182 | Trio	Solo	0
183 | Bedienung	Gast	0
184 | Schwergewicht	Leichtgewicht	0
185 | Deutlichkeit	Undeutlichkeit	0
186 | Abschneiden	Ankleben	0
187 | Verhandlung	Entscheidung	0
188 | Gatter	Freiheit	0
189 | Katze	Hund	0
190 | Bahnsteig	Bushaltestelle	0
191 | Bauer	Industrieller	0
192 | Betrieb	Stillstand	0
193 | Wandzeitung	Aushang	0
194 | Propagandist	Verkaufsförderer	0
195 | Schulbuch	Lehrbuch	0
196 | Hoheit	König	0
197 | Sirene	Martinshorn	0
198 | Ausnahmefall	Sonderfall	0
199 | Lieblichkeit	Anmut	0
200 | VHS	Videokassette	0
201 | Kadett	Anfänger	0
202 | Gatter	Umzäunung	0
203 | Devise	Wahlspruch	0
204 | Fisch	Meeresbewohner	0
205 | Pfeffer	Würze	0
206 | Zivilgesellschaft	Volk	0
207 | Anrufbeantworter	AB	0
208 | Katze	Stubentiger	0
209 | Professor	Dozent	0
210 | Lieblichkeit	Liebreiz	0
211 | Kirchengemeinde	Glaubensanhänger	0
212 | Wache	Aufsicht	0
213 | Gegenargument	Widerspruch	0
214 | Temperatur	Wärmegrad	0
215 | Bauer	Landwirt	0
216 | Restaurierung	Instandsetzung	0
217 | Großmeister	Sachkundiger	0
218 | Bein	Gliedmaß	0
219 | Numerus	Nummer	0
220 | Pension	Gästehaus	0
221 | Agentur	Geschäftsstelle	0
222 | Funk	Radio	0
223 | Abschneiden	Abtrennen	0
224 | Kondom	Pariser	0
225 | Trio	Dreiergruppe	0
226 | Rübe	Karotte	0
227 | Rakete	Geschoss	0
228 | Zeile	Linie	0
229 | Kaufhaus	Einkaufszentrum	0
230 | Komplex	Störung	0
231 | Trio	drei	0
232 | Glaubenssatz	Lehre	0
233 | Looping	Salto	0
234 | Explosion	Ausbruch	0
235 | Kondom	Präservativ	0
236 | Anrufbeantworter	Mailbox	0
237 | Rübe	Wurzel	0
238 | VHS	Volkshochschule	0
239 | Ablehnung	Abweisung	0
240 | Looping	Überschlag	0
241 | Kriminalität	Delinquenz	0
242 | Signatur	Unterschrift	0
243 | Verhandlung	Besprechung	0
244 | Klage	Anschuldigung	0
245 | Pflanze	Gewächs	0
246 | Atomwaffe	Kernwaffe	0
247 | Bauer	Farmer	0
248 | Zeit	Dauer	0
249 | Fisch	Wassertier	0
250 | Umgebung	Umland	0
251 | Bauabschnitt	Bausektion	0
252 | Verhandlung	Prozess	0
253 | Individualismus	Eigenheit	0
254 | Rübe	Möhre	0
255 | Anrufbeantworter	Telefonbeantworter	0
256 | Zeit	Epoche	0
257 | Invasion	Einfall	0
258 | Jagd	Hetze	0
259 | Mittelklasse	Mittelschicht	0
260 | Breite	Weite	0
261 | Schleife	Schlaufe	0
262 | Schwergewicht	Schwerpunkt	0
263 | Arzt	Doktor	0
264 | Katze	Samtpfote	0
265 | Bahnsteig	Perron	0
266 | Bundesland	Gliedstaat	0
267 | Zivilgesellschaft	Bürgergesellschaft	0
268 | Bock	Hammel	0
269 | Gesamtbelastung	Totalbelastung	0
270 | Gerät	Apparat	0
271 | Hoheit	Majestät	0
272 | Zeile	Reihe	0
273 | Mittelklasse	Mittelstand	0
274 | Katalysator	Beschleuniger	0
275 | Looping	Schleife	0
276 | Explosion	Sprengung	0
277 | Kaufhaus	Warenhaus	0
278 | Professor	Hochschullehrer	0
279 | Blatt	Zettel	0
280 | Schulbuch	Schullektüre	0
281 | Temperatur	Wärme	0
282 | Verjüngung	Verengung	0
283 | 


--------------------------------------------------------------------------------
/datasets_classification/eval-bless.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/datasets_classification/eval-bless.jar


--------------------------------------------------------------------------------
/datasets_classification/eval-dir.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/datasets_classification/eval-dir.jar


--------------------------------------------------------------------------------
/datasets_classification/readme_how_to.txt:
--------------------------------------------------------------------------------
1 | 
2 | ## Evaluate Bless
3 | # 1] Vector file hypercos_wiki.txt.gz = word2vec format (txt) gziped.
4 | # 2] Percentage used for training (in the paper we use 2%)
5 | # 3] Number of iterations (in the paper we use 1000)
6 | java -jar eval-bless.jar hypercos_wiki.txt.gz 2 1000
7 | 


--------------------------------------------------------------------------------
/evaluation_scripts/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy import fromstring, dtype
 3 | 
 4 | def smart_open(fname, mode='rb'):
 5 |     if fname.endswith('.gz'):
 6 |         import gzip
 7 |         return gzip.open(fname, mode)
 8 |     elif fname.endswith('.bz2'):
 9 |         import bz2
10 |         return bz2.BZ2File(fname, mode)
11 |     else:
12 |         return open(fname, mode)
13 | 
14 | def load_vecs(binary_file, binary=1):
15 |     vecs = []
16 |     vocab = []
17 |     if binary==1:
18 |         with smart_open(binary_file, 'rb') as f:
19 |             header = to_unicode(f.readline())
20 |             vocab_size, vector_size = map(int, header.split())
21 |             binary_len = dtype(np.float32).itemsize * vector_size
22 |             for _ in range(vocab_size):
23 |                 word = []
24 |                 while True:
25 |                     ch = f.read(1)
26 |                     if ch == b' ':
27 |                         break
28 |                     if ch != b'\n':
29 |                         word.append(ch)
30 |                 word = to_unicode(b''.join(word))
31 |                 vocab.append(word)
32 |                 vec = fromstring(f.read(binary_len), dtype=np.float32)
33 |                 vecs.append(vec)
34 |     else:
35 |         with smart_open(binary_file, 'rb') as f:
36 |             header = to_unicode(f.readline())
37 |             if len(header.split()) == 2: vocab_size, vector_size = map(int, header.split())
38 |             elif len(header.split()) > 2:
39 |                 parts = header.rstrip().split(" ")
40 |                 word, vec = parts[0], list(map(np.float32, parts[1:]))
41 |                 vocab.append(to_unicode(word))
42 |                 vecs.append(vec)
43 |             for _, line in enumerate(f):
44 |                 parts = to_unicode(line.rstrip()).split(" ")
45 |                 word, vec = parts[0], list(map(np.float32, parts[1:]))
46 |                 vocab.append(to_unicode(word))
47 |                 vecs.append(vec)
48 |                 
49 |     #embs_dim = len(vecs[1])   
50 |     #UNKNOWN_WORD = np.random.uniform(-0.25,0.25,embs_dim)
51 |     #vecs = np.vstack((UNKNOWN_WORD, vecs))               
52 |     #vocab = ['#UNKNOWN#'] + list(vocab)
53 |     #words = {word:idx for idx,word in enumerate(vocab)}
54 |     
55 |     return vecs, vocab
56 | 
57 | def to_utf8(text, errors='strict', encoding='utf8'):
58 |     """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
59 |     if isinstance(text, unicode):
60 |         return text.encode('utf8')
61 |     # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
62 |     else:
63 |         return unicode(text, encoding, errors=errors).encode('utf8')
64 | 
65 | def to_unicode(text, encoding='utf8', errors='strict'):
66 |     """Convert a string (bytestring in `encoding` or unicode), to unicode."""
67 |     if isinstance(text, unicode):
68 |         return text
69 |     else:
70 |         return unicode(text, encoding=encoding, errors=errors)


--------------------------------------------------------------------------------
/evaluation_scripts/corrEval.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | from scipy.stats import spearmanr
 4 | from numpy.linalg import norm
 5 | import common
 6 | 
 7 | def cosine(u, v):
 8 |     return np.dot(u,v)/(norm(u)*norm(v))
 9 | 
10 | def hyper_score(u,v):
11 |     sim = np.dot(u,v)/(norm(u)*norm(v))
12 |     direct = norm(v)/norm(u)
13 |     return sim*direct
14 | 
15 | def load_data(embeddings_file, dataset_file, mode='cosine'):
16 |     golds, scores = [], []
17 |     unseen = 0
18 |     with open(dataset_file, 'r') as fin:
19 |         data = [line.strip().split(' ') for line in fin]
20 |     vecs, words = common.load_vecs(embeddings_file, binary=1)
21 |     embs = {word:vec for word,vec in zip(words,vecs)}
22 |     for rec in data:
23 |         if rec[0] in embs and rec[1] in embs:
24 |             golds.append(float(rec[5]))
25 |             if mode=='hyper':
26 |                 grade = hyper_score(embs[rec[0]], embs[rec[1]])
27 |                 scores.append(grade)
28 |             elif mode=='cosine':
29 |                 grade = cosine(embs[rec[0]], embs[rec[1]])
30 |                 scores.append(grade)
31 |         else:
32 |             unseen += 1
33 |     print 'unseen-words: %d' %unseen
34 |     return golds, scores
35 | 
36 | if __name__=='__main__':
37 |     embeddings_file = sys.argv[1]
38 |     dataset_file = sys.argv[2]
39 |     mode = sys.argv[3] # either 'cosine' or 'hyper'
40 |     golds, scores = load_data(embeddings_file, dataset_file, mode)
41 |     rho = spearmanr(golds, scores)[0]
42 |     print 'Spearman correlation: %f' %rho
43 |     
44 |     
45 |     
46 |     
47 | 


--------------------------------------------------------------------------------
/get-pretrainedHyperVecEmbeddings/download_embeddings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # downloads hypvec_embeddings from IMS homepage
 3 | for id in 0 1 2 3 4 5 6 7 8 9
 4 | 	do
 5 | 	wget http://www.ims.uni-stuttgart.de/documents/ressourcen/experiment-daten/hypvec_embd/hyp_p${id}.gz
 6 | 	done
 7 | cat hyp_p0.gz hyp_p1.gz hyp_p2.gz hyp_p3.gz hyp_p4.gz hyp_p5.gz hyp_p6.gz hyp_p7.gz hyp_p8.gz hyp_p9.gz > hypervec.txt.gz
 8 | # rm -f hyp_p*.gz # OPTIONAL  -remove files
 9 | # gunzip hypervec.txt.gz # OPTIONAL unzip embeddings to plain text
10 | 


--------------------------------------------------------------------------------
/hypernymy_resources/cohyponym_n.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/cohyponym_n.txt.gz


--------------------------------------------------------------------------------
/hypernymy_resources/cohyponym_v.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/cohyponym_v.txt.gz


--------------------------------------------------------------------------------
/hypernymy_resources/hypernym_n.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/hypernym_n.txt.gz


--------------------------------------------------------------------------------
/hypernymy_resources/hypernym_v.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/hypernym_v.txt.gz


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <version>0.0.1</version>
 4 |   <build>
 5 |     <sourceDirectory>src</sourceDirectory>
 6 |     <plugins>
 7 |       <plugin>
 8 |         <artifactId>maven-compiler-plugin</artifactId>
 9 |         <version>3.1</version>
10 |         <configuration>
11 |           <source>1.7</source>
12 |           <target>1.7</target>
13 |         </configuration>
14 |       </plugin>
15 |       <plugin>
16 |         <artifactId>maven-assembly-plugin</artifactId>
17 |         <configuration>
18 |            <archive>
19 |             <manifest>
20 |               <mainClass></mainClass>
21 |             </manifest>
22 |           </archive>
23 |           <descriptorRefs>
24 |             <descriptorRef>jar-with-dependencies</descriptorRef>
25 |           </descriptorRefs>
26 |         </configuration>
27 |       </plugin>
28 |     </plugins>
29 |   </build>
30 |   <dependencies>
31 |   	<dependency>
32 |   		<groupId>junit</groupId>
33 |   		<artifactId>junit</artifactId>
34 |   		<version>4.7</version>
35 |   	</dependency>
36 |   	<dependency>
37 |   		<groupId>
38 |   			com.googlecode.efficient-java-matrix-library
39 |   		</groupId>
40 |   		<artifactId>ejml</artifactId>
41 |   		<version>0.25</version>
42 |   	</dependency>
43 |   	<dependency>
44 |   		<groupId>org.apache.commons</groupId>
45 |   		<artifactId>commons-math3</artifactId>
46 |   		<version>3.3</version>
47 |   	</dependency>
48 |   	<dependency>
49 | 	    <groupId>commons-lang</groupId>
50 | 	    <artifactId>commons-lang</artifactId>
51 | 	    <version>2.3</version>
52 |     </dependency>
53 |   	<dependency>
54 |   		<groupId>edu.stanford.nlp</groupId>
55 |   		<artifactId>stanford-corenlp</artifactId>
56 |   		<version>3.4</version>
57 |   	</dependency>
58 |   	<dependency>
59 |   		<groupId>de.erichseifert.gral</groupId>
60 |   		<artifactId>gral-core</artifactId>
61 |   		<version>0.10</version>
62 |   	</dependency>
63 |   	<dependency>
64 |         <groupId>edu.berkeley.compbio</groupId>
65 |         <artifactId>jlibsvm</artifactId>
66 |         <version>0.911</version>
67 |     </dependency>
68 |   </dependencies>
69 |   <repositories>
70 |     <repository>
71 |       <id>erichseifert.de</id>
72 |       <url>http://mvn.erichseifert.de/maven2</url>
73 |     </repository>
74 |     <repository>
75 |         <id>dev.davidsoergel.com releases</id>
76 |         <url>http://dev.davidsoergel.com/nexus/content/repositories/releases</url>
77 |         <snapshots>
78 |             <enabled>false</enabled>
79 |         </snapshots>
80 |     </repository>
81 |     <repository>
82 |         <id>dev.davidsoergel.com snapshots</id>
83 |         <url>http://dev.davidsoergel.com/nexus/content/repositories/snapshots</url>
84 |         <releases>
85 |             <enabled>false</enabled>
86 |         </releases>
87 |     </repository>
88 |   </repositories>
89 |   <groupId>SemRel</groupId>
90 |   <artifactId>HyperVec</artifactId>
91 | </project>


--------------------------------------------------------------------------------
/src/common/DataStructureUtils.java:
--------------------------------------------------------------------------------
  1 | package common;
  2 | 
  3 | import io.word.Phrase;
  4 | 
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.HashSet;
  8 | import java.util.List;
  9 | 
 10 | /**
 11 |  * This class provides a set of utility method to turn one data structure to
 12 |  * another
 13 |  *
 14 |  */
 15 | public class DataStructureUtils {
 16 |     
 17 |     /**
 18 |      * This template method turns an array into a HashSet of the same type 
 19 |      */
 20 |     public static <T> HashSet<T> arrayToSet(T[] inputArray) {
 21 |         HashSet<T> result = new HashSet<T>();
 22 |         if (inputArray != null) {
 23 |             for (int i = 0; i < inputArray.length; i++) {
 24 |                 result.add(inputArray[i]);
 25 |             }
 26 |         }
 27 |         return result;
 28 |     }
 29 | 
 30 |     /**
 31 |      * This template method turns an array into a HashMap that maps an element
 32 |      * of the array to its index 
 33 |      */
 34 |     public static <T> HashMap<T, Integer> arrayToMap(T[] inputArray) {
 35 |         HashMap<T, Integer> result = new HashMap<T, Integer>();
 36 |         if (inputArray != null) {
 37 |             for (int i = 0; i < inputArray.length; i++) {
 38 |                 result.put(inputArray[i], i);
 39 |             }
 40 |         }
 41 |         return result;
 42 |     }
 43 | 
 44 |     /**
 45 |      * This template method turns an array into an (Array)List of the same type 
 46 |      */
 47 |     public static <T> ArrayList<T> arrayToList(T[] inputArray) {
 48 |         ArrayList<T> result = new ArrayList<T>();
 49 |         if (inputArray != null) {
 50 |             for (int i = 0; i < inputArray.length; i++) {
 51 |                 result.add(inputArray[i]);
 52 |             }
 53 |         }
 54 |         return result;
 55 |     }
 56 | 
 57 |     /*
 58 |      * The following set of methods turn a list into an array of the same type
 59 |      * The Java compiler cannot initialize an array without knowing the type of
 60 |      * the elements. Therefore, one cannot generalize with a template method
 61 |      */
 62 |     
 63 |     public static double[][] arrayListTo2dArray(List<double[]> list) {
 64 |         double[][] array = new double[list.size()][list.get(0).length];
 65 |         list.toArray(array);
 66 |         return array;
 67 |     }
 68 |     
 69 |     /**
 70 |      * This template method turns an array into a HashMap that maps an element
 71 |      * of the array to its index 
 72 |      */
 73 |     public static <T> HashMap<T, Integer> listToMap(List<T> inputArray) {
 74 |         HashMap<T, Integer> result = new HashMap<T, Integer>();
 75 |         if (inputArray != null) {
 76 |             for (int i = 0; i < inputArray.size(); i++) {
 77 |                 result.put(inputArray.get(i), i);
 78 |             }
 79 |         }
 80 |         return result;
 81 |     }
 82 | 
 83 |     public static String[] stringListToArray(List<String> list) {
 84 |         String[] array = new String[list.size()];
 85 |         list.toArray(array);
 86 |         return array;
 87 |     }
 88 | 
 89 |     public static Phrase[] phraseListToArray(List<Phrase> list) {
 90 |         Phrase[] array = new Phrase[list.size()];
 91 |         list.toArray(array);
 92 |         return array;
 93 |     }
 94 | 
 95 |     public static int[] intListToArray(List<Integer> list) {
 96 |         int[] array = new int[list.size()];
 97 |         int i = 0;
 98 |         for (Integer element : list) {
 99 |             array[i] = element;
100 |             i++;
101 |         }
102 |         return array;
103 |     }
104 | 
105 |     /**
106 |      * Search through an small int array for a given value
107 |      * @param array
108 |      * @param key
109 |      * @return the index of the first element to have a value equal to the key
110 |      */
111 |     public static int searchSmallIntArray(int[] array, int key) {
112 |         for (int i = 0; i < array.length; i++) {
113 |             if (array[i] == key)
114 |                 return i;
115 |         }
116 |         return -1;
117 |     }
118 |     
119 | }
120 | 


--------------------------------------------------------------------------------
/src/common/MathUtils.java:
--------------------------------------------------------------------------------
 1 | package common;
 2 | 
 3 | import java.util.Random;
 4 | 
 5 | /**
 6 |  * This class contains a set of utility method for simple maths
 7 |  * (maybe should be replaced with utility method for SimpleMatrix class
 8 |  *
 9 |  */
10 | public class MathUtils {
11 |     private static Random rand = new Random();
12 |     // TODO: use some linear algebra method
13 |     
14 |     /**
15 |      * Cosine of two vectors
16 |      * @param v1: 1st vector 
17 |      * @param v2: 2nd vector
18 |      * @return cosine value
19 |      */
20 |     public static double cosine(double[] v1, double[] v2) {
21 |         double length1 = length(v1);
22 |         double length2 = length(v2);
23 |         if (length1 == 0 || length2 == 0) return 0.0;
24 |         else return dot(v1, v2) / (length1 * length2);
25 |     }
26 | 
27 |     /**
28 |      * Length of a vector
29 |      * @param v: input vector
30 |      * @return length
31 |      */
32 |     public static double length(double[] v) {
33 |         double norm = dot(v, v);
34 |         return Math.sqrt(norm);
35 |     }
36 |     
37 |     /**
38 |      * Dot product of two vectors
39 |      * @param v1 first vector
40 |      * @param v2 second vector
41 |      * @return dot product
42 |      */
43 |     public static double dot(double[] v1, double[] v2) {
44 |         double result = 0;
45 |         for (int i = 0; i < v1.length; i++) {
46 |             result += v1[i] * v2[i];
47 |         }
48 |         return result;
49 |     }
50 | 
51 |     /**
52 |      * sigmoid function
53 |      * @param f input value
54 |      * @return sigmoid(f)
55 |      */
56 |     public static double sigmoid(double x) {
57 |         // TODO: understand why they turn the formula like this (e^x faster
58 |         // than e^-x ? Rounding error?)
59 |         return 1 - (double) (1.0 / (1.0 + Math.exp(x)));
60 |     }
61 |     
62 |     /**
63 |      * tanh function
64 |      */
65 |     public static double tanh(double x) {
66 |         return 1 - (double) (2.0 / (1.0 + Math.exp(2 * x)));
67 |     }
68 |     
69 |     public static boolean isSampled(long count, long totalCount, double frequencyThreshold) {
70 |         double randomThreshold = (double) (Math.sqrt(count
71 |                 / (frequencyThreshold * totalCount)) + 1)
72 |                 * (frequencyThreshold * totalCount) / count;
73 |         if (randomThreshold >= rand.nextFloat()) {
74 |             return true;
75 |         } else {
76 |             return false;
77 |         }
78 |     }
79 |     
80 |     public static double[] cosineDerivative(double[] x, double[] a) {
81 |         double lengthX = length(x);
82 |         double lengthA = length(a);
83 |         double dotP = dot(x, a);
84 |         double rToScaleA = 1 / (lengthX * lengthA);
85 |         double rToScaleX = dotP / (lengthA * lengthX * lengthX * lengthX);
86 |         double[] result = new double[x.length];
87 |         for (int i = 0; i < x.length; i++) {
88 |             result[i] = a[i] * rToScaleA - x[i] * rToScaleX;
89 |         }
90 |         return result;
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/common/MeanAveragePrecision.java:
--------------------------------------------------------------------------------
 1 | package common;
 2 | 
 3 | import space.SemanticSpace;
 4 | import java.util.ArrayList;
 5 | import java.util.Arrays;
 6 | import java.util.Comparator;
 7 | 
 8 | import common.IOUtils;
 9 | 
10 | public class MeanAveragePrecision {
11 |     String[][] wordPairs;
12 |     double[] golds;
13 |     
14 |     public MeanAveragePrecision(String dataset) {
15 |         readDataset(dataset);
16 |     }
17 |     
18 |     public MeanAveragePrecision(String[][] wordPairs, double[] golds) {
19 |         this.wordPairs = wordPairs;
20 |         this.golds = golds;
21 |     }
22 |     
23 |     public void readDataset(String dataset) {
24 |         ArrayList<String> data = IOUtils.readFile(dataset);
25 |         golds = new double[data.size()];
26 |         wordPairs = new String[data.size()][2];
27 |         for (int i = 0; i < data.size(); i++) {
28 |             String dataPiece = data.get(i);
29 |             String elements[] = dataPiece.split("\t");
30 |             wordPairs[i][0] = elements[0];
31 |             wordPairs[i][1] = elements[1];
32 |             golds[i] = Double.parseDouble(elements[2]);
33 |             //golds[i] = Double.parseDouble(elements[3]);
34 |         }
35 |     }
36 |     
37 |     public double evaluateMAP(SemanticSpace space) {
38 |         final double[] predicts = new double[golds.length];
39 |         for (int i = 0; i < golds.length; i++) {
40 |             predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1])
41 |                           * space.getDirection(wordPairs[i][0], wordPairs[i][1]);
42 |         }
43 |         Integer[] idxs = new Integer[golds.length];
44 |         for(int i = 0; i < golds.length; i++) idxs[i] = i;
45 |         Arrays.sort(idxs, new Comparator<Integer>(){
46 |             public int compare(Integer o1, Integer o2){
47 |                 return Double.compare(predicts[o2], predicts[o1]);
48 |             }
49 |         });
50 |         double[] sorted_preds = new double[golds.length];
51 |         for(int i = 0; i < golds.length; i++) sorted_preds[i] = golds[idxs[i]];
52 |         
53 |         double map = computeMAP(sorted_preds);
54 |         return map;
55 |     }
56 |     
57 |     public double computeMAP(double[] sorted_preds) {
58 |         double ap = 0.0;
59 |         double retrievedCounter = 0;
60 |         double relevantCounter = 0;
61 | 
62 |         for (int i = 0; i < sorted_preds.length; i++) {
63 |             retrievedCounter++;
64 |             if (sorted_preds[i] == 1.0) {
65 |                 relevantCounter++;
66 |                 ap += relevantCounter / retrievedCounter;
67 |             }
68 |         }
69 |         ap /= relevantCounter;
70 |         return ap;
71 |     }
72 |     
73 |     
74 | }


--------------------------------------------------------------------------------
/src/common/SigmoidTable.java:
--------------------------------------------------------------------------------
 1 | package common;
 2 | 
 3 | /**
 4 |  * An instance of this class pre-computed values for the sigmoid function
 5 |  * Its main purpose to increase the speed of the program (or so people say :P)
 6 |  * since e^-x takes longer time then mult/add
 7 |  *
 8 |  */
 9 | public class SigmoidTable {
10 |     
11 |     // Default parameters for the table
12 |     public static final double DEFAULT_MAX_X              = 6;
13 |     public static final int   DEFAULT_SIGMOID_TABLE_SIZE = 10000000;
14 | 
15 |     /*
16 |      * This sigmoidTable holds the precomputed sigmoid values of variables in the range
17 |      * [-maxX, maxX]
18 |      * tableSize decides the interval between two consecutive values that we
19 |      * compute the sigmoid function for, i.e. the precision of the returned
20 |      * sigmoid values
21 |      */
22 |     private double[]           sigmoidTable;
23 |     private double             maxX;
24 |     private int                tableSize;
25 | 
26 | 
27 |     public SigmoidTable(int tableSize, double maxX) {
28 |         this.tableSize = tableSize;
29 |         this.maxX = maxX;
30 |         initTable();
31 |     }
32 | 
33 |     /**
34 |      * Default constructor
35 |      * Initialize with default values
36 |      */
37 |     public SigmoidTable() {
38 |         this(DEFAULT_SIGMOID_TABLE_SIZE, DEFAULT_MAX_X);
39 |     }
40 |     
41 |     /**
42 |      * Initialize the precomputed sigmoid table.
43 |      * The table consists of "tableSize" precomputed values for sigmoid 
44 |      * function for input values from -maxX to maxX (The difference between to
45 |      * consecutive input value would be: 2 * maxX / (tableSize - 1)
46 |      */
47 |     public void initTable() {
48 |         sigmoidTable = new double[tableSize];
49 |         double step = (2 * maxX) / (tableSize - 1);
50 |         for (int i = 0; i < tableSize - 1; i++) {
51 |             double x = -maxX + i * step;
52 |             sigmoidTable[i] = MathUtils.sigmoid(x);
53 |         }
54 |     }
55 | 
56 |     /**
57 |      * Get the sigmoid function for x from the pre-computed table
58 |      */
59 |     public double getSigmoid(double x) {
60 |         if (x > maxX)
61 |             return 1;
62 |         else if (x < -maxX)
63 |             return 0;
64 |         else {
65 |             int index = (int) Math.round((x + maxX) / (2 * maxX) * (tableSize - 1));
66 |             return sigmoidTable[index];
67 |         }
68 | //        double result = MathUtils.sigmoid(x);
69 | //        return result;
70 |     }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/common/TanhTable.java:
--------------------------------------------------------------------------------
 1 | package common;
 2 | 
 3 | /**
 4 |  * An instance of this class pre-computed values for the tanh function
 5 |  * Its main purpose to increase the speed of the program (or so people say :P)
 6 |  * since e^-x takes longer time then mult/add
 7 |  *
 8 |  */
 9 | public class TanhTable {
10 |     
11 |     // Default parameters for the table
12 |     public static final double DEFAULT_MAX_X              = 6;
13 |     public static final int   DEFAULT_TANH_TABLE_SIZE = 10000000;
14 | 
15 |     /*
16 |      * This tanhTable holds the precomputed tanh values of variables in the range
17 |      * [-maxX, maxX]
18 |      * tableSize decides the interval between two consecutive values that we
19 |      * compute the tanh function for, i.e. the precision of the returned
20 |      * tanh values
21 |      */
22 |     private double[]           tanhTable;
23 |     private double             maxX;
24 |     private int               tableSize;
25 | 
26 | 
27 |     public TanhTable(int tableSize, double maxX) {
28 |         this.tableSize = tableSize;
29 |         this.maxX = maxX;
30 |         initTable();
31 |     }
32 | 
33 |     /**
34 |      * Default constructor
35 |      * Initialize with default values
36 |      */
37 |     public TanhTable() {
38 |         this(DEFAULT_TANH_TABLE_SIZE, DEFAULT_MAX_X);
39 |     }
40 |     
41 |     /**
42 |      * Initialize the precomputed tanh table.
43 |      * The table consists of "tableSize" precomputed values for tanh 
44 |      * function for input values from -maxX to maxX (The difference between to
45 |      * consecutive input value would be: 2 * maxX / (tableSize - 1)
46 |      */
47 |     public void initTable() {
48 |         tanhTable = new double[tableSize];
49 |         double step = (2 * maxX) / (tableSize - 1);
50 |         for (int i = 0; i < tableSize - 1; i++) {
51 |             double x = -maxX + i * step;
52 |             tanhTable[i] = MathUtils.tanh(x);
53 |         }
54 |     }
55 | 
56 |     /**
57 |      * Get the tanh function for x from the pre-computed table
58 |      */
59 |     public double getTanh(double x) {
60 | //        if (x > 1000) {
61 | //            System.out.println("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
62 | //            System.out.println("x: " + x);
63 | //            return 1;
64 | //        } else if (x < -1000) {
65 | //            System.out.println("-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
66 | //            return -1;
67 | //        }
68 | //        return MathUtils.tanh(x);
69 |         if (x > maxX)
70 |             return 1;
71 |         else if (x < -maxX)
72 |             return -1;
73 |         else {
74 | //            int index = (int) Math.round((x + maxX) / (2 * maxX) * (tableSize - 1));
75 | //            return tanhTable[index];
76 |             return MathUtils.tanh(x);
77 |         }
78 | 
79 |     }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/common/WordForm.java:
--------------------------------------------------------------------------------
 1 | package common;
 2 | 
 3 | /**
 4 |  * Constant values for word format
 5 |  *
 6 |  */
 7 | public class WordForm {
 8 |     public static final int WORD = 1;
 9 |     public static final int LEMMA = 2;
10 |     public static final int WORD_POS = 3;
11 |     public static final int LEMMA_POS = 4;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/common/correlation/AreaUnderCurve.java:
--------------------------------------------------------------------------------
 1 | package common.correlation;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | import common.exception.ValueException;
 6 | 
 7 | public class AreaUnderCurve {
 8 |     public static double computeAUC(double[] golds, double[] predicted) {
 9 |         int positive = 0;
10 |         for (double score: golds) {
11 |             if (score == 1) positive++;
12 |         }
13 |         int negative = golds.length - positive;
14 | 
15 |         int total_count = golds.length;
16 |         Point[] point_set = new Point[total_count];
17 |         for (int i = 0; i < golds.length; i++) {
18 |             if (!(golds[i]==1) && !(golds[i] == 0)) {
19 |                 throw new ValueException("For evaluating AUC, gold scores are required to be 0 or 1.");
20 |             }
21 |             point_set[i] = new Point(golds[i], predicted[i]);
22 |         }
23 | 
24 |         Arrays.sort(point_set);
25 | 
26 |         double xi = 1.0;
27 |         double yi = 1.0;
28 |         double xi_old = 1.0;
29 |         double true_positive = positive;
30 |         double false_positive = negative;
31 |         double auc = 0;
32 | 
33 |         for (int i = 0; i < total_count; i++) {
34 |             if (point_set[i].gold == 1) {
35 |                 true_positive -= 1;
36 |                 yi = true_positive / positive;
37 |             } else {
38 |                 false_positive -= 1;
39 |                 xi = false_positive / negative;
40 |                 auc += (xi_old - xi) * yi;
41 |                 xi_old = xi;
42 |             }
43 |         }
44 |         return auc;
45 |     }
46 |     
47 |     
48 |     static class Point implements Comparable<Point>{
49 |         double gold;
50 |         double score;
51 |         public Point(double gold, double score) {
52 |             this.gold = gold;
53 |             this.score = score;
54 |         }
55 |         @Override
56 |         public int compareTo(Point o) {
57 |             // TODO Auto-generated method stub
58 |             if (this.score > o.score) return 1;
59 |             if (this.score < o.score) return -1;
60 |             return 0;
61 |         }
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/common/correlation/MenCorrelation.java:
--------------------------------------------------------------------------------
  1 | package common.correlation;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | import org.apache.commons.math3.stat.correlation.PearsonsCorrelation;
  6 | import org.apache.commons.math3.stat.correlation.SpearmansCorrelation;
  7 | 
  8 | import space.SemanticSpace;
  9 | 
 10 | import common.IOUtils;
 11 | 
 12 | 
 13 | /**
 14 |  * This class can be used to evaluate a word vector space by computing the
 15 |  * correlation between the cosine of the words' vectors and the gold-standard
 16 |  * similarities of them (typically based on human judgment)
 17 |  * The name is kind of misleading since we can use other dataset than MEN
 18 |  *
 19 |  */
 20 | 
 21 | public class MenCorrelation {
 22 | 	String[][] wordPairs;
 23 | 	double[] golds;
 24 | 	PearsonsCorrelation pearson;
 25 | 	SpearmansCorrelation spearman;
 26 | 	String name = "";
 27 | 	
 28 | 	/**
 29 | 	 * Initialize with the path to the dataset file
 30 | 	 * @param dataset
 31 | 	 */
 32 | 	public MenCorrelation(String dataset) {
 33 | 	    pearson = new PearsonsCorrelation();
 34 | 	    spearman = new SpearmansCorrelation();
 35 | 		readDataset(dataset);
 36 | 	}
 37 | 	
 38 | 	
 39 | 	public MenCorrelation(String[][] wordPairs, double[] golds) {
 40 | 	    pearson = new PearsonsCorrelation();
 41 |         spearman = new SpearmansCorrelation();
 42 |         this.wordPairs = wordPairs;
 43 |         this.golds = golds;
 44 | 	}
 45 | 	
 46 | 
 47 |     /**
 48 |      * Read the word pairs and the gold standard from the dataset
 49 |      * @param dataset
 50 |      */
 51 | 	public void readDataset(String dataset) {
 52 | 		ArrayList<String> data = IOUtils.readFile(dataset);
 53 | 		golds = new double[data.size()];
 54 | 		wordPairs = new String[data.size()][2];
 55 | 		for (int i = 0; i < data.size(); i++) {
 56 | 			String dataPiece = data.get(i);
 57 | 			String elements[] = dataPiece.split("\t");
 58 | 			wordPairs[i][0] = elements[0];
 59 | 			wordPairs[i][1] = elements[1];
 60 | 			golds[i] = Double.parseDouble(elements[2]);
 61 | 			//golds[i] = Double.parseDouble(elements[3]);
 62 | 		}
 63 | 	}
 64 | 	
 65 | 	/**
 66 | 	 * Compute the pearson correlation of the predicted values against the gold
 67 | 	 * standard
 68 | 	 * @param predicts
 69 | 	 * @return
 70 | 	 */
 71 | 	public double pearsonCorrelation(double[] predicts) {
 72 | 	    return pearson.correlation(golds, predicts);
 73 | 	}
 74 | 	
 75 | 	/**
 76 | 	 * Compute the spearman correlation of the predicted values against the gold
 77 |      * standard 
 78 | 	 * @param predicts
 79 | 	 * @return
 80 | 	 */
 81 | 	public double spearmanCorrelation(double[] predicts) {
 82 |         return spearman.correlation(golds, predicts);
 83 |     }
 84 | 	
 85 | 	
 86 | 	/**
 87 | 	 * Evaluate the space using the pearson correlation
 88 | 	 * @param space
 89 | 	 * @return
 90 | 	 */
 91 | 	public double evaluateSpacePearson(SemanticSpace space) {
 92 | 	    double[] predicts = new double[golds.length];
 93 | 	    for (int i = 0; i < golds.length; i++) {
 94 | 	        predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]);
 95 | //	        System.out.println(wordPairs[i][0]);
 96 | //	        System.out.println(wordPairs[i][1]);
 97 | 	    }
 98 | 	    return pearson.correlation(golds, predicts);
 99 | 	}
100 | 	
101 | 	
102 | 	/**
103 |      * Evaluate the space using the spearman correlation
104 |      * @param space
105 |      * @return
106 |      */
107 | 	public double evaluateSpaceSpearman(SemanticSpace space) {
108 |         double[] predicts = new double[golds.length];
109 |         for (int i = 0; i < golds.length; i++) {
110 |             predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]);
111 |         }
112 |         return spearman.correlation(golds, predicts);
113 |     }
114 | 	
115 | 	
116 | 	/**
117 | 	 * @return the gold standard (human's judgment on the similarities)
118 | 	 */
119 | 	public double[] getGolds() {
120 | 	    return golds;
121 | 	}
122 | 	
123 | 	public void setName(String name) {
124 | 	    this.name = name;
125 | 	}
126 | 	
127 | 	public String getName() {
128 | 	    return this.name;
129 | 	}
130 | 	
131 | 	public String[][] getWordPairs() {
132 | 	    return this.wordPairs;
133 | 	}
134 | 	
135 | 	public static void main(String[] args) {
136 | 	}
137 | }
138 | 


--------------------------------------------------------------------------------
/src/common/exception/OutOfVocabularyException.java:
--------------------------------------------------------------------------------
 1 | package common.exception;
 2 | 
 3 | public class OutOfVocabularyException extends RuntimeException {
 4 | 
 5 |     /**
 6 |      * 
 7 |      */
 8 |     private static final long serialVersionUID = 1L;
 9 |     
10 |     public OutOfVocabularyException(String msg) {
11 |         super(msg);
12 |     }
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/common/exception/ValueException.java:
--------------------------------------------------------------------------------
 1 | package common.exception;
 2 | 
 3 | public class ValueException extends RuntimeException {
 4 | 
 5 |     /**
 6 |      * 
 7 |      */
 8 |     private static final long serialVersionUID = 1L;
 9 |     public ValueException(String msg) {
10 |         super(msg);
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/src/common/wordnet/LexicalHypernym.java:
--------------------------------------------------------------------------------
  1 | package common.wordnet;
  2 | 
  3 | import common.IOUtils;
  4 | import vocab.Vocab;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.HashSet;
 10 | import java.util.List;
 11 | import java.util.Random;
 12 | import java.util.Set;
 13 | import com.google.common.collect.Sets;
 14 | 
 15 | 
 16 | public class LexicalHypernym {
 17 |     HashMap<Integer, HashSet<Integer>> features;
 18 |     HashMap<Integer, HashSet<Integer>> cohyponyms;
 19 |     HashMap<Integer, HashSet<Integer>> hypernyms;
 20 |     //Vocab vocab;
 21 |     Random random;
 22 |     
 23 |     public LexicalHypernym(String hypeFile, String cohypoFile, String featureFile, Vocab vocab) throws IOException {
 24 |         cohyponyms = readLexical(cohypoFile, vocab);
 25 |         hypernyms = readLexical(hypeFile, vocab);
 26 |         features = readFeatures(featureFile, vocab);
 27 |         //this.vocab = vocab;
 28 |         random = new Random();
 29 |     }
 30 |        
 31 |     public HashMap<Integer, HashSet<Integer>> readLexical(String inputFile, Vocab vocab) throws IOException {
 32 |         HashMap<Integer, HashSet<Integer>> lexical = new HashMap<Integer, HashSet<Integer>>();
 33 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 34 |         for (int i = 0; i < data.size(); i++) {
 35 |             String dataPiece = data.get(i);
 36 |             String elements[] = dataPiece.split("\t");
 37 |             String key = elements[0];
 38 |             int keyIndex = vocab.getWordIndex(key);
 39 |             if (keyIndex == -1) continue;
 40 |             HashSet<Integer> value = new HashSet<Integer>();
 41 |             for (int j = 1; j < elements.length; j++ ) {
 42 |                 int wordIndex = vocab.getWordIndex(elements[j]);;
 43 |                 if (wordIndex == -1) continue;
 44 |                 //value.add(elements[j]);
 45 |                 value.add(wordIndex);
 46 |             }
 47 |             lexical.put(keyIndex, value);
 48 |         }
 49 |         return lexical;
 50 |     }
 51 |     
 52 |     public HashMap<Integer, HashSet<Integer>> readFeatures(String inputFile, Vocab vocab) throws IOException {
 53 |         HashMap<Integer, HashSet<Integer>> features = new HashMap<Integer, HashSet<Integer>>();
 54 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 55 |         for (int i = 0; i < data.size(); i++) {
 56 |             String dataPiece = data.get(i);
 57 |             String elements[] = dataPiece.split("\t");
 58 |             String key = elements[0];
 59 |             int keyIndex = vocab.getWordIndex(key);
 60 |             if (keyIndex == -1) continue;
 61 |             HashSet<Integer> value = new HashSet<Integer>();
 62 |             for (int j = 1; j < elements.length; j++ ) {
 63 |                 int wordIndex = -1;
 64 |                 wordIndex = vocab.getWordIndex(elements[j]);
 65 |                 if (wordIndex == -1) continue;
 66 |                 value.add(wordIndex);
 67 |             }
 68 |             features.put(keyIndex, value);
 69 |         }
 70 |         return features;
 71 |     }
 72 |     /*
 73 |     public HashSet<String> intersectionAnt(String target, String feature) {
 74 |         HashSet<String> intersection = new HashSet<String>();
 75 |         if (antonyms.containsKey(target) && features.containsKey(feature)) {
 76 |             HashSet<String> setTargets = antonyms.get(target);
 77 |             HashSet<String> setFeatures = features.get(feature);
 78 |             intersection = getIntersection(setTargets, setFeatures);                
 79 |         }
 80 |         return intersection;
 81 |     }
 82 |     
 83 |     public HashSet<String> intersectionSyn(String target, String feature) {
 84 |         HashSet<String> intersection = new HashSet<String>();
 85 |         if (synonyms.containsKey(target) && features.containsKey(feature)) {
 86 |             HashSet<String> setTargets = synonyms.get(target);
 87 |             HashSet<String> setFeatures = features.get(feature);
 88 |             intersection = getIntersection(setTargets, setFeatures);
 89 |         }                
 90 |         return intersection;
 91 |     }*/
 92 |     
 93 |     public Set<Integer> intersectionHype(Integer targetIndex, Integer featureIndex) {
 94 |         Set<Integer> intersection = new HashSet<Integer>();
 95 |         if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
 96 |             Set<Integer> setHypes = hypernyms.get(targetIndex);
 97 |             Set<Integer> setFeatures = features.get(featureIndex);
 98 |             intersection = Sets.intersection(setHypes, setFeatures);
 99 |         }                
100 |         return intersection;
101 |     }
102 |     
103 |     public Set<Integer> intersectionCohypo(Integer targetIndex, Integer featureIndex) {
104 |         Set<Integer> intersection = new HashSet<Integer>();
105 |         if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
106 |             Set<Integer> setCohypos = cohyponyms.get(targetIndex);
107 |             Set<Integer> setFeatures = features.get(featureIndex);
108 |             intersection = Sets.intersection(setCohypos, setFeatures);
109 |         }                
110 |         return intersection;
111 |     }
112 |     
113 |     public HashSet<Integer> getIntersection(HashSet<Integer> hs1, HashSet<Integer> hs2) {
114 |         HashSet<Integer> intersection = new HashSet<Integer>();
115 |         for (Integer element: hs1) {
116 |             if (hs2.contains(element)) intersection.add(element);
117 |         }
118 |         return intersection;
119 |     }
120 |     
121 |     public int getRandom(Set<Integer> antonyms) {
122 |         List<Integer> listAnts = new ArrayList<Integer>(antonyms);
123 |         int id = random.nextInt(listAnts.size());
124 |         return listAnts.get(id);
125 |     }
126 |     
127 |     public boolean hasHypeCohypo(Integer targetIndex){
128 |         return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex);
129 |     }
130 |     
131 |     public boolean hasHypernyms(Integer targetIndex){
132 |         return hypernyms.containsKey(targetIndex);
133 |     }
134 |     
135 |     public boolean hasCohyponyms(Integer targetIndex){
136 |         return cohyponyms.containsKey(targetIndex);
137 |     }
138 |        
139 |     public boolean hasFeature(Integer featureIndex) {
140 |         return features.containsKey(featureIndex);
141 |     }
142 |     
143 |     public HashSet<Integer> getFeatures(Integer featureIndex) {
144 |         return features.get(featureIndex);
145 |     }
146 |     
147 |     public HashSet<Integer> getHypernyms(Integer targetIndex) {
148 |         return hypernyms.get(targetIndex);
149 |     }
150 |     
151 |     public HashSet<Integer> getCohyponyms(Integer targetIndex) {
152 |         return cohyponyms.get(targetIndex);
153 |     }
154 | 
155 | }
156 | 


--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResource.java:
--------------------------------------------------------------------------------
  1 | package common.wordnet;
  2 | 
  3 | import common.IOUtils;
  4 | import vocab.Vocab;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.HashSet;
 10 | import java.util.List;
 11 | import java.util.Random;
 12 | import java.util.Set;
 13 | import com.google.common.collect.Sets;
 14 | 
 15 | 
 16 | public class LexicalResource {
 17 |     HashMap<Integer, HashSet<Integer>> antonyms;
 18 |     HashMap<Integer, HashSet<Integer>> synonyms;
 19 |     HashMap<Integer, HashSet<Integer>> features;
 20 |     //HashMap<Integer, HashSet<Integer>> cohyponyms;
 21 |     //HashMap<Integer, HashSet<Integer>> hypernyms;
 22 |     //Vocab vocab;
 23 |     Random random;
 24 |     
 25 |     public LexicalResource(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
 26 |         antonyms = readLexical(antFile, vocab);
 27 |         synonyms = readLexical(synFile, vocab);
 28 |         //cohyponyms = readLexical(cohypoFile, vocab);
 29 |         //hypernyms = readLexical(hypeFile, vocab);
 30 |         features = readFeatures(featureFile, vocab);
 31 |         //this.vocab = vocab;
 32 |         random = new Random();
 33 |     }
 34 |        
 35 |     public HashMap<Integer, HashSet<Integer>> readLexical(String inputFile, Vocab vocab) throws IOException {
 36 |         HashMap<Integer, HashSet<Integer>> lexical = new HashMap<Integer, HashSet<Integer>>();
 37 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 38 |         for (int i = 0; i < data.size(); i++) {
 39 |             String dataPiece = data.get(i);
 40 |             String elements[] = dataPiece.split("\t");
 41 |             String key = elements[0];
 42 |             int keyIndex = vocab.getWordIndex(key);
 43 |             if (keyIndex == -1) continue;
 44 |             HashSet<Integer> value = new HashSet<Integer>();
 45 |             for (int j = 1; j < elements.length; j++ ) {
 46 |                 int wordIndex = vocab.getWordIndex(elements[j]);;
 47 |                 if (wordIndex == -1) continue;
 48 |                 //value.add(elements[j]);
 49 |                 value.add(wordIndex);
 50 |             }
 51 |             lexical.put(keyIndex, value);
 52 |         }
 53 |         return lexical;
 54 |     }
 55 |     
 56 |     public HashMap<Integer, HashSet<Integer>> readFeatures(String inputFile, Vocab vocab) throws IOException {
 57 |         HashMap<Integer, HashSet<Integer>> features = new HashMap<Integer, HashSet<Integer>>();
 58 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 59 |         for (int i = 0; i < data.size(); i++) {
 60 |             String dataPiece = data.get(i);
 61 |             String elements[] = dataPiece.split("\t");
 62 |             String key = elements[0];
 63 |             int keyIndex = vocab.getWordIndex(key);
 64 |             if (keyIndex == -1) continue;
 65 |             HashSet<Integer> value = new HashSet<Integer>();
 66 |             for (int j = 1; j < elements.length; j++ ) {
 67 |                 int wordIndex = -1;
 68 |                 wordIndex = vocab.getWordIndex(elements[j]);
 69 |                 if (wordIndex == -1) continue;
 70 |                 value.add(wordIndex);
 71 |             }
 72 |             features.put(keyIndex, value);
 73 |         }
 74 |         return features;
 75 |     }
 76 |     /*
 77 |     public HashSet<String> intersectionAnt(String target, String feature) {
 78 |         HashSet<String> intersection = new HashSet<String>();
 79 |         if (antonyms.containsKey(target) && features.containsKey(feature)) {
 80 |             HashSet<String> setTargets = antonyms.get(target);
 81 |             HashSet<String> setFeatures = features.get(feature);
 82 |             intersection = getIntersection(setTargets, setFeatures);                
 83 |         }
 84 |         return intersection;
 85 |     }
 86 |     
 87 |     public HashSet<String> intersectionSyn(String target, String feature) {
 88 |         HashSet<String> intersection = new HashSet<String>();
 89 |         if (synonyms.containsKey(target) && features.containsKey(feature)) {
 90 |             HashSet<String> setTargets = synonyms.get(target);
 91 |             HashSet<String> setFeatures = features.get(feature);
 92 |             intersection = getIntersection(setTargets, setFeatures);
 93 |         }                
 94 |         return intersection;
 95 |     }*/
 96 |     
 97 |     public Set<Integer> intersectionAnt(Integer targetIndex, Integer featureIndex) {
 98 |         Set<Integer> intersection = new HashSet<Integer>();
 99 |         if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
100 |             Set<Integer> setTargets = antonyms.get(targetIndex);
101 |             Set<Integer> setFeatures = features.get(featureIndex);
102 |             intersection = Sets.intersection(setTargets, setFeatures);                
103 |         }
104 |         return intersection;
105 |     }
106 |     
107 |     public Set<Integer> intersectionSyn(Integer targetIndex, Integer featureIndex) {
108 |         Set<Integer> intersection = new HashSet<Integer>();
109 |         if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
110 |             Set<Integer> setTargets = synonyms.get(targetIndex);
111 |             Set<Integer> setFeatures = features.get(featureIndex);
112 |             intersection = Sets.intersection(setTargets, setFeatures);
113 |         }                
114 |         return intersection;
115 |     }
116 |     
117 | //    public Set<Integer> intersectionHype(Integer targetIndex, Integer featureIndex) {
118 | //        Set<Integer> intersection = new HashSet<Integer>();
119 | //        if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
120 | //            Set<Integer> setHypes = hypernyms.get(targetIndex);
121 | //            Set<Integer> setFeatures = features.get(featureIndex);
122 | //            intersection = Sets.intersection(setHypes, setFeatures);
123 | //        }                
124 | //        return intersection;
125 | //    }
126 | //    
127 | //    public Set<Integer> intersectionCohypo(Integer targetIndex, Integer featureIndex) {
128 | //        Set<Integer> intersection = new HashSet<Integer>();
129 | //        if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
130 | //            Set<Integer> setCohypos = cohyponyms.get(targetIndex);
131 | //            Set<Integer> setFeatures = features.get(featureIndex);
132 | //            intersection = Sets.intersection(setCohypos, setFeatures);
133 | //        }                
134 | //        return intersection;
135 | //    }
136 |     
137 |     public HashSet<Integer> getIntersection(HashSet<Integer> hs1, HashSet<Integer> hs2) {
138 |         HashSet<Integer> intersection = new HashSet<Integer>();
139 |         for (Integer element: hs1) {
140 |             if (hs2.contains(element)) intersection.add(element);
141 |         }
142 |         return intersection;
143 |     }
144 |     
145 |     public int getRandom(Set<Integer> antonyms) {
146 |         List<Integer> listAnts = new ArrayList<Integer>(antonyms);
147 |         int id = random.nextInt(listAnts.size());
148 |         return listAnts.get(id);
149 |     }
150 |     
151 | //    public boolean hasHypeCohypo(Integer targetIndex){
152 | //        return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex);
153 | //    }
154 | //    
155 | //    public boolean hasHypernyms(Integer targetIndex){
156 | //        return hypernyms.containsKey(targetIndex);
157 | //    }
158 | //    
159 | //    public boolean hasCohyponyms(Integer targetIndex){
160 | //        return cohyponyms.containsKey(targetIndex);
161 | //    }
162 |     
163 |     public boolean hasTarget(Integer targetIndex) {
164 |         return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex);
165 |     }
166 |     
167 |     public boolean hasAntonyms(Integer targetIndex) {
168 |         return antonyms.containsKey(targetIndex);
169 |     }
170 |     
171 |     public boolean hasSynonyms(Integer targetIndex) {
172 |         return synonyms.containsKey(targetIndex);
173 |     }
174 |     
175 |     public boolean hasFeature(Integer featureIndex) {
176 |         return features.containsKey(featureIndex);
177 |     }
178 |     
179 |     public HashSet<Integer> getAntonyms(Integer targetIndex) {
180 |         return antonyms.get(targetIndex);
181 |     }
182 |     
183 |     public HashSet<Integer> getSynonyms(Integer targetIndex) {
184 |         return synonyms.get(targetIndex);
185 |     }
186 |     
187 |     public HashSet<Integer> getFeatures(Integer featureIndex) {
188 |         return features.get(featureIndex);
189 |     }
190 |     
191 | //    public HashSet<Integer> getHypernyms(Integer targetIndex) {
192 | //        return hypernyms.get(targetIndex);
193 | //    }
194 | //    
195 | //    public HashSet<Integer> getCohyponyms(Integer targetIndex) {
196 | //        return cohyponyms.get(targetIndex);
197 | //    }
198 | 
199 | }
200 | 


--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResourceAdj.java:
--------------------------------------------------------------------------------
  1 | package common.wordnet;
  2 | 
  3 | import common.IOUtils;
  4 | import vocab.Vocab;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.HashSet;
 10 | import java.util.List;
 11 | import java.util.Random;
 12 | import java.util.Set;
 13 | import com.google.common.collect.Sets;
 14 | 
 15 | 
 16 | public class LexicalResourceAdj {
 17 |     HashMap<Integer, HashSet<Integer>> antonyms;
 18 |     HashMap<Integer, HashSet<Integer>> synonyms;
 19 |     HashMap<Integer, HashSet<Integer>> features;
 20 |     Random random;
 21 |     
 22 |     public LexicalResourceAdj(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
 23 |         features = readFeatures(featureFile, vocab);
 24 |         antonyms = readLexical(antFile, vocab);
 25 |         synonyms = readLexical(synFile, vocab);
 26 |         //this.vocab = vocab;
 27 |         //System.out.println("The number of antonyms: " + antonyms.size());
 28 |         //System.out.println("The number of synonyms: " + synonyms.size());
 29 |         //System.out.println("The number of contexts: " + features.size());
 30 |         random = new Random();
 31 |     }
 32 |        
 33 |     public HashMap<Integer, HashSet<Integer>> readLexical(String inputFile, Vocab vocab) throws IOException {
 34 |         HashMap<Integer, HashSet<Integer>> lexical = new HashMap<Integer, HashSet<Integer>>();
 35 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 36 |         for (int i = 0; i < data.size(); i++) {
 37 |             String dataPiece = data.get(i);
 38 |             String elements[] = dataPiece.split("\t");
 39 |             String key = elements[0];
 40 |             int keyIndex = vocab.getWordIndex(key);
 41 |             if (keyIndex == -1) continue;
 42 |             HashSet<Integer> value = new HashSet<Integer>();
 43 |             for (int j = 1; j < elements.length; j++ ) {
 44 |                 int wordIndex = vocab.getWordIndex(elements[j]);;
 45 |                 if (wordIndex == -1) continue;
 46 |                 //value.add(elements[j]);
 47 |                 value.add(wordIndex);
 48 |             }
 49 |             lexical.put(keyIndex, value);
 50 |             //random antonym
 51 |             //List<Integer> listAnts = new ArrayList<Integer>(value);
 52 |             //int id = random.nextInt(listAnts.size());
 53 |             //antRandom.put(keyIndex, listAnts.get(id));
 54 |         }
 55 |         return lexical;
 56 |     }
 57 |     
 58 |     public HashMap<Integer, HashSet<Integer>> readFeatures(String inputFile, Vocab vocab) throws IOException {
 59 |         HashMap<Integer, HashSet<Integer>> features = new HashMap<Integer, HashSet<Integer>>();
 60 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 61 |         for (int i = 0; i < data.size(); i++) {
 62 |             String dataPiece = data.get(i);
 63 |             String elements[] = dataPiece.split("\t");
 64 |             String key = elements[0];
 65 |             int keyIndex = vocab.getWordIndex(key);
 66 |             if (keyIndex == -1) continue;
 67 |             HashSet<Integer> value = new HashSet<Integer>();
 68 |             for (int j = 1; j < elements.length; j++ ) {
 69 |                 int wordIndex = -1;
 70 |                 wordIndex = vocab.getWordIndex(elements[j]);
 71 |                 if (wordIndex == -1) continue;
 72 |                 value.add(wordIndex);
 73 |             }
 74 |             features.put(keyIndex, value);
 75 |         }
 76 |         return features;
 77 |     }
 78 |     /*
 79 |     public HashSet<String> intersectionAnt(String target, String feature) {
 80 |         HashSet<String> intersection = new HashSet<String>();
 81 |         if (antonyms.containsKey(target) && features.containsKey(feature)) {
 82 |             HashSet<String> setTargets = antonyms.get(target);
 83 |             HashSet<String> setFeatures = features.get(feature);
 84 |             intersection = getIntersection(setTargets, setFeatures);                
 85 |         }
 86 |         return intersection;
 87 |     }
 88 |     
 89 |     public HashSet<String> intersectionSyn(String target, String feature) {
 90 |         HashSet<String> intersection = new HashSet<String>();
 91 |         if (synonyms.containsKey(target) && features.containsKey(feature)) {
 92 |             HashSet<String> setTargets = synonyms.get(target);
 93 |             HashSet<String> setFeatures = features.get(feature);
 94 |             intersection = getIntersection(setTargets, setFeatures);
 95 |         }                
 96 |         return intersection;
 97 |     }*/
 98 |     
 99 |     public Set<Integer> intersectionAnt(Integer targetIndex, Integer featureIndex) {
100 |         Set<Integer> intersection = new HashSet<Integer>();
101 |         if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
102 |             Set<Integer> setTargets = antonyms.get(targetIndex);
103 |             Set<Integer> setFeatures = features.get(featureIndex);
104 |             intersection = Sets.intersection(setTargets, setFeatures);                
105 |         }
106 |         return intersection;
107 |     }
108 |     
109 |     public Set<Integer> intersectionSyn(Integer targetIndex, Integer featureIndex) {
110 |         Set<Integer> intersection = new HashSet<Integer>();
111 |         if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
112 |             Set<Integer> setTargets = synonyms.get(targetIndex);
113 |             Set<Integer> setFeatures = features.get(featureIndex);
114 |             intersection = Sets.intersection(setTargets, setFeatures);
115 |         }                
116 |         return intersection;
117 |     }
118 |     
119 |     public HashSet<Integer> getIntersection(HashSet<Integer> hs1, HashSet<Integer> hs2) {
120 |         HashSet<Integer> intersection = new HashSet<Integer>();
121 |         for (Integer element: hs1) {
122 |             if (hs2.contains(element)) intersection.add(element);
123 |         }
124 |         return intersection;
125 |     }
126 |     
127 |     public int getRandom(Set<Integer> antonyms) {
128 |         List<Integer> listAnts = new ArrayList<Integer>(antonyms);
129 |         int id = random.nextInt(listAnts.size());
130 |         return listAnts.get(id);
131 |     }
132 |     
133 |     public boolean hasTarget(Integer targetIndex) {
134 |         return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex);
135 |     }
136 |     
137 |     public boolean hasAntonyms(Integer targetIndex) {
138 |         return antonyms.containsKey(targetIndex);
139 |     }
140 |     
141 |     public boolean hasSynonyms(Integer targetIndex) {
142 |         return synonyms.containsKey(targetIndex);
143 |     }
144 |     
145 |     public boolean hasFeature(Integer featureIndex) {
146 |         return features.containsKey(featureIndex);
147 |     }
148 |     
149 |     public HashSet<Integer> getAntonyms(Integer targetIndex) {
150 |         return antonyms.get(targetIndex);
151 |     }
152 |     
153 |     public HashSet<Integer> getSynonyms(Integer targetIndex) {
154 |         return synonyms.get(targetIndex);
155 |     }
156 |     
157 |     public HashSet<Integer> getFeatures(Integer featureIndex) {
158 |         return features.get(featureIndex);
159 |     }
160 | 
161 | }
162 | 


--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResourceNoun.java:
--------------------------------------------------------------------------------
  1 | package common.wordnet;
  2 | 
  3 | import common.IOUtils;
  4 | import vocab.Vocab;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.HashSet;
 10 | import java.util.List;
 11 | import java.util.Random;
 12 | import java.util.Set;
 13 | import com.google.common.collect.Sets;
 14 | 
 15 | 
 16 | public class LexicalResourceNoun {
 17 |     HashMap<Integer, HashSet<Integer>> antonyms;
 18 |     HashMap<Integer, HashSet<Integer>> synonyms;
 19 |     HashMap<Integer, HashSet<Integer>> features;
 20 |     //HashMap<Integer, HashSet<Integer>> cohyponyms;
 21 |     //HashMap<Integer, HashSet<Integer>> hypernyms;
 22 |     //Vocab vocab;
 23 |     Random random;
 24 |     
 25 |     public LexicalResourceNoun(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
 26 |         antonyms = readLexical(antFile, vocab);
 27 |         synonyms = readLexical(synFile, vocab);
 28 |         //cohyponyms = readLexical(cohypoFile, vocab);
 29 |         //hypernyms = readLexical(hypeFile, vocab);
 30 |         features = readFeatures(featureFile, vocab);
 31 |         //this.vocab = vocab;
 32 |         random = new Random();
 33 |     }
 34 |        
 35 |     public HashMap<Integer, HashSet<Integer>> readLexical(String inputFile, Vocab vocab) throws IOException {
 36 |         HashMap<Integer, HashSet<Integer>> lexical = new HashMap<Integer, HashSet<Integer>>();
 37 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 38 |         for (int i = 0; i < data.size(); i++) {
 39 |             String dataPiece = data.get(i);
 40 |             String elements[] = dataPiece.split("\t");
 41 |             String key = elements[0];
 42 |             int keyIndex = vocab.getWordIndex(key);
 43 |             if (keyIndex == -1) continue;
 44 |             HashSet<Integer> value = new HashSet<Integer>();
 45 |             for (int j = 1; j < elements.length; j++ ) {
 46 |                 int wordIndex = vocab.getWordIndex(elements[j]);;
 47 |                 if (wordIndex == -1) continue;
 48 |                 //value.add(elements[j]);
 49 |                 value.add(wordIndex);
 50 |             }
 51 |             lexical.put(keyIndex, value);
 52 |         }
 53 |         return lexical;
 54 |     }
 55 |     
 56 |     public HashMap<Integer, HashSet<Integer>> readFeatures(String inputFile, Vocab vocab) throws IOException {
 57 |         HashMap<Integer, HashSet<Integer>> features = new HashMap<Integer, HashSet<Integer>>();
 58 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 59 |         for (int i = 0; i < data.size(); i++) {
 60 |             String dataPiece = data.get(i);
 61 |             String elements[] = dataPiece.split("\t");
 62 |             String key = elements[0];
 63 |             int keyIndex = vocab.getWordIndex(key);
 64 |             if (keyIndex == -1) continue;
 65 |             HashSet<Integer> value = new HashSet<Integer>();
 66 |             for (int j = 1; j < elements.length; j++ ) {
 67 |                 int wordIndex = -1;
 68 |                 wordIndex = vocab.getWordIndex(elements[j]);
 69 |                 if (wordIndex == -1) continue;
 70 |                 value.add(wordIndex);
 71 |             }
 72 |             features.put(keyIndex, value);
 73 |         }
 74 |         return features;
 75 |     }
 76 |     /*
 77 |     public HashSet<String> intersectionAnt(String target, String feature) {
 78 |         HashSet<String> intersection = new HashSet<String>();
 79 |         if (antonyms.containsKey(target) && features.containsKey(feature)) {
 80 |             HashSet<String> setTargets = antonyms.get(target);
 81 |             HashSet<String> setFeatures = features.get(feature);
 82 |             intersection = getIntersection(setTargets, setFeatures);                
 83 |         }
 84 |         return intersection;
 85 |     }
 86 |     
 87 |     public HashSet<String> intersectionSyn(String target, String feature) {
 88 |         HashSet<String> intersection = new HashSet<String>();
 89 |         if (synonyms.containsKey(target) && features.containsKey(feature)) {
 90 |             HashSet<String> setTargets = synonyms.get(target);
 91 |             HashSet<String> setFeatures = features.get(feature);
 92 |             intersection = getIntersection(setTargets, setFeatures);
 93 |         }                
 94 |         return intersection;
 95 |     }*/
 96 |     
 97 |     public Set<Integer> intersectionAnt(Integer targetIndex, Integer featureIndex) {
 98 |         Set<Integer> intersection = new HashSet<Integer>();
 99 |         if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
100 |             Set<Integer> setTargets = antonyms.get(targetIndex);
101 |             Set<Integer> setFeatures = features.get(featureIndex);
102 |             intersection = Sets.intersection(setTargets, setFeatures);                
103 |         }
104 |         return intersection;
105 |     }
106 |     
107 |     public Set<Integer> intersectionSyn(Integer targetIndex, Integer featureIndex) {
108 |         Set<Integer> intersection = new HashSet<Integer>();
109 |         if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
110 |             Set<Integer> setTargets = synonyms.get(targetIndex);
111 |             Set<Integer> setFeatures = features.get(featureIndex);
112 |             intersection = Sets.intersection(setTargets, setFeatures);
113 |         }                
114 |         return intersection;
115 |     }
116 |     
117 | //    public Set<Integer> intersectionHype(Integer targetIndex, Integer featureIndex) {
118 | //        Set<Integer> intersection = new HashSet<Integer>();
119 | //        if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
120 | //            Set<Integer> setHypes = hypernyms.get(targetIndex);
121 | //            Set<Integer> setFeatures = features.get(featureIndex);
122 | //            intersection = Sets.intersection(setHypes, setFeatures);
123 | //        }                
124 | //        return intersection;
125 | //    }
126 | //    
127 | //    public Set<Integer> intersectionCohypo(Integer targetIndex, Integer featureIndex) {
128 | //        Set<Integer> intersection = new HashSet<Integer>();
129 | //        if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
130 | //            Set<Integer> setCohypos = cohyponyms.get(targetIndex);
131 | //            Set<Integer> setFeatures = features.get(featureIndex);
132 | //            intersection = Sets.intersection(setCohypos, setFeatures);
133 | //        }                
134 | //        return intersection;
135 | //    }
136 |     
137 |     public HashSet<Integer> getIntersection(HashSet<Integer> hs1, HashSet<Integer> hs2) {
138 |         HashSet<Integer> intersection = new HashSet<Integer>();
139 |         for (Integer element: hs1) {
140 |             if (hs2.contains(element)) intersection.add(element);
141 |         }
142 |         return intersection;
143 |     }
144 |     
145 |     public int getRandom(Set<Integer> antonyms) {
146 |         List<Integer> listAnts = new ArrayList<Integer>(antonyms);
147 |         int id = random.nextInt(listAnts.size());
148 |         return listAnts.get(id);
149 |     }
150 |     
151 | //    public boolean hasHypeCohypo(Integer targetIndex){
152 | //        return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex);
153 | //    }
154 | //    
155 | //    public boolean hasHypernyms(Integer targetIndex){
156 | //        return hypernyms.containsKey(targetIndex);
157 | //    }
158 | //    
159 | //    public boolean hasCohyponyms(Integer targetIndex){
160 | //        return cohyponyms.containsKey(targetIndex);
161 | //    }
162 |     
163 |     public boolean hasTarget(Integer targetIndex) {
164 |         return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex);
165 |     }
166 |     
167 |     public boolean hasAntonyms(Integer targetIndex) {
168 |         return antonyms.containsKey(targetIndex);
169 |     }
170 |     
171 |     public boolean hasSynonyms(Integer targetIndex) {
172 |         return synonyms.containsKey(targetIndex);
173 |     }
174 |     
175 |     public boolean hasFeature(Integer featureIndex) {
176 |         return features.containsKey(featureIndex);
177 |     }
178 |     
179 |     public HashSet<Integer> getAntonyms(Integer targetIndex) {
180 |         return antonyms.get(targetIndex);
181 |     }
182 |     
183 |     public HashSet<Integer> getSynonyms(Integer targetIndex) {
184 |         return synonyms.get(targetIndex);
185 |     }
186 |     
187 |     public HashSet<Integer> getFeatures(Integer featureIndex) {
188 |         return features.get(featureIndex);
189 |     }
190 |     
191 | //    public HashSet<Integer> getHypernyms(Integer targetIndex) {
192 | //        return hypernyms.get(targetIndex);
193 | //    }
194 | //    
195 | //    public HashSet<Integer> getCohyponyms(Integer targetIndex) {
196 | //        return cohyponyms.get(targetIndex);
197 | //    }
198 | 
199 | }
200 | 


--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResourceVerb.java:
--------------------------------------------------------------------------------
  1 | package common.wordnet;
  2 | 
  3 | import common.IOUtils;
  4 | import vocab.Vocab;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.HashSet;
 10 | import java.util.List;
 11 | import java.util.Random;
 12 | import java.util.Set;
 13 | import com.google.common.collect.Sets;
 14 | 
 15 | 
 16 | public class LexicalResourceVerb {
 17 |     HashMap<Integer, HashSet<Integer>> antonyms;
 18 |     HashMap<Integer, HashSet<Integer>> synonyms;
 19 |     HashMap<Integer, HashSet<Integer>> features;
 20 |     //HashMap<Integer, HashSet<Integer>> cohyponyms;
 21 |     //HashMap<Integer, HashSet<Integer>> hypernyms;
 22 |     //Vocab vocab;
 23 |     Random random;
 24 |     
 25 |     public LexicalResourceVerb(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
 26 |         antonyms = readLexical(antFile, vocab);
 27 |         synonyms = readLexical(synFile, vocab);
 28 |         features = readFeatures(featureFile, vocab);
 29 |         //cohyponyms = readLexical(cohypoFile, vocab);
 30 |         //hypernyms = readLexical(hypeFile, vocab);
 31 |         //this.vocab = vocab;
 32 |         random = new Random();
 33 |     }
 34 |        
 35 |     public HashMap<Integer, HashSet<Integer>> readLexical(String inputFile, Vocab vocab) throws IOException {
 36 |         HashMap<Integer, HashSet<Integer>> lexical = new HashMap<Integer, HashSet<Integer>>();
 37 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 38 |         for (int i = 0; i < data.size(); i++) {
 39 |             String dataPiece = data.get(i);
 40 |             String elements[] = dataPiece.split("\t");
 41 |             String key = elements[0];
 42 |             int keyIndex = vocab.getWordIndex(key);
 43 |             if (keyIndex == -1) continue;
 44 |             HashSet<Integer> value = new HashSet<Integer>();
 45 |             for (int j = 1; j < elements.length; j++ ) {
 46 |                 int wordIndex = vocab.getWordIndex(elements[j]);;
 47 |                 if (wordIndex == -1) continue;
 48 |                 //value.add(elements[j]);
 49 |                 value.add(wordIndex);
 50 |             }
 51 |             lexical.put(keyIndex, value);
 52 |         }
 53 |         return lexical;
 54 |     }
 55 |     
 56 |     public HashMap<Integer, HashSet<Integer>> readFeatures(String inputFile, Vocab vocab) throws IOException {
 57 |         HashMap<Integer, HashSet<Integer>> features = new HashMap<Integer, HashSet<Integer>>();
 58 |         ArrayList<String> data = IOUtils.readFile(inputFile);
 59 |         for (int i = 0; i < data.size(); i++) {
 60 |             String dataPiece = data.get(i);
 61 |             String elements[] = dataPiece.split("\t");
 62 |             String key = elements[0];
 63 |             int keyIndex = vocab.getWordIndex(key);
 64 |             if (keyIndex == -1) continue;
 65 |             HashSet<Integer> value = new HashSet<Integer>();
 66 |             for (int j = 1; j < elements.length; j++ ) {
 67 |                 int wordIndex = -1;
 68 |                 wordIndex = vocab.getWordIndex(elements[j]);
 69 |                 if (wordIndex == -1) continue;
 70 |                 value.add(wordIndex);
 71 |             }
 72 |             features.put(keyIndex, value);
 73 |         }
 74 |         return features;
 75 |     }
 76 |     /*
 77 |     public HashSet<String> intersectionAnt(String target, String feature) {
 78 |         HashSet<String> intersection = new HashSet<String>();
 79 |         if (antonyms.containsKey(target) && features.containsKey(feature)) {
 80 |             HashSet<String> setTargets = antonyms.get(target);
 81 |             HashSet<String> setFeatures = features.get(feature);
 82 |             intersection = getIntersection(setTargets, setFeatures);                
 83 |         }
 84 |         return intersection;
 85 |     }
 86 |     
 87 |     public HashSet<String> intersectionSyn(String target, String feature) {
 88 |         HashSet<String> intersection = new HashSet<String>();
 89 |         if (synonyms.containsKey(target) && features.containsKey(feature)) {
 90 |             HashSet<String> setTargets = synonyms.get(target);
 91 |             HashSet<String> setFeatures = features.get(feature);
 92 |             intersection = getIntersection(setTargets, setFeatures);
 93 |         }                
 94 |         return intersection;
 95 |     }*/
 96 |     
 97 |     public Set<Integer> intersectionAnt(Integer targetIndex, Integer featureIndex) {
 98 |         Set<Integer> intersection = new HashSet<Integer>();
 99 |         if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
100 |             Set<Integer> setTargets = antonyms.get(targetIndex);
101 |             Set<Integer> setFeatures = features.get(featureIndex);
102 |             intersection = Sets.intersection(setTargets, setFeatures);                
103 |         }
104 |         return intersection;
105 |     }
106 |     
107 |     public Set<Integer> intersectionSyn(Integer targetIndex, Integer featureIndex) {
108 |         Set<Integer> intersection = new HashSet<Integer>();
109 |         if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
110 |             Set<Integer> setTargets = synonyms.get(targetIndex);
111 |             Set<Integer> setFeatures = features.get(featureIndex);
112 |             intersection = Sets.intersection(setTargets, setFeatures);
113 |         }                
114 |         return intersection;
115 |     }
116 |     
117 | //    public Set<Integer> intersectionHype(Integer targetIndex, Integer featureIndex) {
118 | //        Set<Integer> intersection = new HashSet<Integer>();
119 | //        if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
120 | //            Set<Integer> setHypes = hypernyms.get(targetIndex);
121 | //            Set<Integer> setFeatures = features.get(featureIndex);
122 | //            intersection = Sets.intersection(setHypes, setFeatures);
123 | //        }                
124 | //        return intersection;
125 | //    }
126 | //    
127 | //    public Set<Integer> intersectionCohypo(Integer targetIndex, Integer featureIndex) {
128 | //        Set<Integer> intersection = new HashSet<Integer>();
129 | //        if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
130 | //            Set<Integer> setCohypos = cohyponyms.get(targetIndex);
131 | //            Set<Integer> setFeatures = features.get(featureIndex);
132 | //            intersection = Sets.intersection(setCohypos, setFeatures);
133 | //        }                
134 | //        return intersection;
135 | //    }
136 |     
137 |     public HashSet<Integer> getIntersection(HashSet<Integer> hs1, HashSet<Integer> hs2) {
138 |         HashSet<Integer> intersection = new HashSet<Integer>();
139 |         for (Integer element: hs1) {
140 |             if (hs2.contains(element)) intersection.add(element);
141 |         }
142 |         return intersection;
143 |     }
144 |     
145 |     public int getRandom(Set<Integer> antonyms) {
146 |         List<Integer> listAnts = new ArrayList<Integer>(antonyms);
147 |         int id = random.nextInt(listAnts.size());
148 |         return listAnts.get(id);
149 |     }
150 |     
151 | //    public boolean hasHypeCohypo(Integer targetIndex){
152 | //        return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex);
153 | //    }
154 | //    
155 | //    public boolean hasHypernyms(Integer targetIndex){
156 | //        return hypernyms.containsKey(targetIndex);
157 | //    }
158 | //    
159 | //    public boolean hasCohyponyms(Integer targetIndex){
160 | //        return cohyponyms.containsKey(targetIndex);
161 | //    }
162 |     
163 |     public boolean hasTarget(Integer targetIndex) {
164 |         return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex);
165 |     }
166 |     
167 |     public boolean hasAntonyms(Integer targetIndex) {
168 |         return antonyms.containsKey(targetIndex);
169 |     }
170 |     
171 |     public boolean hasSynonyms(Integer targetIndex) {
172 |         return synonyms.containsKey(targetIndex);
173 |     }
174 |     
175 |     public boolean hasFeature(Integer featureIndex) {
176 |         return features.containsKey(featureIndex);
177 |     }
178 |     
179 |     public HashSet<Integer> getAntonyms(Integer targetIndex) {
180 |         return antonyms.get(targetIndex);
181 |     }
182 |     
183 |     public HashSet<Integer> getSynonyms(Integer targetIndex) {
184 |         return synonyms.get(targetIndex);
185 |     }
186 |     
187 |     public HashSet<Integer> getFeatures(Integer featureIndex) {
188 |         return features.get(featureIndex);
189 |     }
190 |     
191 | //    public HashSet<Integer> getHypernyms(Integer targetIndex) {
192 | //        return hypernyms.get(targetIndex);
193 | //    }
194 | //    
195 | //    public HashSet<Integer> getCohyponyms(Integer targetIndex) {
196 | //        return cohyponyms.get(targetIndex);
197 | //    }
198 | 
199 | }
200 | 


--------------------------------------------------------------------------------
/src/common/wordnet/Synset.java:
--------------------------------------------------------------------------------
 1 | package common.wordnet;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | public class Synset {
 6 | 	String id;
 7 | 	String synsetType;
 8 | 	String[] words;
 9 | 	String antonymSSId;
10 | 	String[] simSSId;
11 | 	String[] hypoSSId;
12 | 	String[] hyperSSId;
13 | 	
14 | 	public Synset(String line) {
15 | 		String[] elements = line.split(" ");
16 | 		id = elements[0];
17 | 		synsetType = elements[2];
18 | 		readWords(elements);
19 | 	}
20 | 	
21 | 	public void readWords(String[] elements) {
22 | 		// read words
23 | 		
24 | 		int size = Integer.parseInt(elements[3],16);
25 | 		words = new String[size];
26 | 		for (int i = 0; i< size; i++) {
27 | 			words[i] = elements[4 + 2 * i].replaceAll("_", "-");
28 | 		}
29 | 		
30 | 		// read info
31 | 		int fieldNum = Integer.parseInt(elements[4 + size * 2],10);
32 | 		
33 | 		ArrayList<String> simList = new ArrayList<String>();
34 | 		ArrayList<String> hypoList = new ArrayList<String>();
35 | 		ArrayList<String> hyperList = new ArrayList<String>();
36 | 		for (int i = 0; i < fieldNum; i++) {
37 | 			String type = elements[4 + size * 2 + 1 + i * 4];
38 | 			String id = elements[4 + size * 2 + 2 + i * 4];
39 | 			String pos = elements[4 + size * 2 + 3 + i * 4];
40 | 			if (type.equals("&") || type.equals("^")) {
41 | 				if (pos.equals("a"))
42 | 					simList.add(id);
43 | 			} else if (type.equals("!")) {
44 | 				antonymSSId = id;
45 | 			} else if (type.equals("@")) {
46 | 			    hyperList.add(id);
47 |             } else if (type.equals("~")) {
48 |                 hypoList.add(id);
49 |             }
50 | 		}
51 | 		simSSId = new String[simList.size()];
52 | 		simSSId = simList.toArray(simSSId);
53 | 		hyperSSId = new String[simList.size()];
54 | 		hyperSSId = hyperList.toArray(hyperSSId);
55 |         hypoSSId = new String[simList.size()];
56 |         hypoSSId = hypoList.toArray(hypoSSId);
57 | 	}
58 |  }
59 | 


--------------------------------------------------------------------------------
/src/common/wordnet/WordNetReader.java:
--------------------------------------------------------------------------------
 1 | package common.wordnet;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.io.IOException;
 6 | import java.util.ArrayList;
 7 | import java.util.HashMap;
 8 | 
 9 | public class WordNetReader {
10 | 	public static HashMap<String, Synset> readSynsets(String fileName) throws IOException{
11 | 		HashMap<String, Synset> data = new HashMap<String, Synset>();
12 | 		BufferedReader reader = new BufferedReader(new FileReader(fileName));
13 | 		String line = reader.readLine();
14 | 		while (line != null) {
15 | 			if (!line.startsWith(" ")) {
16 | 				Synset synset = new Synset(line);
17 | 				data.put(synset.id, synset);
18 | 			}
19 | 			line = reader.readLine();
20 | 		}
21 | 		reader.close();
22 | 		return data;
23 | 	} 
24 | 	
25 | 	public static HashMap<String, ArrayList<String>> getWord2SynsetIds(HashMap<String, Synset> synsetMap) {
26 | 		HashMap<String, ArrayList<String>> word2SynsetIds = new HashMap<String, ArrayList<String>>();
27 | 		for (String id: synsetMap.keySet()) {
28 | 			Synset synset = synsetMap.get(id);
29 | 			for (String word: synset.words) {
30 | 				if (!word2SynsetIds.containsKey(word)) {
31 | 					word2SynsetIds.put(word, new ArrayList<String>());
32 | 				} 
33 | 				word2SynsetIds.get(word).add(id);
34 | 			}
35 | 		}
36 | 		return word2SynsetIds;
37 | 	}
38 | 	
39 | 	public static void main(String[] args) throws IOException{
40 | 		String adjFile = args[0];
41 | 		HashMap<String, Synset> synsetMap = readSynsets(adjFile);
42 | 		for (String id: synsetMap.keySet()) {
43 | 			Synset synset = synsetMap.get(id);
44 | 			System.out.print(synset.id);
45 | 			for (String word: synset.words) {
46 | 				System.out.print(" " + word);
47 | 			}
48 | 			System.out.println();
49 | 		}
50 | 		
51 | 		HashMap<String, ArrayList<String>> word2SynsetIds = getWord2SynsetIds(synsetMap);
52 | 		for (String word: word2SynsetIds.keySet()) {
53 | 		    System.out.print(word + ": ");
54 | 		    System.out.print(word2SynsetIds.get(word) + "\n");
55 | 		}
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/src/demo/HyperVecLearning.java:
--------------------------------------------------------------------------------
  1 | package demo;
  2 | 
  3 | import io.sentence.PlainSentenceInputStream;
  4 | import io.word.CombinedWordInputStream;
  5 | import io.word.PushBackWordStream;
  6 | import io.word.WordInputStream;
  7 | import io.sentence.SentenceInputStream;
  8 | 
  9 | import java.io.File;
 10 | import java.io.IOException;
 11 | import java.util.ArrayList;
 12 | 
 13 | import common.wordnet.LexicalHypernym;
 14 | import vocab.Vocab;
 15 | import word2vec.MultiThreadWord2Vec;
 16 | import word2vec.multitask.Hyper2Vec;
 17 | 
 18 | 
 19 | 
 20 | public class HyperVecLearning {
 21 |     public static void main(String[] args) throws IOException{
 22 |         
 23 |         
 24 |         MultiThreadWord2Vec word2vec = null;
 25 |         String configFile = args[0]; 
 26 |         int size = Integer.parseInt(args[1]);
 27 |         int window = Integer.parseInt(args[2]);
 28 |         
 29 |         W2vProperties properties = new W2vProperties(configFile);
 30 |         boolean softmax = Boolean.parseBoolean(properties.getProperty("HierarchialSoftmax"));
 31 |         int negativeSamples = Integer.parseInt(properties.getProperty("NegativeSampling"));
 32 |         double subSampling = Double.parseDouble(properties.getProperty("SubSampling"));
 33 |         String trainDirPath = properties.getProperty("TrainDir");
 34 |         String outputFile = properties.getProperty("WordVectorFile");
 35 |         String vocabFile = properties.getProperty("VocabFile");
 36 |         
 37 |         outputFile = outputFile.replaceAll(".bin", "_" + size + ".bin");
 38 |         
 39 |         File trainDir = new File(trainDirPath);
 40 |         File[] trainFiles = trainDir.listFiles();
 41 |         System.out.println("Starting training using dir " + trainDirPath);
 42 |         System.out.println("Output file: " + outputFile);
 43 | 
 44 |         boolean learnVocab = !(new File(vocabFile)).exists();
 45 |         Vocab vocab = new Vocab(Integer.parseInt(properties.getProperty("MinFrequency")));
 46 |         if (!learnVocab)
 47 |             vocab.loadVocab(vocabFile);// ,minFrequency);
 48 |         else {
 49 |             ArrayList<WordInputStream> wordStreamList = new ArrayList<>();
 50 |             for (File trainFile: trainFiles) {
 51 |                 WordInputStream wordStream = new PushBackWordStream(trainFile.getAbsolutePath(), 200);
 52 |                 wordStreamList.add(wordStream);
 53 |             }
 54 |           
 55 |             CombinedWordInputStream wordStream = new CombinedWordInputStream(wordStreamList);
 56 |             vocab.learnVocabFromTrainStream(wordStream);
 57 |             // save vocabulary
 58 |             vocab.saveVocab(vocabFile);
 59 |         }
 60 |                 
 61 |         word2vec = new Hyper2Vec(size, window, softmax, negativeSamples, subSampling);            
 62 |         Hyper2Vec hypervec = (Hyper2Vec) word2vec;
 63 |         
 64 |         LexicalHypernym hypeNoun = new LexicalHypernym(properties.getProperty("hypeNoun"),
 65 |                                                        properties.getProperty("cohypoNoun"),
 66 |                                                        properties.getProperty("featureNoun"), 
 67 |                                                        vocab);
 68 |         LexicalHypernym hypeVerb = new LexicalHypernym(properties.getProperty("hypeVerb"),
 69 |                                                        properties.getProperty("cohypoVerb"),
 70 |                                                        properties.getProperty("featureVerb"), 
 71 |                                                        vocab);
 72 |         hypervec.setLexicalHypeNoun(hypeNoun);
 73 |         outputFile = outputFile.replaceAll(".bin", "_HypeNoun.bin");
 74 |         hypervec.setLexicalHypeVerb(hypeVerb);
 75 |         outputFile = outputFile.replaceAll(".bin", "_HypeVerb.bin");
 76 |             
 77 | 
 78 |         word2vec.setVocab(vocab);
 79 |         word2vec.initNetwork();
 80 | 
 81 |         System.out.println("Start training");
 82 |         try {
 83 |             ArrayList<SentenceInputStream> inputStreams = new ArrayList<SentenceInputStream>();
 84 |             for (File trainFile: trainFiles) {
 85 |                 SentenceInputStream sentenceInputStream = new PlainSentenceInputStream(
 86 |                     new PushBackWordStream(trainFile.getAbsolutePath(), 200));
 87 |                 inputStreams.add(sentenceInputStream);
 88 |             }
 89 |             
 90 |             word2vec.trainModel(inputStreams); 
 91 |             word2vec.saveVector(outputFile, true);
 92 |             
 93 |             System.out.println("The vocab size: " + vocab.getVocabSize() + " words");
 94 |         } catch (IOException e) {
 95 |             System.exit(1);
 96 |         }
 97 | 
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/demo/W2vProperties.java:
--------------------------------------------------------------------------------
 1 | package demo;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.io.IOException;
 6 | import java.util.Properties;
 7 | 
 8 | public class W2vProperties{
 9 |     protected Properties properties;
10 |     public W2vProperties(String configFile) throws IOException{
11 |         
12 | 
13 |         properties = new Properties();
14 |         BufferedReader reader = new BufferedReader(new FileReader(configFile));
15 |         properties.load(reader);
16 |         reader.close();
17 |         
18 |         
19 |         // PROJECT DIR        
20 |         String projectDir = properties.getProperty("ProjectDir");
21 |         
22 |         // TRAIN DIR
23 |         String sTrainDir = properties.getProperty("STrainDirName");
24 |         String sTrainDirPath = projectDir + "/" + sTrainDir;
25 |         properties.setProperty("STrainDir", sTrainDirPath);
26 |         
27 |         String outputDir = projectDir;
28 |         properties.setProperty("OutputDir", outputDir);
29 |         
30 |         // OUTPUT NAME
31 |         String sOutputName = properties.getProperty("SOutputFileTemplate");
32 |         String sOutputFilePath = outputDir + "/" + sOutputName;
33 |         properties.setProperty("SOutputFile", sOutputFilePath);
34 |         
35 |         // VOCAB FILE
36 |         String vocabFileName = properties.getProperty("VocabFileName");
37 |         String vocabFile = outputDir + "/" + vocabFileName;
38 |         properties.setProperty("VocabFile", vocabFile);
39 |         
40 |         // OUTPUT W2V
41 |         
42 |         String wordVectorFileName = properties.getProperty("WordVectorFileName");
43 |         String wordVectorFilePath = outputDir + "/" + wordVectorFileName;
44 |         properties.setProperty("WordVectorFile", wordVectorFilePath);
45 |     }
46 |     
47 |     public String getProperty(String key) {
48 |         return properties.getProperty(key);
49 |     }
50 |     
51 | }
52 | 


--------------------------------------------------------------------------------
/src/io/sentence/PlainSentenceInputStream.java:
--------------------------------------------------------------------------------
 1 | package io.sentence;
 2 | 
 3 | import io.word.Phrase;
 4 | import io.word.WordInputStream;
 5 | 
 6 | import java.io.IOException;
 7 | import java.util.ArrayList;
 8 | 
 9 | import common.DataStructureUtils;
10 | 
11 | import vocab.Vocab;
12 | 
13 | public class PlainSentenceInputStream implements SentenceInputStream {
14 |     public static final int DEFAULT_MAX_SENTENCE_LENGTH = 1000;
15 |     WordInputStream         inputStream;
16 |     long                    wordCount;
17 |     int[]                   sentence;
18 | 
19 |     public PlainSentenceInputStream(WordInputStream inputStream) {
20 |         this.inputStream = inputStream;
21 |         wordCount = 0;
22 |     }
23 | 
24 |     @Override
25 |     public boolean readNextSentence(Vocab vocab) throws IOException {
26 |         ArrayList<Integer> currentSentence = new ArrayList<Integer>();
27 |         while (true) {
28 |             // read the next word & the word index
29 |             String word = "";
30 |             word = inputStream.readWord();
31 | 
32 |             if ("".equals(word))
33 |                 break;
34 |             int wordIndex = vocab.getWordIndex(word);
35 | 
36 |             // if the word is not in the vocabulary, continue
37 |             if (wordIndex == -1)
38 |                 continue;
39 |             else
40 |                 wordCount++;
41 | 
42 |             // end of sentence -> break;
43 |             if (wordIndex == 0) {
44 |                 // System.out.println("end of sentence: " + word);
45 |                 break;
46 |             }
47 | 
48 |             currentSentence.add(wordIndex);
49 |             // break if sentence is too long
50 |             if (currentSentence.size() >= DEFAULT_MAX_SENTENCE_LENGTH)
51 |                 break;
52 | 
53 |         }
54 |         // System.out.println("sentence length: " + sentence.size());
55 |         sentence = DataStructureUtils.intListToArray(currentSentence);
56 |         if (sentence.length == 0 && inputStream.endOfFile())
57 |             return false;
58 |         else
59 |             return true;
60 | 
61 |     }
62 | 
63 |     @Override
64 |     public int[] getCurrentSentence() throws IOException {
65 |         return sentence;
66 |     }
67 | 
68 |     @Override
69 |     public Phrase[] getCurrentPhrases() throws IOException {
70 |         return new Phrase[0];
71 |     }
72 | 
73 |     @Override
74 |     public long getWordCount() {
75 |         return wordCount;
76 |     }
77 | 
78 |     @Override
79 |     public boolean crossDocBoundary() {
80 |         // TODO Auto-generated method stub
81 |         return false;
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/io/sentence/SentenceInputStream.java:
--------------------------------------------------------------------------------
 1 | package io.sentence;
 2 | 
 3 | import io.word.Phrase;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | import vocab.Vocab;
 8 | 
 9 | public interface SentenceInputStream {
10 |     public boolean readNextSentence(Vocab vocab) throws IOException;
11 | 
12 |     public int[] getCurrentSentence() throws IOException;
13 |     
14 |     public boolean crossDocBoundary();
15 | 
16 |     public Phrase[] getCurrentPhrases() throws IOException;
17 | 
18 |     public long getWordCount();
19 | }
20 | 


--------------------------------------------------------------------------------
/src/io/sentence/SubSamplingSentenceInputStream.java:
--------------------------------------------------------------------------------
  1 | package io.sentence;
  2 | 
  3 | import io.word.Phrase;
  4 | 
  5 | import java.io.IOException;
  6 | import java.util.ArrayList;
  7 | import java.util.Random;
  8 | 
  9 | import common.DataStructureUtils;
 10 | 
 11 | import vocab.Vocab;
 12 | import vocab.VocabEntry;
 13 | 
 14 | public class SubSamplingSentenceInputStream implements SentenceInputStream {
 15 | 
 16 |     SentenceInputStream inputStream;
 17 |     double               frequencyThreshold;
 18 |     int[]               sentence;
 19 |     Phrase[]            phrases;
 20 |     Random              rand = new Random();
 21 | 
 22 |     public SubSamplingSentenceInputStream(SentenceInputStream inputStream,
 23 |             double frequencyThreshold) {
 24 |         this.inputStream = inputStream;
 25 |         this.frequencyThreshold = frequencyThreshold;
 26 |     }
 27 | 
 28 |     protected boolean isSampled(long count, long totalCount) {
 29 |         double randomThreshold = (double) (Math.sqrt(count
 30 |                 / (frequencyThreshold * totalCount)) + 1)
 31 |                 * (frequencyThreshold * totalCount) / count;
 32 |         if (randomThreshold >= rand.nextFloat()) {
 33 |             return true;
 34 |         } else {
 35 |             return false;
 36 |         }
 37 |     }
 38 | 
 39 |     protected void filterSentence(int[] unFilteredSentence,
 40 |             Phrase[] unFilteredPhrases, Vocab vocab) {
 41 |         ArrayList<Integer> filteredIndices = new ArrayList<Integer>();
 42 |         long totalCount = vocab.getTrainWords();
 43 |         int[] newPositions = new int[unFilteredSentence.length];
 44 |         int newPosition = 0;
 45 |         for (int i = 0; i < unFilteredSentence.length; i++) {
 46 |             int vocabEntryIndex = unFilteredSentence[i];
 47 |             if (vocabEntryIndex == -1) {
 48 |                 newPositions[i] = Integer.MIN_VALUE;
 49 |                 continue;
 50 |             }
 51 |             VocabEntry entry = vocab.getEntry(vocabEntryIndex);
 52 |             long count = entry.frequency;
 53 | 
 54 |             if (isSampled(count, totalCount)) {
 55 |                 filteredIndices.add(vocabEntryIndex);
 56 |                 newPositions[i] = newPosition;
 57 |                 newPosition++;
 58 |             }
 59 |             // set those words'positions that are not in vocab to -1
 60 |             else {
 61 |                 newPositions[i] = Integer.MIN_VALUE;
 62 |             }
 63 |         }
 64 | //        System.out.println("\nOld Sentence:");
 65 | //        for (int i = 0; i < unFilteredSentence.length; i++)
 66 | //        {
 67 | //            System.out.print(" "+unFilteredSentence[i]);
 68 | //        }
 69 | //        System.out.println("\nOld phrase:");
 70 | //        for (int i = 0; i < unFilteredPhrases.length; i++)
 71 | //        {
 72 | //            System.out.print("("+unFilteredPhrases[i].startPosition + " " + +unFilteredPhrases[i].endPosition + ") ");
 73 | //        }
 74 | //        System.out.println();
 75 |         sentence = DataStructureUtils.intListToArray(filteredIndices);
 76 | 
 77 |         ArrayList<Phrase> fileterPhraseList = new ArrayList<Phrase>();
 78 |         for (Phrase unFilteredPhrase : unFilteredPhrases) {
 79 |             int phraseType = unFilteredPhrase.phraseType;
 80 |             int startPosition = newPositions[unFilteredPhrase.startPosition];
 81 |             int endPosition = newPositions[unFilteredPhrase.endPosition];
 82 |             // TODO: check if this condition is correct
 83 |             if (endPosition - startPosition == unFilteredPhrase.endPosition - unFilteredPhrase.startPosition) {
 84 |                 Phrase phrase = new Phrase(phraseType, startPosition,
 85 |                         endPosition, unFilteredPhrase.tree);
 86 |                 fileterPhraseList.add(phrase);
 87 |             } 
 88 |             else if (Math.max(startPosition, endPosition) >= 0) {
 89 |                 int maxPosition = Math.max(startPosition, endPosition);
 90 |                 Phrase phrase = new Phrase(phraseType, maxPosition,
 91 |                         maxPosition, unFilteredPhrase.tree);
 92 |                 fileterPhraseList.add(phrase);
 93 |             }
 94 |         }
 95 | //        System.out.println("New pos:");
 96 | //        for (int i = 0; i < newPositions.length; i++)
 97 | //        {
 98 | //            System.out.print(""+i+":"+newPositions[i] + " ");
 99 | //        }
100 | //        System.out.println("\nNew Sentence:");
101 | //        for (int i = 0; i < sentence.length; i++)
102 | //        {
103 | //            System.out.print(" "+sentence[i]);
104 | //        }
105 | //        System.out.println("\nNew phrase:");
106 |         phrases = DataStructureUtils.phraseListToArray(fileterPhraseList);
107 | //        for (int i = 0; i < phrases.length; i++)
108 | //        {
109 | //            System.out.print("("+phrases[i].startPosition + " " + +phrases[i].endPosition + ") ");
110 | //        }
111 | //        System.out.println();
112 |         
113 |     }
114 | 
115 |     @Override
116 |     public boolean readNextSentence(Vocab vocab) throws IOException {
117 |         boolean hasNextSentence = inputStream.readNextSentence(vocab);
118 |         if (hasNextSentence) {
119 |             int[] unFilteredSentence = inputStream.getCurrentSentence();
120 |             Phrase[] unFilteredPhrases = inputStream.getCurrentPhrases();
121 |             filterSentence(unFilteredSentence, unFilteredPhrases, vocab);
122 |         }
123 |         return hasNextSentence;
124 |     }
125 | 
126 |     @Override
127 |     public int[] getCurrentSentence() throws IOException {
128 |         return sentence;
129 |     }
130 | 
131 |     @Override
132 |     public Phrase[] getCurrentPhrases() throws IOException {
133 |         return phrases;
134 |     }
135 | 
136 |     @Override
137 |     public long getWordCount() {
138 |         return inputStream.getWordCount();
139 |     }
140 | 
141 |     @Override
142 |     public boolean crossDocBoundary() {
143 |         // TODO Auto-generated method stub
144 |         return inputStream.crossDocBoundary();
145 |     }
146 | 
147 | }
148 | 


--------------------------------------------------------------------------------
/src/io/sentence/TreeInputStream.java:
--------------------------------------------------------------------------------
 1 | package io.sentence;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import tree.Tree;
 6 | 
 7 | public interface TreeInputStream {
 8 |     // return null if end of file
 9 |     public Tree readTree() throws IOException;
10 |     public long getReadLine();
11 |     public void close() throws IOException;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/io/word/CombinedWordInputStream.java:
--------------------------------------------------------------------------------
 1 | package io.word;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | import java.util.List;
 6 | 
 7 | public class CombinedWordInputStream implements WordInputStream {
 8 |     Iterator<WordInputStream> streamIterator;
 9 |     WordInputStream           currentStream;
10 |     int                       streamCount = 0;
11 | 
12 |     public CombinedWordInputStream(List<WordInputStream> inputStream) {
13 |         streamIterator = inputStream.iterator();
14 |         if (streamIterator.hasNext()) {
15 |             currentStream = streamIterator.next();
16 |             streamCount++;
17 |             System.out.println("read " + streamCount + "th stream");
18 |         } else {
19 |             currentStream = null;
20 |         }
21 |     }
22 | 
23 |     @Override
24 |     public String readWord() throws IOException {
25 |         // TODO Auto-generated method stub
26 |         if (currentStream == null) {
27 |             return "";
28 |         }
29 |         while (true) {
30 |             String word = currentStream.readWord();
31 |             if (!word.equals("")) {
32 |                 return word;
33 |             }
34 |             currentStream.close();
35 |             boolean hasNextStream = false;
36 |             while (streamIterator.hasNext()) {
37 |                 currentStream = streamIterator.next();
38 |                 streamCount++;
39 |                 System.out.println("read " + streamCount + "th stream");
40 |                 if (currentStream == null) {
41 |                     System.out.println("" + streamCount + "th stream is null");
42 |                     continue;
43 |                 } else {
44 |                     System.out.println("" + streamCount
45 |                             + "th stream is not null");
46 |                     hasNextStream = true;
47 |                     break;
48 |                 }
49 |             }
50 |             if (!hasNextStream) {
51 |                 currentStream = null;
52 |                 return "";
53 |             }
54 |         }
55 |     }
56 | 
57 |     @Override
58 |     public boolean endOfFile() {
59 |         // TODO Auto-generated method stub
60 |         if (currentStream == null)
61 |             return true;
62 |         else if (currentStream.endOfFile()) {
63 |             return streamIterator.hasNext();
64 |         } else
65 |             return false;
66 |     }
67 | 
68 |     @Override
69 |     public void close() throws IOException {
70 |         // TODO Auto-generated method stub
71 |         if (currentStream == null)
72 |             return;
73 |         else {
74 |             currentStream.close();
75 |             while (streamIterator.hasNext()) {
76 |                 currentStream = streamIterator.next();
77 |                 if (currentStream != null) {
78 |                     currentStream.close();
79 |                 }
80 |             }
81 |         }
82 |     }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/src/io/word/Phrase.java:
--------------------------------------------------------------------------------
 1 | package io.word;
 2 | 
 3 | import common.WordForm;
 4 | 
 5 | import tree.CcgTree;
 6 | 
 7 | public class Phrase {
 8 |     public int   phraseType;
 9 |     public int   startPosition;
10 |     public int   endPosition;
11 |     public CcgTree tree;
12 | 
13 |     public Phrase(int phraseType, int startPosition, int endPosition,
14 |             CcgTree tree) {
15 |         this.phraseType = phraseType;
16 |         this.startPosition = startPosition;
17 |         this.endPosition = endPosition;
18 |         this.tree = tree;
19 |     }
20 | 
21 |     public String toString() {
22 |         StringBuffer sbResult = new StringBuffer();
23 |         sbResult.append("phrase type:" + phraseType + "\n");
24 |         sbResult.append("start:" + startPosition + "\n");
25 |         sbResult.append("end:" + endPosition + "\n");
26 |         sbResult.append("surface: \'" + tree.getSurfaceString(WordForm.WORD) + "\'\n");
27 |         return sbResult.toString();
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/io/word/PushBackWordStream.java:
--------------------------------------------------------------------------------
 1 | package io.word;
 2 | 
 3 | import java.io.BufferedInputStream;
 4 | import java.io.FileInputStream;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import java.io.PushbackInputStream;
 8 | 
 9 | public class PushBackWordStream implements WordInputStream {
10 |     protected PushbackInputStream inputStream;
11 |     protected int                 maxWordLength;
12 |     boolean                       reachedEndOfFile = false;
13 | 
14 |     public PushBackWordStream(String filePath, int maxWordLength)
15 |             throws IOException {
16 |         this.maxWordLength = maxWordLength;
17 |         inputStream = new PushbackInputStream(new BufferedInputStream(
18 |                 new FileInputStream(filePath)));
19 |     }
20 | 
21 |     public PushBackWordStream(InputStream is, int maxWordLength) {
22 |         this.maxWordLength = maxWordLength;
23 |         inputStream = new PushbackInputStream(new BufferedInputStream(is));
24 |     }
25 | 
26 |     @Override
27 |     public String readWord() throws IOException {
28 |         StringBuffer buff = new StringBuffer();
29 |         boolean newString = true;
30 |         char ch;
31 |         while (true) {
32 |             int nextCh = inputStream.read();
33 |             if (nextCh == -1) {
34 |                 reachedEndOfFile = true;
35 |                 break;
36 |             }
37 |             ch = (char) nextCh;
38 |             // for window character
39 |             if (ch == 13)
40 |                 continue;
41 |             if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
42 |                 if (!newString) {
43 |                     if (ch == '\n') {
44 |                         inputStream.unread(ch);
45 |                     }
46 |                     break;
47 |                 }
48 |                 // end of line = end of sentence
49 |                 if (ch == '\n') {
50 |                     return "</s>";
51 |                 } else
52 |                     continue;
53 |             }
54 |             buff.append(ch);
55 |             newString = false;
56 |         }
57 |         String result = buff.toString();
58 |         if (result.length() > maxWordLength) {
59 |             return result.substring(0, maxWordLength);
60 |         } else {
61 |             return result;
62 |         }
63 |     }
64 | 
65 |     @Override
66 |     public void close() throws IOException {
67 |         inputStream.close();
68 |     }
69 | 
70 |     @Override
71 |     public boolean endOfFile() {
72 |         return reachedEndOfFile;
73 |     }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/io/word/WordFilter.java:
--------------------------------------------------------------------------------
1 | package io.word;
2 | 
3 | public interface WordFilter {
4 |     public boolean isFiltered(String word);
5 | }
6 | 


--------------------------------------------------------------------------------
/src/io/word/WordInputStream.java:
--------------------------------------------------------------------------------
 1 | package io.word;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public interface WordInputStream {
 6 |     /**
 7 |      * get the next word from the stream
 8 |      * 
 9 |      * @return A string as the next word
10 |      *         If it's the end of the stream, return ""
11 |      *         If it's the end of a sentence, return "</s>"
12 |      * @throws IOException
13 |      */
14 |     public String readWord() throws IOException;
15 | 
16 |     /**
17 |      * Check if the we reach the end of the stream
18 |      * Seem a bit redundant since we
19 |      * can get this information
20 |      * 
21 |      * @return true: if the end of the stream is reach false: otherwise
22 |      */
23 |     public boolean endOfFile();
24 | 
25 |     /**
26 |      * close the stream
27 |      * 
28 |      * @throws IOException
29 |      */
30 |     public void close() throws IOException;
31 | }
32 | 


--------------------------------------------------------------------------------
/src/neural/function/ActivationFunction.java:
--------------------------------------------------------------------------------
1 | package neural.function;
2 | 
3 | public interface ActivationFunction {
4 |     public double activation(double x);
5 |     public double derivative(double x); 
6 |     public String getName();
7 | }
8 | 


--------------------------------------------------------------------------------
/src/neural/function/Correlation.java:
--------------------------------------------------------------------------------
  1 | package neural.function;
  2 | 
  3 | import java.util.Random;
  4 | 
  5 | import common.MathUtils;
  6 | import common.exception.ValueException;
  7 | 
  8 | public class Correlation {
  9 | //    double[] predicted;
 10 |     String name = "correlation";
 11 |     double[] gold;
 12 |     int length;
 13 |     double aveY;
 14 |     double aveY2;
 15 | //    public train
 16 |     public Correlation(double[] gold) {
 17 | //        this.predicted = predicted;
 18 |         this.gold = gold;
 19 |         precompute();
 20 |     }
 21 |     
 22 |     protected void precompute() {
 23 |         aveY = 0;
 24 |         aveY2 = 0;
 25 |         length = gold.length;
 26 |         
 27 |         for (int i = 0; i < gold.length; i++) {
 28 |             aveY += gold[i];
 29 |             aveY2 += gold[i] * gold[i];
 30 |         }
 31 |         aveY /= length;
 32 |         aveY2 /= length;
 33 |     }
 34 |     
 35 |     public Correlation(double[][] vectors, int[][] pairs) {
 36 |         gold = new double[pairs.length];
 37 |         for (int i = 0; i < pairs.length; i++) {
 38 |             gold[i] = MathUtils.cosine(vectors[pairs[i][0]], vectors[pairs[i][1]]);
 39 |         }
 40 |         precompute();
 41 |     }
 42 |     
 43 |     
 44 |     public double[] derivative(double[] predicted) {
 45 |         if (length != predicted.length) {
 46 |             throw new ValueException("Value must be the same");
 47 |         }
 48 |         double[] result = new double[length];
 49 |         double aveX = 0;
 50 |         double aveX2 = 0;
 51 |         double aveXY =0;
 52 |         for (int i = 0; i < gold.length; i++) {
 53 |             aveX += predicted[i];
 54 |             aveX2 += predicted[i] * predicted[i];
 55 |             aveXY += predicted[i] * gold[i];
 56 |         }
 57 |         aveX /= length;
 58 |         aveX2 /= length;
 59 |         aveXY /= length;
 60 |         double ave2X = aveX * aveX;
 61 |         double ave2Y = aveY * aveY;
 62 |         double covXY = (aveXY - (aveX * aveY));
 63 |         double covX = (aveX2 - ave2X);
 64 |         double covY = (aveY2 - ave2Y);
 65 |         double sCovX = Math.sqrt(covX);
 66 |         double sCovY = Math.sqrt(covY);
 67 |         
 68 |         double correlation =  covXY / (sCovX * sCovY);
 69 |         for (int i = 0; i < length; i++) {
 70 |             result[i] = 1 / (covX * sCovY);
 71 |             result[i] *= (((gold[i] - aveY) * sCovX) - ((covXY / sCovY) * (predicted[i] - aveX))) / length;
 72 |         }
 73 |             
 74 |         System.out.println(name + ": " + correlation);
 75 |         return result;
 76 |     }
 77 |     
 78 |     public double[][] derivative(double[][] vectors, int[][] pairs) {
 79 |         int vocabSize = vectors.length;
 80 |         int vectorSize = vectors[0].length;
 81 |         double[][] result = new double[vocabSize][vectorSize];
 82 |         double[] cosines = new double[gold.length];
 83 |         for (int i = 0; i < pairs.length; i++) {
 84 |             cosines[i] = MathUtils.cosine(vectors[pairs[i][0]], vectors[pairs[i][1]]);
 85 |         }
 86 |         double[] cosDerivative = derivative(cosines);
 87 |         for (int i = 0; i < pairs.length; i++) {
 88 |             int index1 = pairs[i][0];
 89 |             int index2 = pairs[i][1];
 90 |             // TODO: optimize here
 91 |             double[] deltaX1 = MathUtils.cosineDerivative(vectors[index1], vectors[index2]);
 92 |             double[] deltaX2 = MathUtils.cosineDerivative(vectors[index2], vectors[index1]);
 93 |             for (int j = 0; j < vectorSize; j++) {
 94 |                 result[index1][j] += cosDerivative[i] * deltaX1[j];
 95 |                 result[index2][j] += cosDerivative[i] * deltaX2[j];
 96 |             }
 97 |         }
 98 |         
 99 |         return result;
100 |     }
101 |     
102 |     public void setName(String name) {
103 |         this.name = name;
104 |     }
105 |     
106 |     public static void testPearsonDerivative() {
107 |         Random random = new Random();
108 |         int arrayLength = 1000;
109 |         double[] gold = new double[arrayLength];
110 |         double[] predicted = new double[arrayLength];
111 |         for (int i = 0; i < arrayLength; i++) {
112 |             gold[i] = random.nextDouble();
113 |             predicted[i] = random.nextDouble();
114 |         }
115 |         double alpha = 1;
116 |         int iteration = 1000;
117 |         Correlation cor = new Correlation(gold);
118 |         for (int i = 0; i < iteration; i++) {
119 |             double[] derivative = cor.derivative(predicted);
120 |             for (int j = 0; j < derivative.length; j++) {
121 |                 predicted[j] += alpha * derivative[j];
122 |             }
123 |         }
124 |     }
125 |     
126 |     public static void testPearsonCosDerivative() {
127 |         Random random = new Random();
128 |         int arrayLength = 3000;
129 |         double[] gold = new double[arrayLength];
130 |         for (int i = 0; i < arrayLength; i++) {
131 |             gold[i] = random.nextDouble();
132 |         }
133 |         Correlation cor = new Correlation(gold);
134 |         int vectorNum = 1000;
135 |         int[][] pairs = new int[arrayLength][2];
136 |         int index = 0;
137 |         while (index < arrayLength) {
138 |             int i = random.nextInt(vectorNum);
139 |             int j = random.nextInt(vectorNum);
140 |             if (i == j) continue;
141 |             pairs[index][0] = i;
142 |             pairs[index][1] = j;
143 |             index++;
144 |         }
145 |         int vectorSize = 100;
146 |         double[][] vectors = new double[vectorNum][vectorSize]; 
147 |         for (int i = 0; i < vectorNum; i++) {
148 |             for (int j = 0; j < vectorSize; j++) {
149 |                 vectors[i][j] = random.nextDouble();
150 |             }
151 |         }
152 |         
153 |         int iteration = 10000; 
154 |         double alpha = 1;
155 |         for (int iter = 0; iter < iteration; iter++) {
156 |             double[][] delta = cor.derivative(vectors, pairs);
157 |             for (int i = 0; i < vectorNum; i++) {
158 |                 for (int j = 0; j < vectorSize; j++) {
159 |                     vectors[i][j] += alpha * delta[i][j];
160 |                 }
161 |             }
162 |         }
163 |     }
164 |     
165 |     public static void main(String[] args) {
166 | //        testPearsonDerivative();
167 |         testPearsonCosDerivative();
168 |     }
169 |     
170 |     
171 |     
172 | }
173 | 
174 | 


--------------------------------------------------------------------------------
/src/neural/function/Sigmoid.java:
--------------------------------------------------------------------------------
 1 | package neural.function;
 2 | 
 3 | import common.SigmoidTable;
 4 | 
 5 | public class Sigmoid implements ActivationFunction {
 6 |     public static final SigmoidTable sigmoidTable = new SigmoidTable();
 7 | 
 8 |     @Override
 9 |     public double activation(double x) {
10 |         // TODO Auto-generated method stub
11 |         return sigmoidTable.getSigmoid(x);
12 |     }
13 | 
14 |     @Override
15 |     public double derivative(double x) {
16 |         // TODO Auto-generated method stub
17 |         double sigmoid = sigmoidTable.getSigmoid(x);
18 |         return sigmoid * (1 - sigmoid);
19 |     }
20 | 
21 |     @Override
22 |     public String getName() {
23 |         // TODO Auto-generated method stub
24 |         return "sigmoid";
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/neural/function/Tanh.java:
--------------------------------------------------------------------------------
 1 | package neural.function;
 2 | 
 3 | import common.TanhTable;
 4 | 
 5 | public class Tanh implements ActivationFunction {
 6 |     public static final TanhTable tanhTable = new TanhTable();
 7 | 
 8 |     @Override
 9 |     public double activation(double x) {
10 |         // TODO Auto-generated method stub
11 |         return tanhTable.getTanh(x);
12 |     }
13 | 
14 |     @Override
15 |     public double derivative(double x) {
16 |         // TODO Auto-generated method stub
17 |         double tanh = tanhTable.getTanh(x);
18 |         return 1 - (tanh * tanh);
19 |     }
20 | 
21 |     @Override
22 |     public String getName() {
23 |         // TODO Auto-generated method stub
24 |         return "tanh";
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/space/AbstractSemanticSpace.java:
--------------------------------------------------------------------------------
 1 | package space;
 2 | 
 3 | import org.ejml.simple.SimpleMatrix;
 4 | 
 5 | import common.SimpleMatrixUtils;
 6 | import common.exception.OutOfVocabularyException;
 7 | 
 8 | public abstract class AbstractSemanticSpace implements SemanticSpace{
 9 | 
10 |     @Override
11 |     public double getSim(String word1, String word2) {
12 |         // TODO Auto-generated method stub
13 |         SimpleMatrix vector1 = getVector(word1);
14 |         SimpleMatrix vector2 = getVector(word2);
15 |         if (vector1 == null) {
16 |             throw new OutOfVocabularyException(word1 +" not found");
17 |         } else if (vector2 == null) {
18 |             throw new OutOfVocabularyException(word2 +" not found");
19 |         }
20 |         return SimpleMatrixUtils.cosine(vector1, vector2);
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/space/Neighbor.java:
--------------------------------------------------------------------------------
 1 | package space;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | public class Neighbor {
 6 |     public String word;
 7 |     public double sim;
 8 | 
 9 |     public Neighbor(String word, double sim) {
10 |         this.word = word;
11 |         this.sim = sim;
12 |     }
13 | 
14 |     public static Comparator<Neighbor> NeighborComparator = new Comparator<Neighbor>() {
15 | 
16 |                                                               @Override
17 |                                                               public int compare(
18 |                                                                       Neighbor o1,
19 |                                                                       Neighbor o2) {
20 |                                                                   if (o1.sim > o2.sim) {
21 |                                                                       return -1;
22 |                                                                   } else if (o1.sim == o2.sim) {
23 |                                                                       return 0;
24 |                                                                   } else {
25 |                                                                       return 1;
26 |                                                                   }
27 |                                                               }
28 | 
29 |                                                           };
30 | }
31 | 


--------------------------------------------------------------------------------
/src/space/SemanticSpace.java:
--------------------------------------------------------------------------------
 1 | package space;
 2 | 
 3 | import org.ejml.simple.SimpleMatrix;
 4 | 
 5 | public interface SemanticSpace {
 6 | //    public boolean containsWord(String word);
 7 |     public int getVectorSize();
 8 |     public SimpleMatrix getVector(String word);
 9 |     public double getSim(String word1, String word2);
10 |     public double getDirection(String word1, String word2);
11 |     public Neighbor[] getNeighbors(String word, int noNeighbor);
12 |     public Neighbor[] getNeighbors(SimpleMatrix vector, int noNeighbor, String[] excludedWords);
13 | }
14 | 


--------------------------------------------------------------------------------
/src/space/Similarity.java:
--------------------------------------------------------------------------------
 1 | package space;
 2 | 
 3 | import org.ejml.simple.SimpleMatrix;
 4 | 
 5 | import common.MathUtils;
 6 | import common.SimpleMatrixUtils;
 7 | 
 8 | public class Similarity {
 9 |     public static double cosine(double[] v1, double[] v2) {
10 |         return MathUtils.cosine(v1, v2);
11 |     }
12 |     
13 |     public static double cosine(SimpleMatrix v1, SimpleMatrix v2) {
14 |         return SimpleMatrixUtils.cosine(v1, v2);
15 |     }
16 |     
17 |     public static SimpleMatrix massCosine(SimpleMatrix matrix, SimpleMatrix vector) {
18 |         return SimpleMatrixUtils.massCosine(matrix, vector);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/vocab/HuffmanTree.java:
--------------------------------------------------------------------------------
  1 | package vocab;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | 
  6 | import common.DataStructureUtils;
  7 | 
  8 | public class HuffmanTree {
  9 |     int[]  binaries;
 10 |     int[]  parentNodes;
 11 |     long[] counts;
 12 |     int    vocabSize;
 13 | 
 14 |     /*
 15 |      * Create binary Huffman tree using the word counts intCounts must already
 16 |      * be sorted (descending order)
 17 |      */
 18 |     public HuffmanTree(long[] inCounts) {
 19 | 
 20 |         vocabSize = inCounts.length;
 21 | 
 22 |         /*
 23 |          * counts: the count each node in a tree binaries: the code bit going
 24 |          * from the parent to the current node parentNodes: the direct parent of
 25 |          * each node
 26 |          * 
 27 |          * These arrays are splitted into 2 groups: leaf nodes: 0 -> vocabSize-1
 28 |          * (descending counts) internal nodes: vocabSize -> 2 * vocabSize - 2
 29 |          * (ascending counts)
 30 |          */
 31 |         counts = new long[2 * vocabSize - 1];
 32 |         binaries = new int[2 * vocabSize - 1];
 33 |         parentNodes = new int[2 * vocabSize - 1];
 34 | 
 35 |         // creating a
 36 |         for (int i = 0; i < vocabSize; i++) {
 37 |             counts[i] = inCounts[i];
 38 |         }
 39 |         for (int i = vocabSize; i < vocabSize * 2 - 1; i++) {
 40 |             counts[i] = (int) 1e15;
 41 |         }
 42 | 
 43 |         int pos1 = vocabSize - 1; // traverse in the leaf node indices
 44 |         int pos2 = vocabSize; // traverse in the internal node indices
 45 | 
 46 |         /*
 47 |          * Following algorithm constructs the Huffman tree by creating one
 48 |          * internal node at a time
 49 |          */
 50 | 
 51 |         int min1i, min2i;
 52 |         for (int i = 0; i < vocabSize - 1; i++) {
 53 | 
 54 |             // First, find node with smallest count 'min1'
 55 |             if (pos1 >= 0) {
 56 |                 if (counts[pos1] < counts[pos2]) {
 57 |                     min1i = pos1;
 58 |                     pos1--;
 59 |                 } else {
 60 |                     min1i = pos2;
 61 |                     pos2++;
 62 |                 }
 63 |             } else {
 64 |                 min1i = pos2;
 65 |                 pos2++;
 66 |             }
 67 |             // Then, find node with next smallest count 'min2'
 68 |             if (pos1 >= 0) {
 69 |                 if (counts[pos1] < counts[pos2]) {
 70 |                     min2i = pos1;
 71 |                     pos1--;
 72 |                 } else {
 73 |                     min2i = pos2;
 74 |                     pos2++;
 75 |                 }
 76 |             } else {
 77 |                 min2i = pos2;
 78 |                 pos2++;
 79 |             }
 80 | 
 81 |             // sum the count, create a new node with the sum as its count
 82 |             counts[vocabSize + i] = counts[min1i] + counts[min2i];
 83 |             // update the code & parent information
 84 |             parentNodes[min1i] = vocabSize + i;
 85 |             parentNodes[min2i] = vocabSize + i;
 86 |             binaries[min1i] = 0; // which is default in Java
 87 |             binaries[min2i] = 1;
 88 |         }
 89 |     }
 90 | 
 91 |     /*
 92 |      * retrieve the Huffman code of a index_th input entry (i.e. word in a
 93 |      * vocab)
 94 |      */
 95 |     public String getCode(int index) {
 96 |         int parentIndex = index;
 97 | 
 98 |         StringBuffer code = new StringBuffer();
 99 |         // traverse from the node to the root to get the reversed code
100 |         // reverse and return the code
101 |         while (true) {
102 |             code.append(binaries[parentIndex]);
103 |             parentIndex = parentNodes[parentIndex];
104 |             if (parentIndex > vocabSize * 2 - 2) {
105 |                 System.out.println(parentIndex);
106 |             }
107 |             if (parentIndex == vocabSize * 2 - 2) {
108 |                 break;
109 |             }
110 |         }
111 |         return new StringBuilder(code.toString()).reverse().toString();
112 |     }
113 | 
114 |     /*
115 |      * retrieve the ancestors of a index_th input entry in the Huffman tree
116 |      */
117 |     public int[] getParentIndices(int index) {
118 |         int currentIndex = index;
119 |         ArrayList<Integer> parentIndices = new ArrayList<Integer>();
120 | 
121 |         /*
122 |          * traverse from the node to the root to get the reversed list of parent
123 |          * indices in the internal node list (the original indices subtracted by
124 |          * vocabSize) reverse the list, turn it into an array and return
125 |          */
126 | 
127 |         while (true) {
128 |             int parentIndex = parentNodes[currentIndex];
129 |             parentIndices.add(parentIndex - vocabSize);
130 |             currentIndex = parentIndex;
131 |             if (parentIndex == vocabSize * 2 - 2) {
132 |                 break;
133 |             }
134 |         }
135 |         Collections.reverse(parentIndices);
136 |         return DataStructureUtils.intListToArray(parentIndices);
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/vocab/VocabEntry.java:
--------------------------------------------------------------------------------
 1 | package vocab;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | public class VocabEntry {
 6 |     // count the word in the training file
 7 |     public long   frequency;
 8 | 
 9 |     // the ancestors' indices in the huffman tree
10 |     public int[]  ancestors;
11 | 
12 |     // the surface string
13 |     public String word;
14 | 
15 |     // the huffman code
16 |     public String code;
17 | 
18 |     public VocabEntry() {
19 |         word = "";
20 |         frequency = 0;
21 |     }
22 | 
23 |     public VocabEntry(String word, int frequency) {
24 |         this.word = word;
25 |         this.frequency = frequency;
26 |     }
27 | 
28 |     public static Comparator<VocabEntry> VocabEntryFrequencyComparator = new Comparator<VocabEntry>() {
29 | 
30 |                                                                            @Override
31 |                                                                            public int compare(
32 |                                                                                    VocabEntry o1,
33 |                                                                                    VocabEntry o2) {
34 |                                                                                if (o1.frequency > o2.frequency) {
35 |                                                                                    return 1;
36 |                                                                                } else if (o1.frequency < o2.frequency) {
37 |                                                                                    return -1;
38 |                                                                                } else {
39 |                                                                                    return 0;
40 |                                                                                }
41 |                                                                            }
42 | 
43 |                                                                        };
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/vocab/VocabEntryFilter.java:
--------------------------------------------------------------------------------
1 | package vocab;
2 | 
3 | public interface VocabEntryFilter {
4 |     public boolean isFiltered(VocabEntry entry);
5 | }
6 | 


--------------------------------------------------------------------------------
/src/vocab/filter/MinFrequencyVocabFilter.java:
--------------------------------------------------------------------------------
 1 | package vocab.filter;
 2 | 
 3 | import vocab.VocabEntry;
 4 | import vocab.VocabEntryFilter;
 5 | 
 6 | public class MinFrequencyVocabFilter implements VocabEntryFilter {
 7 |     protected int minFrequency;
 8 | 
 9 |     public MinFrequencyVocabFilter(int minFrequency) {
10 |         this.minFrequency = minFrequency;
11 |     }
12 | 
13 |     @Override
14 |     public boolean isFiltered(VocabEntry entry) {
15 |         // if the frequency of the word is less the minFrequency, return true
16 |         // to filter it
17 |         return entry.frequency < minFrequency;
18 |     }
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/word2vec/MultiThreadWord2Vec.java:
--------------------------------------------------------------------------------
  1 | package word2vec;
  2 | 
  3 | import io.sentence.SentenceInputStream;
  4 | import io.sentence.SubSamplingSentenceInputStream;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | 
  9 | import space.RawSemanticSpace;
 10 | import common.correlation.MenCorrelation;
 11 | import common.MeanAveragePrecision;
 12 | 
 13 | /**
 14 |  * Still abstract class for learning words' vectors
 15 |  * Implement some common methods
 16 |  *
 17 |  */
 18 | public abstract class MultiThreadWord2Vec extends AbstractWord2Vec {
 19 |     
 20 |     protected MenCorrelation men;
 21 |     protected MenCorrelation ws;
 22 |     protected MeanAveragePrecision ap;
 23 |     protected MeanAveragePrecision entail;
 24 |     protected MeanAveragePrecision eval;
 25 |     protected RawSemanticSpace outputSpace;
 26 |     protected RawSemanticSpace negSpace;
 27 |     protected long lastWordCount = 0;
 28 |     protected int iteration = 0;
 29 |     protected int epochNum = 1;    
 30 | 
 31 |     public MultiThreadWord2Vec(int projectionLayerSize, int windowSize,
 32 |             boolean hierarchicalSoftmax, int negativeSamples, double subSample) {
 33 |         super(projectionLayerSize, windowSize, hierarchicalSoftmax,
 34 |                 negativeSamples, subSample);
 35 |     }
 36 |     
 37 |     @Override
 38 |     public void trainModel(ArrayList<SentenceInputStream> inputStreams) {
 39 |         wordCount = 0;
 40 |         lastWordCount = 0;
 41 |         trainWords = vocab.getTrainWords();
 42 |         System.out.println("train words: " + trainWords);
 43 |         System.out.println("vocab size: " + vocab.getVocabSize());
 44 |         System.out.println("hidden size: " + projectionLayerSize);
 45 |         System.out.println("first word:" + vocab.getEntry(0).word);
 46 |         System.out.println("last word:" + vocab.getEntry(vocab.getVocabSize() - 1).word);
 47 |         
 48 |         //The number of sentences in the corpus
 49 |         TrainingThread[] threads = new TrainingThread[inputStreams.size()]; 
 50 |         for (int i = 0; i < inputStreams.size(); i++) {
 51 |             SentenceInputStream inputStream = inputStreams.get(i);
 52 |             if (subSample > 0) {
 53 |                 inputStream = new SubSamplingSentenceInputStream(inputStream, subSample);
 54 |             }
 55 |             threads[i] = new TrainingThread(inputStream);
 56 |             threads[i].start();
 57 |         }
 58 |         try {
 59 |             for (TrainingThread thread: threads) {
 60 |                     thread.join();
 61 |             }
 62 |         } catch (InterruptedException e) {
 63 |             // TODO Auto-generated catch block
 64 |             e.printStackTrace();
 65 |         }
 66 |         
 67 |         System.out.println("total word count: " + wordCount);
 68 |     }
 69 | 
 70 |     protected void trainModelThread(SentenceInputStream inputStream) {
 71 |         long oldWordCount = 0;
 72 |         try {
 73 |             while (true) {
 74 | 
 75 |                 // read the whole sentence,
 76 |                 // the output would be the list of the word's indices in the
 77 |                 // dictionary
 78 |                 boolean hasNextSentence = inputStream.readNextSentence(vocab);
 79 |                 if (!hasNextSentence) break;
 80 |                 int[] sentence = inputStream.getCurrentSentence();
 81 |                 // if end of file, finish
 82 |                 if (sentence.length == 0) {
 83 |                     continue;
 84 | //                    if (!hasNextSentence)
 85 | //                        break;
 86 |                 }
 87 | 
 88 |                 // check word count
 89 |                 // update alpha
 90 |                 long newSentenceWordCount = inputStream.getWordCount() - oldWordCount;
 91 |                 oldWordCount = inputStream.getWordCount();
 92 |                 
 93 |                 synchronized (this) {
 94 |                     wordCount = wordCount + newSentenceWordCount;
 95 |                     if (wordCount - lastWordCount >= 10000) {
 96 |                         lastWordCount = wordCount;
 97 |                         iteration++;
 98 |                         // update alpha
 99 |                         // what about thread safe???
100 | 
101 |                         alpha = starting_alpha
102 |                                 * (1 - (double) wordCount / (trainWords + 1));
103 |                         if (alpha < starting_alpha * 0.0001) {
104 |                             alpha = starting_alpha * 0.0001;
105 |                         }
106 |                         if (iteration % 10 == 0) {
107 |                             System.out.println("Trained: " + wordCount + " words, " + "Training rate: " + alpha);
108 |                             //System.out.println("Training rate: " + alpha);
109 |                         }
110 |                         
111 |                     }
112 |                 }
113 | 
114 |                 trainSentence(sentence);
115 |             }
116 |         } catch (IOException e) {
117 |             e.printStackTrace();
118 |             System.exit(1);
119 |         }
120 |     }
121 | 
122 |     
123 |     public void printStatistics() {
124 |     }
125 | 
126 |     
127 |     public abstract void trainSentence(int[] sentence);
128 |     
129 |     protected class TrainingThread extends Thread {
130 |         SentenceInputStream inputStream; //put each sentence of the corpus
131 |         
132 |         public TrainingThread(SentenceInputStream inputStream) {
133 |             this.inputStream = inputStream;
134 |         }
135 |         
136 |         public void run() {
137 |             trainModelThread(inputStream);
138 |         }
139 |     }
140 |     
141 | }
142 | 


--------------------------------------------------------------------------------
/src/word2vec/UniGram.java:
--------------------------------------------------------------------------------
 1 | package word2vec;
 2 | 
 3 | import java.util.Random;
 4 | 
 5 | import vocab.Vocab;
 6 | 
 7 | public class UniGram {
 8 |     public static final int DEFAULT_TABLE_SIZE = 100000000;
 9 |     protected int           randomTablesize;
10 |     protected int[]         randomTable;
11 |     private Random          random;
12 | 
13 |     public UniGram(Vocab vocab, int tableSize) {
14 |         this.randomTablesize = tableSize;
15 |         initUnigramTable(vocab);
16 |         random = new Random();
17 |     }
18 | 
19 |     public UniGram(Vocab vocab) {
20 |         this(vocab, DEFAULT_TABLE_SIZE);
21 |     }
22 | 
23 |     /**
24 |      * Create an unigram table to randomly generate a word. The probability of
25 |      * generating a word corresponds to its frequency^3/4
26 |      */
27 |     protected void initUnigramTable(Vocab vocab) {
28 |         long trainWordsPow = 0;
29 |         double sumPow;
30 |         double power = (double) 0.75;
31 |         int vocabSize = vocab.getVocabSize();
32 |         randomTable = new int[randomTablesize];
33 |         
34 |         // trainWordsPow = sum (frequency ^ 3/4)
35 |         for (int i = 0; i < vocabSize; i++) {
36 |             trainWordsPow += Math.pow(vocab.getEntry(i).frequency, power);
37 |         }
38 |         int index = 0;
39 |         sumPow = (double) Math.pow(vocab.getEntry(index).frequency, power)
40 |                 / trainWordsPow;
41 | 
42 |         // fill up the uni-gram table with words from the vocabulary
43 |         // the number of times a word appear is in proportion with its
44 |         // frequency^3/4
45 |         for (int i = 0; i < randomTablesize; i++) {
46 |             randomTable[i] = index;
47 |             if (i / (double) randomTablesize > sumPow) {
48 |                 index++;
49 |                 if (index < vocabSize) {
50 |                     sumPow += Math.pow(vocab.getEntry(index).frequency, power)
51 |                             / trainWordsPow;
52 |                 } else {
53 |                     System.out.println("what does it mean here");
54 |                 }
55 |             }
56 |             if (index >= vocabSize)
57 |                 index = vocabSize - 1;
58 |         }
59 |     }
60 | 
61 |     public int randomWordIndex() {
62 |         int randomInt = random.nextInt(randomTablesize);
63 |         return randomTable[randomInt];
64 |     }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------