├── .classpath
├── .gitignore
├── .metadata
├── .lock
├── .log
├── .mylyn
│ └── repositories.xml.zip
├── .plugins
│ ├── org.eclipse.core.resources
│ │ ├── .root
│ │ │ ├── .indexes
│ │ │ │ ├── history.version
│ │ │ │ ├── properties.index
│ │ │ │ └── properties.version
│ │ │ └── 2.tree
│ │ └── .safetable
│ │ │ └── org.eclipse.core.resources
│ ├── org.eclipse.core.runtime
│ │ └── .settings
│ │ │ ├── org.eclipse.core.resources.prefs
│ │ │ ├── org.eclipse.e4.ui.css.swt.theme.prefs
│ │ │ ├── org.eclipse.e4.ui.workbench.renderers.swt.prefs
│ │ │ ├── org.eclipse.jdt.ui.prefs
│ │ │ ├── org.eclipse.m2e.discovery.prefs
│ │ │ ├── org.eclipse.mylyn.context.core.prefs
│ │ │ ├── org.eclipse.mylyn.monitor.ui.prefs
│ │ │ ├── org.eclipse.mylyn.tasks.ui.prefs
│ │ │ ├── org.eclipse.team.ui.prefs
│ │ │ ├── org.eclipse.ui.editors.prefs
│ │ │ ├── org.eclipse.ui.ide.prefs
│ │ │ ├── org.eclipse.ui.prefs
│ │ │ ├── org.eclipse.ui.workbench.prefs
│ │ │ └── org.python.pydev.prefs
│ ├── org.eclipse.e4.workbench
│ │ └── workbench.xmi
│ ├── org.eclipse.epp.logging.aeri.ui
│ │ ├── history
│ │ │ ├── _0.fdt
│ │ │ ├── _0.fdx
│ │ │ ├── _0.fnm
│ │ │ ├── _0.frq
│ │ │ ├── _0.nrm
│ │ │ ├── _0.tii
│ │ │ ├── _0.tis
│ │ │ ├── segments.gen
│ │ │ └── segments_1
│ │ ├── remote-index
│ │ │ ├── _2.fdt
│ │ │ ├── _2.fdx
│ │ │ ├── _2.fnm
│ │ │ ├── _2.frq
│ │ │ ├── _2.nrm
│ │ │ ├── _2.prx
│ │ │ ├── _2.tii
│ │ │ ├── _2.tis
│ │ │ ├── segments.gen
│ │ │ └── segments_3
│ │ └── server-config.json
│ ├── org.eclipse.jdt.core
│ │ ├── assumedExternalFilesCache
│ │ ├── externalFilesCache
│ │ ├── nonChainingJarsCache
│ │ └── variablesAndContainers.dat
│ ├── org.eclipse.jdt.ui
│ │ ├── OpenTypeHistory.xml
│ │ ├── QualifiedTypeNameHistory.xml
│ │ └── dialog_settings.xml
│ ├── org.eclipse.m2e.logback.configuration
│ │ ├── 0.log
│ │ └── logback.1.6.2.20150902-0002.xml
│ ├── org.eclipse.oomph.setup.ui
│ │ └── dialog_settings.xml
│ ├── org.eclipse.oomph.setup
│ │ └── workspace.setup
│ ├── org.eclipse.ui.ide
│ │ └── dialog_settings.xml
│ └── org.eclipse.ui.workbench
│ │ ├── dialog_settings.xml
│ │ └── workingsets.xml
└── version.ini
├── .project
├── .pydevproject
├── .settings
├── org.eclipse.jdt.core.prefs
├── org.eclipse.jdt.ui.prefs
└── org.eclipse.m2e.core.prefs
├── HyperVec.jar
├── README.md
├── code_mapping_across_languages
├── AP_evaluation_code
│ ├── common.py
│ ├── test_default.py
│ └── test_norm.py
├── alignment_files
│ ├── de_en.align
│ └── it_en.align
├── convert_w2vTXT_to_w2vBIN.py
├── credits_to_CLIC_trento.txt
├── mappingcode
│ ├── __init__.py
│ ├── demo.sh~
│ ├── learn_mat.sh
│ ├── space.py
│ ├── space.pyc
│ ├── test_tm.py
│ ├── test_tm2.py
│ ├── test_tm_pred.py
│ ├── train_tm.py
│ ├── translate_tm.py
│ ├── utils.py
│ └── utils.pyc
├── perform_mapping.sh
└── vocabulary file
│ ├── german_voc_wikipedia.txt.gz
│ └── italian_voc_wikipedia.txt.gz
├── config.cfg
├── create_features.py
├── datasets_across_languages
├── eval_DE
│ ├── noun_hyp_vs_ant.txt
│ ├── noun_hyp_vs_syn.txt
│ └── noun_hyp_vs_synant.txt
└── eval_IT
│ ├── noun_hyp_vs-ant.txt
│ ├── noun_hyp_vs-syn-ant.txt
│ └── noun_hyp_vs-syn.txt
├── datasets_classification
├── ABIBLESS.txt
├── AWBLESS.txt
├── BLESS.txt
├── eval-bless.jar
├── eval-dir.jar
└── readme_how_to.txt
├── evaluation_scripts
├── common.py
└── corrEval.py
├── get-pretrainedHyperVecEmbeddings
└── download_embeddings.sh
├── hypernymy_resources
├── cohyponym_n.txt.gz
├── cohyponym_v.txt.gz
├── hypernym_n.txt.gz
└── hypernym_v.txt.gz
├── pom.xml
└── src
├── common
├── DataStructureUtils.java
├── IOUtils.java
├── MathUtils.java
├── MeanAveragePrecision.java
├── SigmoidTable.java
├── SimpleMatrixUtils.java
├── TanhTable.java
├── WordForm.java
├── correlation
│ ├── AreaUnderCurve.java
│ └── MenCorrelation.java
├── exception
│ ├── OutOfVocabularyException.java
│ └── ValueException.java
└── wordnet
│ ├── LexicalHypernym.java
│ ├── LexicalResource.java
│ ├── LexicalResourceAdj.java
│ ├── LexicalResourceNoun.java
│ ├── LexicalResourceVerb.java
│ ├── Synset.java
│ ├── WordNetAdj.java
│ ├── WordNetNoun.java
│ ├── WordNetReader.java
│ └── WordNetVerb.java
├── demo
├── HyperVecLearning.java
└── W2vProperties.java
├── io
├── sentence
│ ├── PlainSentenceInputStream.java
│ ├── SentenceInputStream.java
│ ├── SubSamplingSentenceInputStream.java
│ └── TreeInputStream.java
└── word
│ ├── CombinedWordInputStream.java
│ ├── Phrase.java
│ ├── PushBackWordStream.java
│ ├── WordFilter.java
│ └── WordInputStream.java
├── neural
└── function
│ ├── ActivationFunction.java
│ ├── Correlation.java
│ ├── Sigmoid.java
│ └── Tanh.java
├── space
├── AbstractSemanticSpace.java
├── Neighbor.java
├── RawSemanticSpace.java
├── SemanticSpace.java
└── Similarity.java
├── tree
├── CcgTree.java
└── Tree.java
├── vocab
├── HuffmanTree.java
├── Vocab.java
├── VocabEntry.java
├── VocabEntryFilter.java
└── filter
│ └── MinFrequencyVocabFilter.java
└── word2vec
├── AbstractWord2Vec.java
├── MultiThreadWord2Vec.java
├── UniGram.java
└── multitask
└── Hyper2Vec.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /target
3 | .attach_pid*
4 |
--------------------------------------------------------------------------------
/.metadata/.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.lock
--------------------------------------------------------------------------------
/.metadata/.log:
--------------------------------------------------------------------------------
1 | !SESSION 2016-01-11 12:36:53.838 -----------------------------------------------
2 | eclipse.buildId=4.5.1.M20150904-0015
3 | java.version=1.7.0_79
4 | java.vendor=Oracle Corporation
5 | BootLoader constants: OS=macosx, ARCH=x86_64, WS=cocoa, NL=en_US
6 | Framework arguments: -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
7 | Command-line arguments: -os macosx -ws cocoa -arch x86_64 -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
8 |
9 | !ENTRY org.eclipse.core.net 1 0 2016-01-11 12:36:55.177
10 | !MESSAGE System property http.nonProxyHosts has been set to local|*.local|169.254/16|*.169.254/16 by an external source. This value will be overwritten using the values from the preferences
11 |
12 | !ENTRY org.eclipse.jface 2 0 2016-01-11 12:37:55.985
13 | !MESSAGE Keybinding conflicts occurred. They may interfere with normal accelerator operation.
14 | !SUBENTRY 1 org.eclipse.jface 2 0 2016-01-11 12:37:55.985
15 | !MESSAGE A conflict occurred for ALT+COMMAND+R:
16 | Binding(ALT+COMMAND+R,
17 | ParameterizedCommand(Command(org.python.pydev.debug.setnext,Set Next Statement,
18 | ,
19 | Category(org.python.pydev.ui.category.run,PyDev - Run,Python run category,true),
20 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@1ee8d4b6,
21 | ,,true),null),
22 | org.eclipse.ui.defaultAcceleratorConfiguration,
23 | org.eclipse.ui.contexts.window,,,system)
24 | Binding(ALT+COMMAND+R,
25 | ParameterizedCommand(Command(org.eclipse.jdt.ui.edit.text.java.rename.element,Rename - Refactoring ,
26 | Rename the selected element,
27 | Category(org.eclipse.jdt.ui.category.refactoring,Refactor - Java,Java Refactoring Actions,true),
28 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@47e50894,
29 | ,,true),null),
30 | org.eclipse.ui.defaultAcceleratorConfiguration,
31 | org.eclipse.ui.contexts.window,,cocoa,system)
32 | !SESSION 2016-03-23 14:56:47.781 -----------------------------------------------
33 | eclipse.buildId=4.5.1.M20150904-0015
34 | java.version=1.7.0_79
35 | java.vendor=Oracle Corporation
36 | BootLoader constants: OS=macosx, ARCH=x86_64, WS=cocoa, NL=en_US
37 | Framework arguments: -product org.eclipse.epp.package.java.product -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
38 | Command-line arguments: -os macosx -ws cocoa -arch x86_64 -product org.eclipse.epp.package.java.product -data /Volumes/Data/Doctorate/Implementation/w2vcomp -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation
39 |
40 | !ENTRY org.eclipse.core.net 1 0 2016-03-23 14:56:48.939
41 | !MESSAGE System property http.nonProxyHosts has been set to local|*.local|169.254/16|*.169.254/16 by an external source. This value will be overwritten using the values from the preferences
42 |
43 | !ENTRY org.eclipse.jface 2 0 2016-03-23 14:56:52.297
44 | !MESSAGE Keybinding conflicts occurred. They may interfere with normal accelerator operation.
45 | !SUBENTRY 1 org.eclipse.jface 2 0 2016-03-23 14:56:52.297
46 | !MESSAGE A conflict occurred for ALT+COMMAND+R:
47 | Binding(ALT+COMMAND+R,
48 | ParameterizedCommand(Command(org.python.pydev.debug.setnext,Set Next Statement,
49 | ,
50 | Category(org.python.pydev.ui.category.run,PyDev - Run,Python run category,true),
51 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@33b88372,
52 | ,,true),null),
53 | org.eclipse.ui.defaultAcceleratorConfiguration,
54 | org.eclipse.ui.contexts.window,,,system)
55 | Binding(ALT+COMMAND+R,
56 | ParameterizedCommand(Command(org.eclipse.jdt.ui.edit.text.java.rename.element,Rename - Refactoring ,
57 | Rename the selected element,
58 | Category(org.eclipse.jdt.ui.category.refactoring,Refactor - Java,Java Refactoring Actions,true),
59 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@16bdcbe5,
60 | ,,true),null),
61 | org.eclipse.ui.defaultAcceleratorConfiguration,
62 | org.eclipse.ui.contexts.window,,cocoa,system)
63 |
--------------------------------------------------------------------------------
/.metadata/.mylyn/repositories.xml.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.mylyn/repositories.xml.zip
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/history.version:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/properties.index:
--------------------------------------------------------------------------------
1 | / org.eclipse.jdt.core stateVersionNumber 28
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/properties.version:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.root/2.tree:
--------------------------------------------------------------------------------
1 | org.eclipse.jdt.core
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.resources/.safetable/org.eclipse.core.resources:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.core.resources/.safetable/org.eclipse.core.resources
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | version=1
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.e4.ui.css.swt.theme.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | themeid=org.eclipse.e4.ui.css.theme.e4_default
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.e4.ui.workbench.renderers.swt.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | enableMRU=true
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | content_assist_proposals_background=255,255,255
2 | content_assist_proposals_foreground=0,0,0
3 | eclipse.preferences.version=1
4 | fontPropagated=true
5 | org.eclipse.jdt.internal.ui.navigator.layout=2
6 | org.eclipse.jdt.ui.editor.tab.width=
7 | org.eclipse.jdt.ui.formatterprofiles.version=12
8 | org.eclipse.jdt.ui.javadoclocations.migrated=true
9 | org.eclipse.jface.textfont=1|Monaco|13.0|0|COCOA|1|Monaco;
10 | proposalOrderMigrated=true
11 | sourceHoverBackgroundColor=236,235,236
12 | spelling_locale_initialized=true
13 | tabWidthPropagated=true
14 | useAnnotationsPrefPage=true
15 | useQuickDiffPrefPage=true
16 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.m2e.discovery.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.discovery.pref.projects=
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.context.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | mylyn.attention.migrated=true
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.monitor.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.mylyn.monitor.activity.tracking.enabled.checked=true
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.tasks.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | migrated.task.repositories.secure.store=true
3 | org.eclipse.mylyn.tasks.ui.filters.nonmatching=true
4 | org.eclipse.mylyn.tasks.ui.filters.nonmatching.encouraged=true
5 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.team.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.team.ui.first_time=false
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.editors.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | lineNumberRuler=true
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.ide.prefs:
--------------------------------------------------------------------------------
1 | TASKS_FILTERS_MIGRATE=true
2 | eclipse.preferences.version=1
3 | platformState=1450704678997
4 | quickStart=false
5 | tipsAndTricks=true
6 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | showIntro=false
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.workbench.prefs:
--------------------------------------------------------------------------------
1 | //org.eclipse.ui.commands/state/org.eclipse.ui.navigator.resources.nested.changeProjectPresentation/org.eclipse.ui.commands.radioState=false
2 | ColorsAndFontsPreferencePage.expandedCategories=Torg.eclipse.ui.workbenchMisc
3 | ColorsAndFontsPreferencePage.selectedElement=Forg.eclipse.jface.textfont
4 | ENABLED_DECORATORS=org.eclipse.m2e.core.mavenVersionDecorator\:true,org.eclipse.buildship.ui.gradledecorator\:true,org.eclipse.egit.ui.internal.decorators.GitLightweightDecorator\:true,org.eclipse.jdt.ui.override.decorator\:true,org.eclipse.jdt.ui.interface.decorator\:true,org.eclipse.jdt.ui.buildpath.decorator\:true,org.eclipse.m2e.core.maven2decorator\:true,org.eclipse.mylyn.context.ui.decorator.interest\:true,org.eclipse.mylyn.tasks.ui.decorators.task\:true,org.eclipse.mylyn.team.ui.changeset.decorator\:true,org.eclipse.ui.LinkedResourceDecorator\:true,org.eclipse.ui.SymlinkDecorator\:true,org.eclipse.ui.VirtualResourceDecorator\:true,org.eclipse.ui.ContentTypeDecorator\:true,org.eclipse.ui.ResourceFilterDecorator\:false,org.python.pydev.navigator.decorator.problemsLabelDecorator\:true,
5 | PLUGINS_NOT_ACTIVATED_ON_STARTUP=org.eclipse.m2e.discovery;
6 | eclipse.preferences.version=1
7 | org.eclipse.jface.textfont=1|Monaco|13.0|0|COCOA|1|Monaco;
8 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.core.runtime/.settings/org.python.pydev.prefs:
--------------------------------------------------------------------------------
1 | INTERPRETERS_CHECKED_ONCE=true
2 | eclipse.preferences.version=1
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fdt:
--------------------------------------------------------------------------------
1 | 0.6
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fdx:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fnm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fnm
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.frq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.frq
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.nrm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.nrm
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tii
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tis
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments.gen
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments_1
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdt
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdx
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fnm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fnm
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.frq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.frq
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.nrm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.nrm
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tii
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tis
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments.gen
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments_3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments_3
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/server-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "v1",
3 | "title": "Eclipse.org Error Reporting Server",
4 | "description": "Automated Error Reporting for eclipse.org",
5 | "timestamp": 1458741415931,
6 | "ttl": 20160,
7 | "helpUrl": "https://wiki.eclipse.org/EPP/Logging",
8 | "feedbackUrl": "https://docs.google.com/a/codetrails.com/forms/d/1wd9AzydLv_TMa7ZBXHO7zQIhZjZCJRNMed-6J4fVNsc/viewform",
9 | "aboutUrl": "https://dev.eclipse.org/recommenders/community/confess/#/about",
10 | "submitUrl": "https://dev.eclipse.org/recommenders/community/confess/0.6/reports/",
11 | "maxReportSize": 5242880,
12 | "problemsUrl": "https://www.eclipse.org/downloads/download.php?r\u003d1\u0026file\u003d/technology/epp/logging/problems.zip",
13 | "problemsTtl": 20160,
14 | "queryUrl": "https://dev.eclipse.org/recommenders/community/confess/0.6/query/",
15 | "connectTimeout": 10000,
16 | "socketTimeout": 100000,
17 | "acceptedProducts": [
18 | "org.eclipse.*"
19 | ],
20 | "acceptedPlugins": [
21 | "org.eclipse.*",
22 | "org.apache.log4j.*",
23 | "com.codetrails.*"
24 | ],
25 | "acceptedPackages": [
26 | "org.eclipse.*",
27 | "org.apache.*",
28 | "java.*",
29 | "javax.*",
30 | "javafx.*",
31 | "sun.*",
32 | "com.sun.*",
33 | "com.codetrails.*",
34 | "com.google.*",
35 | "org.osgi.*",
36 | "ch.qos.*",
37 | "org.slf4j.*"
38 | ],
39 | "acceptOtherPackages": true,
40 | "acceptUiFreezes": true,
41 | "ignoredStatuses": [
42 | "org.eclipse.equinox.p2.*::",
43 | "org.eclipse.epp.mpc.ui:java.io.IOException:",
44 | "org.eclipse.epp.mpc.ui:java.net.SocketTimeoutException:",
45 | "org.eclipse.oomph.setup.core:$org.apache.http.ConnectionClosedException:",
46 | "org.eclipse.ui::Conflicting handlers for*",
47 | "org.eclipse.jface:java.io.IOException:Unable to resolve plug-in*",
48 | "org.eclipse.core.runtime::Invalid input url*",
49 | "org.eclipse.core.filesystem::Could not move*",
50 | "org.eclipse.core.filesystem::Could not delete*",
51 | "org.eclipse.pde.core::The current target platform contains errors*",
52 | ":org.eclipse.equinox.security.storage.StorageException:",
53 | ":org.eclipse.ecf.filetransfer.*:",
54 | ":java.net.*:"
55 | ],
56 | "problemsZipLastDownloadTimestamp": 1458741427690
57 | }
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/assumedExternalFilesCache:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/externalFilesCache:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/nonChainingJarsCache:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.core/variablesAndContainers.dat:
--------------------------------------------------------------------------------
1 | JRE_SRC M2_REPO
2 | JUNIT_HOME JRE_SRCROOT JRE_LIB JUNIT_SRC_HOME
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.ui/OpenTypeHistory.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.ui/QualifiedTypeNameHistory.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.jdt.ui/dialog_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.m2e.logback.configuration/0.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.m2e.logback.configuration/0.log
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.m2e.logback.configuration/logback.1.6.2.20150902-0002.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %date [%thread] %-5level %logger{35} - %msg%n
5 |
6 |
7 | OFF
8 |
9 |
10 |
11 |
12 | ${org.eclipse.m2e.log.dir}/0.log
13 |
14 | ${org.eclipse.m2e.log.dir}/%i.log
15 | 1
16 | 10
17 |
18 |
19 | 100MB
20 |
21 |
22 | %date [%thread] %-5level %logger{35} - %msg%n
23 |
24 |
25 |
26 |
27 |
28 | WARN
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.oomph.setup.ui/dialog_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.oomph.setup/workspace.setup:
--------------------------------------------------------------------------------
1 |
2 |
7 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.ui.ide/dialog_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.ui.workbench/dialog_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
17 |
18 |
--------------------------------------------------------------------------------
/.metadata/.plugins/org.eclipse.ui.workbench/workingsets.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.metadata/version.ini:
--------------------------------------------------------------------------------
1 | #Wed Mar 23 14:56:49 CET 2016
2 | org.eclipse.core.runtime=2
3 | org.eclipse.platform=4.5.1.v20150904-0015
4 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | HyperVec
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.jdt.core.javabuilder
15 |
16 |
17 |
18 |
19 | org.eclipse.m2e.core.maven2Builder
20 |
21 |
22 |
23 |
24 |
25 | org.eclipse.m2e.core.maven2Nature
26 | org.eclipse.jdt.core.javanature
27 | org.python.pydev.pythonNature
28 |
29 |
30 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 | Default
4 | python 2.7
5 |
6 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_Nghia
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/HyperVec.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/HyperVec.jar
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## HyperVec
2 | Hierarchical Embeddings for Hypernymy Detection and Directionality
3 |
4 | ### Prerequisite
5 | - [spaCy](https://spacy.io): for parsing, version 2.0.11
6 | - a corpus such as wikipedia corpus (plain-text)
7 |
8 | ### Preprocess
9 | - Create the feature files:
10 |
11 | ```python create_features.py -input corpus-file.txt -output output-file-name -pos pos_tag```
12 |
13 | in which: pos_tag is either NN (for the noun features) or VB (for the verb features)
14 |
15 | ### Configuration
16 | See the config.cfg to set agruments for model.
17 |
18 | ### Training embeddings
19 | ```java -jar HyperVec.jar config.cfg vector-size window-size```
20 |
21 | For example, training embeddings with 100 dimensions; window-size = 5:
22 |
23 | ```java -jar HyperVec.jar config.cfg 100 5```
24 |
25 | ### Pretrained (hypervec) embeddings
26 | The embeddings used in our paper can be downloaded by using the script in `get-pretrainedHyperVecEmbeddings/download_embeddings.sh`. Note that the script downloads 9 files and concatenates them again to a single file (`hypervec.txt.gz`). The format is the default word2vec format: first line with header information, other lines word followed by whitespace seperated vector.
27 |
28 | Information about the embeddings: creatd using the ENCOW14A corpus (14.5bn token), 100 dimensions, sym. window of 5, 15 negative samples, 0.025 learning rate, threshhold set to 0.05. The resulting vocabulary contains about 2.7m words.
29 |
30 | ### Example usage: Evaluation BLESS,BIBLESS and AWBLESS
31 | To reproduce our experiments from Table 3 use the code in the `datasets_classification/`,
32 | assuming your vector file is located in the same folder and named `hypervec.txt.gz`.
33 | `java -jar eval-dir.jar hypervec.txt.gz` (Evaluate directionality on `BLESS.txt` using hyperscore)
34 | `java -jar eval-bless.jar hypervec.txt.gz 2 1000` (Evaluate classification on `BIBLESS.txt, AWBLESS.txt` using 2% of the training data and 1000 random iterations)
35 |
36 |
37 | ### Citation info
38 | If you use the code or the created feature norms, please [cite our paper (Bibtex)](http://www2.ims.uni-stuttgart.de/bibliographie/entry/2811b00e1bbd503adf28648ddb737132dc67a091/), the paper can be found here: [PDF](http://www.aclweb.org/anthology/D17-1022), the poster from EMNLP can be found here: [Poster](http://www.ims.uni-stuttgart.de/institut/mitarbeiter/koepermn/publications/poster_EMNLP2017.pdf)
39 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/AP_evaluation_code/common.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numpy import fromstring, dtype
3 | from numpy.linalg import norm
4 |
5 | def smart_open(fname, mode='rb'):
6 | if fname.endswith('.gz'):
7 | import gzip
8 | return gzip.open(fname, mode)
9 | elif fname.endswith('.bz2'):
10 | import bz2
11 | return bz2.BZ2File(fname, mode)
12 | else:
13 | return open(fname, mode)
14 |
15 | def load_vecs(binary_file, binary=1):
16 | vecs = []
17 | vocab = []
18 | if binary==1:
19 | with smart_open(binary_file, 'rb') as f:
20 | header = to_unicode(f.readline())
21 | vocab_size, vector_size = map(int, header.split())
22 | binary_len = dtype(np.float32).itemsize * vector_size
23 | for _ in range(vocab_size):
24 | word = []
25 | while True:
26 | ch = f.read(1)
27 | if ch == b' ':
28 | break
29 | if ch != b'\n':
30 | word.append(ch)
31 | word = to_unicode(b''.join(word))
32 | vocab.append(word)
33 | vec = fromstring(f.read(binary_len), dtype=np.float32)
34 | vecs.append(vec)
35 | else:
36 | with smart_open(binary_file, 'rb') as f:
37 | header = to_unicode(f.readline())
38 | if len(header.split()) == 2: vocab_size, vector_size = map(int, header.split())
39 | elif len(header.split()) > 2:
40 | parts = header.rstrip().split(" ")
41 | word, vec = parts[0], list(map(np.float32, parts[1:]))
42 | vocab.append(to_unicode(word))
43 | vecs.append(vec)
44 | for _, line in enumerate(f):
45 | parts = to_unicode(line.rstrip()).split(" ")
46 | word, vec = parts[0], list(map(np.float32, parts[1:]))
47 | vocab.append(to_unicode(word))
48 | vecs.append(vec)
49 | #embs_dim = len(vecs[1])
50 | #UNKNOWN_WORD = np.random.uniform(-0.25,0.25,embs_dim)
51 | #vecs = np.vstack((UNKNOWN_WORD, vecs))
52 | #vocab = ['#UNKNOWN#'] + list(vocab)
53 | #words = {word:idx for idx,word in enumerate(vocab)}
54 |
55 | return vecs, vocab
56 |
57 | def to_utf8(text, errors='strict', encoding='utf8'):
58 | """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
59 | if isinstance(text, unicode):
60 | return text.encode('utf8')
61 | # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
62 | else:
63 | return unicode(text, encoding, errors=errors).encode('utf8')
64 |
65 | def to_unicode(text, encoding='utf8', errors='strict'):
66 | """Convert a string (bytestring in `encoding` or unicode), to unicode."""
67 | if isinstance(text, unicode):
68 | return text
69 | else:
70 | return unicode(text, encoding=encoding, errors=errors)
--------------------------------------------------------------------------------
/code_mapping_across_languages/AP_evaluation_code/test_default.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 | from sklearn.metrics import average_precision_score
4 | from numpy.linalg import norm
5 | import common
6 |
7 | def cosine_sim(u, v):
8 | return np.dot(u,v)/(norm(u)*norm(v))
9 |
10 | def computeAP(targets, preds):
11 | paired = zip(preds, targets)
12 | sorted_paired = sorted(paired, key=lambda x:x[0], reverse=True)
13 | preds, targets = zip(*sorted_paired)
14 | preds, targets = list(preds), list(targets)
15 |
16 | ap = 0.0
17 | retrievedCounter = 0.0;
18 | relevantCounter = 0.0;
19 |
20 | for i in range(len(targets)):
21 | retrievedCounter += 1
22 | if int(targets[i]) == 1:
23 | relevantCounter += 1
24 | ap += relevantCounter / retrievedCounter
25 | ap /= relevantCounter
26 | return ap
27 |
28 | def _filter(word):
29 | word = word.split('-')
30 | if len(word) > 2:
31 | f_word = '-'.join(word[:-1])
32 | else:
33 | f_word = word[0]
34 | return f_word
35 |
36 | def load_dataset(dataset_file):
37 | dataset = []
38 | with open(dataset_file, 'r') as fin:
39 | for line in fin:
40 | left, right, label = line.strip().split('\t')
41 | dataset.append((left, right, int(label)))
42 | return dataset
43 |
44 | def compute_similarity(dataset, embs):
45 | data = []
46 | for (left, right, label) in dataset:
47 | if left in embs and right in embs:
48 | #direct = norm(embs[right]) / norm(embs[left])
49 | score = cosine_sim(embs[left], embs[right]) #* direct
50 | data.append((left, right, label, score))
51 | else:
52 | continue
53 | return data
54 |
55 | def build_data(dataset_file, embeddings_file):
56 | vecs, words = common.load_vecs(embeddings_file, binary=1) #TODO: set binary=0 to read text file
57 | embs = {word:vecs[idx] for idx,word in enumerate(words)}
58 | dataset = load_dataset(dataset_file)
59 | data = compute_similarity(dataset, embs)
60 |
61 | return data
62 |
63 | def ap_evaluation(data, cutoff=-1):
64 |
65 | data = sorted(data, key=lambda line:line[-1], reverse=True)
66 | targets, scores = [], []
67 | for (left, right, label, score) in data:
68 | targets.append(label)
69 | scores.append(score)
70 | if cutoff > 0:
71 | ap_score = average_precision_score(targets[:cutoff], scores[:cutoff])
72 | #ap_score = computeAP(targets, scores)
73 | print 'AP at %d cutoff: %f' %(cutoff, ap_score)
74 | else:
75 | ap_score = average_precision_score(targets, scores)
76 | #ap_score = computeAP(targets, scores)
77 | print 'AP score: %f' %ap_score
78 |
79 | return ap_score
80 |
81 | if __name__=='__main__':
82 | dataset_file = sys.argv[1]
83 | embeddings_file = sys.argv[2]
84 | data = build_data(dataset_file, embeddings_file)
85 | ap_evaluation(data)
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/AP_evaluation_code/test_norm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 | from sklearn.metrics import average_precision_score
4 | from numpy.linalg import norm
5 | import common
6 |
7 | def cosine_sim(u, v):
8 | return np.dot(u,v)/(norm(u)*norm(v))
9 |
10 | def computeAP(targets, preds):
11 | paired = zip(preds, targets)
12 | sorted_paired = sorted(paired, key=lambda x:x[0], reverse=True)
13 | preds, targets = zip(*sorted_paired)
14 | preds, targets = list(preds), list(targets)
15 |
16 | ap = 0.0
17 | retrievedCounter = 0.0;
18 | relevantCounter = 0.0;
19 |
20 | for i in range(len(targets)):
21 | retrievedCounter += 1
22 | if int(targets[i]) == 1:
23 | relevantCounter += 1
24 | ap += relevantCounter / retrievedCounter
25 | ap /= relevantCounter
26 | return ap
27 |
28 | def _filter(word):
29 | word = word.split('-')
30 | if len(word) > 2:
31 | f_word = '-'.join(word[:-1])
32 | else:
33 | f_word = word[0]
34 | return f_word
35 |
36 | def load_dataset(dataset_file):
37 | dataset = []
38 | with open(dataset_file, 'r') as fin:
39 | for line in fin:
40 | left, right, label = line.strip().split('\t')
41 | dataset.append((left, right, int(label)))
42 | return dataset
43 |
44 | def compute_similarity(dataset, embs):
45 | data = []
46 | for (left, right, label) in dataset:
47 | if left in embs and right in embs:
48 | direct = norm(embs[right]) / norm(embs[left])
49 | score = cosine_sim(embs[left], embs[right]) * direct
50 | data.append((left, right, label, score))
51 | else:
52 | continue
53 | return data
54 |
55 | def build_data(dataset_file, embeddings_file):
56 | vecs, words = common.load_vecs(embeddings_file, binary=1) #TODO: set binary=0 to read text file
57 | embs = {word:vecs[idx] for idx,word in enumerate(words)}
58 | dataset = load_dataset(dataset_file)
59 | data = compute_similarity(dataset, embs)
60 |
61 | return data
62 |
63 | def ap_evaluation(data, cutoff=-1):
64 |
65 | data = sorted(data, key=lambda line:line[-1], reverse=True)
66 | targets, scores = [], []
67 | for (left, right, label, score) in data:
68 | targets.append(label)
69 | scores.append(score)
70 | if cutoff > 0:
71 | ap_score = average_precision_score(targets[:cutoff], scores[:cutoff])
72 | #ap_score = computeAP(targets, scores)
73 | print 'AP at %d cutoff: %f' %(cutoff, ap_score)
74 | else:
75 | ap_score = average_precision_score(targets, scores)
76 | #ap_score = computeAP(targets, scores)
77 | print 'AP score: %f' %ap_score
78 |
79 | return ap_score
80 |
81 | if __name__=='__main__':
82 | dataset_file = sys.argv[1]
83 | embeddings_file = sys.argv[2]
84 | data = build_data(dataset_file, embeddings_file)
85 | ap_evaluation(data)
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/convert_w2vTXT_to_w2vBIN.py:
--------------------------------------------------------------------------------
1 | from gensim.models import word2vec
2 | import sys
3 |
4 | # Script that converts word2vec txtfile into word2vec binary
5 | print ("Script name: %s" % str(sys.argv[1]))
6 | model = word2vec.Word2Vec.load_word2vec_format(str(sys.argv[1]),binary=False)
7 | model.save_word2vec_format(str(sys.argv[1])+'.bin',binary=True)
8 |
9 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/credits_to_CLIC_trento.txt:
--------------------------------------------------------------------------------
1 | A huge part of this code is taken from an implementation that used to be available at http://clic.cimec.unitn.it and was also used for the paper 'Improving zero-shot learning by mitigating the hubness problem' by Georgiana Dinu, Angeliki Lazaridou, Marco Baroni https://arxiv.org/pdf/1412.6568.pdf.
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/demo.sh~:
--------------------------------------------------------------------------------
1 | echo "Training..."
2 |
3 | python train_tm.py -o tm data/OPUS_en_it_europarl_train_5K.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl
4 |
5 |
6 | echo "Testing standard NN retrieval (baseline)"
7 |
8 | python -c 5000 test_tm.py tm.pkl data/OPUS_en_it_europarl_test.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl
9 |
10 |
11 |
12 | echo "Testing GC retrieval with 5000 aditional elements"
13 |
14 | python -c 5000 test_tm.py tm.pkl data/OPUS_en_it_europarl_test.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/learn_mat.sh:
--------------------------------------------------------------------------------
1 | echo "Training..."
2 |
3 |
4 | for i in 0 1 2 3 4 5 6 7 8 9
5 | do
6 | python ../train_tm.py -o trainmat_${i} align.train.-${i} encow5.ppmi.train-${i} GNet_img_avg.train.-${i} &&
7 | python ../train_tm.py -o testmat_${i} align.test.-${i} encow5.ppmi.test-${i} GNet_img_avg.test.-${i}
8 | done;
9 |
10 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/space.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Space(object):
4 |
5 | def __init__(self, matrix_, id2row_):
6 |
7 | self.mat = matrix_
8 | self.id2row = id2row_
9 | self.create_row2id()
10 |
11 | def create_row2id(self):
12 | self.row2id = {}
13 | for idx, word in enumerate(self.id2row):
14 | if word in self.row2id:
15 | raise ValueError("Found duplicate word: %s" % (word))
16 | self.row2id[word] = idx
17 |
18 |
19 | @classmethod
20 | def build(cls, fname, lexicon=None):
21 |
22 | #if lexicon is provided, only data occurring in the lexicon is loaded
23 | id2row = []
24 | def filter_lines(f):
25 | for i,line in enumerate(f):
26 | word = line.split()[0]
27 | if i != 0 and (lexicon is None or word in lexicon):
28 | id2row.append(word)
29 | yield line
30 |
31 | #get the number of columns
32 | with open(fname) as f:
33 | f.readline()
34 | ncols = len(f.readline().split())
35 |
36 | with open(fname) as f:
37 | m = np.matrix(np.loadtxt(filter_lines(f),
38 | comments=None, usecols=range(1,ncols)))
39 |
40 | return Space(m, id2row)
41 |
42 | def normalize(self):
43 | row_norms = np.sqrt(np.multiply(self.mat, self.mat).sum(1))
44 | row_norms = row_norms.astype(np.double)
45 | row_norms[row_norms != 0] = np.array(1.0/row_norms[row_norms != 0]).flatten()
46 | self.mat = np.multiply(self.mat, row_norms)
47 |
48 |
49 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/space.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/mappingcode/space.pyc
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/test_tm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import getopt
3 | import numpy as np
4 | import collections
5 | import random
6 | from space import Space
7 | from utils import read_dict, apply_tm, score, get_valid_data
8 |
9 | def usage(errno=0):
10 | print >>sys.stderr,\
11 | """
12 | Given a translation matrix, test data (words and their translations) and
13 | source and target language vectors, it returns translations of source test
14 | words and computes Top N accuracy.
15 |
16 | Usage:
17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
18 | \n\
19 | Options:
20 | -o --output : file prefix. It prints the vectors obtained after
21 | the translation matrix is applied (.vecs.txt and .wds.txt).
22 | Optional. Default is ./translated_vecs
23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA)
24 | to be used with Global Correction (GC) strategy.
25 | Optional. Default, baseline retrieval is run.
26 |
27 | -h --help : help
28 |
29 | Arguments:
30 | trans_matrix: , translation matrix
31 | test_data: , list of source-target word pairs (space separated words,
32 | one word pair per line)
33 | source_vecs: , vectors in source language, Space-separated, with string
34 | identifier as first column (dim+1 columns, where dim is the
35 | dimensionality of the space)
36 | target_vecs: , vectors in target language
37 |
38 |
39 | Example:
40 | 1) Retrieve translations with standard nearest neighbour retrieval
41 |
42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
43 |
44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to
45 | correct for hubs (words that appear as the nearest neighbours of many points))
46 |
47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
48 |
49 | """
50 | sys.exit(errno)
51 |
52 |
53 | def main(sys_argv):
54 |
55 | try:
56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
57 | ["help", "output=", "correction="])
58 | except getopt.GetoptError, err:
59 | print str(err)
60 | usage()
61 | sys.exit(1)
62 |
63 | out_file = "./translated_vecs"
64 | additional = None
65 | for opt, val in opts:
66 | if opt in ("-o", "--ouput"):
67 | out_file = val
68 | if opt in ("-c", "--correction"):
69 | try:
70 | additional = int(val)
71 | except ValueError:
72 | usage(1)
73 | elif opt in ("-h", "--help"):
74 | usage(0)
75 | else:
76 | usage(1)
77 |
78 | if len(argv) == 4:
79 | tm_file = argv[0]
80 | test_file = argv[1]
81 | source_file = argv[2]
82 | target_file = argv[3]
83 |
84 | else:
85 | print str(err)
86 | usage(1)
87 |
88 | print "Loading the translation matrix"
89 | tm = np.loadtxt(tm_file)
90 |
91 | print "Reading the test data"
92 | test_data = read_dict(test_file)
93 |
94 | #in the _source_ space, we only need to load vectors for the words in test.
95 | #semantic spaces may contain additional words, ALL words in the _target_
96 | #space are used as the search space
97 | source_words, _ = zip(*test_data)
98 | source_words = set(source_words)
99 |
100 | print "Reading: %s" % source_file
101 | if not additional:
102 | source_sp = Space.build(source_file, source_words)
103 | else:
104 | #read all the words in the space
105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
106 | comments=None, usecols=(0,)).flatten())
107 | #the max number of additional+test elements is bounded by the size
108 | #of the lexicon
109 | additional = min(additional, len(lexicon) - len(source_words))
110 | #we sample additional elements that are not already in source_words
111 | random.seed(100)
112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |
114 | #load the source space
115 | source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |
117 | source_sp.normalize()
118 |
119 | print "Reading: %s" % target_file
120 | target_sp = Space.build(target_file)
121 | target_sp.normalize()
122 |
123 | print "Translating" #translates all the elements loaded in the source space
124 | mapped_source_sp = apply_tm(source_sp, tm)
125 |
126 | print "Retrieving translations"
127 | test_data = get_valid_data(source_sp, target_sp, test_data)
128 |
129 | #turn test data into a dictionary (a word can have mutiple translation)
130 | gold = collections.defaultdict(set)
131 | for k, v in test_data:
132 | gold[k].add(v)
133 |
134 | score(mapped_source_sp, target_sp, gold, additional)
135 |
136 | print "Printing mapped vectors: %s" % out_file
137 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
138 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
139 |
140 | if __name__ == '__main__':
141 | main(sys.argv)
142 |
143 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/test_tm2.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import getopt
3 | import numpy as np
4 | import collections
5 | import random
6 | from space import Space
7 | from utils import read_dict, apply_tm, score, get_valid_data
8 |
9 | def usage(errno=0):
10 | print >>sys.stderr,\
11 | """
12 | Given a translation matrix, test data (words and their translations) and
13 | source and target language vectors, it returns translations of source test
14 | words and computes Top N accuracy.
15 |
16 | Usage:
17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
18 | \n\
19 | Options:
20 | -o --output : file prefix. It prints the vectors obtained after
21 | the translation matrix is applied (.vecs.txt and .wds.txt).
22 | Optional. Default is ./translated_vecs
23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA)
24 | to be used with Global Correction (GC) strategy.
25 | Optional. Default, baseline retrieval is run.
26 |
27 | -h --help : help
28 |
29 | Arguments:
30 | trans_matrix: , translation matrix
31 | test_data: , list of source-target word pairs (space separated words,
32 | one word pair per line)
33 | source_vecs: , vectors in source language, Space-separated, with string
34 | identifier as first column (dim+1 columns, where dim is the
35 | dimensionality of the space)
36 | target_vecs: , vectors in target language
37 |
38 |
39 | Example:
40 | 1) Retrieve translations with standard nearest neighbour retrieval
41 |
42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
43 |
44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to
45 | correct for hubs (words that appear as the nearest neighbours of many points))
46 |
47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
48 |
49 | """
50 | sys.exit(errno)
51 |
52 |
53 | def main(sys_argv):
54 |
55 | try:
56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
57 | ["help", "output=", "correction="])
58 | except getopt.GetoptError, err:
59 | print str(err)
60 | usage()
61 | sys.exit(1)
62 |
63 | out_file = "./translated_vecs2"
64 | additional = None
65 | for opt, val in opts:
66 | if opt in ("-o", "--ouput"):
67 | out_file = val
68 | if opt in ("-c", "--correction"):
69 | try:
70 | additional = int(val)
71 | except ValueError:
72 | usage(1)
73 | elif opt in ("-h", "--help"):
74 | usage(0)
75 | else:
76 | usage(1)
77 |
78 | if len(argv) == 4:
79 | tm_file = argv[0]
80 | test_file = argv[1]
81 | source_file = argv[2]
82 | target_file = argv[3]
83 |
84 | else:
85 | print str(err)
86 | usage(1)
87 |
88 | print "Loading the translation matrix"
89 | tm = np.loadtxt(tm_file)
90 |
91 | print "Reading the test data"
92 | test_data = read_dict(test_file)
93 |
94 | #in the _source_ space, we only need to load vectors for the words in test.
95 | #semantic spaces may contain additional words, ALL words in the _target_
96 | #space are used as the search space
97 | source_words, _ = zip(*test_data)
98 | source_words = set(source_words)
99 |
100 | print "Reading: %s" % source_file
101 | if not additional:
102 | source_sp = Space.build(source_file, source_words)
103 | else:
104 | #read all the words in the space
105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
106 | comments=None, usecols=(0,)).flatten())
107 | #the max number of additional+test elements is bounded by the size
108 | #of the lexicon
109 | additional = min(additional, len(lexicon) - len(source_words))
110 | #we sample additional elements that are not already in source_words
111 | random.seed(100)
112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |
114 | #load the source space
115 | source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |
117 | source_sp.normalize()
118 |
119 | print "Reading: %s" % target_file
120 | target_sp = Space.build(target_file)
121 | target_sp.normalize()
122 |
123 | print "Translating" #translates all the elements loaded in the source space
124 | mapped_source_sp = apply_tm(source_sp, tm)
125 |
126 | print "Retrieving translations"
127 | test_data = get_valid_data(source_sp, target_sp, test_data)
128 |
129 | #turn test data into a dictionary (a word can have mutiple translation)
130 | gold = collections.defaultdict(set)
131 | for k, v in test_data:
132 | gold[k].add(v)
133 |
134 | score(mapped_source_sp, target_sp, gold, additional)
135 |
136 | print "Printing mapped vectors: %s" % out_file
137 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
138 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
139 |
140 | if __name__ == '__main__':
141 | main(sys.argv)
142 |
143 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/test_tm_pred.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import getopt
3 | import numpy as np
4 | import collections
5 | import random
6 | from space import Space
7 | from utils import read_dict, apply_tm, score, get_valid_data
8 |
9 | def usage(errno=0):
10 | print >>sys.stderr,\
11 | """
12 | Given a translation matrix, test data (words and their translations) and
13 | source and target language vectors, it returns translations of source test
14 | words and computes Top N accuracy.
15 |
16 | Usage:
17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
18 | \n\
19 | Options:
20 | -o --output : file prefix. It prints the vectors obtained after
21 | the translation matrix is applied (.vecs.txt and .wds.txt).
22 | Optional. Default is ./translated_vecs
23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA)
24 | to be used with Global Correction (GC) strategy.
25 | Optional. Default, baseline retrieval is run.
26 |
27 | -h --help : help
28 |
29 | Arguments:
30 | trans_matrix: , translation matrix
31 | test_data: , list of source-target word pairs (space separated words,
32 | one word pair per line)
33 | source_vecs: , vectors in source language, Space-separated, with string
34 | identifier as first column (dim+1 columns, where dim is the
35 | dimensionality of the space)
36 | target_vecs: , vectors in target language
37 |
38 |
39 | Example:
40 | 1) Retrieve translations with standard nearest neighbour retrieval
41 |
42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
43 |
44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to
45 | correct for hubs (words that appear as the nearest neighbours of many points))
46 |
47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
48 |
49 | """
50 | sys.exit(errno)
51 |
52 |
53 | def main(sys_argv):
54 |
55 | try:
56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
57 | ["help", "output=", "correction="])
58 | except getopt.GetoptError, err:
59 | print str(err)
60 | usage()
61 | sys.exit(1)
62 |
63 | out_file = "./translated_vecs"
64 | additional = None
65 | for opt, val in opts:
66 | if opt in ("-o", "--ouput"):
67 | out_file = val
68 | if opt in ("-c", "--correction"):
69 | try:
70 | additional = int(val)
71 | except ValueError:
72 | usage(1)
73 | elif opt in ("-h", "--help"):
74 | usage(0)
75 | else:
76 | usage(1)
77 |
78 | if len(argv) == 4:
79 | tm_file = argv[0]
80 | test_file = argv[1]
81 | source_file = argv[2]
82 | target_file = argv[3]
83 |
84 | else:
85 | print str(err)
86 | usage(1)
87 |
88 | print "Loading the translation matrix"
89 | tm = np.loadtxt(tm_file)
90 |
91 | print "Reading the test data"
92 | test_data = read_dict(test_file)
93 |
94 | #in the _source_ space, we only need to load vectors for the words in test.
95 | #semantic spaces may contain additional words, ALL words in the _target_
96 | #space are used as the search space
97 | source_words, _ = zip(*test_data)
98 | source_words = set(source_words)
99 |
100 | print "Reading: %s" % source_file
101 | if not additional:
102 | source_sp = Space.build(source_file, source_words)
103 | else:
104 | #read all the words in the space
105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
106 | comments=None, usecols=(0,)).flatten())
107 | #the max number of additional+test elements is bounded by the size
108 | #of the lexicon
109 | additional = min(additional, len(lexicon) - len(source_words))
110 | #we sample additional elements that are not already in source_words
111 | random.seed(100)
112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |
114 | #load the source space
115 | source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |
117 | source_sp.normalize()
118 |
119 | print "Reading: %s" % target_file
120 | target_sp = Space.build(target_file)
121 | target_sp.normalize()
122 |
123 | print "Translating" #translates all the elements loaded in the source space
124 | mapped_source_sp = apply_tm(source_sp, tm)
125 |
126 | print "Retrieving translations"
127 | test_data = get_valid_data(source_sp, target_sp, test_data)
128 |
129 | #turn test data into a dictionary (a word can have mutiple translation)
130 | #gold = collections.defaultdict(set)
131 | #for k, v in test_data:
132 | # gold[k].add(v)
133 |
134 | #score(mapped_source_sp, target_sp, gold, additional)
135 |
136 | print "Printing mapped vectors: %s" % out_file
137 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
138 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
139 |
140 | if __name__ == '__main__':
141 | main(sys.argv)
142 |
143 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/train_tm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import getopt
3 | import numpy as np
4 | from space import Space
5 | from utils import read_dict, train_tm
6 |
7 | def usage(errno=0):
8 | print >>sys.stderr,\
9 | """
10 | Given train data (pairs of words and their translation), source language and
11 | target language vectors, it outputs a translation matrix between source and
12 | target spaces.
13 |
14 | Usage:
15 | python train_tm.py [options] train_data source_vecs target_vecs
16 | \n\
17 | Options:
18 | -o --output : output file prefix. Optional. Default is ./tm
19 | -h --help : help
20 |
21 | Arguments:
22 | train_data: , train dictionary, list of word pairs (space separated words,
23 | one word pair per line)
24 | source_vecs: , vectors in source language. Space-separated, with string
25 | identifier as first column (dim+1 columns, where dim is the dimensionality
26 | of the space)
27 | target_vecs: , vectors in target language
28 |
29 |
30 | Example:
31 | python train_tm.py train_data.txt ENspace.pkl ITspace.pkl
32 |
33 | """
34 | sys.exit(errno)
35 |
36 |
37 | def main(sys_argv):
38 |
39 | try:
40 | opts, argv = getopt.getopt(sys_argv[1:], "ho:",
41 | ["help", "output="])
42 | except getopt.GetoptError, err:
43 | print str(err)
44 | usage()
45 | sys.exit(1)
46 |
47 | out_file = "./tm"
48 | for opt, val in opts:
49 | if opt in ("-o", "--output"):
50 | out_file = val
51 | elif opt in ("-h", "--help"):
52 | usage(0)
53 | else:
54 | usage(1)
55 |
56 | if len(argv) == 3:
57 | source_file = argv[1]
58 | target_file = argv[2]
59 | dict_file = argv[0]
60 | else:
61 | print str(err)
62 | usage(1)
63 |
64 |
65 | print "Reading the training data"
66 | train_data = read_dict(dict_file)
67 |
68 | #we only need to load the vectors for the words in the training data
69 | #semantic spaces contain additional words
70 | source_words, target_words = zip(*train_data)
71 |
72 | print "Reading: %s" % source_file
73 | source_sp = Space.build(source_file, set(source_words))
74 | source_sp.normalize()
75 |
76 | print "Reading: %s" % target_file
77 | target_sp = Space.build(target_file, set(target_words))
78 | target_sp.normalize()
79 |
80 | print "Learning the translation matrix"
81 | tm = train_tm(source_sp, target_sp, train_data)
82 |
83 | print "Printing the translation matrix"
84 | np.savetxt("%s.txt" % out_file, tm)
85 |
86 |
87 | if __name__ == '__main__':
88 | main(sys.argv)
89 |
90 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/translate_tm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import getopt
3 | import numpy as np
4 | import collections
5 | import random
6 | from space import Space
7 | from utils import read_dict, apply_tm, score, get_valid_data
8 |
9 | def usage(errno=0):
10 | print >>sys.stderr,\
11 | """
12 | Given a translation matrix, test data (words and their translations) and
13 | source and target language vectors, it returns translations of source test
14 | words and computes Top N accuracy.
15 |
16 | Usage:
17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs
18 | \n\
19 | Options:
20 | -o --output : file prefix. It prints the vectors obtained after
21 | the translation matrix is applied (.vecs.txt and .wds.txt).
22 | Optional. Default is ./translated_vecs
23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA)
24 | to be used with Global Correction (GC) strategy.
25 | Optional. Default, baseline retrieval is run.
26 |
27 | -h --help : help
28 |
29 | Arguments:
30 | trans_matrix: , translation matrix
31 | test_data: , list of source-target word pairs (space separated words,
32 | one word pair per line)
33 | source_vecs: , vectors in source language, Space-separated, with string
34 | identifier as first column (dim+1 columns, where dim is the
35 | dimensionality of the space)
36 | target_vecs: , vectors in target language
37 |
38 |
39 | Example:
40 | 1) Retrieve translations with standard nearest neighbour retrieval
41 |
42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
43 |
44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to
45 | correct for hubs (words that appear as the nearest neighbours of many points))
46 |
47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt
48 |
49 | """
50 | sys.exit(errno)
51 |
52 |
53 | def main(sys_argv):
54 |
55 | try:
56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
57 | ["help", "output=", "correction="])
58 | except getopt.GetoptError, err:
59 | print str(err)
60 | usage()
61 | sys.exit(1)
62 |
63 | out_file = "./translated_vecs"
64 | additional = None
65 | for opt, val in opts:
66 | if opt in ("-o", "--ouput"):
67 | out_file = val
68 | if opt in ("-c", "--correction"):
69 | try:
70 | additional = int(val)
71 | except ValueError:
72 | usage(1)
73 | elif opt in ("-h", "--help"):
74 | usage(0)
75 | else:
76 | usage(1)
77 |
78 | if len(argv) == 4:
79 | tm_file = argv[0]
80 | test_file = argv[1]
81 | source_file = argv[2]
82 | target_file = argv[3]
83 |
84 | else:
85 | print str(err)
86 | usage(1)
87 |
88 | print "Loading the translation matrix"
89 | tm = np.loadtxt(tm_file)
90 |
91 | print "Reading the test data"
92 | test_data = read_dict(test_file)
93 |
94 | #in the _source_ space, we only need to load vectors for the words in test.
95 | #semantic spaces may contain additional words, ALL words in the _target_
96 | #space are used as the search space
97 | source_words, _ = zip(*test_data)
98 | source_words = set(source_words)
99 |
100 | print "Reading: %s" % source_file
101 | if not additional:
102 | source_sp = Space.build(source_file, source_words)
103 | else:
104 | #read all the words in the space
105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
106 | comments=None, usecols=(0,)).flatten())
107 | #the max number of additional+test elements is bounded by the size
108 | #of the lexicon
109 | additional = min(additional, len(lexicon) - len(source_words))
110 | #we sample additional elements that are not already in source_words
111 | random.seed(100)
112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional)
113 |
114 | #load the source space
115 | source_sp = Space.build(source_file, source_words.union(set(lexicon)))
116 |
117 | source_sp.normalize()
118 |
119 | print "Reading: %s" % target_file
120 | target_sp = Space.build(target_file)
121 | target_sp.normalize()
122 |
123 | print "Translating" #translates all the elements loaded in the source space
124 | mapped_source_sp = apply_tm(source_sp, tm)
125 |
126 |
127 | print "Printing mapped vectors: %s" % out_file
128 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
129 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
130 |
131 | if __name__ == '__main__':
132 | main(sys.argv)
133 |
134 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import collections
3 | from space import Space
4 |
5 |
6 | def prec_at(ranks, cut):
7 | return len([r for r in ranks if r <= cut])/float(len(ranks))
8 |
9 | def get_rank(nn, gold):
10 | for idx,word in enumerate(nn):
11 | if word in gold:
12 | return idx + 1
13 | return idx + 1
14 |
15 |
16 | def read_dict(dict_file):
17 | return [tuple(line.strip().split()) for line in file(dict_file)]
18 |
19 |
20 | def apply_tm(sp, tm):
21 |
22 | print "Applying the translation matrix, size of data: %d" % sp.mat.shape[0]
23 | return Space(sp.mat*tm, sp.id2row)
24 |
25 | def get_valid_data(sp1, sp2, data):
26 | return [(el1, el2) for el1,el2 in data if
27 | el1 in sp1.row2id and el2 in sp2.row2id]
28 |
29 | def train_tm(sp1, sp2, data):
30 |
31 | data = get_valid_data(sp1, sp2, data)
32 | print "Training using: %d word pairs" % len(data)
33 |
34 | els1, els2 = zip(*data)
35 | m1 = sp1.mat[[sp1.row2id[el] for el in els1],:]
36 | m2 = sp2.mat[[sp2.row2id[el] for el in els2],:]
37 |
38 | tm = np.linalg.lstsq(m1, m2, -1)[0]
39 |
40 | return tm
41 |
42 |
43 | def score(sp1, sp2, gold, additional):
44 |
45 | sp1.normalize()
46 |
47 | print "Computing cosines and sorting target space elements"
48 | sim_mat = -sp2.mat*sp1.mat.T
49 |
50 | if additional:
51 | #for each element, computes its rank in the ranked list of
52 | #similarites. sorting done on the opposite axis (inverse querying)
53 | srtd_idx = np.argsort(np.argsort(sim_mat, axis=1), axis=1)
54 |
55 | #for each element, the resulting rank is combined with cosine scores.
56 | #the effect will be of breaking the ties, because cosines are smaller
57 | #than 1. sorting done on the standard axis (regular NN querying)
58 | srtd_idx = np.argsort(srtd_idx + sim_mat, axis=0)
59 | else:
60 | srtd_idx = np.argsort(sim_mat, axis=0)
61 |
62 | ranks = []
63 | for i,el1 in enumerate(gold.keys()):
64 |
65 | sp1_idx = sp1.row2id[el1]
66 |
67 | #print the top 5 translations
68 | translations = []
69 | for j in range(5):
70 | sp2_idx = srtd_idx[j, sp1_idx]
71 | word, score = sp2.id2row[sp2_idx], -sim_mat[sp2_idx, sp1_idx]
72 | translations.append("\t\t%s:%.3f" % (word, score))
73 |
74 | translations = "\n".join(translations)
75 |
76 | #get the rank of the (highest-ranked) translation
77 | rnk = get_rank(srtd_idx[:,sp1_idx].A.ravel(),
78 | [sp2.row2id[el] for el in gold[el1]])
79 | ranks.append(rnk)
80 |
81 | print ("\nId: %d Source: %s \n\tTranslation:\n%s \n\tGold: %s \n\tRank: %d" %
82 | (len(ranks), el1, translations, gold[el1], rnk))
83 |
84 | print "Corrected: %s" % str(additional)
85 | if additional:
86 | print "Total extra elements, Test(%d) + Additional:%d" % (len(gold.keys()),
87 | sp1.mat.shape[0])
88 | for k in [1,5,10]:
89 | print "Prec@%d: %.3f" % (k, prec_at(ranks, k))
90 |
91 |
--------------------------------------------------------------------------------
/code_mapping_across_languages/mappingcode/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/mappingcode/utils.pyc
--------------------------------------------------------------------------------
/code_mapping_across_languages/perform_mapping.sh:
--------------------------------------------------------------------------------
1 | MAIN="/mount/arbeitsdaten29/corpora/waterloo/img/en_vec/zeroShot/"
2 | CODE="mappingcode/"
3 |
4 |
5 | EN="hypercos.txt" #<- English Vectors (plain text w2v format)
6 | DE="de_cow_vecs.txt" #<- Source Language Vectors DE/IT
7 | A="zero_full.align" # Alignment file format word-source TAB word-target (EN)
8 | AV="fullvoc_de.txt" # <- Vocabulary file of the source language (used to predict every word in Source -> Target)
9 | OUT="out/" #<- Output folder
10 |
11 | python ${CODE}train_tm.py -o TM1 ${A} ${DE} ${EN}; # Learn the mapping Matrix
12 | python ${CODE}test_tm_pred.py TM1.txt ${AV} ${DE} ${EN}; # Apply the Mapping Matrix
13 | paste -d" " translated_vecs.wds.txt translated_vecs.vecs.txt >> ${OUT}output-vecs-tmp.txt # this is just formating
14 | rm -f translated_vecs*; # remove temporary files
15 | less ${DE} | head -1 > HEAD.txt;
16 | cat HEAD.txt ${OUT}output-vecs-tmp.txt > ${OUT}output-vecs.txt;
17 | rm -f HEAD.txt;
18 | rm -f ${OUT}output-vecs-tmp.txt
19 | rm -f TM1;
20 | #gzip ${OUT}output-vecs.txt # <- final new file!
21 |
22 |
23 | # Now we can convert the vectors into binary vectors using the script convert_w2vTXT_to_w2vBIN.py
24 | python convert_w2vTXT_to_w2vBIN.py ${OUT}output-vecs.txt # (will create) output-vecs.txt.bin
25 |
26 | # Now we can evaluate the binary embeddings
27 | # Using hyperscore python AP_evaluation_code/test_norm.py
28 | # Using default cosine python AP_evaluation_code/test_default.py
--------------------------------------------------------------------------------
/code_mapping_across_languages/vocabulary file/german_voc_wikipedia.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/vocabulary file/german_voc_wikipedia.txt.gz
--------------------------------------------------------------------------------
/code_mapping_across_languages/vocabulary file/italian_voc_wikipedia.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/vocabulary file/italian_voc_wikipedia.txt.gz
--------------------------------------------------------------------------------
/config.cfg:
--------------------------------------------------------------------------------
1 | ProjectDir = /mount/arbeitsdaten34/projekte/slu/KimAnh/HypeEmb
2 |
3 | TrainDir = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/corpus
4 | VocabFileName = wiki_en.vocab
5 |
6 | WordVectorFileName = hypervec.bin
7 |
8 | hypeNoun = /projekte/semrel/Users/kim-anh/hypernyms/hypernym_n.txt
9 | hypeVerb = /projekte/semrel/Users/kim-anh/hypernyms/hypernym_v.txt
10 | cohypoNoun = /projekte/semrel/Users/kim-anh/hypernyms/cohyponym_n.txt
11 | cohypoVerb = /projekte/semrel/Users/kim-anh/hypernyms/cohyponym_v.txt
12 |
13 | featureNoun = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/lexical-contrast/wiki_en_features.noun
14 | featureVerb = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/lexical-contrast/wiki_en_features.verb
15 |
16 | HierarchialSoftmax = false
17 | NegativeSampling = 15
18 | SubSampling = 1e-5
19 | MinFrequency = 50
20 |
--------------------------------------------------------------------------------
/create_features.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import spacy
3 | from spacy.lang.en import English
4 | import gzip
5 | from collections import Counter, defaultdict
6 | import six.moves.cPickle as pickle
7 | from itertools import count
8 |
9 | def main():
10 | """
11 | TODO: extracts the feature files in the corpus
12 | Usage: python create_features.py -input corpus -output output-file-name -pos
13 | -pos:
14 | """
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('-input', type=str)
17 | parser.add_argument('-output', type=str)
18 | parser.add_argument('-pos', type=str)
19 | args = parser.parse_args()
20 |
21 | nlp = English()
22 | nlp.add_pipe(nlp.create_pipe('sentencizer'))
23 |
24 | window_size = 5
25 | dfeatures = defaultdict(set)
26 |
27 | output_dir = '/mount/arbeitsdaten34/projekte/slu/KimAnh/Corpora/'
28 |
29 | vocab_to_id = defaultdict(count(0).next)
30 |
31 | with gzip.open(args.input,'rb') as fin:
32 | para_num = 0
33 | # Read each paragraph in corpus
34 | for paragraph in fin:
35 | # Check empty paragraph
36 | paragraph = paragraph.strip()
37 | if len(paragraph) == 0: continue
38 | para_num += 1
39 | print 'Processing para: %d' %para_num
40 | # Parse each sentence
41 | parsed_para = nlp(unicode(paragraph))
42 | for sent in parsed_para.sents:
43 | features = process_one_sentence(sent, args.pos, window_size, vocab_to_id)
44 | dfeatures.update(features)
45 |
46 | id_to_vocab = {idx:word for word,idx in vocab_to_id.iteritems()}
47 | save_file(dfeatures, id_to_vocab, args.output)
48 |
49 | print 'Parsing corpus done....!'
50 |
51 | def save_file(dfeatures, id_to_vocab, outfile):
52 | with gzip.open(outfile, 'w') as fout:
53 | for kk,vv in dfeatures:
54 | contexts = [id_to_vocab[idx] for idx in list(vv)]
55 | fout.write(str(id_to_vocab[kk]))
56 | for word in contexts:
57 | fout.write('\t' + str(word))
58 | fout.write('\n')
59 | print 'Saved file!'
60 |
61 | def process_one_sentence(sent, pos, window_size, vocab_to_id):
62 | features = defaultdict(set)
63 |
64 | for idx,token in enumerate(sent):
65 | if token.tag_[:2] == pos and len(token.string.strip()) > 2:
66 | for idw in range(idx-window_size, idx+window_size):
67 | if idw != idx and idw >= 0 and idw < len(sent):
68 | features[vocab_to_id[sent[idx]]].add(vocab_to_id[sent[idw]])
69 |
70 | return features
71 |
72 | if __name__=='__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/datasets_across_languages/eval_DE/noun_hyp_vs_ant.txt:
--------------------------------------------------------------------------------
1 | Pflanze Lebewesen 1
2 | Anrufbeantworter Gerät 1
3 | Rakete Flugobjekt 1
4 | Jagd Freizeitbeschäftigung 1
5 | Persönlichkeit Eigenschaft 1
6 | Restaurierung Pflege 1
7 | Vizekönig Adeliger 1
8 | Bein Körperteil 1
9 | Bauabschnitt Bau 1
10 | Hoheit Herrscher 1
11 | Mittelklasse Klasse 1
12 | Schwergewicht Gewichtsklasse 1
13 | Bauabschnitt Etappe 1
14 | Recherche Tätigkeit 1
15 | MEZ Zeitzone 1
16 | Ekel Gefühl 1
17 | Trio Gruppe 1
18 | Kirchengemeinde Gruppe 1
19 | Fisch Tier 1
20 | Modell Ding 1
21 | Bock Säugetier 1
22 | Agentur Firma 1
23 | Gesamtbelastung Belastung 1
24 | Mittelklasse Gesellschaftsschicht 1
25 | Bremer Norddeutscher 1
26 | Kern Pflanzenbestandteil 1
27 | Looping Figur 1
28 | Bahnsteig Verkehr 1
29 | Komplex Gebäude 1
30 | Stuhl Möbel 1
31 | Bremer Person 1
32 | Sirene Alarmgerät 1
33 | Nutzer Kunde 1
34 | Hoheit Titel 1
35 | Torf Material 1
36 | Bedienung Restaurant 1
37 | Lieblichkeit Attraktivität 1
38 | Bibel Buch 1
39 | Bein Gliedmaße 1
40 | Los Glücksspiel 1
41 | Wache Schutz 1
42 | Persönlichkeit Person 1
43 | Bahnsteig Haltestelle 1
44 | Breite Dimension 1
45 | Bremer Deutscher 1
46 | Nobelpreis Preis 1
47 | Professor Gelehrter 1
48 | Gegenargument Diskussion 1
49 | Zeit Dimension 1
50 | Nutzer Person 1
51 | Großmeister Meister 1
52 | Wache Tätigkeit 1
53 | Killer Krimineller 1
54 | Schleife Gebinde 1
55 | Alb Gebirgsmassiv 1
56 | Bedienung Dienstleistung 1
57 | Vizekönig Herrscher 1
58 | Arzt Beruf 1
59 | Bock Tier 1
60 | Zivilgesellschaft Gesellschaft 1
61 | Ablehnung Reaktion 1
62 | Lieblichkeit Eigenschaft 1
63 | Gerät Ding 1
64 | Professor Lehrperson 1
65 | Bundesland Region 1
66 | Eiweiß Nährstoff 1
67 | Natter Tier 1
68 | Restaurierung Modernisierung 1
69 | Breite Maß 1
70 | Katze Tier 1
71 | Bahnsteig Bahnhof 1
72 | Glaubenssatz Satz 1
73 | Katze Haustier 1
74 | Signatur Kennzeichen 1
75 | Eiweiß Ei 1
76 | Hexe Märchengestalt 1
77 | Agentur Unternehmen 1
78 | Propagandist Politiker 1
79 | Signatur Schrift 1
80 | Entscheidungskompetenz Kompetenz 1
81 | Verhandlung Gespräch 1
82 | Temperatur Maßeinheit 1
83 | Wandzeitung Zeitung 1
84 | Gatter Begrenzung 1
85 | Auflösung Ende 1
86 | Nutzer Mensch 1
87 | Bauer Beruf 1
88 | Ausnahmefall Fall 1
89 | Rübe Gemüse 1
90 | Funk Kommunikation 1
91 | Pension Einkommen 1
92 | Pflanze Natur 1
93 | Atomwaffe Waffe 1
94 | Ablehnung Verhalten 1
95 | Buchhalterin Beruf 1
96 | Individualismus Wertesystem 1
97 | MEZ Zeit 1
98 | Signatur Überschrift 0
99 | Kriminalität Sicherheit 0
100 | Bein Arm 0
101 | Ablehnung Annahme 0
102 | Komplex Einfachheit 0
103 | MEZ UTC 0
104 | Schulbuch Roman 0
105 | Ausnahmefall Regelfall 0
106 | Fluch Segen 0
107 | Killer Opfer 0
108 | Verhandlung Bestimmung 0
109 | Breite Enge 0
110 | Ausnahmefall Normalfall 0
111 | Individualismus Kollektivismus 0
112 | Hexe Fee 0
113 | Ablehnung Zustimmung 0
114 | Kriminalität Friedlichkeit 0
115 | Komplex einfach 0
116 | Lieblichkeit Bitterkeit 0
117 | Bauer Stadtbewohner 0
118 | Devise Landeswährung 0
119 | Bremer Hamburger 0
120 | Breite Länge 0
121 | Regression Fortschritt 0
122 | Torf Sand 0
123 | Kaufhaus Tante-Emma-Laden 0
124 | Gegenargument Fürargument 0
125 | Pfeffer Zucker 0
126 | Natter Viper 0
127 | Hoheit Untertan 0
128 | Hoheit Bürger 0
129 | Deutlichkeit Unklarheit 0
130 | Gesamtbelastung Einzelbelastung 0
131 | Großmeister Lehrling 0
132 | Kadett Offizier 0
133 | Restaurierung Verfall 0
134 | Gerät Mensch 0
135 | Bahnsteig Bahngleis 0
136 | Bedienung Selbstbedienung 0
137 | Fisch Fleisch 0
138 | Eiweiß Kohlenhydrat 0
139 | Gesamtbelastung Teilbelastung 0
140 | Ekel Zuneigung 0
141 | Materialsammlung Einzelstück 0
142 | VHS Universität 0
143 | Auslöser Bremser 0
144 | Funk Kabel 0
145 | Explosion Implosion 0
146 | Nutzer Entwickler 0
147 | Rakete U-Boot 0
148 | Buchhalterin Buchhalter 0
149 | Bibel Koran 0
150 | Stuhl Tisch 0
151 | Alb Flachland 0
152 | Alb Tal 0
153 | Auflösung Gründung 0
154 | Pfeffer Salz 0
155 | Zeit Raum 0
156 | Katalysator Motor 0
157 | Kern Hülle 0
158 | Ekel Anziehung 0
159 | Bock Geiß 0
160 | Regression Progression 0
161 | Betrieb Ruhe 0
162 | Restaurierung Zerstörung 0
163 | Wache Einbrecher 0
164 | Pflanze Tier 0
165 | Natter Vogel 0
166 | Arzt Patient 0
167 | Klage Zufriedenheit 0
168 | Mittelklasse Oberklasse 0
169 | Bremer Münchner 0
170 | Los Niete 0
171 | Verjüngung Alterung 0
172 | Nutzer Hersteller 0
173 | Zeit Unendlichkeit 0
174 | Ausnahmefall Regel 0
175 | Sirene Stille 0
176 | Lieblichkeit Hässlichkeit 0
177 | Hexe Zauberer 0
178 | Entscheidungskompetenz Entscheidungsunfähigkeit 0
179 | VHS DVD 0
180 | Modell Realität 0
181 | Mittelklasse Oberschicht 0
182 | Trio Solo 0
183 | Bedienung Gast 0
184 | Schwergewicht Leichtgewicht 0
185 | Deutlichkeit Undeutlichkeit 0
186 | Abschneiden Ankleben 0
187 | Verhandlung Entscheidung 0
188 | Gatter Freiheit 0
189 | Katze Hund 0
190 | Bahnsteig Bushaltestelle 0
191 | Bauer Industrieller 0
192 | Betrieb Stillstand 0
193 |
--------------------------------------------------------------------------------
/datasets_across_languages/eval_DE/noun_hyp_vs_syn.txt:
--------------------------------------------------------------------------------
1 | Pflanze Lebewesen 1
2 | Anrufbeantworter Gerät 1
3 | Rakete Flugobjekt 1
4 | Jagd Freizeitbeschäftigung 1
5 | Persönlichkeit Eigenschaft 1
6 | Restaurierung Pflege 1
7 | Vizekönig Adeliger 1
8 | Bein Körperteil 1
9 | Bauabschnitt Bau 1
10 | Hoheit Herrscher 1
11 | Mittelklasse Klasse 1
12 | Schwergewicht Gewichtsklasse 1
13 | Bauabschnitt Etappe 1
14 | Recherche Tätigkeit 1
15 | MEZ Zeitzone 1
16 | Ekel Gefühl 1
17 | Trio Gruppe 1
18 | Kirchengemeinde Gruppe 1
19 | Fisch Tier 1
20 | Modell Ding 1
21 | Bock Säugetier 1
22 | Agentur Firma 1
23 | Gesamtbelastung Belastung 1
24 | Mittelklasse Gesellschaftsschicht 1
25 | Bremer Norddeutscher 1
26 | Kern Pflanzenbestandteil 1
27 | Looping Figur 1
28 | Bahnsteig Verkehr 1
29 | Komplex Gebäude 1
30 | Stuhl Möbel 1
31 | Bremer Person 1
32 | Sirene Alarmgerät 1
33 | Nutzer Kunde 1
34 | Hoheit Titel 1
35 | Torf Material 1
36 | Bedienung Restaurant 1
37 | Lieblichkeit Attraktivität 1
38 | Bibel Buch 1
39 | Bein Gliedmaße 1
40 | Los Glücksspiel 1
41 | Wache Schutz 1
42 | Persönlichkeit Person 1
43 | Bahnsteig Haltestelle 1
44 | Breite Dimension 1
45 | Bremer Deutscher 1
46 | Nobelpreis Preis 1
47 | Professor Gelehrter 1
48 | Gegenargument Diskussion 1
49 | Zeit Dimension 1
50 | Nutzer Person 1
51 | Großmeister Meister 1
52 | Wache Tätigkeit 1
53 | Killer Krimineller 1
54 | Schleife Gebinde 1
55 | Alb Gebirgsmassiv 1
56 | Bedienung Dienstleistung 1
57 | Vizekönig Herrscher 1
58 | Arzt Beruf 1
59 | Bock Tier 1
60 | Zivilgesellschaft Gesellschaft 1
61 | Ablehnung Reaktion 1
62 | Lieblichkeit Eigenschaft 1
63 | Gerät Ding 1
64 | Professor Lehrperson 1
65 | Bundesland Region 1
66 | Eiweiß Nährstoff 1
67 | Natter Tier 1
68 | Restaurierung Modernisierung 1
69 | Breite Maß 1
70 | Katze Tier 1
71 | Bahnsteig Bahnhof 1
72 | Glaubenssatz Satz 1
73 | Katze Haustier 1
74 | Signatur Kennzeichen 1
75 | Eiweiß Ei 1
76 | Hexe Märchengestalt 1
77 | Agentur Unternehmen 1
78 | Propagandist Politiker 1
79 | Signatur Schrift 1
80 | Entscheidungskompetenz Kompetenz 1
81 | Verhandlung Gespräch 1
82 | Temperatur Maßeinheit 1
83 | Wandzeitung Zeitung 1
84 | Gatter Begrenzung 1
85 | Auflösung Ende 1
86 | Nutzer Mensch 1
87 | Bauer Beruf 1
88 | Ausnahmefall Fall 1
89 | Rübe Gemüse 1
90 | Funk Kommunikation 1
91 | Pension Einkommen 1
92 | Pflanze Natur 1
93 | Atomwaffe Waffe 1
94 | Ablehnung Verhalten 1
95 | Buchhalterin Beruf 1
96 | Individualismus Wertesystem 1
97 | MEZ Zeit 1
98 | Wandzeitung Aushang 0
99 | Propagandist Verkaufsförderer 0
100 | Schulbuch Lehrbuch 0
101 | Hoheit König 0
102 | Sirene Martinshorn 0
103 | Ausnahmefall Sonderfall 0
104 | Lieblichkeit Anmut 0
105 | VHS Videokassette 0
106 | Kadett Anfänger 0
107 | Gatter Umzäunung 0
108 | Devise Wahlspruch 0
109 | Fisch Meeresbewohner 0
110 | Pfeffer Würze 0
111 | Zivilgesellschaft Volk 0
112 | Anrufbeantworter AB 0
113 | Katze Stubentiger 0
114 | Professor Dozent 0
115 | Lieblichkeit Liebreiz 0
116 | Kirchengemeinde Glaubensanhänger 0
117 | Wache Aufsicht 0
118 | Gegenargument Widerspruch 0
119 | Temperatur Wärmegrad 0
120 | Bauer Landwirt 0
121 | Restaurierung Instandsetzung 0
122 | Großmeister Sachkundiger 0
123 | Bein Gliedmaß 0
124 | Numerus Nummer 0
125 | Pension Gästehaus 0
126 | Agentur Geschäftsstelle 0
127 | Funk Radio 0
128 | Abschneiden Abtrennen 0
129 | Kondom Pariser 0
130 | Trio Dreiergruppe 0
131 | Rübe Karotte 0
132 | Rakete Geschoss 0
133 | Zeile Linie 0
134 | Kaufhaus Einkaufszentrum 0
135 | Komplex Störung 0
136 | Trio drei 0
137 | Glaubenssatz Lehre 0
138 | Looping Salto 0
139 | Explosion Ausbruch 0
140 | Kondom Präservativ 0
141 | Anrufbeantworter Mailbox 0
142 | Rübe Wurzel 0
143 | VHS Volkshochschule 0
144 | Ablehnung Abweisung 0
145 | Looping Überschlag 0
146 | Kriminalität Delinquenz 0
147 | Signatur Unterschrift 0
148 | Verhandlung Besprechung 0
149 | Klage Anschuldigung 0
150 | Pflanze Gewächs 0
151 | Atomwaffe Kernwaffe 0
152 | Bauer Farmer 0
153 | Zeit Dauer 0
154 | Fisch Wassertier 0
155 | Umgebung Umland 0
156 | Bauabschnitt Bausektion 0
157 | Verhandlung Prozess 0
158 | Individualismus Eigenheit 0
159 | Rübe Möhre 0
160 | Anrufbeantworter Telefonbeantworter 0
161 | Zeit Epoche 0
162 | Invasion Einfall 0
163 | Jagd Hetze 0
164 | Mittelklasse Mittelschicht 0
165 | Breite Weite 0
166 | Schleife Schlaufe 0
167 | Schwergewicht Schwerpunkt 0
168 | Arzt Doktor 0
169 | Katze Samtpfote 0
170 | Bahnsteig Perron 0
171 | Bundesland Gliedstaat 0
172 | Zivilgesellschaft Bürgergesellschaft 0
173 | Bock Hammel 0
174 | Gesamtbelastung Totalbelastung 0
175 | Gerät Apparat 0
176 | Hoheit Majestät 0
177 | Zeile Reihe 0
178 | Mittelklasse Mittelstand 0
179 | Katalysator Beschleuniger 0
180 | Looping Schleife 0
181 | Explosion Sprengung 0
182 | Kaufhaus Warenhaus 0
183 | Professor Hochschullehrer 0
184 | Blatt Zettel 0
185 | Schulbuch Schullektüre 0
186 | Temperatur Wärme 0
187 | Verjüngung Verengung 0
188 |
--------------------------------------------------------------------------------
/datasets_across_languages/eval_DE/noun_hyp_vs_synant.txt:
--------------------------------------------------------------------------------
1 | Pflanze Lebewesen 1
2 | Anrufbeantworter Gerät 1
3 | Rakete Flugobjekt 1
4 | Jagd Freizeitbeschäftigung 1
5 | Persönlichkeit Eigenschaft 1
6 | Restaurierung Pflege 1
7 | Vizekönig Adeliger 1
8 | Bein Körperteil 1
9 | Bauabschnitt Bau 1
10 | Hoheit Herrscher 1
11 | Mittelklasse Klasse 1
12 | Schwergewicht Gewichtsklasse 1
13 | Bauabschnitt Etappe 1
14 | Recherche Tätigkeit 1
15 | MEZ Zeitzone 1
16 | Ekel Gefühl 1
17 | Trio Gruppe 1
18 | Kirchengemeinde Gruppe 1
19 | Fisch Tier 1
20 | Modell Ding 1
21 | Bock Säugetier 1
22 | Agentur Firma 1
23 | Gesamtbelastung Belastung 1
24 | Mittelklasse Gesellschaftsschicht 1
25 | Bremer Norddeutscher 1
26 | Kern Pflanzenbestandteil 1
27 | Looping Figur 1
28 | Bahnsteig Verkehr 1
29 | Komplex Gebäude 1
30 | Stuhl Möbel 1
31 | Bremer Person 1
32 | Sirene Alarmgerät 1
33 | Nutzer Kunde 1
34 | Hoheit Titel 1
35 | Torf Material 1
36 | Bedienung Restaurant 1
37 | Lieblichkeit Attraktivität 1
38 | Bibel Buch 1
39 | Bein Gliedmaße 1
40 | Los Glücksspiel 1
41 | Wache Schutz 1
42 | Persönlichkeit Person 1
43 | Bahnsteig Haltestelle 1
44 | Breite Dimension 1
45 | Bremer Deutscher 1
46 | Nobelpreis Preis 1
47 | Professor Gelehrter 1
48 | Gegenargument Diskussion 1
49 | Zeit Dimension 1
50 | Nutzer Person 1
51 | Großmeister Meister 1
52 | Wache Tätigkeit 1
53 | Killer Krimineller 1
54 | Schleife Gebinde 1
55 | Alb Gebirgsmassiv 1
56 | Bedienung Dienstleistung 1
57 | Vizekönig Herrscher 1
58 | Arzt Beruf 1
59 | Bock Tier 1
60 | Zivilgesellschaft Gesellschaft 1
61 | Ablehnung Reaktion 1
62 | Lieblichkeit Eigenschaft 1
63 | Gerät Ding 1
64 | Professor Lehrperson 1
65 | Bundesland Region 1
66 | Eiweiß Nährstoff 1
67 | Natter Tier 1
68 | Restaurierung Modernisierung 1
69 | Breite Maß 1
70 | Katze Tier 1
71 | Bahnsteig Bahnhof 1
72 | Glaubenssatz Satz 1
73 | Katze Haustier 1
74 | Signatur Kennzeichen 1
75 | Eiweiß Ei 1
76 | Hexe Märchengestalt 1
77 | Agentur Unternehmen 1
78 | Propagandist Politiker 1
79 | Signatur Schrift 1
80 | Entscheidungskompetenz Kompetenz 1
81 | Verhandlung Gespräch 1
82 | Temperatur Maßeinheit 1
83 | Wandzeitung Zeitung 1
84 | Gatter Begrenzung 1
85 | Auflösung Ende 1
86 | Nutzer Mensch 1
87 | Bauer Beruf 1
88 | Ausnahmefall Fall 1
89 | Rübe Gemüse 1
90 | Funk Kommunikation 1
91 | Pension Einkommen 1
92 | Pflanze Natur 1
93 | Atomwaffe Waffe 1
94 | Ablehnung Verhalten 1
95 | Buchhalterin Beruf 1
96 | Individualismus Wertesystem 1
97 | MEZ Zeit 1
98 | Signatur Überschrift 0
99 | Kriminalität Sicherheit 0
100 | Bein Arm 0
101 | Ablehnung Annahme 0
102 | Komplex Einfachheit 0
103 | MEZ UTC 0
104 | Schulbuch Roman 0
105 | Ausnahmefall Regelfall 0
106 | Fluch Segen 0
107 | Killer Opfer 0
108 | Verhandlung Bestimmung 0
109 | Breite Enge 0
110 | Ausnahmefall Normalfall 0
111 | Individualismus Kollektivismus 0
112 | Hexe Fee 0
113 | Ablehnung Zustimmung 0
114 | Kriminalität Friedlichkeit 0
115 | Komplex einfach 0
116 | Lieblichkeit Bitterkeit 0
117 | Bauer Stadtbewohner 0
118 | Devise Landeswährung 0
119 | Bremer Hamburger 0
120 | Breite Länge 0
121 | Regression Fortschritt 0
122 | Torf Sand 0
123 | Kaufhaus Tante-Emma-Laden 0
124 | Gegenargument Fürargument 0
125 | Pfeffer Zucker 0
126 | Natter Viper 0
127 | Hoheit Untertan 0
128 | Hoheit Bürger 0
129 | Deutlichkeit Unklarheit 0
130 | Gesamtbelastung Einzelbelastung 0
131 | Großmeister Lehrling 0
132 | Kadett Offizier 0
133 | Restaurierung Verfall 0
134 | Gerät Mensch 0
135 | Bahnsteig Bahngleis 0
136 | Bedienung Selbstbedienung 0
137 | Fisch Fleisch 0
138 | Eiweiß Kohlenhydrat 0
139 | Gesamtbelastung Teilbelastung 0
140 | Ekel Zuneigung 0
141 | Materialsammlung Einzelstück 0
142 | VHS Universität 0
143 | Auslöser Bremser 0
144 | Funk Kabel 0
145 | Explosion Implosion 0
146 | Nutzer Entwickler 0
147 | Rakete U-Boot 0
148 | Buchhalterin Buchhalter 0
149 | Bibel Koran 0
150 | Stuhl Tisch 0
151 | Alb Flachland 0
152 | Alb Tal 0
153 | Auflösung Gründung 0
154 | Pfeffer Salz 0
155 | Zeit Raum 0
156 | Katalysator Motor 0
157 | Kern Hülle 0
158 | Ekel Anziehung 0
159 | Bock Geiß 0
160 | Regression Progression 0
161 | Betrieb Ruhe 0
162 | Restaurierung Zerstörung 0
163 | Wache Einbrecher 0
164 | Pflanze Tier 0
165 | Natter Vogel 0
166 | Arzt Patient 0
167 | Klage Zufriedenheit 0
168 | Mittelklasse Oberklasse 0
169 | Bremer Münchner 0
170 | Los Niete 0
171 | Verjüngung Alterung 0
172 | Nutzer Hersteller 0
173 | Zeit Unendlichkeit 0
174 | Ausnahmefall Regel 0
175 | Sirene Stille 0
176 | Lieblichkeit Hässlichkeit 0
177 | Hexe Zauberer 0
178 | Entscheidungskompetenz Entscheidungsunfähigkeit 0
179 | VHS DVD 0
180 | Modell Realität 0
181 | Mittelklasse Oberschicht 0
182 | Trio Solo 0
183 | Bedienung Gast 0
184 | Schwergewicht Leichtgewicht 0
185 | Deutlichkeit Undeutlichkeit 0
186 | Abschneiden Ankleben 0
187 | Verhandlung Entscheidung 0
188 | Gatter Freiheit 0
189 | Katze Hund 0
190 | Bahnsteig Bushaltestelle 0
191 | Bauer Industrieller 0
192 | Betrieb Stillstand 0
193 | Wandzeitung Aushang 0
194 | Propagandist Verkaufsförderer 0
195 | Schulbuch Lehrbuch 0
196 | Hoheit König 0
197 | Sirene Martinshorn 0
198 | Ausnahmefall Sonderfall 0
199 | Lieblichkeit Anmut 0
200 | VHS Videokassette 0
201 | Kadett Anfänger 0
202 | Gatter Umzäunung 0
203 | Devise Wahlspruch 0
204 | Fisch Meeresbewohner 0
205 | Pfeffer Würze 0
206 | Zivilgesellschaft Volk 0
207 | Anrufbeantworter AB 0
208 | Katze Stubentiger 0
209 | Professor Dozent 0
210 | Lieblichkeit Liebreiz 0
211 | Kirchengemeinde Glaubensanhänger 0
212 | Wache Aufsicht 0
213 | Gegenargument Widerspruch 0
214 | Temperatur Wärmegrad 0
215 | Bauer Landwirt 0
216 | Restaurierung Instandsetzung 0
217 | Großmeister Sachkundiger 0
218 | Bein Gliedmaß 0
219 | Numerus Nummer 0
220 | Pension Gästehaus 0
221 | Agentur Geschäftsstelle 0
222 | Funk Radio 0
223 | Abschneiden Abtrennen 0
224 | Kondom Pariser 0
225 | Trio Dreiergruppe 0
226 | Rübe Karotte 0
227 | Rakete Geschoss 0
228 | Zeile Linie 0
229 | Kaufhaus Einkaufszentrum 0
230 | Komplex Störung 0
231 | Trio drei 0
232 | Glaubenssatz Lehre 0
233 | Looping Salto 0
234 | Explosion Ausbruch 0
235 | Kondom Präservativ 0
236 | Anrufbeantworter Mailbox 0
237 | Rübe Wurzel 0
238 | VHS Volkshochschule 0
239 | Ablehnung Abweisung 0
240 | Looping Überschlag 0
241 | Kriminalität Delinquenz 0
242 | Signatur Unterschrift 0
243 | Verhandlung Besprechung 0
244 | Klage Anschuldigung 0
245 | Pflanze Gewächs 0
246 | Atomwaffe Kernwaffe 0
247 | Bauer Farmer 0
248 | Zeit Dauer 0
249 | Fisch Wassertier 0
250 | Umgebung Umland 0
251 | Bauabschnitt Bausektion 0
252 | Verhandlung Prozess 0
253 | Individualismus Eigenheit 0
254 | Rübe Möhre 0
255 | Anrufbeantworter Telefonbeantworter 0
256 | Zeit Epoche 0
257 | Invasion Einfall 0
258 | Jagd Hetze 0
259 | Mittelklasse Mittelschicht 0
260 | Breite Weite 0
261 | Schleife Schlaufe 0
262 | Schwergewicht Schwerpunkt 0
263 | Arzt Doktor 0
264 | Katze Samtpfote 0
265 | Bahnsteig Perron 0
266 | Bundesland Gliedstaat 0
267 | Zivilgesellschaft Bürgergesellschaft 0
268 | Bock Hammel 0
269 | Gesamtbelastung Totalbelastung 0
270 | Gerät Apparat 0
271 | Hoheit Majestät 0
272 | Zeile Reihe 0
273 | Mittelklasse Mittelstand 0
274 | Katalysator Beschleuniger 0
275 | Looping Schleife 0
276 | Explosion Sprengung 0
277 | Kaufhaus Warenhaus 0
278 | Professor Hochschullehrer 0
279 | Blatt Zettel 0
280 | Schulbuch Schullektüre 0
281 | Temperatur Wärme 0
282 | Verjüngung Verengung 0
283 |
--------------------------------------------------------------------------------
/datasets_classification/eval-bless.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/datasets_classification/eval-bless.jar
--------------------------------------------------------------------------------
/datasets_classification/eval-dir.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/datasets_classification/eval-dir.jar
--------------------------------------------------------------------------------
/datasets_classification/readme_how_to.txt:
--------------------------------------------------------------------------------
1 |
2 | ## Evaluate Bless
3 | # 1] Vector file hypercos_wiki.txt.gz = word2vec format (txt) gziped.
4 | # 2] Percentage used for training (in the paper we use 2%)
5 | # 3] Number of iterations (in the paper we use 1000)
6 | java -jar eval-bless.jar hypercos_wiki.txt.gz 2 1000
7 |
--------------------------------------------------------------------------------
/evaluation_scripts/common.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numpy import fromstring, dtype
3 |
4 | def smart_open(fname, mode='rb'):
5 | if fname.endswith('.gz'):
6 | import gzip
7 | return gzip.open(fname, mode)
8 | elif fname.endswith('.bz2'):
9 | import bz2
10 | return bz2.BZ2File(fname, mode)
11 | else:
12 | return open(fname, mode)
13 |
14 | def load_vecs(binary_file, binary=1):
15 | vecs = []
16 | vocab = []
17 | if binary==1:
18 | with smart_open(binary_file, 'rb') as f:
19 | header = to_unicode(f.readline())
20 | vocab_size, vector_size = map(int, header.split())
21 | binary_len = dtype(np.float32).itemsize * vector_size
22 | for _ in range(vocab_size):
23 | word = []
24 | while True:
25 | ch = f.read(1)
26 | if ch == b' ':
27 | break
28 | if ch != b'\n':
29 | word.append(ch)
30 | word = to_unicode(b''.join(word))
31 | vocab.append(word)
32 | vec = fromstring(f.read(binary_len), dtype=np.float32)
33 | vecs.append(vec)
34 | else:
35 | with smart_open(binary_file, 'rb') as f:
36 | header = to_unicode(f.readline())
37 | if len(header.split()) == 2: vocab_size, vector_size = map(int, header.split())
38 | elif len(header.split()) > 2:
39 | parts = header.rstrip().split(" ")
40 | word, vec = parts[0], list(map(np.float32, parts[1:]))
41 | vocab.append(to_unicode(word))
42 | vecs.append(vec)
43 | for _, line in enumerate(f):
44 | parts = to_unicode(line.rstrip()).split(" ")
45 | word, vec = parts[0], list(map(np.float32, parts[1:]))
46 | vocab.append(to_unicode(word))
47 | vecs.append(vec)
48 |
49 | #embs_dim = len(vecs[1])
50 | #UNKNOWN_WORD = np.random.uniform(-0.25,0.25,embs_dim)
51 | #vecs = np.vstack((UNKNOWN_WORD, vecs))
52 | #vocab = ['#UNKNOWN#'] + list(vocab)
53 | #words = {word:idx for idx,word in enumerate(vocab)}
54 |
55 | return vecs, vocab
56 |
57 | def to_utf8(text, errors='strict', encoding='utf8'):
58 | """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
59 | if isinstance(text, unicode):
60 | return text.encode('utf8')
61 | # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
62 | else:
63 | return unicode(text, encoding, errors=errors).encode('utf8')
64 |
65 | def to_unicode(text, encoding='utf8', errors='strict'):
66 | """Convert a string (bytestring in `encoding` or unicode), to unicode."""
67 | if isinstance(text, unicode):
68 | return text
69 | else:
70 | return unicode(text, encoding=encoding, errors=errors)
--------------------------------------------------------------------------------
/evaluation_scripts/corrEval.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 | from scipy.stats import spearmanr
4 | from numpy.linalg import norm
5 | import common
6 |
7 | def cosine(u, v):
8 | return np.dot(u,v)/(norm(u)*norm(v))
9 |
10 | def hyper_score(u,v):
11 | sim = np.dot(u,v)/(norm(u)*norm(v))
12 | direct = norm(v)/norm(u)
13 | return sim*direct
14 |
15 | def load_data(embeddings_file, dataset_file, mode='cosine'):
16 | golds, scores = [], []
17 | unseen = 0
18 | with open(dataset_file, 'r') as fin:
19 | data = [line.strip().split(' ') for line in fin]
20 | vecs, words = common.load_vecs(embeddings_file, binary=1)
21 | embs = {word:vec for word,vec in zip(words,vecs)}
22 | for rec in data:
23 | if rec[0] in embs and rec[1] in embs:
24 | golds.append(float(rec[5]))
25 | if mode=='hyper':
26 | grade = hyper_score(embs[rec[0]], embs[rec[1]])
27 | scores.append(grade)
28 | elif mode=='cosine':
29 | grade = cosine(embs[rec[0]], embs[rec[1]])
30 | scores.append(grade)
31 | else:
32 | unseen += 1
33 | print 'unseen-words: %d' %unseen
34 | return golds, scores
35 |
36 | if __name__=='__main__':
37 | embeddings_file = sys.argv[1]
38 | dataset_file = sys.argv[2]
39 | mode = sys.argv[3] # either 'cosine' or 'hyper'
40 | golds, scores = load_data(embeddings_file, dataset_file, mode)
41 | rho = spearmanr(golds, scores)[0]
42 | print 'Spearman correlation: %f' %rho
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/get-pretrainedHyperVecEmbeddings/download_embeddings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # downloads hypvec_embeddings from IMS homepage
3 | for id in 0 1 2 3 4 5 6 7 8 9
4 | do
5 | wget http://www.ims.uni-stuttgart.de/documents/ressourcen/experiment-daten/hypvec_embd/hyp_p${id}.gz
6 | done
7 | cat hyp_p0.gz hyp_p1.gz hyp_p2.gz hyp_p3.gz hyp_p4.gz hyp_p5.gz hyp_p6.gz hyp_p7.gz hyp_p8.gz hyp_p9.gz > hypervec.txt.gz
8 | # rm -f hyp_p*.gz # OPTIONAL -remove files
9 | # gunzip hypervec.txt.gz # OPTIONAL unzip embeddings to plain text
10 |
--------------------------------------------------------------------------------
/hypernymy_resources/cohyponym_n.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/cohyponym_n.txt.gz
--------------------------------------------------------------------------------
/hypernymy_resources/cohyponym_v.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/cohyponym_v.txt.gz
--------------------------------------------------------------------------------
/hypernymy_resources/hypernym_n.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/hypernym_n.txt.gz
--------------------------------------------------------------------------------
/hypernymy_resources/hypernym_v.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/hypernym_v.txt.gz
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | 0.0.1
4 |
5 | src
6 |
7 |
8 | maven-compiler-plugin
9 | 3.1
10 |
11 | 1.7
12 | 1.7
13 |
14 |
15 |
16 | maven-assembly-plugin
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | jar-with-dependencies
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | junit
33 | junit
34 | 4.7
35 |
36 |
37 |
38 | com.googlecode.efficient-java-matrix-library
39 |
40 | ejml
41 | 0.25
42 |
43 |
44 | org.apache.commons
45 | commons-math3
46 | 3.3
47 |
48 |
49 | commons-lang
50 | commons-lang
51 | 2.3
52 |
53 |
54 | edu.stanford.nlp
55 | stanford-corenlp
56 | 3.4
57 |
58 |
59 | de.erichseifert.gral
60 | gral-core
61 | 0.10
62 |
63 |
64 | edu.berkeley.compbio
65 | jlibsvm
66 | 0.911
67 |
68 |
69 |
70 |
71 | erichseifert.de
72 | http://mvn.erichseifert.de/maven2
73 |
74 |
75 | dev.davidsoergel.com releases
76 | http://dev.davidsoergel.com/nexus/content/repositories/releases
77 |
78 | false
79 |
80 |
81 |
82 | dev.davidsoergel.com snapshots
83 | http://dev.davidsoergel.com/nexus/content/repositories/snapshots
84 |
85 | false
86 |
87 |
88 |
89 | SemRel
90 | HyperVec
91 |
--------------------------------------------------------------------------------
/src/common/DataStructureUtils.java:
--------------------------------------------------------------------------------
1 | package common;
2 |
3 | import io.word.Phrase;
4 |
5 | import java.util.ArrayList;
6 | import java.util.HashMap;
7 | import java.util.HashSet;
8 | import java.util.List;
9 |
10 | /**
11 | * This class provides a set of utility method to turn one data structure to
12 | * another
13 | *
14 | */
15 | public class DataStructureUtils {
16 |
17 | /**
18 | * This template method turns an array into a HashSet of the same type
19 | */
20 | public static HashSet arrayToSet(T[] inputArray) {
21 | HashSet result = new HashSet();
22 | if (inputArray != null) {
23 | for (int i = 0; i < inputArray.length; i++) {
24 | result.add(inputArray[i]);
25 | }
26 | }
27 | return result;
28 | }
29 |
30 | /**
31 | * This template method turns an array into a HashMap that maps an element
32 | * of the array to its index
33 | */
34 | public static HashMap arrayToMap(T[] inputArray) {
35 | HashMap result = new HashMap();
36 | if (inputArray != null) {
37 | for (int i = 0; i < inputArray.length; i++) {
38 | result.put(inputArray[i], i);
39 | }
40 | }
41 | return result;
42 | }
43 |
44 | /**
45 | * This template method turns an array into an (Array)List of the same type
46 | */
47 | public static ArrayList arrayToList(T[] inputArray) {
48 | ArrayList result = new ArrayList();
49 | if (inputArray != null) {
50 | for (int i = 0; i < inputArray.length; i++) {
51 | result.add(inputArray[i]);
52 | }
53 | }
54 | return result;
55 | }
56 |
57 | /*
58 | * The following set of methods turn a list into an array of the same type
59 | * The Java compiler cannot initialize an array without knowing the type of
60 | * the elements. Therefore, one cannot generalize with a template method
61 | */
62 |
63 | public static double[][] arrayListTo2dArray(List list) {
64 | double[][] array = new double[list.size()][list.get(0).length];
65 | list.toArray(array);
66 | return array;
67 | }
68 |
69 | /**
70 | * This template method turns an array into a HashMap that maps an element
71 | * of the array to its index
72 | */
73 | public static HashMap listToMap(List inputArray) {
74 | HashMap result = new HashMap();
75 | if (inputArray != null) {
76 | for (int i = 0; i < inputArray.size(); i++) {
77 | result.put(inputArray.get(i), i);
78 | }
79 | }
80 | return result;
81 | }
82 |
83 | public static String[] stringListToArray(List list) {
84 | String[] array = new String[list.size()];
85 | list.toArray(array);
86 | return array;
87 | }
88 |
89 | public static Phrase[] phraseListToArray(List list) {
90 | Phrase[] array = new Phrase[list.size()];
91 | list.toArray(array);
92 | return array;
93 | }
94 |
95 | public static int[] intListToArray(List list) {
96 | int[] array = new int[list.size()];
97 | int i = 0;
98 | for (Integer element : list) {
99 | array[i] = element;
100 | i++;
101 | }
102 | return array;
103 | }
104 |
105 | /**
106 | * Search through an small int array for a given value
107 | * @param array
108 | * @param key
109 | * @return the index of the first element to have a value equal to the key
110 | */
111 | public static int searchSmallIntArray(int[] array, int key) {
112 | for (int i = 0; i < array.length; i++) {
113 | if (array[i] == key)
114 | return i;
115 | }
116 | return -1;
117 | }
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/src/common/MathUtils.java:
--------------------------------------------------------------------------------
1 | package common;
2 |
3 | import java.util.Random;
4 |
5 | /**
6 | * This class contains a set of utility method for simple maths
7 | * (maybe should be replaced with utility method for SimpleMatrix class
8 | *
9 | */
10 | public class MathUtils {
11 | private static Random rand = new Random();
12 | // TODO: use some linear algebra method
13 |
14 | /**
15 | * Cosine of two vectors
16 | * @param v1: 1st vector
17 | * @param v2: 2nd vector
18 | * @return cosine value
19 | */
20 | public static double cosine(double[] v1, double[] v2) {
21 | double length1 = length(v1);
22 | double length2 = length(v2);
23 | if (length1 == 0 || length2 == 0) return 0.0;
24 | else return dot(v1, v2) / (length1 * length2);
25 | }
26 |
27 | /**
28 | * Length of a vector
29 | * @param v: input vector
30 | * @return length
31 | */
32 | public static double length(double[] v) {
33 | double norm = dot(v, v);
34 | return Math.sqrt(norm);
35 | }
36 |
37 | /**
38 | * Dot product of two vectors
39 | * @param v1 first vector
40 | * @param v2 second vector
41 | * @return dot product
42 | */
43 | public static double dot(double[] v1, double[] v2) {
44 | double result = 0;
45 | for (int i = 0; i < v1.length; i++) {
46 | result += v1[i] * v2[i];
47 | }
48 | return result;
49 | }
50 |
51 | /**
52 | * sigmoid function
53 | * @param f input value
54 | * @return sigmoid(f)
55 | */
56 | public static double sigmoid(double x) {
57 | // TODO: understand why they turn the formula like this (e^x faster
58 | // than e^-x ? Rounding error?)
59 | return 1 - (double) (1.0 / (1.0 + Math.exp(x)));
60 | }
61 |
62 | /**
63 | * tanh function
64 | */
65 | public static double tanh(double x) {
66 | return 1 - (double) (2.0 / (1.0 + Math.exp(2 * x)));
67 | }
68 |
69 | public static boolean isSampled(long count, long totalCount, double frequencyThreshold) {
70 | double randomThreshold = (double) (Math.sqrt(count
71 | / (frequencyThreshold * totalCount)) + 1)
72 | * (frequencyThreshold * totalCount) / count;
73 | if (randomThreshold >= rand.nextFloat()) {
74 | return true;
75 | } else {
76 | return false;
77 | }
78 | }
79 |
80 | public static double[] cosineDerivative(double[] x, double[] a) {
81 | double lengthX = length(x);
82 | double lengthA = length(a);
83 | double dotP = dot(x, a);
84 | double rToScaleA = 1 / (lengthX * lengthA);
85 | double rToScaleX = dotP / (lengthA * lengthX * lengthX * lengthX);
86 | double[] result = new double[x.length];
87 | for (int i = 0; i < x.length; i++) {
88 | result[i] = a[i] * rToScaleA - x[i] * rToScaleX;
89 | }
90 | return result;
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/common/MeanAveragePrecision.java:
--------------------------------------------------------------------------------
1 | package common;
2 |
3 | import space.SemanticSpace;
4 | import java.util.ArrayList;
5 | import java.util.Arrays;
6 | import java.util.Comparator;
7 |
8 | import common.IOUtils;
9 |
10 | public class MeanAveragePrecision {
11 | String[][] wordPairs;
12 | double[] golds;
13 |
14 | public MeanAveragePrecision(String dataset) {
15 | readDataset(dataset);
16 | }
17 |
18 | public MeanAveragePrecision(String[][] wordPairs, double[] golds) {
19 | this.wordPairs = wordPairs;
20 | this.golds = golds;
21 | }
22 |
23 | public void readDataset(String dataset) {
24 | ArrayList data = IOUtils.readFile(dataset);
25 | golds = new double[data.size()];
26 | wordPairs = new String[data.size()][2];
27 | for (int i = 0; i < data.size(); i++) {
28 | String dataPiece = data.get(i);
29 | String elements[] = dataPiece.split("\t");
30 | wordPairs[i][0] = elements[0];
31 | wordPairs[i][1] = elements[1];
32 | golds[i] = Double.parseDouble(elements[2]);
33 | //golds[i] = Double.parseDouble(elements[3]);
34 | }
35 | }
36 |
37 | public double evaluateMAP(SemanticSpace space) {
38 | final double[] predicts = new double[golds.length];
39 | for (int i = 0; i < golds.length; i++) {
40 | predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1])
41 | * space.getDirection(wordPairs[i][0], wordPairs[i][1]);
42 | }
43 | Integer[] idxs = new Integer[golds.length];
44 | for(int i = 0; i < golds.length; i++) idxs[i] = i;
45 | Arrays.sort(idxs, new Comparator(){
46 | public int compare(Integer o1, Integer o2){
47 | return Double.compare(predicts[o2], predicts[o1]);
48 | }
49 | });
50 | double[] sorted_preds = new double[golds.length];
51 | for(int i = 0; i < golds.length; i++) sorted_preds[i] = golds[idxs[i]];
52 |
53 | double map = computeMAP(sorted_preds);
54 | return map;
55 | }
56 |
57 | public double computeMAP(double[] sorted_preds) {
58 | double ap = 0.0;
59 | double retrievedCounter = 0;
60 | double relevantCounter = 0;
61 |
62 | for (int i = 0; i < sorted_preds.length; i++) {
63 | retrievedCounter++;
64 | if (sorted_preds[i] == 1.0) {
65 | relevantCounter++;
66 | ap += relevantCounter / retrievedCounter;
67 | }
68 | }
69 | ap /= relevantCounter;
70 | return ap;
71 | }
72 |
73 |
74 | }
--------------------------------------------------------------------------------
/src/common/SigmoidTable.java:
--------------------------------------------------------------------------------
1 | package common;
2 |
3 | /**
4 | * An instance of this class pre-computed values for the sigmoid function
5 | * Its main purpose to increase the speed of the program (or so people say :P)
6 | * since e^-x takes longer time then mult/add
7 | *
8 | */
9 | public class SigmoidTable {
10 |
11 | // Default parameters for the table
12 | public static final double DEFAULT_MAX_X = 6;
13 | public static final int DEFAULT_SIGMOID_TABLE_SIZE = 10000000;
14 |
15 | /*
16 | * This sigmoidTable holds the precomputed sigmoid values of variables in the range
17 | * [-maxX, maxX]
18 | * tableSize decides the interval between two consecutive values that we
19 | * compute the sigmoid function for, i.e. the precision of the returned
20 | * sigmoid values
21 | */
22 | private double[] sigmoidTable;
23 | private double maxX;
24 | private int tableSize;
25 |
26 |
27 | public SigmoidTable(int tableSize, double maxX) {
28 | this.tableSize = tableSize;
29 | this.maxX = maxX;
30 | initTable();
31 | }
32 |
33 | /**
34 | * Default constructor
35 | * Initialize with default values
36 | */
37 | public SigmoidTable() {
38 | this(DEFAULT_SIGMOID_TABLE_SIZE, DEFAULT_MAX_X);
39 | }
40 |
41 | /**
42 | * Initialize the precomputed sigmoid table.
43 | * The table consists of "tableSize" precomputed values for sigmoid
44 | * function for input values from -maxX to maxX (The difference between to
45 | * consecutive input value would be: 2 * maxX / (tableSize - 1)
46 | */
47 | public void initTable() {
48 | sigmoidTable = new double[tableSize];
49 | double step = (2 * maxX) / (tableSize - 1);
50 | for (int i = 0; i < tableSize - 1; i++) {
51 | double x = -maxX + i * step;
52 | sigmoidTable[i] = MathUtils.sigmoid(x);
53 | }
54 | }
55 |
56 | /**
57 | * Get the sigmoid function for x from the pre-computed table
58 | */
59 | public double getSigmoid(double x) {
60 | if (x > maxX)
61 | return 1;
62 | else if (x < -maxX)
63 | return 0;
64 | else {
65 | int index = (int) Math.round((x + maxX) / (2 * maxX) * (tableSize - 1));
66 | return sigmoidTable[index];
67 | }
68 | // double result = MathUtils.sigmoid(x);
69 | // return result;
70 | }
71 |
72 | }
73 |
--------------------------------------------------------------------------------
/src/common/TanhTable.java:
--------------------------------------------------------------------------------
1 | package common;
2 |
3 | /**
4 | * An instance of this class pre-computed values for the tanh function
5 | * Its main purpose to increase the speed of the program (or so people say :P)
6 | * since e^-x takes longer time then mult/add
7 | *
8 | */
9 | public class TanhTable {
10 |
11 | // Default parameters for the table
12 | public static final double DEFAULT_MAX_X = 6;
13 | public static final int DEFAULT_TANH_TABLE_SIZE = 10000000;
14 |
15 | /*
16 | * This tanhTable holds the precomputed tanh values of variables in the range
17 | * [-maxX, maxX]
18 | * tableSize decides the interval between two consecutive values that we
19 | * compute the tanh function for, i.e. the precision of the returned
20 | * tanh values
21 | */
22 | private double[] tanhTable;
23 | private double maxX;
24 | private int tableSize;
25 |
26 |
27 | public TanhTable(int tableSize, double maxX) {
28 | this.tableSize = tableSize;
29 | this.maxX = maxX;
30 | initTable();
31 | }
32 |
33 | /**
34 | * Default constructor
35 | * Initialize with default values
36 | */
37 | public TanhTable() {
38 | this(DEFAULT_TANH_TABLE_SIZE, DEFAULT_MAX_X);
39 | }
40 |
41 | /**
42 | * Initialize the precomputed tanh table.
43 | * The table consists of "tableSize" precomputed values for tanh
44 | * function for input values from -maxX to maxX (The difference between to
45 | * consecutive input value would be: 2 * maxX / (tableSize - 1)
46 | */
47 | public void initTable() {
48 | tanhTable = new double[tableSize];
49 | double step = (2 * maxX) / (tableSize - 1);
50 | for (int i = 0; i < tableSize - 1; i++) {
51 | double x = -maxX + i * step;
52 | tanhTable[i] = MathUtils.tanh(x);
53 | }
54 | }
55 |
56 | /**
57 | * Get the tanh function for x from the pre-computed table
58 | */
59 | public double getTanh(double x) {
60 | // if (x > 1000) {
61 | // System.out.println("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
62 | // System.out.println("x: " + x);
63 | // return 1;
64 | // } else if (x < -1000) {
65 | // System.out.println("-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
66 | // return -1;
67 | // }
68 | // return MathUtils.tanh(x);
69 | if (x > maxX)
70 | return 1;
71 | else if (x < -maxX)
72 | return -1;
73 | else {
74 | // int index = (int) Math.round((x + maxX) / (2 * maxX) * (tableSize - 1));
75 | // return tanhTable[index];
76 | return MathUtils.tanh(x);
77 | }
78 |
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/common/WordForm.java:
--------------------------------------------------------------------------------
1 | package common;
2 |
3 | /**
4 | * Constant values for word format
5 | *
6 | */
7 | public class WordForm {
8 | public static final int WORD = 1;
9 | public static final int LEMMA = 2;
10 | public static final int WORD_POS = 3;
11 | public static final int LEMMA_POS = 4;
12 | }
13 |
--------------------------------------------------------------------------------
/src/common/correlation/AreaUnderCurve.java:
--------------------------------------------------------------------------------
1 | package common.correlation;
2 |
3 | import java.util.Arrays;
4 |
5 | import common.exception.ValueException;
6 |
7 | public class AreaUnderCurve {
8 | public static double computeAUC(double[] golds, double[] predicted) {
9 | int positive = 0;
10 | for (double score: golds) {
11 | if (score == 1) positive++;
12 | }
13 | int negative = golds.length - positive;
14 |
15 | int total_count = golds.length;
16 | Point[] point_set = new Point[total_count];
17 | for (int i = 0; i < golds.length; i++) {
18 | if (!(golds[i]==1) && !(golds[i] == 0)) {
19 | throw new ValueException("For evaluating AUC, gold scores are required to be 0 or 1.");
20 | }
21 | point_set[i] = new Point(golds[i], predicted[i]);
22 | }
23 |
24 | Arrays.sort(point_set);
25 |
26 | double xi = 1.0;
27 | double yi = 1.0;
28 | double xi_old = 1.0;
29 | double true_positive = positive;
30 | double false_positive = negative;
31 | double auc = 0;
32 |
33 | for (int i = 0; i < total_count; i++) {
34 | if (point_set[i].gold == 1) {
35 | true_positive -= 1;
36 | yi = true_positive / positive;
37 | } else {
38 | false_positive -= 1;
39 | xi = false_positive / negative;
40 | auc += (xi_old - xi) * yi;
41 | xi_old = xi;
42 | }
43 | }
44 | return auc;
45 | }
46 |
47 |
48 | static class Point implements Comparable{
49 | double gold;
50 | double score;
51 | public Point(double gold, double score) {
52 | this.gold = gold;
53 | this.score = score;
54 | }
55 | @Override
56 | public int compareTo(Point o) {
57 | // TODO Auto-generated method stub
58 | if (this.score > o.score) return 1;
59 | if (this.score < o.score) return -1;
60 | return 0;
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/common/correlation/MenCorrelation.java:
--------------------------------------------------------------------------------
1 | package common.correlation;
2 |
3 | import java.util.ArrayList;
4 |
5 | import org.apache.commons.math3.stat.correlation.PearsonsCorrelation;
6 | import org.apache.commons.math3.stat.correlation.SpearmansCorrelation;
7 |
8 | import space.SemanticSpace;
9 |
10 | import common.IOUtils;
11 |
12 |
13 | /**
14 | * This class can be used to evaluate a word vector space by computing the
15 | * correlation between the cosine of the words' vectors and the gold-standard
16 | * similarities of them (typically based on human judgment)
17 | * The name is kind of misleading since we can use other dataset than MEN
18 | *
19 | */
20 |
21 | public class MenCorrelation {
22 | String[][] wordPairs;
23 | double[] golds;
24 | PearsonsCorrelation pearson;
25 | SpearmansCorrelation spearman;
26 | String name = "";
27 |
28 | /**
29 | * Initialize with the path to the dataset file
30 | * @param dataset
31 | */
32 | public MenCorrelation(String dataset) {
33 | pearson = new PearsonsCorrelation();
34 | spearman = new SpearmansCorrelation();
35 | readDataset(dataset);
36 | }
37 |
38 |
39 | public MenCorrelation(String[][] wordPairs, double[] golds) {
40 | pearson = new PearsonsCorrelation();
41 | spearman = new SpearmansCorrelation();
42 | this.wordPairs = wordPairs;
43 | this.golds = golds;
44 | }
45 |
46 |
47 | /**
48 | * Read the word pairs and the gold standard from the dataset
49 | * @param dataset
50 | */
51 | public void readDataset(String dataset) {
52 | ArrayList data = IOUtils.readFile(dataset);
53 | golds = new double[data.size()];
54 | wordPairs = new String[data.size()][2];
55 | for (int i = 0; i < data.size(); i++) {
56 | String dataPiece = data.get(i);
57 | String elements[] = dataPiece.split("\t");
58 | wordPairs[i][0] = elements[0];
59 | wordPairs[i][1] = elements[1];
60 | golds[i] = Double.parseDouble(elements[2]);
61 | //golds[i] = Double.parseDouble(elements[3]);
62 | }
63 | }
64 |
65 | /**
66 | * Compute the pearson correlation of the predicted values against the gold
67 | * standard
68 | * @param predicts
69 | * @return
70 | */
71 | public double pearsonCorrelation(double[] predicts) {
72 | return pearson.correlation(golds, predicts);
73 | }
74 |
75 | /**
76 | * Compute the spearman correlation of the predicted values against the gold
77 | * standard
78 | * @param predicts
79 | * @return
80 | */
81 | public double spearmanCorrelation(double[] predicts) {
82 | return spearman.correlation(golds, predicts);
83 | }
84 |
85 |
86 | /**
87 | * Evaluate the space using the pearson correlation
88 | * @param space
89 | * @return
90 | */
91 | public double evaluateSpacePearson(SemanticSpace space) {
92 | double[] predicts = new double[golds.length];
93 | for (int i = 0; i < golds.length; i++) {
94 | predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]);
95 | // System.out.println(wordPairs[i][0]);
96 | // System.out.println(wordPairs[i][1]);
97 | }
98 | return pearson.correlation(golds, predicts);
99 | }
100 |
101 |
102 | /**
103 | * Evaluate the space using the spearman correlation
104 | * @param space
105 | * @return
106 | */
107 | public double evaluateSpaceSpearman(SemanticSpace space) {
108 | double[] predicts = new double[golds.length];
109 | for (int i = 0; i < golds.length; i++) {
110 | predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]);
111 | }
112 | return spearman.correlation(golds, predicts);
113 | }
114 |
115 |
116 | /**
117 | * @return the gold standard (human's judgment on the similarities)
118 | */
119 | public double[] getGolds() {
120 | return golds;
121 | }
122 |
123 | public void setName(String name) {
124 | this.name = name;
125 | }
126 |
127 | public String getName() {
128 | return this.name;
129 | }
130 |
131 | public String[][] getWordPairs() {
132 | return this.wordPairs;
133 | }
134 |
135 | public static void main(String[] args) {
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/common/exception/OutOfVocabularyException.java:
--------------------------------------------------------------------------------
1 | package common.exception;
2 |
3 | public class OutOfVocabularyException extends RuntimeException {
4 |
5 | /**
6 | *
7 | */
8 | private static final long serialVersionUID = 1L;
9 |
10 | public OutOfVocabularyException(String msg) {
11 | super(msg);
12 | }
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/src/common/exception/ValueException.java:
--------------------------------------------------------------------------------
1 | package common.exception;
2 |
3 | public class ValueException extends RuntimeException {
4 |
5 | /**
6 | *
7 | */
8 | private static final long serialVersionUID = 1L;
9 | public ValueException(String msg) {
10 | super(msg);
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/common/wordnet/LexicalHypernym.java:
--------------------------------------------------------------------------------
1 | package common.wordnet;
2 |
3 | import common.IOUtils;
4 | import vocab.Vocab;
5 |
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Random;
12 | import java.util.Set;
13 | import com.google.common.collect.Sets;
14 |
15 |
16 | public class LexicalHypernym {
17 | HashMap> features;
18 | HashMap> cohyponyms;
19 | HashMap> hypernyms;
20 | //Vocab vocab;
21 | Random random;
22 |
23 | public LexicalHypernym(String hypeFile, String cohypoFile, String featureFile, Vocab vocab) throws IOException {
24 | cohyponyms = readLexical(cohypoFile, vocab);
25 | hypernyms = readLexical(hypeFile, vocab);
26 | features = readFeatures(featureFile, vocab);
27 | //this.vocab = vocab;
28 | random = new Random();
29 | }
30 |
31 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException {
32 | HashMap> lexical = new HashMap>();
33 | ArrayList data = IOUtils.readFile(inputFile);
34 | for (int i = 0; i < data.size(); i++) {
35 | String dataPiece = data.get(i);
36 | String elements[] = dataPiece.split("\t");
37 | String key = elements[0];
38 | int keyIndex = vocab.getWordIndex(key);
39 | if (keyIndex == -1) continue;
40 | HashSet value = new HashSet();
41 | for (int j = 1; j < elements.length; j++ ) {
42 | int wordIndex = vocab.getWordIndex(elements[j]);;
43 | if (wordIndex == -1) continue;
44 | //value.add(elements[j]);
45 | value.add(wordIndex);
46 | }
47 | lexical.put(keyIndex, value);
48 | }
49 | return lexical;
50 | }
51 |
52 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException {
53 | HashMap> features = new HashMap>();
54 | ArrayList data = IOUtils.readFile(inputFile);
55 | for (int i = 0; i < data.size(); i++) {
56 | String dataPiece = data.get(i);
57 | String elements[] = dataPiece.split("\t");
58 | String key = elements[0];
59 | int keyIndex = vocab.getWordIndex(key);
60 | if (keyIndex == -1) continue;
61 | HashSet value = new HashSet();
62 | for (int j = 1; j < elements.length; j++ ) {
63 | int wordIndex = -1;
64 | wordIndex = vocab.getWordIndex(elements[j]);
65 | if (wordIndex == -1) continue;
66 | value.add(wordIndex);
67 | }
68 | features.put(keyIndex, value);
69 | }
70 | return features;
71 | }
72 | /*
73 | public HashSet intersectionAnt(String target, String feature) {
74 | HashSet intersection = new HashSet();
75 | if (antonyms.containsKey(target) && features.containsKey(feature)) {
76 | HashSet setTargets = antonyms.get(target);
77 | HashSet setFeatures = features.get(feature);
78 | intersection = getIntersection(setTargets, setFeatures);
79 | }
80 | return intersection;
81 | }
82 |
83 | public HashSet intersectionSyn(String target, String feature) {
84 | HashSet intersection = new HashSet();
85 | if (synonyms.containsKey(target) && features.containsKey(feature)) {
86 | HashSet setTargets = synonyms.get(target);
87 | HashSet setFeatures = features.get(feature);
88 | intersection = getIntersection(setTargets, setFeatures);
89 | }
90 | return intersection;
91 | }*/
92 |
93 | public Set intersectionHype(Integer targetIndex, Integer featureIndex) {
94 | Set intersection = new HashSet();
95 | if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
96 | Set setHypes = hypernyms.get(targetIndex);
97 | Set setFeatures = features.get(featureIndex);
98 | intersection = Sets.intersection(setHypes, setFeatures);
99 | }
100 | return intersection;
101 | }
102 |
103 | public Set intersectionCohypo(Integer targetIndex, Integer featureIndex) {
104 | Set intersection = new HashSet();
105 | if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
106 | Set setCohypos = cohyponyms.get(targetIndex);
107 | Set setFeatures = features.get(featureIndex);
108 | intersection = Sets.intersection(setCohypos, setFeatures);
109 | }
110 | return intersection;
111 | }
112 |
113 | public HashSet getIntersection(HashSet hs1, HashSet hs2) {
114 | HashSet intersection = new HashSet();
115 | for (Integer element: hs1) {
116 | if (hs2.contains(element)) intersection.add(element);
117 | }
118 | return intersection;
119 | }
120 |
121 | public int getRandom(Set antonyms) {
122 | List listAnts = new ArrayList(antonyms);
123 | int id = random.nextInt(listAnts.size());
124 | return listAnts.get(id);
125 | }
126 |
127 | public boolean hasHypeCohypo(Integer targetIndex){
128 | return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex);
129 | }
130 |
131 | public boolean hasHypernyms(Integer targetIndex){
132 | return hypernyms.containsKey(targetIndex);
133 | }
134 |
135 | public boolean hasCohyponyms(Integer targetIndex){
136 | return cohyponyms.containsKey(targetIndex);
137 | }
138 |
139 | public boolean hasFeature(Integer featureIndex) {
140 | return features.containsKey(featureIndex);
141 | }
142 |
143 | public HashSet getFeatures(Integer featureIndex) {
144 | return features.get(featureIndex);
145 | }
146 |
147 | public HashSet getHypernyms(Integer targetIndex) {
148 | return hypernyms.get(targetIndex);
149 | }
150 |
151 | public HashSet getCohyponyms(Integer targetIndex) {
152 | return cohyponyms.get(targetIndex);
153 | }
154 |
155 | }
156 |
--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResource.java:
--------------------------------------------------------------------------------
1 | package common.wordnet;
2 |
3 | import common.IOUtils;
4 | import vocab.Vocab;
5 |
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Random;
12 | import java.util.Set;
13 | import com.google.common.collect.Sets;
14 |
15 |
16 | public class LexicalResource {
17 | HashMap> antonyms;
18 | HashMap> synonyms;
19 | HashMap> features;
20 | //HashMap> cohyponyms;
21 | //HashMap> hypernyms;
22 | //Vocab vocab;
23 | Random random;
24 |
25 | public LexicalResource(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
26 | antonyms = readLexical(antFile, vocab);
27 | synonyms = readLexical(synFile, vocab);
28 | //cohyponyms = readLexical(cohypoFile, vocab);
29 | //hypernyms = readLexical(hypeFile, vocab);
30 | features = readFeatures(featureFile, vocab);
31 | //this.vocab = vocab;
32 | random = new Random();
33 | }
34 |
35 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException {
36 | HashMap> lexical = new HashMap>();
37 | ArrayList data = IOUtils.readFile(inputFile);
38 | for (int i = 0; i < data.size(); i++) {
39 | String dataPiece = data.get(i);
40 | String elements[] = dataPiece.split("\t");
41 | String key = elements[0];
42 | int keyIndex = vocab.getWordIndex(key);
43 | if (keyIndex == -1) continue;
44 | HashSet value = new HashSet();
45 | for (int j = 1; j < elements.length; j++ ) {
46 | int wordIndex = vocab.getWordIndex(elements[j]);;
47 | if (wordIndex == -1) continue;
48 | //value.add(elements[j]);
49 | value.add(wordIndex);
50 | }
51 | lexical.put(keyIndex, value);
52 | }
53 | return lexical;
54 | }
55 |
56 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException {
57 | HashMap> features = new HashMap>();
58 | ArrayList data = IOUtils.readFile(inputFile);
59 | for (int i = 0; i < data.size(); i++) {
60 | String dataPiece = data.get(i);
61 | String elements[] = dataPiece.split("\t");
62 | String key = elements[0];
63 | int keyIndex = vocab.getWordIndex(key);
64 | if (keyIndex == -1) continue;
65 | HashSet value = new HashSet();
66 | for (int j = 1; j < elements.length; j++ ) {
67 | int wordIndex = -1;
68 | wordIndex = vocab.getWordIndex(elements[j]);
69 | if (wordIndex == -1) continue;
70 | value.add(wordIndex);
71 | }
72 | features.put(keyIndex, value);
73 | }
74 | return features;
75 | }
76 | /*
77 | public HashSet intersectionAnt(String target, String feature) {
78 | HashSet intersection = new HashSet();
79 | if (antonyms.containsKey(target) && features.containsKey(feature)) {
80 | HashSet setTargets = antonyms.get(target);
81 | HashSet setFeatures = features.get(feature);
82 | intersection = getIntersection(setTargets, setFeatures);
83 | }
84 | return intersection;
85 | }
86 |
87 | public HashSet intersectionSyn(String target, String feature) {
88 | HashSet intersection = new HashSet();
89 | if (synonyms.containsKey(target) && features.containsKey(feature)) {
90 | HashSet setTargets = synonyms.get(target);
91 | HashSet setFeatures = features.get(feature);
92 | intersection = getIntersection(setTargets, setFeatures);
93 | }
94 | return intersection;
95 | }*/
96 |
97 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) {
98 | Set intersection = new HashSet();
99 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
100 | Set setTargets = antonyms.get(targetIndex);
101 | Set setFeatures = features.get(featureIndex);
102 | intersection = Sets.intersection(setTargets, setFeatures);
103 | }
104 | return intersection;
105 | }
106 |
107 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) {
108 | Set intersection = new HashSet();
109 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
110 | Set setTargets = synonyms.get(targetIndex);
111 | Set setFeatures = features.get(featureIndex);
112 | intersection = Sets.intersection(setTargets, setFeatures);
113 | }
114 | return intersection;
115 | }
116 |
117 | // public Set intersectionHype(Integer targetIndex, Integer featureIndex) {
118 | // Set intersection = new HashSet();
119 | // if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
120 | // Set setHypes = hypernyms.get(targetIndex);
121 | // Set setFeatures = features.get(featureIndex);
122 | // intersection = Sets.intersection(setHypes, setFeatures);
123 | // }
124 | // return intersection;
125 | // }
126 | //
127 | // public Set intersectionCohypo(Integer targetIndex, Integer featureIndex) {
128 | // Set intersection = new HashSet();
129 | // if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
130 | // Set setCohypos = cohyponyms.get(targetIndex);
131 | // Set setFeatures = features.get(featureIndex);
132 | // intersection = Sets.intersection(setCohypos, setFeatures);
133 | // }
134 | // return intersection;
135 | // }
136 |
137 | public HashSet getIntersection(HashSet hs1, HashSet hs2) {
138 | HashSet intersection = new HashSet();
139 | for (Integer element: hs1) {
140 | if (hs2.contains(element)) intersection.add(element);
141 | }
142 | return intersection;
143 | }
144 |
145 | public int getRandom(Set antonyms) {
146 | List listAnts = new ArrayList(antonyms);
147 | int id = random.nextInt(listAnts.size());
148 | return listAnts.get(id);
149 | }
150 |
151 | // public boolean hasHypeCohypo(Integer targetIndex){
152 | // return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex);
153 | // }
154 | //
155 | // public boolean hasHypernyms(Integer targetIndex){
156 | // return hypernyms.containsKey(targetIndex);
157 | // }
158 | //
159 | // public boolean hasCohyponyms(Integer targetIndex){
160 | // return cohyponyms.containsKey(targetIndex);
161 | // }
162 |
163 | public boolean hasTarget(Integer targetIndex) {
164 | return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex);
165 | }
166 |
167 | public boolean hasAntonyms(Integer targetIndex) {
168 | return antonyms.containsKey(targetIndex);
169 | }
170 |
171 | public boolean hasSynonyms(Integer targetIndex) {
172 | return synonyms.containsKey(targetIndex);
173 | }
174 |
175 | public boolean hasFeature(Integer featureIndex) {
176 | return features.containsKey(featureIndex);
177 | }
178 |
179 | public HashSet getAntonyms(Integer targetIndex) {
180 | return antonyms.get(targetIndex);
181 | }
182 |
183 | public HashSet getSynonyms(Integer targetIndex) {
184 | return synonyms.get(targetIndex);
185 | }
186 |
187 | public HashSet getFeatures(Integer featureIndex) {
188 | return features.get(featureIndex);
189 | }
190 |
191 | // public HashSet getHypernyms(Integer targetIndex) {
192 | // return hypernyms.get(targetIndex);
193 | // }
194 | //
195 | // public HashSet getCohyponyms(Integer targetIndex) {
196 | // return cohyponyms.get(targetIndex);
197 | // }
198 |
199 | }
200 |
--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResourceAdj.java:
--------------------------------------------------------------------------------
1 | package common.wordnet;
2 |
3 | import common.IOUtils;
4 | import vocab.Vocab;
5 |
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Random;
12 | import java.util.Set;
13 | import com.google.common.collect.Sets;
14 |
15 |
16 | public class LexicalResourceAdj {
17 | HashMap> antonyms;
18 | HashMap> synonyms;
19 | HashMap> features;
20 | Random random;
21 |
22 | public LexicalResourceAdj(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
23 | features = readFeatures(featureFile, vocab);
24 | antonyms = readLexical(antFile, vocab);
25 | synonyms = readLexical(synFile, vocab);
26 | //this.vocab = vocab;
27 | //System.out.println("The number of antonyms: " + antonyms.size());
28 | //System.out.println("The number of synonyms: " + synonyms.size());
29 | //System.out.println("The number of contexts: " + features.size());
30 | random = new Random();
31 | }
32 |
33 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException {
34 | HashMap> lexical = new HashMap>();
35 | ArrayList data = IOUtils.readFile(inputFile);
36 | for (int i = 0; i < data.size(); i++) {
37 | String dataPiece = data.get(i);
38 | String elements[] = dataPiece.split("\t");
39 | String key = elements[0];
40 | int keyIndex = vocab.getWordIndex(key);
41 | if (keyIndex == -1) continue;
42 | HashSet value = new HashSet();
43 | for (int j = 1; j < elements.length; j++ ) {
44 | int wordIndex = vocab.getWordIndex(elements[j]);;
45 | if (wordIndex == -1) continue;
46 | //value.add(elements[j]);
47 | value.add(wordIndex);
48 | }
49 | lexical.put(keyIndex, value);
50 | //random antonym
51 | //List listAnts = new ArrayList(value);
52 | //int id = random.nextInt(listAnts.size());
53 | //antRandom.put(keyIndex, listAnts.get(id));
54 | }
55 | return lexical;
56 | }
57 |
58 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException {
59 | HashMap> features = new HashMap>();
60 | ArrayList data = IOUtils.readFile(inputFile);
61 | for (int i = 0; i < data.size(); i++) {
62 | String dataPiece = data.get(i);
63 | String elements[] = dataPiece.split("\t");
64 | String key = elements[0];
65 | int keyIndex = vocab.getWordIndex(key);
66 | if (keyIndex == -1) continue;
67 | HashSet value = new HashSet();
68 | for (int j = 1; j < elements.length; j++ ) {
69 | int wordIndex = -1;
70 | wordIndex = vocab.getWordIndex(elements[j]);
71 | if (wordIndex == -1) continue;
72 | value.add(wordIndex);
73 | }
74 | features.put(keyIndex, value);
75 | }
76 | return features;
77 | }
78 | /*
79 | public HashSet intersectionAnt(String target, String feature) {
80 | HashSet intersection = new HashSet();
81 | if (antonyms.containsKey(target) && features.containsKey(feature)) {
82 | HashSet setTargets = antonyms.get(target);
83 | HashSet setFeatures = features.get(feature);
84 | intersection = getIntersection(setTargets, setFeatures);
85 | }
86 | return intersection;
87 | }
88 |
89 | public HashSet intersectionSyn(String target, String feature) {
90 | HashSet intersection = new HashSet();
91 | if (synonyms.containsKey(target) && features.containsKey(feature)) {
92 | HashSet setTargets = synonyms.get(target);
93 | HashSet setFeatures = features.get(feature);
94 | intersection = getIntersection(setTargets, setFeatures);
95 | }
96 | return intersection;
97 | }*/
98 |
99 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) {
100 | Set intersection = new HashSet();
101 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
102 | Set setTargets = antonyms.get(targetIndex);
103 | Set setFeatures = features.get(featureIndex);
104 | intersection = Sets.intersection(setTargets, setFeatures);
105 | }
106 | return intersection;
107 | }
108 |
109 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) {
110 | Set intersection = new HashSet();
111 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
112 | Set setTargets = synonyms.get(targetIndex);
113 | Set setFeatures = features.get(featureIndex);
114 | intersection = Sets.intersection(setTargets, setFeatures);
115 | }
116 | return intersection;
117 | }
118 |
119 | public HashSet getIntersection(HashSet hs1, HashSet hs2) {
120 | HashSet intersection = new HashSet();
121 | for (Integer element: hs1) {
122 | if (hs2.contains(element)) intersection.add(element);
123 | }
124 | return intersection;
125 | }
126 |
127 | public int getRandom(Set antonyms) {
128 | List listAnts = new ArrayList(antonyms);
129 | int id = random.nextInt(listAnts.size());
130 | return listAnts.get(id);
131 | }
132 |
133 | public boolean hasTarget(Integer targetIndex) {
134 | return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex);
135 | }
136 |
137 | public boolean hasAntonyms(Integer targetIndex) {
138 | return antonyms.containsKey(targetIndex);
139 | }
140 |
141 | public boolean hasSynonyms(Integer targetIndex) {
142 | return synonyms.containsKey(targetIndex);
143 | }
144 |
145 | public boolean hasFeature(Integer featureIndex) {
146 | return features.containsKey(featureIndex);
147 | }
148 |
149 | public HashSet getAntonyms(Integer targetIndex) {
150 | return antonyms.get(targetIndex);
151 | }
152 |
153 | public HashSet getSynonyms(Integer targetIndex) {
154 | return synonyms.get(targetIndex);
155 | }
156 |
157 | public HashSet getFeatures(Integer featureIndex) {
158 | return features.get(featureIndex);
159 | }
160 |
161 | }
162 |
--------------------------------------------------------------------------------
/src/common/wordnet/LexicalResourceNoun.java:
--------------------------------------------------------------------------------
1 | package common.wordnet;
2 |
3 | import common.IOUtils;
4 | import vocab.Vocab;
5 |
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Random;
12 | import java.util.Set;
13 | import com.google.common.collect.Sets;
14 |
15 |
16 | public class LexicalResourceNoun {
17 | HashMap> antonyms;
18 | HashMap> synonyms;
19 | HashMap> features;
20 | //HashMap> cohyponyms;
21 | //HashMap> hypernyms;
22 | //Vocab vocab;
23 | Random random;
24 |
25 | public LexicalResourceNoun(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException {
26 | antonyms = readLexical(antFile, vocab);
27 | synonyms = readLexical(synFile, vocab);
28 | //cohyponyms = readLexical(cohypoFile, vocab);
29 | //hypernyms = readLexical(hypeFile, vocab);
30 | features = readFeatures(featureFile, vocab);
31 | //this.vocab = vocab;
32 | random = new Random();
33 | }
34 |
35 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException {
36 | HashMap> lexical = new HashMap>();
37 | ArrayList data = IOUtils.readFile(inputFile);
38 | for (int i = 0; i < data.size(); i++) {
39 | String dataPiece = data.get(i);
40 | String elements[] = dataPiece.split("\t");
41 | String key = elements[0];
42 | int keyIndex = vocab.getWordIndex(key);
43 | if (keyIndex == -1) continue;
44 | HashSet value = new HashSet();
45 | for (int j = 1; j < elements.length; j++ ) {
46 | int wordIndex = vocab.getWordIndex(elements[j]);;
47 | if (wordIndex == -1) continue;
48 | //value.add(elements[j]);
49 | value.add(wordIndex);
50 | }
51 | lexical.put(keyIndex, value);
52 | }
53 | return lexical;
54 | }
55 |
56 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException {
57 | HashMap> features = new HashMap>();
58 | ArrayList data = IOUtils.readFile(inputFile);
59 | for (int i = 0; i < data.size(); i++) {
60 | String dataPiece = data.get(i);
61 | String elements[] = dataPiece.split("\t");
62 | String key = elements[0];
63 | int keyIndex = vocab.getWordIndex(key);
64 | if (keyIndex == -1) continue;
65 | HashSet value = new HashSet();
66 | for (int j = 1; j < elements.length; j++ ) {
67 | int wordIndex = -1;
68 | wordIndex = vocab.getWordIndex(elements[j]);
69 | if (wordIndex == -1) continue;
70 | value.add(wordIndex);
71 | }
72 | features.put(keyIndex, value);
73 | }
74 | return features;
75 | }
76 | /*
77 | public HashSet intersectionAnt(String target, String feature) {
78 | HashSet intersection = new HashSet();
79 | if (antonyms.containsKey(target) && features.containsKey(feature)) {
80 | HashSet setTargets = antonyms.get(target);
81 | HashSet setFeatures = features.get(feature);
82 | intersection = getIntersection(setTargets, setFeatures);
83 | }
84 | return intersection;
85 | }
86 |
87 | public HashSet intersectionSyn(String target, String feature) {
88 | HashSet intersection = new HashSet();
89 | if (synonyms.containsKey(target) && features.containsKey(feature)) {
90 | HashSet setTargets = synonyms.get(target);
91 | HashSet setFeatures = features.get(feature);
92 | intersection = getIntersection(setTargets, setFeatures);
93 | }
94 | return intersection;
95 | }*/
96 |
97 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) {
98 | Set intersection = new HashSet();
99 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
100 | Set setTargets = antonyms.get(targetIndex);
101 | Set setFeatures = features.get(featureIndex);
102 | intersection = Sets.intersection(setTargets, setFeatures);
103 | }
104 | return intersection;
105 | }
106 |
107 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) {
108 | Set intersection = new HashSet();
109 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
110 | Set setTargets = synonyms.get(targetIndex);
111 | Set setFeatures = features.get(featureIndex);
112 | intersection = Sets.intersection(setTargets, setFeatures);
113 | }
114 | return intersection;
115 | }
116 |
117 | // public Set intersectionHype(Integer targetIndex, Integer featureIndex) {
118 | // Set intersection = new HashSet();
119 | // if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) {
120 | // Set setHypes = hypernyms.get(targetIndex);
121 | // Set setFeatures = features.get(featureIndex);
122 | // intersection = Sets.intersection(setHypes, setFeatures);
123 | // }
124 | // return intersection;
125 | // }
126 | //
127 | // public Set