├── .classpath ├── .gitignore ├── .metadata ├── .lock ├── .log ├── .mylyn │ └── repositories.xml.zip ├── .plugins │ ├── org.eclipse.core.resources │ │ ├── .root │ │ │ ├── .indexes │ │ │ │ ├── history.version │ │ │ │ ├── properties.index │ │ │ │ └── properties.version │ │ │ └── 2.tree │ │ └── .safetable │ │ │ └── org.eclipse.core.resources │ ├── org.eclipse.core.runtime │ │ └── .settings │ │ │ ├── org.eclipse.core.resources.prefs │ │ │ ├── org.eclipse.e4.ui.css.swt.theme.prefs │ │ │ ├── org.eclipse.e4.ui.workbench.renderers.swt.prefs │ │ │ ├── org.eclipse.jdt.ui.prefs │ │ │ ├── org.eclipse.m2e.discovery.prefs │ │ │ ├── org.eclipse.mylyn.context.core.prefs │ │ │ ├── org.eclipse.mylyn.monitor.ui.prefs │ │ │ ├── org.eclipse.mylyn.tasks.ui.prefs │ │ │ ├── org.eclipse.team.ui.prefs │ │ │ ├── org.eclipse.ui.editors.prefs │ │ │ ├── org.eclipse.ui.ide.prefs │ │ │ ├── org.eclipse.ui.prefs │ │ │ ├── org.eclipse.ui.workbench.prefs │ │ │ └── org.python.pydev.prefs │ ├── org.eclipse.e4.workbench │ │ └── workbench.xmi │ ├── org.eclipse.epp.logging.aeri.ui │ │ ├── history │ │ │ ├── _0.fdt │ │ │ ├── _0.fdx │ │ │ ├── _0.fnm │ │ │ ├── _0.frq │ │ │ ├── _0.nrm │ │ │ ├── _0.tii │ │ │ ├── _0.tis │ │ │ ├── segments.gen │ │ │ └── segments_1 │ │ ├── remote-index │ │ │ ├── _2.fdt │ │ │ ├── _2.fdx │ │ │ ├── _2.fnm │ │ │ ├── _2.frq │ │ │ ├── _2.nrm │ │ │ ├── _2.prx │ │ │ ├── _2.tii │ │ │ ├── _2.tis │ │ │ ├── segments.gen │ │ │ └── segments_3 │ │ └── server-config.json │ ├── org.eclipse.jdt.core │ │ ├── assumedExternalFilesCache │ │ ├── externalFilesCache │ │ ├── nonChainingJarsCache │ │ └── variablesAndContainers.dat │ ├── org.eclipse.jdt.ui │ │ ├── OpenTypeHistory.xml │ │ ├── QualifiedTypeNameHistory.xml │ │ └── dialog_settings.xml │ ├── org.eclipse.m2e.logback.configuration │ │ ├── 0.log │ │ └── logback.1.6.2.20150902-0002.xml │ ├── org.eclipse.oomph.setup.ui │ │ └── dialog_settings.xml │ ├── org.eclipse.oomph.setup │ │ └── workspace.setup │ ├── org.eclipse.ui.ide │ │ └── dialog_settings.xml │ └── org.eclipse.ui.workbench │ │ ├── dialog_settings.xml │ │ └── workingsets.xml └── version.ini ├── .project ├── .pydevproject ├── .settings ├── org.eclipse.jdt.core.prefs ├── org.eclipse.jdt.ui.prefs └── org.eclipse.m2e.core.prefs ├── HyperVec.jar ├── README.md ├── code_mapping_across_languages ├── AP_evaluation_code │ ├── common.py │ ├── test_default.py │ └── test_norm.py ├── alignment_files │ ├── de_en.align │ └── it_en.align ├── convert_w2vTXT_to_w2vBIN.py ├── credits_to_CLIC_trento.txt ├── mappingcode │ ├── __init__.py │ ├── demo.sh~ │ ├── learn_mat.sh │ ├── space.py │ ├── space.pyc │ ├── test_tm.py │ ├── test_tm2.py │ ├── test_tm_pred.py │ ├── train_tm.py │ ├── translate_tm.py │ ├── utils.py │ └── utils.pyc ├── perform_mapping.sh └── vocabulary file │ ├── german_voc_wikipedia.txt.gz │ └── italian_voc_wikipedia.txt.gz ├── config.cfg ├── create_features.py ├── datasets_across_languages ├── eval_DE │ ├── noun_hyp_vs_ant.txt │ ├── noun_hyp_vs_syn.txt │ └── noun_hyp_vs_synant.txt └── eval_IT │ ├── noun_hyp_vs-ant.txt │ ├── noun_hyp_vs-syn-ant.txt │ └── noun_hyp_vs-syn.txt ├── datasets_classification ├── ABIBLESS.txt ├── AWBLESS.txt ├── BLESS.txt ├── eval-bless.jar ├── eval-dir.jar └── readme_how_to.txt ├── evaluation_scripts ├── common.py └── corrEval.py ├── get-pretrainedHyperVecEmbeddings └── download_embeddings.sh ├── hypernymy_resources ├── cohyponym_n.txt.gz ├── cohyponym_v.txt.gz ├── hypernym_n.txt.gz └── hypernym_v.txt.gz ├── pom.xml └── src ├── common ├── DataStructureUtils.java ├── IOUtils.java ├── MathUtils.java ├── MeanAveragePrecision.java ├── SigmoidTable.java ├── SimpleMatrixUtils.java ├── TanhTable.java ├── WordForm.java ├── correlation │ ├── AreaUnderCurve.java │ └── MenCorrelation.java ├── exception │ ├── OutOfVocabularyException.java │ └── ValueException.java └── wordnet │ ├── LexicalHypernym.java │ ├── LexicalResource.java │ ├── LexicalResourceAdj.java │ ├── LexicalResourceNoun.java │ ├── LexicalResourceVerb.java │ ├── Synset.java │ ├── WordNetAdj.java │ ├── WordNetNoun.java │ ├── WordNetReader.java │ └── WordNetVerb.java ├── demo ├── HyperVecLearning.java └── W2vProperties.java ├── io ├── sentence │ ├── PlainSentenceInputStream.java │ ├── SentenceInputStream.java │ ├── SubSamplingSentenceInputStream.java │ └── TreeInputStream.java └── word │ ├── CombinedWordInputStream.java │ ├── Phrase.java │ ├── PushBackWordStream.java │ ├── WordFilter.java │ └── WordInputStream.java ├── neural └── function │ ├── ActivationFunction.java │ ├── Correlation.java │ ├── Sigmoid.java │ └── Tanh.java ├── space ├── AbstractSemanticSpace.java ├── Neighbor.java ├── RawSemanticSpace.java ├── SemanticSpace.java └── Similarity.java ├── tree ├── CcgTree.java └── Tree.java ├── vocab ├── HuffmanTree.java ├── Vocab.java ├── VocabEntry.java ├── VocabEntryFilter.java └── filter │ └── MinFrequencyVocabFilter.java └── word2vec ├── AbstractWord2Vec.java ├── MultiThreadWord2Vec.java ├── UniGram.java └── multitask └── Hyper2Vec.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /target 3 | .attach_pid* 4 | -------------------------------------------------------------------------------- /.metadata/.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.lock -------------------------------------------------------------------------------- /.metadata/.log: -------------------------------------------------------------------------------- 1 | !SESSION 2016-01-11 12:36:53.838 ----------------------------------------------- 2 | eclipse.buildId=4.5.1.M20150904-0015 3 | java.version=1.7.0_79 4 | java.vendor=Oracle Corporation 5 | BootLoader constants: OS=macosx, ARCH=x86_64, WS=cocoa, NL=en_US 6 | Framework arguments: -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation 7 | Command-line arguments: -os macosx -ws cocoa -arch x86_64 -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation 8 | 9 | !ENTRY org.eclipse.core.net 1 0 2016-01-11 12:36:55.177 10 | !MESSAGE System property http.nonProxyHosts has been set to local|*.local|169.254/16|*.169.254/16 by an external source. This value will be overwritten using the values from the preferences 11 | 12 | !ENTRY org.eclipse.jface 2 0 2016-01-11 12:37:55.985 13 | !MESSAGE Keybinding conflicts occurred. They may interfere with normal accelerator operation. 14 | !SUBENTRY 1 org.eclipse.jface 2 0 2016-01-11 12:37:55.985 15 | !MESSAGE A conflict occurred for ALT+COMMAND+R: 16 | Binding(ALT+COMMAND+R, 17 | ParameterizedCommand(Command(org.python.pydev.debug.setnext,Set Next Statement, 18 | , 19 | Category(org.python.pydev.ui.category.run,PyDev - Run,Python run category,true), 20 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@1ee8d4b6, 21 | ,,true),null), 22 | org.eclipse.ui.defaultAcceleratorConfiguration, 23 | org.eclipse.ui.contexts.window,,,system) 24 | Binding(ALT+COMMAND+R, 25 | ParameterizedCommand(Command(org.eclipse.jdt.ui.edit.text.java.rename.element,Rename - Refactoring , 26 | Rename the selected element, 27 | Category(org.eclipse.jdt.ui.category.refactoring,Refactor - Java,Java Refactoring Actions,true), 28 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@47e50894, 29 | ,,true),null), 30 | org.eclipse.ui.defaultAcceleratorConfiguration, 31 | org.eclipse.ui.contexts.window,,cocoa,system) 32 | !SESSION 2016-03-23 14:56:47.781 ----------------------------------------------- 33 | eclipse.buildId=4.5.1.M20150904-0015 34 | java.version=1.7.0_79 35 | java.vendor=Oracle Corporation 36 | BootLoader constants: OS=macosx, ARCH=x86_64, WS=cocoa, NL=en_US 37 | Framework arguments: -product org.eclipse.epp.package.java.product -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation 38 | Command-line arguments: -os macosx -ws cocoa -arch x86_64 -product org.eclipse.epp.package.java.product -data /Volumes/Data/Doctorate/Implementation/w2vcomp -product org.eclipse.epp.package.java.product -keyring /Users/anhnk/.eclipse_keyring -showlocation 39 | 40 | !ENTRY org.eclipse.core.net 1 0 2016-03-23 14:56:48.939 41 | !MESSAGE System property http.nonProxyHosts has been set to local|*.local|169.254/16|*.169.254/16 by an external source. This value will be overwritten using the values from the preferences 42 | 43 | !ENTRY org.eclipse.jface 2 0 2016-03-23 14:56:52.297 44 | !MESSAGE Keybinding conflicts occurred. They may interfere with normal accelerator operation. 45 | !SUBENTRY 1 org.eclipse.jface 2 0 2016-03-23 14:56:52.297 46 | !MESSAGE A conflict occurred for ALT+COMMAND+R: 47 | Binding(ALT+COMMAND+R, 48 | ParameterizedCommand(Command(org.python.pydev.debug.setnext,Set Next Statement, 49 | , 50 | Category(org.python.pydev.ui.category.run,PyDev - Run,Python run category,true), 51 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@33b88372, 52 | ,,true),null), 53 | org.eclipse.ui.defaultAcceleratorConfiguration, 54 | org.eclipse.ui.contexts.window,,,system) 55 | Binding(ALT+COMMAND+R, 56 | ParameterizedCommand(Command(org.eclipse.jdt.ui.edit.text.java.rename.element,Rename - Refactoring , 57 | Rename the selected element, 58 | Category(org.eclipse.jdt.ui.category.refactoring,Refactor - Java,Java Refactoring Actions,true), 59 | org.eclipse.ui.internal.WorkbenchHandlerServiceHandler@16bdcbe5, 60 | ,,true),null), 61 | org.eclipse.ui.defaultAcceleratorConfiguration, 62 | org.eclipse.ui.contexts.window,,cocoa,system) 63 | -------------------------------------------------------------------------------- /.metadata/.mylyn/repositories.xml.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.mylyn/repositories.xml.zip -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/history.version: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/properties.index: -------------------------------------------------------------------------------- 1 | /org.eclipse.jdt.corestateVersionNumber28 -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.resources/.root/.indexes/properties.version: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.resources/.root/2.tree: -------------------------------------------------------------------------------- 1 | org.eclipse.jdt.core -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.resources/.safetable/org.eclipse.core.resources: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.core.resources/.safetable/org.eclipse.core.resources -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | version=1 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.e4.ui.css.swt.theme.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | themeid=org.eclipse.e4.ui.css.theme.e4_default 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.e4.ui.workbench.renderers.swt.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | enableMRU=true 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | content_assist_proposals_background=255,255,255 2 | content_assist_proposals_foreground=0,0,0 3 | eclipse.preferences.version=1 4 | fontPropagated=true 5 | org.eclipse.jdt.internal.ui.navigator.layout=2 6 | org.eclipse.jdt.ui.editor.tab.width= 7 | org.eclipse.jdt.ui.formatterprofiles.version=12 8 | org.eclipse.jdt.ui.javadoclocations.migrated=true 9 | org.eclipse.jface.textfont=1|Monaco|13.0|0|COCOA|1|Monaco; 10 | proposalOrderMigrated=true 11 | sourceHoverBackgroundColor=236,235,236 12 | spelling_locale_initialized=true 13 | tabWidthPropagated=true 14 | useAnnotationsPrefPage=true 15 | useQuickDiffPrefPage=true 16 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.m2e.discovery.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.m2e.discovery.pref.projects= 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.context.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | mylyn.attention.migrated=true 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.monitor.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.mylyn.monitor.activity.tracking.enabled.checked=true 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.mylyn.tasks.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | migrated.task.repositories.secure.store=true 3 | org.eclipse.mylyn.tasks.ui.filters.nonmatching=true 4 | org.eclipse.mylyn.tasks.ui.filters.nonmatching.encouraged=true 5 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.team.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.team.ui.first_time=false 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.editors.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | lineNumberRuler=true 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.ide.prefs: -------------------------------------------------------------------------------- 1 | TASKS_FILTERS_MIGRATE=true 2 | eclipse.preferences.version=1 3 | platformState=1450704678997 4 | quickStart=false 5 | tipsAndTricks=true 6 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | showIntro=false 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.eclipse.ui.workbench.prefs: -------------------------------------------------------------------------------- 1 | //org.eclipse.ui.commands/state/org.eclipse.ui.navigator.resources.nested.changeProjectPresentation/org.eclipse.ui.commands.radioState=false 2 | ColorsAndFontsPreferencePage.expandedCategories=Torg.eclipse.ui.workbenchMisc 3 | ColorsAndFontsPreferencePage.selectedElement=Forg.eclipse.jface.textfont 4 | ENABLED_DECORATORS=org.eclipse.m2e.core.mavenVersionDecorator\:true,org.eclipse.buildship.ui.gradledecorator\:true,org.eclipse.egit.ui.internal.decorators.GitLightweightDecorator\:true,org.eclipse.jdt.ui.override.decorator\:true,org.eclipse.jdt.ui.interface.decorator\:true,org.eclipse.jdt.ui.buildpath.decorator\:true,org.eclipse.m2e.core.maven2decorator\:true,org.eclipse.mylyn.context.ui.decorator.interest\:true,org.eclipse.mylyn.tasks.ui.decorators.task\:true,org.eclipse.mylyn.team.ui.changeset.decorator\:true,org.eclipse.ui.LinkedResourceDecorator\:true,org.eclipse.ui.SymlinkDecorator\:true,org.eclipse.ui.VirtualResourceDecorator\:true,org.eclipse.ui.ContentTypeDecorator\:true,org.eclipse.ui.ResourceFilterDecorator\:false,org.python.pydev.navigator.decorator.problemsLabelDecorator\:true, 5 | PLUGINS_NOT_ACTIVATED_ON_STARTUP=org.eclipse.m2e.discovery; 6 | eclipse.preferences.version=1 7 | org.eclipse.jface.textfont=1|Monaco|13.0|0|COCOA|1|Monaco; 8 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.core.runtime/.settings/org.python.pydev.prefs: -------------------------------------------------------------------------------- 1 | INTERPRETERS_CHECKED_ONCE=true 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fdt: -------------------------------------------------------------------------------- 1 | 0.6 -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fdx: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fnm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.fnm -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.frq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.frq -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.nrm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.nrm -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tii -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/_0.tis -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments.gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments.gen -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/history/segments_1 -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdt -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fdx -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fnm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.fnm -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.frq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.frq -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.nrm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.nrm -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tii -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/_2.tis -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments.gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments.gen -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/remote-index/segments_3 -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.epp.logging.aeri.ui/server-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "v1", 3 | "title": "Eclipse.org Error Reporting Server", 4 | "description": "Automated Error Reporting for eclipse.org", 5 | "timestamp": 1458741415931, 6 | "ttl": 20160, 7 | "helpUrl": "https://wiki.eclipse.org/EPP/Logging", 8 | "feedbackUrl": "https://docs.google.com/a/codetrails.com/forms/d/1wd9AzydLv_TMa7ZBXHO7zQIhZjZCJRNMed-6J4fVNsc/viewform", 9 | "aboutUrl": "https://dev.eclipse.org/recommenders/community/confess/#/about", 10 | "submitUrl": "https://dev.eclipse.org/recommenders/community/confess/0.6/reports/", 11 | "maxReportSize": 5242880, 12 | "problemsUrl": "https://www.eclipse.org/downloads/download.php?r\u003d1\u0026file\u003d/technology/epp/logging/problems.zip", 13 | "problemsTtl": 20160, 14 | "queryUrl": "https://dev.eclipse.org/recommenders/community/confess/0.6/query/", 15 | "connectTimeout": 10000, 16 | "socketTimeout": 100000, 17 | "acceptedProducts": [ 18 | "org.eclipse.*" 19 | ], 20 | "acceptedPlugins": [ 21 | "org.eclipse.*", 22 | "org.apache.log4j.*", 23 | "com.codetrails.*" 24 | ], 25 | "acceptedPackages": [ 26 | "org.eclipse.*", 27 | "org.apache.*", 28 | "java.*", 29 | "javax.*", 30 | "javafx.*", 31 | "sun.*", 32 | "com.sun.*", 33 | "com.codetrails.*", 34 | "com.google.*", 35 | "org.osgi.*", 36 | "ch.qos.*", 37 | "org.slf4j.*" 38 | ], 39 | "acceptOtherPackages": true, 40 | "acceptUiFreezes": true, 41 | "ignoredStatuses": [ 42 | "org.eclipse.equinox.p2.*::", 43 | "org.eclipse.epp.mpc.ui:java.io.IOException:", 44 | "org.eclipse.epp.mpc.ui:java.net.SocketTimeoutException:", 45 | "org.eclipse.oomph.setup.core:$org.apache.http.ConnectionClosedException:", 46 | "org.eclipse.ui::Conflicting handlers for*", 47 | "org.eclipse.jface:java.io.IOException:Unable to resolve plug-in*", 48 | "org.eclipse.core.runtime::Invalid input url*", 49 | "org.eclipse.core.filesystem::Could not move*", 50 | "org.eclipse.core.filesystem::Could not delete*", 51 | "org.eclipse.pde.core::The current target platform contains errors*", 52 | ":org.eclipse.equinox.security.storage.StorageException:", 53 | ":org.eclipse.ecf.filetransfer.*:", 54 | ":java.net.*:" 55 | ], 56 | "problemsZipLastDownloadTimestamp": 1458741427690 57 | } -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.core/assumedExternalFilesCache: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.core/externalFilesCache: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.core/nonChainingJarsCache: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.core/variablesAndContainers.dat: -------------------------------------------------------------------------------- 1 | JRE_SRCM2_REPO 2 | JUNIT_HOME JRE_SRCROOTJRE_LIBJUNIT_SRC_HOME -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.ui/OpenTypeHistory.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.ui/QualifiedTypeNameHistory.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.jdt.ui/dialog_settings.xml: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 |
5 |
6 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.m2e.logback.configuration/0.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/.metadata/.plugins/org.eclipse.m2e.logback.configuration/0.log -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.m2e.logback.configuration/logback.1.6.2.20150902-0002.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %date [%thread] %-5level %logger{35} - %msg%n 5 | 6 | 7 | OFF 8 | 9 | 10 | 11 | 12 | ${org.eclipse.m2e.log.dir}/0.log 13 | 14 | ${org.eclipse.m2e.log.dir}/%i.log 15 | 1 16 | 10 17 | 18 | 19 | 100MB 20 | 21 | 22 | %date [%thread] %-5level %logger{35} - %msg%n 23 | 24 | 25 | 26 | 27 | 28 | WARN 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.oomph.setup.ui/dialog_settings.xml: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.oomph.setup/workspace.setup: -------------------------------------------------------------------------------- 1 | 2 | 7 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.ui.ide/dialog_settings.xml: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |
15 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.ui.workbench/dialog_settings.xml: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 |
17 |
18 | -------------------------------------------------------------------------------- /.metadata/.plugins/org.eclipse.ui.workbench/workingsets.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.metadata/version.ini: -------------------------------------------------------------------------------- 1 | #Wed Mar 23 14:56:49 CET 2016 2 | org.eclipse.core.runtime=2 3 | org.eclipse.platform=4.5.1.v20150904-0015 4 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | HyperVec 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.m2e.core.maven2Builder 20 | 21 | 22 | 23 | 24 | 25 | org.eclipse.m2e.core.maven2Nature 26 | org.eclipse.jdt.core.javanature 27 | org.python.pydev.pythonNature 28 | 29 | 30 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | Default 4 | python 2.7 5 | 6 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | formatter_profile=_Nghia 3 | formatter_settings_version=12 4 | org.eclipse.jdt.ui.exception.name=e 5 | org.eclipse.jdt.ui.gettersetter.use.is=true 6 | org.eclipse.jdt.ui.keywordthis=false 7 | org.eclipse.jdt.ui.overrideannotation=true 8 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /HyperVec.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/HyperVec.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## HyperVec 2 | Hierarchical Embeddings for Hypernymy Detection and Directionality 3 | 4 | ### Prerequisite 5 | - [spaCy](https://spacy.io): for parsing, version 2.0.11 6 | - a corpus such as wikipedia corpus (plain-text) 7 | 8 | ### Preprocess 9 | - Create the feature files: 10 | 11 | ```python create_features.py -input corpus-file.txt -output output-file-name -pos pos_tag``` 12 | 13 | in which: pos_tag is either NN (for the noun features) or VB (for the verb features) 14 | 15 | ### Configuration 16 | See the config.cfg to set agruments for model. 17 | 18 | ### Training embeddings 19 | ```java -jar HyperVec.jar config.cfg vector-size window-size``` 20 | 21 | For example, training embeddings with 100 dimensions; window-size = 5: 22 | 23 | ```java -jar HyperVec.jar config.cfg 100 5``` 24 | 25 | ### Pretrained (hypervec) embeddings 26 | The embeddings used in our paper can be downloaded by using the script in `get-pretrainedHyperVecEmbeddings/download_embeddings.sh`. Note that the script downloads 9 files and concatenates them again to a single file (`hypervec.txt.gz`). The format is the default word2vec format: first line with header information, other lines word followed by whitespace seperated vector. 27 | 28 | Information about the embeddings: creatd using the ENCOW14A corpus (14.5bn token), 100 dimensions, sym. window of 5, 15 negative samples, 0.025 learning rate, threshhold set to 0.05. The resulting vocabulary contains about 2.7m words. 29 | 30 | ### Example usage: Evaluation BLESS,BIBLESS and AWBLESS 31 | To reproduce our experiments from Table 3 use the code in the `datasets_classification/`, 32 | assuming your vector file is located in the same folder and named `hypervec.txt.gz`. 33 | `java -jar eval-dir.jar hypervec.txt.gz` (Evaluate directionality on `BLESS.txt` using hyperscore) 34 | `java -jar eval-bless.jar hypervec.txt.gz 2 1000` (Evaluate classification on `BIBLESS.txt, AWBLESS.txt` using 2% of the training data and 1000 random iterations) 35 | 36 | 37 | ### Citation info 38 | If you use the code or the created feature norms, please [cite our paper (Bibtex)](http://www2.ims.uni-stuttgart.de/bibliographie/entry/2811b00e1bbd503adf28648ddb737132dc67a091/), the paper can be found here: [PDF](http://www.aclweb.org/anthology/D17-1022), the poster from EMNLP can be found here: [Poster](http://www.ims.uni-stuttgart.de/institut/mitarbeiter/koepermn/publications/poster_EMNLP2017.pdf) 39 | -------------------------------------------------------------------------------- /code_mapping_across_languages/AP_evaluation_code/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import fromstring, dtype 3 | from numpy.linalg import norm 4 | 5 | def smart_open(fname, mode='rb'): 6 | if fname.endswith('.gz'): 7 | import gzip 8 | return gzip.open(fname, mode) 9 | elif fname.endswith('.bz2'): 10 | import bz2 11 | return bz2.BZ2File(fname, mode) 12 | else: 13 | return open(fname, mode) 14 | 15 | def load_vecs(binary_file, binary=1): 16 | vecs = [] 17 | vocab = [] 18 | if binary==1: 19 | with smart_open(binary_file, 'rb') as f: 20 | header = to_unicode(f.readline()) 21 | vocab_size, vector_size = map(int, header.split()) 22 | binary_len = dtype(np.float32).itemsize * vector_size 23 | for _ in range(vocab_size): 24 | word = [] 25 | while True: 26 | ch = f.read(1) 27 | if ch == b' ': 28 | break 29 | if ch != b'\n': 30 | word.append(ch) 31 | word = to_unicode(b''.join(word)) 32 | vocab.append(word) 33 | vec = fromstring(f.read(binary_len), dtype=np.float32) 34 | vecs.append(vec) 35 | else: 36 | with smart_open(binary_file, 'rb') as f: 37 | header = to_unicode(f.readline()) 38 | if len(header.split()) == 2: vocab_size, vector_size = map(int, header.split()) 39 | elif len(header.split()) > 2: 40 | parts = header.rstrip().split(" ") 41 | word, vec = parts[0], list(map(np.float32, parts[1:])) 42 | vocab.append(to_unicode(word)) 43 | vecs.append(vec) 44 | for _, line in enumerate(f): 45 | parts = to_unicode(line.rstrip()).split(" ") 46 | word, vec = parts[0], list(map(np.float32, parts[1:])) 47 | vocab.append(to_unicode(word)) 48 | vecs.append(vec) 49 | #embs_dim = len(vecs[1]) 50 | #UNKNOWN_WORD = np.random.uniform(-0.25,0.25,embs_dim) 51 | #vecs = np.vstack((UNKNOWN_WORD, vecs)) 52 | #vocab = ['#UNKNOWN#'] + list(vocab) 53 | #words = {word:idx for idx,word in enumerate(vocab)} 54 | 55 | return vecs, vocab 56 | 57 | def to_utf8(text, errors='strict', encoding='utf8'): 58 | """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" 59 | if isinstance(text, unicode): 60 | return text.encode('utf8') 61 | # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 62 | else: 63 | return unicode(text, encoding, errors=errors).encode('utf8') 64 | 65 | def to_unicode(text, encoding='utf8', errors='strict'): 66 | """Convert a string (bytestring in `encoding` or unicode), to unicode.""" 67 | if isinstance(text, unicode): 68 | return text 69 | else: 70 | return unicode(text, encoding=encoding, errors=errors) -------------------------------------------------------------------------------- /code_mapping_across_languages/AP_evaluation_code/test_default.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from sklearn.metrics import average_precision_score 4 | from numpy.linalg import norm 5 | import common 6 | 7 | def cosine_sim(u, v): 8 | return np.dot(u,v)/(norm(u)*norm(v)) 9 | 10 | def computeAP(targets, preds): 11 | paired = zip(preds, targets) 12 | sorted_paired = sorted(paired, key=lambda x:x[0], reverse=True) 13 | preds, targets = zip(*sorted_paired) 14 | preds, targets = list(preds), list(targets) 15 | 16 | ap = 0.0 17 | retrievedCounter = 0.0; 18 | relevantCounter = 0.0; 19 | 20 | for i in range(len(targets)): 21 | retrievedCounter += 1 22 | if int(targets[i]) == 1: 23 | relevantCounter += 1 24 | ap += relevantCounter / retrievedCounter 25 | ap /= relevantCounter 26 | return ap 27 | 28 | def _filter(word): 29 | word = word.split('-') 30 | if len(word) > 2: 31 | f_word = '-'.join(word[:-1]) 32 | else: 33 | f_word = word[0] 34 | return f_word 35 | 36 | def load_dataset(dataset_file): 37 | dataset = [] 38 | with open(dataset_file, 'r') as fin: 39 | for line in fin: 40 | left, right, label = line.strip().split('\t') 41 | dataset.append((left, right, int(label))) 42 | return dataset 43 | 44 | def compute_similarity(dataset, embs): 45 | data = [] 46 | for (left, right, label) in dataset: 47 | if left in embs and right in embs: 48 | #direct = norm(embs[right]) / norm(embs[left]) 49 | score = cosine_sim(embs[left], embs[right]) #* direct 50 | data.append((left, right, label, score)) 51 | else: 52 | continue 53 | return data 54 | 55 | def build_data(dataset_file, embeddings_file): 56 | vecs, words = common.load_vecs(embeddings_file, binary=1) #TODO: set binary=0 to read text file 57 | embs = {word:vecs[idx] for idx,word in enumerate(words)} 58 | dataset = load_dataset(dataset_file) 59 | data = compute_similarity(dataset, embs) 60 | 61 | return data 62 | 63 | def ap_evaluation(data, cutoff=-1): 64 | 65 | data = sorted(data, key=lambda line:line[-1], reverse=True) 66 | targets, scores = [], [] 67 | for (left, right, label, score) in data: 68 | targets.append(label) 69 | scores.append(score) 70 | if cutoff > 0: 71 | ap_score = average_precision_score(targets[:cutoff], scores[:cutoff]) 72 | #ap_score = computeAP(targets, scores) 73 | print 'AP at %d cutoff: %f' %(cutoff, ap_score) 74 | else: 75 | ap_score = average_precision_score(targets, scores) 76 | #ap_score = computeAP(targets, scores) 77 | print 'AP score: %f' %ap_score 78 | 79 | return ap_score 80 | 81 | if __name__=='__main__': 82 | dataset_file = sys.argv[1] 83 | embeddings_file = sys.argv[2] 84 | data = build_data(dataset_file, embeddings_file) 85 | ap_evaluation(data) 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /code_mapping_across_languages/AP_evaluation_code/test_norm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from sklearn.metrics import average_precision_score 4 | from numpy.linalg import norm 5 | import common 6 | 7 | def cosine_sim(u, v): 8 | return np.dot(u,v)/(norm(u)*norm(v)) 9 | 10 | def computeAP(targets, preds): 11 | paired = zip(preds, targets) 12 | sorted_paired = sorted(paired, key=lambda x:x[0], reverse=True) 13 | preds, targets = zip(*sorted_paired) 14 | preds, targets = list(preds), list(targets) 15 | 16 | ap = 0.0 17 | retrievedCounter = 0.0; 18 | relevantCounter = 0.0; 19 | 20 | for i in range(len(targets)): 21 | retrievedCounter += 1 22 | if int(targets[i]) == 1: 23 | relevantCounter += 1 24 | ap += relevantCounter / retrievedCounter 25 | ap /= relevantCounter 26 | return ap 27 | 28 | def _filter(word): 29 | word = word.split('-') 30 | if len(word) > 2: 31 | f_word = '-'.join(word[:-1]) 32 | else: 33 | f_word = word[0] 34 | return f_word 35 | 36 | def load_dataset(dataset_file): 37 | dataset = [] 38 | with open(dataset_file, 'r') as fin: 39 | for line in fin: 40 | left, right, label = line.strip().split('\t') 41 | dataset.append((left, right, int(label))) 42 | return dataset 43 | 44 | def compute_similarity(dataset, embs): 45 | data = [] 46 | for (left, right, label) in dataset: 47 | if left in embs and right in embs: 48 | direct = norm(embs[right]) / norm(embs[left]) 49 | score = cosine_sim(embs[left], embs[right]) * direct 50 | data.append((left, right, label, score)) 51 | else: 52 | continue 53 | return data 54 | 55 | def build_data(dataset_file, embeddings_file): 56 | vecs, words = common.load_vecs(embeddings_file, binary=1) #TODO: set binary=0 to read text file 57 | embs = {word:vecs[idx] for idx,word in enumerate(words)} 58 | dataset = load_dataset(dataset_file) 59 | data = compute_similarity(dataset, embs) 60 | 61 | return data 62 | 63 | def ap_evaluation(data, cutoff=-1): 64 | 65 | data = sorted(data, key=lambda line:line[-1], reverse=True) 66 | targets, scores = [], [] 67 | for (left, right, label, score) in data: 68 | targets.append(label) 69 | scores.append(score) 70 | if cutoff > 0: 71 | ap_score = average_precision_score(targets[:cutoff], scores[:cutoff]) 72 | #ap_score = computeAP(targets, scores) 73 | print 'AP at %d cutoff: %f' %(cutoff, ap_score) 74 | else: 75 | ap_score = average_precision_score(targets, scores) 76 | #ap_score = computeAP(targets, scores) 77 | print 'AP score: %f' %ap_score 78 | 79 | return ap_score 80 | 81 | if __name__=='__main__': 82 | dataset_file = sys.argv[1] 83 | embeddings_file = sys.argv[2] 84 | data = build_data(dataset_file, embeddings_file) 85 | ap_evaluation(data) 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /code_mapping_across_languages/convert_w2vTXT_to_w2vBIN.py: -------------------------------------------------------------------------------- 1 | from gensim.models import word2vec 2 | import sys 3 | 4 | # Script that converts word2vec txtfile into word2vec binary 5 | print ("Script name: %s" % str(sys.argv[1])) 6 | model = word2vec.Word2Vec.load_word2vec_format(str(sys.argv[1]),binary=False) 7 | model.save_word2vec_format(str(sys.argv[1])+'.bin',binary=True) 8 | 9 | -------------------------------------------------------------------------------- /code_mapping_across_languages/credits_to_CLIC_trento.txt: -------------------------------------------------------------------------------- 1 | A huge part of this code is taken from an implementation that used to be available at http://clic.cimec.unitn.it and was also used for the paper 'Improving zero-shot learning by mitigating the hubness problem' by Georgiana Dinu, Angeliki Lazaridou, Marco Baroni https://arxiv.org/pdf/1412.6568.pdf. -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/demo.sh~: -------------------------------------------------------------------------------- 1 | echo "Training..." 2 | 3 | python train_tm.py -o tm data/OPUS_en_it_europarl_train_5K.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl 4 | 5 | 6 | echo "Testing standard NN retrieval (baseline)" 7 | 8 | python -c 5000 test_tm.py tm.pkl data/OPUS_en_it_europarl_test.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl 9 | 10 | 11 | 12 | echo "Testing GC retrieval with 5000 aditional elements" 13 | 14 | python -c 5000 test_tm.py tm.pkl data/OPUS_en_it_europarl_test.txt data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.pkl 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/learn_mat.sh: -------------------------------------------------------------------------------- 1 | echo "Training..." 2 | 3 | 4 | for i in 0 1 2 3 4 5 6 7 8 9 5 | do 6 | python ../train_tm.py -o trainmat_${i} align.train.-${i} encow5.ppmi.train-${i} GNet_img_avg.train.-${i} && 7 | python ../train_tm.py -o testmat_${i} align.test.-${i} encow5.ppmi.test-${i} GNet_img_avg.test.-${i} 8 | done; 9 | 10 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/space.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Space(object): 4 | 5 | def __init__(self, matrix_, id2row_): 6 | 7 | self.mat = matrix_ 8 | self.id2row = id2row_ 9 | self.create_row2id() 10 | 11 | def create_row2id(self): 12 | self.row2id = {} 13 | for idx, word in enumerate(self.id2row): 14 | if word in self.row2id: 15 | raise ValueError("Found duplicate word: %s" % (word)) 16 | self.row2id[word] = idx 17 | 18 | 19 | @classmethod 20 | def build(cls, fname, lexicon=None): 21 | 22 | #if lexicon is provided, only data occurring in the lexicon is loaded 23 | id2row = [] 24 | def filter_lines(f): 25 | for i,line in enumerate(f): 26 | word = line.split()[0] 27 | if i != 0 and (lexicon is None or word in lexicon): 28 | id2row.append(word) 29 | yield line 30 | 31 | #get the number of columns 32 | with open(fname) as f: 33 | f.readline() 34 | ncols = len(f.readline().split()) 35 | 36 | with open(fname) as f: 37 | m = np.matrix(np.loadtxt(filter_lines(f), 38 | comments=None, usecols=range(1,ncols))) 39 | 40 | return Space(m, id2row) 41 | 42 | def normalize(self): 43 | row_norms = np.sqrt(np.multiply(self.mat, self.mat).sum(1)) 44 | row_norms = row_norms.astype(np.double) 45 | row_norms[row_norms != 0] = np.array(1.0/row_norms[row_norms != 0]).flatten() 46 | self.mat = np.multiply(self.mat, row_norms) 47 | 48 | 49 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/space.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/mappingcode/space.pyc -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/test_tm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import getopt 3 | import numpy as np 4 | import collections 5 | import random 6 | from space import Space 7 | from utils import read_dict, apply_tm, score, get_valid_data 8 | 9 | def usage(errno=0): 10 | print >>sys.stderr,\ 11 | """ 12 | Given a translation matrix, test data (words and their translations) and 13 | source and target language vectors, it returns translations of source test 14 | words and computes Top N accuracy. 15 | 16 | Usage: 17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs 18 | \n\ 19 | Options: 20 | -o --output : file prefix. It prints the vectors obtained after 21 | the translation matrix is applied (.vecs.txt and .wds.txt). 22 | Optional. Default is ./translated_vecs 23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA) 24 | to be used with Global Correction (GC) strategy. 25 | Optional. Default, baseline retrieval is run. 26 | 27 | -h --help : help 28 | 29 | Arguments: 30 | trans_matrix: , translation matrix 31 | test_data: , list of source-target word pairs (space separated words, 32 | one word pair per line) 33 | source_vecs: , vectors in source language, Space-separated, with string 34 | identifier as first column (dim+1 columns, where dim is the 35 | dimensionality of the space) 36 | target_vecs: , vectors in target language 37 | 38 | 39 | Example: 40 | 1) Retrieve translations with standard nearest neighbour retrieval 41 | 42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 43 | 44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 45 | correct for hubs (words that appear as the nearest neighbours of many points)) 46 | 47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 48 | 49 | """ 50 | sys.exit(errno) 51 | 52 | 53 | def main(sys_argv): 54 | 55 | try: 56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:", 57 | ["help", "output=", "correction="]) 58 | except getopt.GetoptError, err: 59 | print str(err) 60 | usage() 61 | sys.exit(1) 62 | 63 | out_file = "./translated_vecs" 64 | additional = None 65 | for opt, val in opts: 66 | if opt in ("-o", "--ouput"): 67 | out_file = val 68 | if opt in ("-c", "--correction"): 69 | try: 70 | additional = int(val) 71 | except ValueError: 72 | usage(1) 73 | elif opt in ("-h", "--help"): 74 | usage(0) 75 | else: 76 | usage(1) 77 | 78 | if len(argv) == 4: 79 | tm_file = argv[0] 80 | test_file = argv[1] 81 | source_file = argv[2] 82 | target_file = argv[3] 83 | 84 | else: 85 | print str(err) 86 | usage(1) 87 | 88 | print "Loading the translation matrix" 89 | tm = np.loadtxt(tm_file) 90 | 91 | print "Reading the test data" 92 | test_data = read_dict(test_file) 93 | 94 | #in the _source_ space, we only need to load vectors for the words in test. 95 | #semantic spaces may contain additional words, ALL words in the _target_ 96 | #space are used as the search space 97 | source_words, _ = zip(*test_data) 98 | source_words = set(source_words) 99 | 100 | print "Reading: %s" % source_file 101 | if not additional: 102 | source_sp = Space.build(source_file, source_words) 103 | else: 104 | #read all the words in the space 105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 106 | comments=None, usecols=(0,)).flatten()) 107 | #the max number of additional+test elements is bounded by the size 108 | #of the lexicon 109 | additional = min(additional, len(lexicon) - len(source_words)) 110 | #we sample additional elements that are not already in source_words 111 | random.seed(100) 112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional) 113 | 114 | #load the source space 115 | source_sp = Space.build(source_file, source_words.union(set(lexicon))) 116 | 117 | source_sp.normalize() 118 | 119 | print "Reading: %s" % target_file 120 | target_sp = Space.build(target_file) 121 | target_sp.normalize() 122 | 123 | print "Translating" #translates all the elements loaded in the source space 124 | mapped_source_sp = apply_tm(source_sp, tm) 125 | 126 | print "Retrieving translations" 127 | test_data = get_valid_data(source_sp, target_sp, test_data) 128 | 129 | #turn test data into a dictionary (a word can have mutiple translation) 130 | gold = collections.defaultdict(set) 131 | for k, v in test_data: 132 | gold[k].add(v) 133 | 134 | score(mapped_source_sp, target_sp, gold, additional) 135 | 136 | print "Printing mapped vectors: %s" % out_file 137 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) 138 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s") 139 | 140 | if __name__ == '__main__': 141 | main(sys.argv) 142 | 143 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/test_tm2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import getopt 3 | import numpy as np 4 | import collections 5 | import random 6 | from space import Space 7 | from utils import read_dict, apply_tm, score, get_valid_data 8 | 9 | def usage(errno=0): 10 | print >>sys.stderr,\ 11 | """ 12 | Given a translation matrix, test data (words and their translations) and 13 | source and target language vectors, it returns translations of source test 14 | words and computes Top N accuracy. 15 | 16 | Usage: 17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs 18 | \n\ 19 | Options: 20 | -o --output : file prefix. It prints the vectors obtained after 21 | the translation matrix is applied (.vecs.txt and .wds.txt). 22 | Optional. Default is ./translated_vecs 23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA) 24 | to be used with Global Correction (GC) strategy. 25 | Optional. Default, baseline retrieval is run. 26 | 27 | -h --help : help 28 | 29 | Arguments: 30 | trans_matrix: , translation matrix 31 | test_data: , list of source-target word pairs (space separated words, 32 | one word pair per line) 33 | source_vecs: , vectors in source language, Space-separated, with string 34 | identifier as first column (dim+1 columns, where dim is the 35 | dimensionality of the space) 36 | target_vecs: , vectors in target language 37 | 38 | 39 | Example: 40 | 1) Retrieve translations with standard nearest neighbour retrieval 41 | 42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 43 | 44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 45 | correct for hubs (words that appear as the nearest neighbours of many points)) 46 | 47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 48 | 49 | """ 50 | sys.exit(errno) 51 | 52 | 53 | def main(sys_argv): 54 | 55 | try: 56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:", 57 | ["help", "output=", "correction="]) 58 | except getopt.GetoptError, err: 59 | print str(err) 60 | usage() 61 | sys.exit(1) 62 | 63 | out_file = "./translated_vecs2" 64 | additional = None 65 | for opt, val in opts: 66 | if opt in ("-o", "--ouput"): 67 | out_file = val 68 | if opt in ("-c", "--correction"): 69 | try: 70 | additional = int(val) 71 | except ValueError: 72 | usage(1) 73 | elif opt in ("-h", "--help"): 74 | usage(0) 75 | else: 76 | usage(1) 77 | 78 | if len(argv) == 4: 79 | tm_file = argv[0] 80 | test_file = argv[1] 81 | source_file = argv[2] 82 | target_file = argv[3] 83 | 84 | else: 85 | print str(err) 86 | usage(1) 87 | 88 | print "Loading the translation matrix" 89 | tm = np.loadtxt(tm_file) 90 | 91 | print "Reading the test data" 92 | test_data = read_dict(test_file) 93 | 94 | #in the _source_ space, we only need to load vectors for the words in test. 95 | #semantic spaces may contain additional words, ALL words in the _target_ 96 | #space are used as the search space 97 | source_words, _ = zip(*test_data) 98 | source_words = set(source_words) 99 | 100 | print "Reading: %s" % source_file 101 | if not additional: 102 | source_sp = Space.build(source_file, source_words) 103 | else: 104 | #read all the words in the space 105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 106 | comments=None, usecols=(0,)).flatten()) 107 | #the max number of additional+test elements is bounded by the size 108 | #of the lexicon 109 | additional = min(additional, len(lexicon) - len(source_words)) 110 | #we sample additional elements that are not already in source_words 111 | random.seed(100) 112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional) 113 | 114 | #load the source space 115 | source_sp = Space.build(source_file, source_words.union(set(lexicon))) 116 | 117 | source_sp.normalize() 118 | 119 | print "Reading: %s" % target_file 120 | target_sp = Space.build(target_file) 121 | target_sp.normalize() 122 | 123 | print "Translating" #translates all the elements loaded in the source space 124 | mapped_source_sp = apply_tm(source_sp, tm) 125 | 126 | print "Retrieving translations" 127 | test_data = get_valid_data(source_sp, target_sp, test_data) 128 | 129 | #turn test data into a dictionary (a word can have mutiple translation) 130 | gold = collections.defaultdict(set) 131 | for k, v in test_data: 132 | gold[k].add(v) 133 | 134 | score(mapped_source_sp, target_sp, gold, additional) 135 | 136 | print "Printing mapped vectors: %s" % out_file 137 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) 138 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s") 139 | 140 | if __name__ == '__main__': 141 | main(sys.argv) 142 | 143 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/test_tm_pred.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import getopt 3 | import numpy as np 4 | import collections 5 | import random 6 | from space import Space 7 | from utils import read_dict, apply_tm, score, get_valid_data 8 | 9 | def usage(errno=0): 10 | print >>sys.stderr,\ 11 | """ 12 | Given a translation matrix, test data (words and their translations) and 13 | source and target language vectors, it returns translations of source test 14 | words and computes Top N accuracy. 15 | 16 | Usage: 17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs 18 | \n\ 19 | Options: 20 | -o --output : file prefix. It prints the vectors obtained after 21 | the translation matrix is applied (.vecs.txt and .wds.txt). 22 | Optional. Default is ./translated_vecs 23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA) 24 | to be used with Global Correction (GC) strategy. 25 | Optional. Default, baseline retrieval is run. 26 | 27 | -h --help : help 28 | 29 | Arguments: 30 | trans_matrix: , translation matrix 31 | test_data: , list of source-target word pairs (space separated words, 32 | one word pair per line) 33 | source_vecs: , vectors in source language, Space-separated, with string 34 | identifier as first column (dim+1 columns, where dim is the 35 | dimensionality of the space) 36 | target_vecs: , vectors in target language 37 | 38 | 39 | Example: 40 | 1) Retrieve translations with standard nearest neighbour retrieval 41 | 42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 43 | 44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 45 | correct for hubs (words that appear as the nearest neighbours of many points)) 46 | 47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 48 | 49 | """ 50 | sys.exit(errno) 51 | 52 | 53 | def main(sys_argv): 54 | 55 | try: 56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:", 57 | ["help", "output=", "correction="]) 58 | except getopt.GetoptError, err: 59 | print str(err) 60 | usage() 61 | sys.exit(1) 62 | 63 | out_file = "./translated_vecs" 64 | additional = None 65 | for opt, val in opts: 66 | if opt in ("-o", "--ouput"): 67 | out_file = val 68 | if opt in ("-c", "--correction"): 69 | try: 70 | additional = int(val) 71 | except ValueError: 72 | usage(1) 73 | elif opt in ("-h", "--help"): 74 | usage(0) 75 | else: 76 | usage(1) 77 | 78 | if len(argv) == 4: 79 | tm_file = argv[0] 80 | test_file = argv[1] 81 | source_file = argv[2] 82 | target_file = argv[3] 83 | 84 | else: 85 | print str(err) 86 | usage(1) 87 | 88 | print "Loading the translation matrix" 89 | tm = np.loadtxt(tm_file) 90 | 91 | print "Reading the test data" 92 | test_data = read_dict(test_file) 93 | 94 | #in the _source_ space, we only need to load vectors for the words in test. 95 | #semantic spaces may contain additional words, ALL words in the _target_ 96 | #space are used as the search space 97 | source_words, _ = zip(*test_data) 98 | source_words = set(source_words) 99 | 100 | print "Reading: %s" % source_file 101 | if not additional: 102 | source_sp = Space.build(source_file, source_words) 103 | else: 104 | #read all the words in the space 105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 106 | comments=None, usecols=(0,)).flatten()) 107 | #the max number of additional+test elements is bounded by the size 108 | #of the lexicon 109 | additional = min(additional, len(lexicon) - len(source_words)) 110 | #we sample additional elements that are not already in source_words 111 | random.seed(100) 112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional) 113 | 114 | #load the source space 115 | source_sp = Space.build(source_file, source_words.union(set(lexicon))) 116 | 117 | source_sp.normalize() 118 | 119 | print "Reading: %s" % target_file 120 | target_sp = Space.build(target_file) 121 | target_sp.normalize() 122 | 123 | print "Translating" #translates all the elements loaded in the source space 124 | mapped_source_sp = apply_tm(source_sp, tm) 125 | 126 | print "Retrieving translations" 127 | test_data = get_valid_data(source_sp, target_sp, test_data) 128 | 129 | #turn test data into a dictionary (a word can have mutiple translation) 130 | #gold = collections.defaultdict(set) 131 | #for k, v in test_data: 132 | # gold[k].add(v) 133 | 134 | #score(mapped_source_sp, target_sp, gold, additional) 135 | 136 | print "Printing mapped vectors: %s" % out_file 137 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) 138 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s") 139 | 140 | if __name__ == '__main__': 141 | main(sys.argv) 142 | 143 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/train_tm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import getopt 3 | import numpy as np 4 | from space import Space 5 | from utils import read_dict, train_tm 6 | 7 | def usage(errno=0): 8 | print >>sys.stderr,\ 9 | """ 10 | Given train data (pairs of words and their translation), source language and 11 | target language vectors, it outputs a translation matrix between source and 12 | target spaces. 13 | 14 | Usage: 15 | python train_tm.py [options] train_data source_vecs target_vecs 16 | \n\ 17 | Options: 18 | -o --output : output file prefix. Optional. Default is ./tm 19 | -h --help : help 20 | 21 | Arguments: 22 | train_data: , train dictionary, list of word pairs (space separated words, 23 | one word pair per line) 24 | source_vecs: , vectors in source language. Space-separated, with string 25 | identifier as first column (dim+1 columns, where dim is the dimensionality 26 | of the space) 27 | target_vecs: , vectors in target language 28 | 29 | 30 | Example: 31 | python train_tm.py train_data.txt ENspace.pkl ITspace.pkl 32 | 33 | """ 34 | sys.exit(errno) 35 | 36 | 37 | def main(sys_argv): 38 | 39 | try: 40 | opts, argv = getopt.getopt(sys_argv[1:], "ho:", 41 | ["help", "output="]) 42 | except getopt.GetoptError, err: 43 | print str(err) 44 | usage() 45 | sys.exit(1) 46 | 47 | out_file = "./tm" 48 | for opt, val in opts: 49 | if opt in ("-o", "--output"): 50 | out_file = val 51 | elif opt in ("-h", "--help"): 52 | usage(0) 53 | else: 54 | usage(1) 55 | 56 | if len(argv) == 3: 57 | source_file = argv[1] 58 | target_file = argv[2] 59 | dict_file = argv[0] 60 | else: 61 | print str(err) 62 | usage(1) 63 | 64 | 65 | print "Reading the training data" 66 | train_data = read_dict(dict_file) 67 | 68 | #we only need to load the vectors for the words in the training data 69 | #semantic spaces contain additional words 70 | source_words, target_words = zip(*train_data) 71 | 72 | print "Reading: %s" % source_file 73 | source_sp = Space.build(source_file, set(source_words)) 74 | source_sp.normalize() 75 | 76 | print "Reading: %s" % target_file 77 | target_sp = Space.build(target_file, set(target_words)) 78 | target_sp.normalize() 79 | 80 | print "Learning the translation matrix" 81 | tm = train_tm(source_sp, target_sp, train_data) 82 | 83 | print "Printing the translation matrix" 84 | np.savetxt("%s.txt" % out_file, tm) 85 | 86 | 87 | if __name__ == '__main__': 88 | main(sys.argv) 89 | 90 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/translate_tm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import getopt 3 | import numpy as np 4 | import collections 5 | import random 6 | from space import Space 7 | from utils import read_dict, apply_tm, score, get_valid_data 8 | 9 | def usage(errno=0): 10 | print >>sys.stderr,\ 11 | """ 12 | Given a translation matrix, test data (words and their translations) and 13 | source and target language vectors, it returns translations of source test 14 | words and computes Top N accuracy. 15 | 16 | Usage: 17 | python test_tm.py [options] trans_matrix test_data source_vecs target_vecs 18 | \n\ 19 | Options: 20 | -o --output : file prefix. It prints the vectors obtained after 21 | the translation matrix is applied (.vecs.txt and .wds.txt). 22 | Optional. Default is ./translated_vecs 23 | -c --correction : Number of additional elements (ADDITIONAL TO TEST DATA) 24 | to be used with Global Correction (GC) strategy. 25 | Optional. Default, baseline retrieval is run. 26 | 27 | -h --help : help 28 | 29 | Arguments: 30 | trans_matrix: , translation matrix 31 | test_data: , list of source-target word pairs (space separated words, 32 | one word pair per line) 33 | source_vecs: , vectors in source language, Space-separated, with string 34 | identifier as first column (dim+1 columns, where dim is the 35 | dimensionality of the space) 36 | target_vecs: , vectors in target language 37 | 38 | 39 | Example: 40 | 1) Retrieve translations with standard nearest neighbour retrieval 41 | 42 | python test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 43 | 44 | 2) "Corrected" retrieval (GC). Use additional 2000 source space elements to 45 | correct for hubs (words that appear as the nearest neighbours of many points)) 46 | 47 | python -c 2000 test_tm.py tm.txt test_data.txt ENspace.txt ITspace.txt 48 | 49 | """ 50 | sys.exit(errno) 51 | 52 | 53 | def main(sys_argv): 54 | 55 | try: 56 | opts, argv = getopt.getopt(sys_argv[1:], "ho:c:", 57 | ["help", "output=", "correction="]) 58 | except getopt.GetoptError, err: 59 | print str(err) 60 | usage() 61 | sys.exit(1) 62 | 63 | out_file = "./translated_vecs" 64 | additional = None 65 | for opt, val in opts: 66 | if opt in ("-o", "--ouput"): 67 | out_file = val 68 | if opt in ("-c", "--correction"): 69 | try: 70 | additional = int(val) 71 | except ValueError: 72 | usage(1) 73 | elif opt in ("-h", "--help"): 74 | usage(0) 75 | else: 76 | usage(1) 77 | 78 | if len(argv) == 4: 79 | tm_file = argv[0] 80 | test_file = argv[1] 81 | source_file = argv[2] 82 | target_file = argv[3] 83 | 84 | else: 85 | print str(err) 86 | usage(1) 87 | 88 | print "Loading the translation matrix" 89 | tm = np.loadtxt(tm_file) 90 | 91 | print "Reading the test data" 92 | test_data = read_dict(test_file) 93 | 94 | #in the _source_ space, we only need to load vectors for the words in test. 95 | #semantic spaces may contain additional words, ALL words in the _target_ 96 | #space are used as the search space 97 | source_words, _ = zip(*test_data) 98 | source_words = set(source_words) 99 | 100 | print "Reading: %s" % source_file 101 | if not additional: 102 | source_sp = Space.build(source_file, source_words) 103 | else: 104 | #read all the words in the space 105 | lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 106 | comments=None, usecols=(0,)).flatten()) 107 | #the max number of additional+test elements is bounded by the size 108 | #of the lexicon 109 | additional = min(additional, len(lexicon) - len(source_words)) 110 | #we sample additional elements that are not already in source_words 111 | random.seed(100) 112 | lexicon = random.sample(list(lexicon.difference(source_words)), additional) 113 | 114 | #load the source space 115 | source_sp = Space.build(source_file, source_words.union(set(lexicon))) 116 | 117 | source_sp.normalize() 118 | 119 | print "Reading: %s" % target_file 120 | target_sp = Space.build(target_file) 121 | target_sp.normalize() 122 | 123 | print "Translating" #translates all the elements loaded in the source space 124 | mapped_source_sp = apply_tm(source_sp, tm) 125 | 126 | 127 | print "Printing mapped vectors: %s" % out_file 128 | np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) 129 | np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s") 130 | 131 | if __name__ == '__main__': 132 | main(sys.argv) 133 | 134 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | from space import Space 4 | 5 | 6 | def prec_at(ranks, cut): 7 | return len([r for r in ranks if r <= cut])/float(len(ranks)) 8 | 9 | def get_rank(nn, gold): 10 | for idx,word in enumerate(nn): 11 | if word in gold: 12 | return idx + 1 13 | return idx + 1 14 | 15 | 16 | def read_dict(dict_file): 17 | return [tuple(line.strip().split()) for line in file(dict_file)] 18 | 19 | 20 | def apply_tm(sp, tm): 21 | 22 | print "Applying the translation matrix, size of data: %d" % sp.mat.shape[0] 23 | return Space(sp.mat*tm, sp.id2row) 24 | 25 | def get_valid_data(sp1, sp2, data): 26 | return [(el1, el2) for el1,el2 in data if 27 | el1 in sp1.row2id and el2 in sp2.row2id] 28 | 29 | def train_tm(sp1, sp2, data): 30 | 31 | data = get_valid_data(sp1, sp2, data) 32 | print "Training using: %d word pairs" % len(data) 33 | 34 | els1, els2 = zip(*data) 35 | m1 = sp1.mat[[sp1.row2id[el] for el in els1],:] 36 | m2 = sp2.mat[[sp2.row2id[el] for el in els2],:] 37 | 38 | tm = np.linalg.lstsq(m1, m2, -1)[0] 39 | 40 | return tm 41 | 42 | 43 | def score(sp1, sp2, gold, additional): 44 | 45 | sp1.normalize() 46 | 47 | print "Computing cosines and sorting target space elements" 48 | sim_mat = -sp2.mat*sp1.mat.T 49 | 50 | if additional: 51 | #for each element, computes its rank in the ranked list of 52 | #similarites. sorting done on the opposite axis (inverse querying) 53 | srtd_idx = np.argsort(np.argsort(sim_mat, axis=1), axis=1) 54 | 55 | #for each element, the resulting rank is combined with cosine scores. 56 | #the effect will be of breaking the ties, because cosines are smaller 57 | #than 1. sorting done on the standard axis (regular NN querying) 58 | srtd_idx = np.argsort(srtd_idx + sim_mat, axis=0) 59 | else: 60 | srtd_idx = np.argsort(sim_mat, axis=0) 61 | 62 | ranks = [] 63 | for i,el1 in enumerate(gold.keys()): 64 | 65 | sp1_idx = sp1.row2id[el1] 66 | 67 | #print the top 5 translations 68 | translations = [] 69 | for j in range(5): 70 | sp2_idx = srtd_idx[j, sp1_idx] 71 | word, score = sp2.id2row[sp2_idx], -sim_mat[sp2_idx, sp1_idx] 72 | translations.append("\t\t%s:%.3f" % (word, score)) 73 | 74 | translations = "\n".join(translations) 75 | 76 | #get the rank of the (highest-ranked) translation 77 | rnk = get_rank(srtd_idx[:,sp1_idx].A.ravel(), 78 | [sp2.row2id[el] for el in gold[el1]]) 79 | ranks.append(rnk) 80 | 81 | print ("\nId: %d Source: %s \n\tTranslation:\n%s \n\tGold: %s \n\tRank: %d" % 82 | (len(ranks), el1, translations, gold[el1], rnk)) 83 | 84 | print "Corrected: %s" % str(additional) 85 | if additional: 86 | print "Total extra elements, Test(%d) + Additional:%d" % (len(gold.keys()), 87 | sp1.mat.shape[0]) 88 | for k in [1,5,10]: 89 | print "Prec@%d: %.3f" % (k, prec_at(ranks, k)) 90 | 91 | -------------------------------------------------------------------------------- /code_mapping_across_languages/mappingcode/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/mappingcode/utils.pyc -------------------------------------------------------------------------------- /code_mapping_across_languages/perform_mapping.sh: -------------------------------------------------------------------------------- 1 | MAIN="/mount/arbeitsdaten29/corpora/waterloo/img/en_vec/zeroShot/" 2 | CODE="mappingcode/" 3 | 4 | 5 | EN="hypercos.txt" #<- English Vectors (plain text w2v format) 6 | DE="de_cow_vecs.txt" #<- Source Language Vectors DE/IT 7 | A="zero_full.align" # Alignment file format word-source TAB word-target (EN) 8 | AV="fullvoc_de.txt" # <- Vocabulary file of the source language (used to predict every word in Source -> Target) 9 | OUT="out/" #<- Output folder 10 | 11 | python ${CODE}train_tm.py -o TM1 ${A} ${DE} ${EN}; # Learn the mapping Matrix 12 | python ${CODE}test_tm_pred.py TM1.txt ${AV} ${DE} ${EN}; # Apply the Mapping Matrix 13 | paste -d" " translated_vecs.wds.txt translated_vecs.vecs.txt >> ${OUT}output-vecs-tmp.txt # this is just formating 14 | rm -f translated_vecs*; # remove temporary files 15 | less ${DE} | head -1 > HEAD.txt; 16 | cat HEAD.txt ${OUT}output-vecs-tmp.txt > ${OUT}output-vecs.txt; 17 | rm -f HEAD.txt; 18 | rm -f ${OUT}output-vecs-tmp.txt 19 | rm -f TM1; 20 | #gzip ${OUT}output-vecs.txt # <- final new file! 21 | 22 | 23 | # Now we can convert the vectors into binary vectors using the script convert_w2vTXT_to_w2vBIN.py 24 | python convert_w2vTXT_to_w2vBIN.py ${OUT}output-vecs.txt # (will create) output-vecs.txt.bin 25 | 26 | # Now we can evaluate the binary embeddings 27 | # Using hyperscore python AP_evaluation_code/test_norm.py 28 | # Using default cosine python AP_evaluation_code/test_default.py -------------------------------------------------------------------------------- /code_mapping_across_languages/vocabulary file/german_voc_wikipedia.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/vocabulary file/german_voc_wikipedia.txt.gz -------------------------------------------------------------------------------- /code_mapping_across_languages/vocabulary file/italian_voc_wikipedia.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/code_mapping_across_languages/vocabulary file/italian_voc_wikipedia.txt.gz -------------------------------------------------------------------------------- /config.cfg: -------------------------------------------------------------------------------- 1 | ProjectDir = /mount/arbeitsdaten34/projekte/slu/KimAnh/HypeEmb 2 | 3 | TrainDir = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/corpus 4 | VocabFileName = wiki_en.vocab 5 | 6 | WordVectorFileName = hypervec.bin 7 | 8 | hypeNoun = /projekte/semrel/Users/kim-anh/hypernyms/hypernym_n.txt 9 | hypeVerb = /projekte/semrel/Users/kim-anh/hypernyms/hypernym_v.txt 10 | cohypoNoun = /projekte/semrel/Users/kim-anh/hypernyms/cohyponym_n.txt 11 | cohypoVerb = /projekte/semrel/Users/kim-anh/hypernyms/cohyponym_v.txt 12 | 13 | featureNoun = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/lexical-contrast/wiki_en_features.noun 14 | featureVerb = /mount/arbeitsdaten34/projekte/slu/KimAnh/AntSynDistinction/lexical-contrast/wiki_en_features.verb 15 | 16 | HierarchialSoftmax = false 17 | NegativeSampling = 15 18 | SubSampling = 1e-5 19 | MinFrequency = 50 20 | -------------------------------------------------------------------------------- /create_features.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import spacy 3 | from spacy.lang.en import English 4 | import gzip 5 | from collections import Counter, defaultdict 6 | import six.moves.cPickle as pickle 7 | from itertools import count 8 | 9 | def main(): 10 | """ 11 | TODO: extracts the feature files in the corpus 12 | Usage: python create_features.py -input corpus -output output-file-name -pos 13 | -pos: 14 | """ 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-input', type=str) 17 | parser.add_argument('-output', type=str) 18 | parser.add_argument('-pos', type=str) 19 | args = parser.parse_args() 20 | 21 | nlp = English() 22 | nlp.add_pipe(nlp.create_pipe('sentencizer')) 23 | 24 | window_size = 5 25 | dfeatures = defaultdict(set) 26 | 27 | output_dir = '/mount/arbeitsdaten34/projekte/slu/KimAnh/Corpora/' 28 | 29 | vocab_to_id = defaultdict(count(0).next) 30 | 31 | with gzip.open(args.input,'rb') as fin: 32 | para_num = 0 33 | # Read each paragraph in corpus 34 | for paragraph in fin: 35 | # Check empty paragraph 36 | paragraph = paragraph.strip() 37 | if len(paragraph) == 0: continue 38 | para_num += 1 39 | print 'Processing para: %d' %para_num 40 | # Parse each sentence 41 | parsed_para = nlp(unicode(paragraph)) 42 | for sent in parsed_para.sents: 43 | features = process_one_sentence(sent, args.pos, window_size, vocab_to_id) 44 | dfeatures.update(features) 45 | 46 | id_to_vocab = {idx:word for word,idx in vocab_to_id.iteritems()} 47 | save_file(dfeatures, id_to_vocab, args.output) 48 | 49 | print 'Parsing corpus done....!' 50 | 51 | def save_file(dfeatures, id_to_vocab, outfile): 52 | with gzip.open(outfile, 'w') as fout: 53 | for kk,vv in dfeatures: 54 | contexts = [id_to_vocab[idx] for idx in list(vv)] 55 | fout.write(str(id_to_vocab[kk])) 56 | for word in contexts: 57 | fout.write('\t' + str(word)) 58 | fout.write('\n') 59 | print 'Saved file!' 60 | 61 | def process_one_sentence(sent, pos, window_size, vocab_to_id): 62 | features = defaultdict(set) 63 | 64 | for idx,token in enumerate(sent): 65 | if token.tag_[:2] == pos and len(token.string.strip()) > 2: 66 | for idw in range(idx-window_size, idx+window_size): 67 | if idw != idx and idw >= 0 and idw < len(sent): 68 | features[vocab_to_id[sent[idx]]].add(vocab_to_id[sent[idw]]) 69 | 70 | return features 71 | 72 | if __name__=='__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /datasets_across_languages/eval_DE/noun_hyp_vs_ant.txt: -------------------------------------------------------------------------------- 1 | Pflanze Lebewesen 1 2 | Anrufbeantworter Gerät 1 3 | Rakete Flugobjekt 1 4 | Jagd Freizeitbeschäftigung 1 5 | Persönlichkeit Eigenschaft 1 6 | Restaurierung Pflege 1 7 | Vizekönig Adeliger 1 8 | Bein Körperteil 1 9 | Bauabschnitt Bau 1 10 | Hoheit Herrscher 1 11 | Mittelklasse Klasse 1 12 | Schwergewicht Gewichtsklasse 1 13 | Bauabschnitt Etappe 1 14 | Recherche Tätigkeit 1 15 | MEZ Zeitzone 1 16 | Ekel Gefühl 1 17 | Trio Gruppe 1 18 | Kirchengemeinde Gruppe 1 19 | Fisch Tier 1 20 | Modell Ding 1 21 | Bock Säugetier 1 22 | Agentur Firma 1 23 | Gesamtbelastung Belastung 1 24 | Mittelklasse Gesellschaftsschicht 1 25 | Bremer Norddeutscher 1 26 | Kern Pflanzenbestandteil 1 27 | Looping Figur 1 28 | Bahnsteig Verkehr 1 29 | Komplex Gebäude 1 30 | Stuhl Möbel 1 31 | Bremer Person 1 32 | Sirene Alarmgerät 1 33 | Nutzer Kunde 1 34 | Hoheit Titel 1 35 | Torf Material 1 36 | Bedienung Restaurant 1 37 | Lieblichkeit Attraktivität 1 38 | Bibel Buch 1 39 | Bein Gliedmaße 1 40 | Los Glücksspiel 1 41 | Wache Schutz 1 42 | Persönlichkeit Person 1 43 | Bahnsteig Haltestelle 1 44 | Breite Dimension 1 45 | Bremer Deutscher 1 46 | Nobelpreis Preis 1 47 | Professor Gelehrter 1 48 | Gegenargument Diskussion 1 49 | Zeit Dimension 1 50 | Nutzer Person 1 51 | Großmeister Meister 1 52 | Wache Tätigkeit 1 53 | Killer Krimineller 1 54 | Schleife Gebinde 1 55 | Alb Gebirgsmassiv 1 56 | Bedienung Dienstleistung 1 57 | Vizekönig Herrscher 1 58 | Arzt Beruf 1 59 | Bock Tier 1 60 | Zivilgesellschaft Gesellschaft 1 61 | Ablehnung Reaktion 1 62 | Lieblichkeit Eigenschaft 1 63 | Gerät Ding 1 64 | Professor Lehrperson 1 65 | Bundesland Region 1 66 | Eiweiß Nährstoff 1 67 | Natter Tier 1 68 | Restaurierung Modernisierung 1 69 | Breite Maß 1 70 | Katze Tier 1 71 | Bahnsteig Bahnhof 1 72 | Glaubenssatz Satz 1 73 | Katze Haustier 1 74 | Signatur Kennzeichen 1 75 | Eiweiß Ei 1 76 | Hexe Märchengestalt 1 77 | Agentur Unternehmen 1 78 | Propagandist Politiker 1 79 | Signatur Schrift 1 80 | Entscheidungskompetenz Kompetenz 1 81 | Verhandlung Gespräch 1 82 | Temperatur Maßeinheit 1 83 | Wandzeitung Zeitung 1 84 | Gatter Begrenzung 1 85 | Auflösung Ende 1 86 | Nutzer Mensch 1 87 | Bauer Beruf 1 88 | Ausnahmefall Fall 1 89 | Rübe Gemüse 1 90 | Funk Kommunikation 1 91 | Pension Einkommen 1 92 | Pflanze Natur 1 93 | Atomwaffe Waffe 1 94 | Ablehnung Verhalten 1 95 | Buchhalterin Beruf 1 96 | Individualismus Wertesystem 1 97 | MEZ Zeit 1 98 | Signatur Überschrift 0 99 | Kriminalität Sicherheit 0 100 | Bein Arm 0 101 | Ablehnung Annahme 0 102 | Komplex Einfachheit 0 103 | MEZ UTC 0 104 | Schulbuch Roman 0 105 | Ausnahmefall Regelfall 0 106 | Fluch Segen 0 107 | Killer Opfer 0 108 | Verhandlung Bestimmung 0 109 | Breite Enge 0 110 | Ausnahmefall Normalfall 0 111 | Individualismus Kollektivismus 0 112 | Hexe Fee 0 113 | Ablehnung Zustimmung 0 114 | Kriminalität Friedlichkeit 0 115 | Komplex einfach 0 116 | Lieblichkeit Bitterkeit 0 117 | Bauer Stadtbewohner 0 118 | Devise Landeswährung 0 119 | Bremer Hamburger 0 120 | Breite Länge 0 121 | Regression Fortschritt 0 122 | Torf Sand 0 123 | Kaufhaus Tante-Emma-Laden 0 124 | Gegenargument Fürargument 0 125 | Pfeffer Zucker 0 126 | Natter Viper 0 127 | Hoheit Untertan 0 128 | Hoheit Bürger 0 129 | Deutlichkeit Unklarheit 0 130 | Gesamtbelastung Einzelbelastung 0 131 | Großmeister Lehrling 0 132 | Kadett Offizier 0 133 | Restaurierung Verfall 0 134 | Gerät Mensch 0 135 | Bahnsteig Bahngleis 0 136 | Bedienung Selbstbedienung 0 137 | Fisch Fleisch 0 138 | Eiweiß Kohlenhydrat 0 139 | Gesamtbelastung Teilbelastung 0 140 | Ekel Zuneigung 0 141 | Materialsammlung Einzelstück 0 142 | VHS Universität 0 143 | Auslöser Bremser 0 144 | Funk Kabel 0 145 | Explosion Implosion 0 146 | Nutzer Entwickler 0 147 | Rakete U-Boot 0 148 | Buchhalterin Buchhalter 0 149 | Bibel Koran 0 150 | Stuhl Tisch 0 151 | Alb Flachland 0 152 | Alb Tal 0 153 | Auflösung Gründung 0 154 | Pfeffer Salz 0 155 | Zeit Raum 0 156 | Katalysator Motor 0 157 | Kern Hülle 0 158 | Ekel Anziehung 0 159 | Bock Geiß 0 160 | Regression Progression 0 161 | Betrieb Ruhe 0 162 | Restaurierung Zerstörung 0 163 | Wache Einbrecher 0 164 | Pflanze Tier 0 165 | Natter Vogel 0 166 | Arzt Patient 0 167 | Klage Zufriedenheit 0 168 | Mittelklasse Oberklasse 0 169 | Bremer Münchner 0 170 | Los Niete 0 171 | Verjüngung Alterung 0 172 | Nutzer Hersteller 0 173 | Zeit Unendlichkeit 0 174 | Ausnahmefall Regel 0 175 | Sirene Stille 0 176 | Lieblichkeit Hässlichkeit 0 177 | Hexe Zauberer 0 178 | Entscheidungskompetenz Entscheidungsunfähigkeit 0 179 | VHS DVD 0 180 | Modell Realität 0 181 | Mittelklasse Oberschicht 0 182 | Trio Solo 0 183 | Bedienung Gast 0 184 | Schwergewicht Leichtgewicht 0 185 | Deutlichkeit Undeutlichkeit 0 186 | Abschneiden Ankleben 0 187 | Verhandlung Entscheidung 0 188 | Gatter Freiheit 0 189 | Katze Hund 0 190 | Bahnsteig Bushaltestelle 0 191 | Bauer Industrieller 0 192 | Betrieb Stillstand 0 193 | -------------------------------------------------------------------------------- /datasets_across_languages/eval_DE/noun_hyp_vs_syn.txt: -------------------------------------------------------------------------------- 1 | Pflanze Lebewesen 1 2 | Anrufbeantworter Gerät 1 3 | Rakete Flugobjekt 1 4 | Jagd Freizeitbeschäftigung 1 5 | Persönlichkeit Eigenschaft 1 6 | Restaurierung Pflege 1 7 | Vizekönig Adeliger 1 8 | Bein Körperteil 1 9 | Bauabschnitt Bau 1 10 | Hoheit Herrscher 1 11 | Mittelklasse Klasse 1 12 | Schwergewicht Gewichtsklasse 1 13 | Bauabschnitt Etappe 1 14 | Recherche Tätigkeit 1 15 | MEZ Zeitzone 1 16 | Ekel Gefühl 1 17 | Trio Gruppe 1 18 | Kirchengemeinde Gruppe 1 19 | Fisch Tier 1 20 | Modell Ding 1 21 | Bock Säugetier 1 22 | Agentur Firma 1 23 | Gesamtbelastung Belastung 1 24 | Mittelklasse Gesellschaftsschicht 1 25 | Bremer Norddeutscher 1 26 | Kern Pflanzenbestandteil 1 27 | Looping Figur 1 28 | Bahnsteig Verkehr 1 29 | Komplex Gebäude 1 30 | Stuhl Möbel 1 31 | Bremer Person 1 32 | Sirene Alarmgerät 1 33 | Nutzer Kunde 1 34 | Hoheit Titel 1 35 | Torf Material 1 36 | Bedienung Restaurant 1 37 | Lieblichkeit Attraktivität 1 38 | Bibel Buch 1 39 | Bein Gliedmaße 1 40 | Los Glücksspiel 1 41 | Wache Schutz 1 42 | Persönlichkeit Person 1 43 | Bahnsteig Haltestelle 1 44 | Breite Dimension 1 45 | Bremer Deutscher 1 46 | Nobelpreis Preis 1 47 | Professor Gelehrter 1 48 | Gegenargument Diskussion 1 49 | Zeit Dimension 1 50 | Nutzer Person 1 51 | Großmeister Meister 1 52 | Wache Tätigkeit 1 53 | Killer Krimineller 1 54 | Schleife Gebinde 1 55 | Alb Gebirgsmassiv 1 56 | Bedienung Dienstleistung 1 57 | Vizekönig Herrscher 1 58 | Arzt Beruf 1 59 | Bock Tier 1 60 | Zivilgesellschaft Gesellschaft 1 61 | Ablehnung Reaktion 1 62 | Lieblichkeit Eigenschaft 1 63 | Gerät Ding 1 64 | Professor Lehrperson 1 65 | Bundesland Region 1 66 | Eiweiß Nährstoff 1 67 | Natter Tier 1 68 | Restaurierung Modernisierung 1 69 | Breite Maß 1 70 | Katze Tier 1 71 | Bahnsteig Bahnhof 1 72 | Glaubenssatz Satz 1 73 | Katze Haustier 1 74 | Signatur Kennzeichen 1 75 | Eiweiß Ei 1 76 | Hexe Märchengestalt 1 77 | Agentur Unternehmen 1 78 | Propagandist Politiker 1 79 | Signatur Schrift 1 80 | Entscheidungskompetenz Kompetenz 1 81 | Verhandlung Gespräch 1 82 | Temperatur Maßeinheit 1 83 | Wandzeitung Zeitung 1 84 | Gatter Begrenzung 1 85 | Auflösung Ende 1 86 | Nutzer Mensch 1 87 | Bauer Beruf 1 88 | Ausnahmefall Fall 1 89 | Rübe Gemüse 1 90 | Funk Kommunikation 1 91 | Pension Einkommen 1 92 | Pflanze Natur 1 93 | Atomwaffe Waffe 1 94 | Ablehnung Verhalten 1 95 | Buchhalterin Beruf 1 96 | Individualismus Wertesystem 1 97 | MEZ Zeit 1 98 | Wandzeitung Aushang 0 99 | Propagandist Verkaufsförderer 0 100 | Schulbuch Lehrbuch 0 101 | Hoheit König 0 102 | Sirene Martinshorn 0 103 | Ausnahmefall Sonderfall 0 104 | Lieblichkeit Anmut 0 105 | VHS Videokassette 0 106 | Kadett Anfänger 0 107 | Gatter Umzäunung 0 108 | Devise Wahlspruch 0 109 | Fisch Meeresbewohner 0 110 | Pfeffer Würze 0 111 | Zivilgesellschaft Volk 0 112 | Anrufbeantworter AB 0 113 | Katze Stubentiger 0 114 | Professor Dozent 0 115 | Lieblichkeit Liebreiz 0 116 | Kirchengemeinde Glaubensanhänger 0 117 | Wache Aufsicht 0 118 | Gegenargument Widerspruch 0 119 | Temperatur Wärmegrad 0 120 | Bauer Landwirt 0 121 | Restaurierung Instandsetzung 0 122 | Großmeister Sachkundiger 0 123 | Bein Gliedmaß 0 124 | Numerus Nummer 0 125 | Pension Gästehaus 0 126 | Agentur Geschäftsstelle 0 127 | Funk Radio 0 128 | Abschneiden Abtrennen 0 129 | Kondom Pariser 0 130 | Trio Dreiergruppe 0 131 | Rübe Karotte 0 132 | Rakete Geschoss 0 133 | Zeile Linie 0 134 | Kaufhaus Einkaufszentrum 0 135 | Komplex Störung 0 136 | Trio drei 0 137 | Glaubenssatz Lehre 0 138 | Looping Salto 0 139 | Explosion Ausbruch 0 140 | Kondom Präservativ 0 141 | Anrufbeantworter Mailbox 0 142 | Rübe Wurzel 0 143 | VHS Volkshochschule 0 144 | Ablehnung Abweisung 0 145 | Looping Überschlag 0 146 | Kriminalität Delinquenz 0 147 | Signatur Unterschrift 0 148 | Verhandlung Besprechung 0 149 | Klage Anschuldigung 0 150 | Pflanze Gewächs 0 151 | Atomwaffe Kernwaffe 0 152 | Bauer Farmer 0 153 | Zeit Dauer 0 154 | Fisch Wassertier 0 155 | Umgebung Umland 0 156 | Bauabschnitt Bausektion 0 157 | Verhandlung Prozess 0 158 | Individualismus Eigenheit 0 159 | Rübe Möhre 0 160 | Anrufbeantworter Telefonbeantworter 0 161 | Zeit Epoche 0 162 | Invasion Einfall 0 163 | Jagd Hetze 0 164 | Mittelklasse Mittelschicht 0 165 | Breite Weite 0 166 | Schleife Schlaufe 0 167 | Schwergewicht Schwerpunkt 0 168 | Arzt Doktor 0 169 | Katze Samtpfote 0 170 | Bahnsteig Perron 0 171 | Bundesland Gliedstaat 0 172 | Zivilgesellschaft Bürgergesellschaft 0 173 | Bock Hammel 0 174 | Gesamtbelastung Totalbelastung 0 175 | Gerät Apparat 0 176 | Hoheit Majestät 0 177 | Zeile Reihe 0 178 | Mittelklasse Mittelstand 0 179 | Katalysator Beschleuniger 0 180 | Looping Schleife 0 181 | Explosion Sprengung 0 182 | Kaufhaus Warenhaus 0 183 | Professor Hochschullehrer 0 184 | Blatt Zettel 0 185 | Schulbuch Schullektüre 0 186 | Temperatur Wärme 0 187 | Verjüngung Verengung 0 188 | -------------------------------------------------------------------------------- /datasets_across_languages/eval_DE/noun_hyp_vs_synant.txt: -------------------------------------------------------------------------------- 1 | Pflanze Lebewesen 1 2 | Anrufbeantworter Gerät 1 3 | Rakete Flugobjekt 1 4 | Jagd Freizeitbeschäftigung 1 5 | Persönlichkeit Eigenschaft 1 6 | Restaurierung Pflege 1 7 | Vizekönig Adeliger 1 8 | Bein Körperteil 1 9 | Bauabschnitt Bau 1 10 | Hoheit Herrscher 1 11 | Mittelklasse Klasse 1 12 | Schwergewicht Gewichtsklasse 1 13 | Bauabschnitt Etappe 1 14 | Recherche Tätigkeit 1 15 | MEZ Zeitzone 1 16 | Ekel Gefühl 1 17 | Trio Gruppe 1 18 | Kirchengemeinde Gruppe 1 19 | Fisch Tier 1 20 | Modell Ding 1 21 | Bock Säugetier 1 22 | Agentur Firma 1 23 | Gesamtbelastung Belastung 1 24 | Mittelklasse Gesellschaftsschicht 1 25 | Bremer Norddeutscher 1 26 | Kern Pflanzenbestandteil 1 27 | Looping Figur 1 28 | Bahnsteig Verkehr 1 29 | Komplex Gebäude 1 30 | Stuhl Möbel 1 31 | Bremer Person 1 32 | Sirene Alarmgerät 1 33 | Nutzer Kunde 1 34 | Hoheit Titel 1 35 | Torf Material 1 36 | Bedienung Restaurant 1 37 | Lieblichkeit Attraktivität 1 38 | Bibel Buch 1 39 | Bein Gliedmaße 1 40 | Los Glücksspiel 1 41 | Wache Schutz 1 42 | Persönlichkeit Person 1 43 | Bahnsteig Haltestelle 1 44 | Breite Dimension 1 45 | Bremer Deutscher 1 46 | Nobelpreis Preis 1 47 | Professor Gelehrter 1 48 | Gegenargument Diskussion 1 49 | Zeit Dimension 1 50 | Nutzer Person 1 51 | Großmeister Meister 1 52 | Wache Tätigkeit 1 53 | Killer Krimineller 1 54 | Schleife Gebinde 1 55 | Alb Gebirgsmassiv 1 56 | Bedienung Dienstleistung 1 57 | Vizekönig Herrscher 1 58 | Arzt Beruf 1 59 | Bock Tier 1 60 | Zivilgesellschaft Gesellschaft 1 61 | Ablehnung Reaktion 1 62 | Lieblichkeit Eigenschaft 1 63 | Gerät Ding 1 64 | Professor Lehrperson 1 65 | Bundesland Region 1 66 | Eiweiß Nährstoff 1 67 | Natter Tier 1 68 | Restaurierung Modernisierung 1 69 | Breite Maß 1 70 | Katze Tier 1 71 | Bahnsteig Bahnhof 1 72 | Glaubenssatz Satz 1 73 | Katze Haustier 1 74 | Signatur Kennzeichen 1 75 | Eiweiß Ei 1 76 | Hexe Märchengestalt 1 77 | Agentur Unternehmen 1 78 | Propagandist Politiker 1 79 | Signatur Schrift 1 80 | Entscheidungskompetenz Kompetenz 1 81 | Verhandlung Gespräch 1 82 | Temperatur Maßeinheit 1 83 | Wandzeitung Zeitung 1 84 | Gatter Begrenzung 1 85 | Auflösung Ende 1 86 | Nutzer Mensch 1 87 | Bauer Beruf 1 88 | Ausnahmefall Fall 1 89 | Rübe Gemüse 1 90 | Funk Kommunikation 1 91 | Pension Einkommen 1 92 | Pflanze Natur 1 93 | Atomwaffe Waffe 1 94 | Ablehnung Verhalten 1 95 | Buchhalterin Beruf 1 96 | Individualismus Wertesystem 1 97 | MEZ Zeit 1 98 | Signatur Überschrift 0 99 | Kriminalität Sicherheit 0 100 | Bein Arm 0 101 | Ablehnung Annahme 0 102 | Komplex Einfachheit 0 103 | MEZ UTC 0 104 | Schulbuch Roman 0 105 | Ausnahmefall Regelfall 0 106 | Fluch Segen 0 107 | Killer Opfer 0 108 | Verhandlung Bestimmung 0 109 | Breite Enge 0 110 | Ausnahmefall Normalfall 0 111 | Individualismus Kollektivismus 0 112 | Hexe Fee 0 113 | Ablehnung Zustimmung 0 114 | Kriminalität Friedlichkeit 0 115 | Komplex einfach 0 116 | Lieblichkeit Bitterkeit 0 117 | Bauer Stadtbewohner 0 118 | Devise Landeswährung 0 119 | Bremer Hamburger 0 120 | Breite Länge 0 121 | Regression Fortschritt 0 122 | Torf Sand 0 123 | Kaufhaus Tante-Emma-Laden 0 124 | Gegenargument Fürargument 0 125 | Pfeffer Zucker 0 126 | Natter Viper 0 127 | Hoheit Untertan 0 128 | Hoheit Bürger 0 129 | Deutlichkeit Unklarheit 0 130 | Gesamtbelastung Einzelbelastung 0 131 | Großmeister Lehrling 0 132 | Kadett Offizier 0 133 | Restaurierung Verfall 0 134 | Gerät Mensch 0 135 | Bahnsteig Bahngleis 0 136 | Bedienung Selbstbedienung 0 137 | Fisch Fleisch 0 138 | Eiweiß Kohlenhydrat 0 139 | Gesamtbelastung Teilbelastung 0 140 | Ekel Zuneigung 0 141 | Materialsammlung Einzelstück 0 142 | VHS Universität 0 143 | Auslöser Bremser 0 144 | Funk Kabel 0 145 | Explosion Implosion 0 146 | Nutzer Entwickler 0 147 | Rakete U-Boot 0 148 | Buchhalterin Buchhalter 0 149 | Bibel Koran 0 150 | Stuhl Tisch 0 151 | Alb Flachland 0 152 | Alb Tal 0 153 | Auflösung Gründung 0 154 | Pfeffer Salz 0 155 | Zeit Raum 0 156 | Katalysator Motor 0 157 | Kern Hülle 0 158 | Ekel Anziehung 0 159 | Bock Geiß 0 160 | Regression Progression 0 161 | Betrieb Ruhe 0 162 | Restaurierung Zerstörung 0 163 | Wache Einbrecher 0 164 | Pflanze Tier 0 165 | Natter Vogel 0 166 | Arzt Patient 0 167 | Klage Zufriedenheit 0 168 | Mittelklasse Oberklasse 0 169 | Bremer Münchner 0 170 | Los Niete 0 171 | Verjüngung Alterung 0 172 | Nutzer Hersteller 0 173 | Zeit Unendlichkeit 0 174 | Ausnahmefall Regel 0 175 | Sirene Stille 0 176 | Lieblichkeit Hässlichkeit 0 177 | Hexe Zauberer 0 178 | Entscheidungskompetenz Entscheidungsunfähigkeit 0 179 | VHS DVD 0 180 | Modell Realität 0 181 | Mittelklasse Oberschicht 0 182 | Trio Solo 0 183 | Bedienung Gast 0 184 | Schwergewicht Leichtgewicht 0 185 | Deutlichkeit Undeutlichkeit 0 186 | Abschneiden Ankleben 0 187 | Verhandlung Entscheidung 0 188 | Gatter Freiheit 0 189 | Katze Hund 0 190 | Bahnsteig Bushaltestelle 0 191 | Bauer Industrieller 0 192 | Betrieb Stillstand 0 193 | Wandzeitung Aushang 0 194 | Propagandist Verkaufsförderer 0 195 | Schulbuch Lehrbuch 0 196 | Hoheit König 0 197 | Sirene Martinshorn 0 198 | Ausnahmefall Sonderfall 0 199 | Lieblichkeit Anmut 0 200 | VHS Videokassette 0 201 | Kadett Anfänger 0 202 | Gatter Umzäunung 0 203 | Devise Wahlspruch 0 204 | Fisch Meeresbewohner 0 205 | Pfeffer Würze 0 206 | Zivilgesellschaft Volk 0 207 | Anrufbeantworter AB 0 208 | Katze Stubentiger 0 209 | Professor Dozent 0 210 | Lieblichkeit Liebreiz 0 211 | Kirchengemeinde Glaubensanhänger 0 212 | Wache Aufsicht 0 213 | Gegenargument Widerspruch 0 214 | Temperatur Wärmegrad 0 215 | Bauer Landwirt 0 216 | Restaurierung Instandsetzung 0 217 | Großmeister Sachkundiger 0 218 | Bein Gliedmaß 0 219 | Numerus Nummer 0 220 | Pension Gästehaus 0 221 | Agentur Geschäftsstelle 0 222 | Funk Radio 0 223 | Abschneiden Abtrennen 0 224 | Kondom Pariser 0 225 | Trio Dreiergruppe 0 226 | Rübe Karotte 0 227 | Rakete Geschoss 0 228 | Zeile Linie 0 229 | Kaufhaus Einkaufszentrum 0 230 | Komplex Störung 0 231 | Trio drei 0 232 | Glaubenssatz Lehre 0 233 | Looping Salto 0 234 | Explosion Ausbruch 0 235 | Kondom Präservativ 0 236 | Anrufbeantworter Mailbox 0 237 | Rübe Wurzel 0 238 | VHS Volkshochschule 0 239 | Ablehnung Abweisung 0 240 | Looping Überschlag 0 241 | Kriminalität Delinquenz 0 242 | Signatur Unterschrift 0 243 | Verhandlung Besprechung 0 244 | Klage Anschuldigung 0 245 | Pflanze Gewächs 0 246 | Atomwaffe Kernwaffe 0 247 | Bauer Farmer 0 248 | Zeit Dauer 0 249 | Fisch Wassertier 0 250 | Umgebung Umland 0 251 | Bauabschnitt Bausektion 0 252 | Verhandlung Prozess 0 253 | Individualismus Eigenheit 0 254 | Rübe Möhre 0 255 | Anrufbeantworter Telefonbeantworter 0 256 | Zeit Epoche 0 257 | Invasion Einfall 0 258 | Jagd Hetze 0 259 | Mittelklasse Mittelschicht 0 260 | Breite Weite 0 261 | Schleife Schlaufe 0 262 | Schwergewicht Schwerpunkt 0 263 | Arzt Doktor 0 264 | Katze Samtpfote 0 265 | Bahnsteig Perron 0 266 | Bundesland Gliedstaat 0 267 | Zivilgesellschaft Bürgergesellschaft 0 268 | Bock Hammel 0 269 | Gesamtbelastung Totalbelastung 0 270 | Gerät Apparat 0 271 | Hoheit Majestät 0 272 | Zeile Reihe 0 273 | Mittelklasse Mittelstand 0 274 | Katalysator Beschleuniger 0 275 | Looping Schleife 0 276 | Explosion Sprengung 0 277 | Kaufhaus Warenhaus 0 278 | Professor Hochschullehrer 0 279 | Blatt Zettel 0 280 | Schulbuch Schullektüre 0 281 | Temperatur Wärme 0 282 | Verjüngung Verengung 0 283 | -------------------------------------------------------------------------------- /datasets_classification/eval-bless.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/datasets_classification/eval-bless.jar -------------------------------------------------------------------------------- /datasets_classification/eval-dir.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/datasets_classification/eval-dir.jar -------------------------------------------------------------------------------- /datasets_classification/readme_how_to.txt: -------------------------------------------------------------------------------- 1 | 2 | ## Evaluate Bless 3 | # 1] Vector file hypercos_wiki.txt.gz = word2vec format (txt) gziped. 4 | # 2] Percentage used for training (in the paper we use 2%) 5 | # 3] Number of iterations (in the paper we use 1000) 6 | java -jar eval-bless.jar hypercos_wiki.txt.gz 2 1000 7 | -------------------------------------------------------------------------------- /evaluation_scripts/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import fromstring, dtype 3 | 4 | def smart_open(fname, mode='rb'): 5 | if fname.endswith('.gz'): 6 | import gzip 7 | return gzip.open(fname, mode) 8 | elif fname.endswith('.bz2'): 9 | import bz2 10 | return bz2.BZ2File(fname, mode) 11 | else: 12 | return open(fname, mode) 13 | 14 | def load_vecs(binary_file, binary=1): 15 | vecs = [] 16 | vocab = [] 17 | if binary==1: 18 | with smart_open(binary_file, 'rb') as f: 19 | header = to_unicode(f.readline()) 20 | vocab_size, vector_size = map(int, header.split()) 21 | binary_len = dtype(np.float32).itemsize * vector_size 22 | for _ in range(vocab_size): 23 | word = [] 24 | while True: 25 | ch = f.read(1) 26 | if ch == b' ': 27 | break 28 | if ch != b'\n': 29 | word.append(ch) 30 | word = to_unicode(b''.join(word)) 31 | vocab.append(word) 32 | vec = fromstring(f.read(binary_len), dtype=np.float32) 33 | vecs.append(vec) 34 | else: 35 | with smart_open(binary_file, 'rb') as f: 36 | header = to_unicode(f.readline()) 37 | if len(header.split()) == 2: vocab_size, vector_size = map(int, header.split()) 38 | elif len(header.split()) > 2: 39 | parts = header.rstrip().split(" ") 40 | word, vec = parts[0], list(map(np.float32, parts[1:])) 41 | vocab.append(to_unicode(word)) 42 | vecs.append(vec) 43 | for _, line in enumerate(f): 44 | parts = to_unicode(line.rstrip()).split(" ") 45 | word, vec = parts[0], list(map(np.float32, parts[1:])) 46 | vocab.append(to_unicode(word)) 47 | vecs.append(vec) 48 | 49 | #embs_dim = len(vecs[1]) 50 | #UNKNOWN_WORD = np.random.uniform(-0.25,0.25,embs_dim) 51 | #vecs = np.vstack((UNKNOWN_WORD, vecs)) 52 | #vocab = ['#UNKNOWN#'] + list(vocab) 53 | #words = {word:idx for idx,word in enumerate(vocab)} 54 | 55 | return vecs, vocab 56 | 57 | def to_utf8(text, errors='strict', encoding='utf8'): 58 | """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" 59 | if isinstance(text, unicode): 60 | return text.encode('utf8') 61 | # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 62 | else: 63 | return unicode(text, encoding, errors=errors).encode('utf8') 64 | 65 | def to_unicode(text, encoding='utf8', errors='strict'): 66 | """Convert a string (bytestring in `encoding` or unicode), to unicode.""" 67 | if isinstance(text, unicode): 68 | return text 69 | else: 70 | return unicode(text, encoding=encoding, errors=errors) -------------------------------------------------------------------------------- /evaluation_scripts/corrEval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from scipy.stats import spearmanr 4 | from numpy.linalg import norm 5 | import common 6 | 7 | def cosine(u, v): 8 | return np.dot(u,v)/(norm(u)*norm(v)) 9 | 10 | def hyper_score(u,v): 11 | sim = np.dot(u,v)/(norm(u)*norm(v)) 12 | direct = norm(v)/norm(u) 13 | return sim*direct 14 | 15 | def load_data(embeddings_file, dataset_file, mode='cosine'): 16 | golds, scores = [], [] 17 | unseen = 0 18 | with open(dataset_file, 'r') as fin: 19 | data = [line.strip().split(' ') for line in fin] 20 | vecs, words = common.load_vecs(embeddings_file, binary=1) 21 | embs = {word:vec for word,vec in zip(words,vecs)} 22 | for rec in data: 23 | if rec[0] in embs and rec[1] in embs: 24 | golds.append(float(rec[5])) 25 | if mode=='hyper': 26 | grade = hyper_score(embs[rec[0]], embs[rec[1]]) 27 | scores.append(grade) 28 | elif mode=='cosine': 29 | grade = cosine(embs[rec[0]], embs[rec[1]]) 30 | scores.append(grade) 31 | else: 32 | unseen += 1 33 | print 'unseen-words: %d' %unseen 34 | return golds, scores 35 | 36 | if __name__=='__main__': 37 | embeddings_file = sys.argv[1] 38 | dataset_file = sys.argv[2] 39 | mode = sys.argv[3] # either 'cosine' or 'hyper' 40 | golds, scores = load_data(embeddings_file, dataset_file, mode) 41 | rho = spearmanr(golds, scores)[0] 42 | print 'Spearman correlation: %f' %rho 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /get-pretrainedHyperVecEmbeddings/download_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # downloads hypvec_embeddings from IMS homepage 3 | for id in 0 1 2 3 4 5 6 7 8 9 4 | do 5 | wget http://www.ims.uni-stuttgart.de/documents/ressourcen/experiment-daten/hypvec_embd/hyp_p${id}.gz 6 | done 7 | cat hyp_p0.gz hyp_p1.gz hyp_p2.gz hyp_p3.gz hyp_p4.gz hyp_p5.gz hyp_p6.gz hyp_p7.gz hyp_p8.gz hyp_p9.gz > hypervec.txt.gz 8 | # rm -f hyp_p*.gz # OPTIONAL -remove files 9 | # gunzip hypervec.txt.gz # OPTIONAL unzip embeddings to plain text 10 | -------------------------------------------------------------------------------- /hypernymy_resources/cohyponym_n.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/cohyponym_n.txt.gz -------------------------------------------------------------------------------- /hypernymy_resources/cohyponym_v.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/cohyponym_v.txt.gz -------------------------------------------------------------------------------- /hypernymy_resources/hypernym_n.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/hypernym_n.txt.gz -------------------------------------------------------------------------------- /hypernymy_resources/hypernym_v.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenkh/HyperVec/878d7b39f2953ed0567d61ca5d45c0163ba7078c/hypernymy_resources/hypernym_v.txt.gz -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 0.0.1 4 | 5 | src 6 | 7 | 8 | maven-compiler-plugin 9 | 3.1 10 | 11 | 1.7 12 | 1.7 13 | 14 | 15 | 16 | maven-assembly-plugin 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | jar-with-dependencies 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | junit 33 | junit 34 | 4.7 35 | 36 | 37 | 38 | com.googlecode.efficient-java-matrix-library 39 | 40 | ejml 41 | 0.25 42 | 43 | 44 | org.apache.commons 45 | commons-math3 46 | 3.3 47 | 48 | 49 | commons-lang 50 | commons-lang 51 | 2.3 52 | 53 | 54 | edu.stanford.nlp 55 | stanford-corenlp 56 | 3.4 57 | 58 | 59 | de.erichseifert.gral 60 | gral-core 61 | 0.10 62 | 63 | 64 | edu.berkeley.compbio 65 | jlibsvm 66 | 0.911 67 | 68 | 69 | 70 | 71 | erichseifert.de 72 | http://mvn.erichseifert.de/maven2 73 | 74 | 75 | dev.davidsoergel.com releases 76 | http://dev.davidsoergel.com/nexus/content/repositories/releases 77 | 78 | false 79 | 80 | 81 | 82 | dev.davidsoergel.com snapshots 83 | http://dev.davidsoergel.com/nexus/content/repositories/snapshots 84 | 85 | false 86 | 87 | 88 | 89 | SemRel 90 | HyperVec 91 | -------------------------------------------------------------------------------- /src/common/DataStructureUtils.java: -------------------------------------------------------------------------------- 1 | package common; 2 | 3 | import io.word.Phrase; 4 | 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | 10 | /** 11 | * This class provides a set of utility method to turn one data structure to 12 | * another 13 | * 14 | */ 15 | public class DataStructureUtils { 16 | 17 | /** 18 | * This template method turns an array into a HashSet of the same type 19 | */ 20 | public static HashSet arrayToSet(T[] inputArray) { 21 | HashSet result = new HashSet(); 22 | if (inputArray != null) { 23 | for (int i = 0; i < inputArray.length; i++) { 24 | result.add(inputArray[i]); 25 | } 26 | } 27 | return result; 28 | } 29 | 30 | /** 31 | * This template method turns an array into a HashMap that maps an element 32 | * of the array to its index 33 | */ 34 | public static HashMap arrayToMap(T[] inputArray) { 35 | HashMap result = new HashMap(); 36 | if (inputArray != null) { 37 | for (int i = 0; i < inputArray.length; i++) { 38 | result.put(inputArray[i], i); 39 | } 40 | } 41 | return result; 42 | } 43 | 44 | /** 45 | * This template method turns an array into an (Array)List of the same type 46 | */ 47 | public static ArrayList arrayToList(T[] inputArray) { 48 | ArrayList result = new ArrayList(); 49 | if (inputArray != null) { 50 | for (int i = 0; i < inputArray.length; i++) { 51 | result.add(inputArray[i]); 52 | } 53 | } 54 | return result; 55 | } 56 | 57 | /* 58 | * The following set of methods turn a list into an array of the same type 59 | * The Java compiler cannot initialize an array without knowing the type of 60 | * the elements. Therefore, one cannot generalize with a template method 61 | */ 62 | 63 | public static double[][] arrayListTo2dArray(List list) { 64 | double[][] array = new double[list.size()][list.get(0).length]; 65 | list.toArray(array); 66 | return array; 67 | } 68 | 69 | /** 70 | * This template method turns an array into a HashMap that maps an element 71 | * of the array to its index 72 | */ 73 | public static HashMap listToMap(List inputArray) { 74 | HashMap result = new HashMap(); 75 | if (inputArray != null) { 76 | for (int i = 0; i < inputArray.size(); i++) { 77 | result.put(inputArray.get(i), i); 78 | } 79 | } 80 | return result; 81 | } 82 | 83 | public static String[] stringListToArray(List list) { 84 | String[] array = new String[list.size()]; 85 | list.toArray(array); 86 | return array; 87 | } 88 | 89 | public static Phrase[] phraseListToArray(List list) { 90 | Phrase[] array = new Phrase[list.size()]; 91 | list.toArray(array); 92 | return array; 93 | } 94 | 95 | public static int[] intListToArray(List list) { 96 | int[] array = new int[list.size()]; 97 | int i = 0; 98 | for (Integer element : list) { 99 | array[i] = element; 100 | i++; 101 | } 102 | return array; 103 | } 104 | 105 | /** 106 | * Search through an small int array for a given value 107 | * @param array 108 | * @param key 109 | * @return the index of the first element to have a value equal to the key 110 | */ 111 | public static int searchSmallIntArray(int[] array, int key) { 112 | for (int i = 0; i < array.length; i++) { 113 | if (array[i] == key) 114 | return i; 115 | } 116 | return -1; 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /src/common/MathUtils.java: -------------------------------------------------------------------------------- 1 | package common; 2 | 3 | import java.util.Random; 4 | 5 | /** 6 | * This class contains a set of utility method for simple maths 7 | * (maybe should be replaced with utility method for SimpleMatrix class 8 | * 9 | */ 10 | public class MathUtils { 11 | private static Random rand = new Random(); 12 | // TODO: use some linear algebra method 13 | 14 | /** 15 | * Cosine of two vectors 16 | * @param v1: 1st vector 17 | * @param v2: 2nd vector 18 | * @return cosine value 19 | */ 20 | public static double cosine(double[] v1, double[] v2) { 21 | double length1 = length(v1); 22 | double length2 = length(v2); 23 | if (length1 == 0 || length2 == 0) return 0.0; 24 | else return dot(v1, v2) / (length1 * length2); 25 | } 26 | 27 | /** 28 | * Length of a vector 29 | * @param v: input vector 30 | * @return length 31 | */ 32 | public static double length(double[] v) { 33 | double norm = dot(v, v); 34 | return Math.sqrt(norm); 35 | } 36 | 37 | /** 38 | * Dot product of two vectors 39 | * @param v1 first vector 40 | * @param v2 second vector 41 | * @return dot product 42 | */ 43 | public static double dot(double[] v1, double[] v2) { 44 | double result = 0; 45 | for (int i = 0; i < v1.length; i++) { 46 | result += v1[i] * v2[i]; 47 | } 48 | return result; 49 | } 50 | 51 | /** 52 | * sigmoid function 53 | * @param f input value 54 | * @return sigmoid(f) 55 | */ 56 | public static double sigmoid(double x) { 57 | // TODO: understand why they turn the formula like this (e^x faster 58 | // than e^-x ? Rounding error?) 59 | return 1 - (double) (1.0 / (1.0 + Math.exp(x))); 60 | } 61 | 62 | /** 63 | * tanh function 64 | */ 65 | public static double tanh(double x) { 66 | return 1 - (double) (2.0 / (1.0 + Math.exp(2 * x))); 67 | } 68 | 69 | public static boolean isSampled(long count, long totalCount, double frequencyThreshold) { 70 | double randomThreshold = (double) (Math.sqrt(count 71 | / (frequencyThreshold * totalCount)) + 1) 72 | * (frequencyThreshold * totalCount) / count; 73 | if (randomThreshold >= rand.nextFloat()) { 74 | return true; 75 | } else { 76 | return false; 77 | } 78 | } 79 | 80 | public static double[] cosineDerivative(double[] x, double[] a) { 81 | double lengthX = length(x); 82 | double lengthA = length(a); 83 | double dotP = dot(x, a); 84 | double rToScaleA = 1 / (lengthX * lengthA); 85 | double rToScaleX = dotP / (lengthA * lengthX * lengthX * lengthX); 86 | double[] result = new double[x.length]; 87 | for (int i = 0; i < x.length; i++) { 88 | result[i] = a[i] * rToScaleA - x[i] * rToScaleX; 89 | } 90 | return result; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/common/MeanAveragePrecision.java: -------------------------------------------------------------------------------- 1 | package common; 2 | 3 | import space.SemanticSpace; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.Comparator; 7 | 8 | import common.IOUtils; 9 | 10 | public class MeanAveragePrecision { 11 | String[][] wordPairs; 12 | double[] golds; 13 | 14 | public MeanAveragePrecision(String dataset) { 15 | readDataset(dataset); 16 | } 17 | 18 | public MeanAveragePrecision(String[][] wordPairs, double[] golds) { 19 | this.wordPairs = wordPairs; 20 | this.golds = golds; 21 | } 22 | 23 | public void readDataset(String dataset) { 24 | ArrayList data = IOUtils.readFile(dataset); 25 | golds = new double[data.size()]; 26 | wordPairs = new String[data.size()][2]; 27 | for (int i = 0; i < data.size(); i++) { 28 | String dataPiece = data.get(i); 29 | String elements[] = dataPiece.split("\t"); 30 | wordPairs[i][0] = elements[0]; 31 | wordPairs[i][1] = elements[1]; 32 | golds[i] = Double.parseDouble(elements[2]); 33 | //golds[i] = Double.parseDouble(elements[3]); 34 | } 35 | } 36 | 37 | public double evaluateMAP(SemanticSpace space) { 38 | final double[] predicts = new double[golds.length]; 39 | for (int i = 0; i < golds.length; i++) { 40 | predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]) 41 | * space.getDirection(wordPairs[i][0], wordPairs[i][1]); 42 | } 43 | Integer[] idxs = new Integer[golds.length]; 44 | for(int i = 0; i < golds.length; i++) idxs[i] = i; 45 | Arrays.sort(idxs, new Comparator(){ 46 | public int compare(Integer o1, Integer o2){ 47 | return Double.compare(predicts[o2], predicts[o1]); 48 | } 49 | }); 50 | double[] sorted_preds = new double[golds.length]; 51 | for(int i = 0; i < golds.length; i++) sorted_preds[i] = golds[idxs[i]]; 52 | 53 | double map = computeMAP(sorted_preds); 54 | return map; 55 | } 56 | 57 | public double computeMAP(double[] sorted_preds) { 58 | double ap = 0.0; 59 | double retrievedCounter = 0; 60 | double relevantCounter = 0; 61 | 62 | for (int i = 0; i < sorted_preds.length; i++) { 63 | retrievedCounter++; 64 | if (sorted_preds[i] == 1.0) { 65 | relevantCounter++; 66 | ap += relevantCounter / retrievedCounter; 67 | } 68 | } 69 | ap /= relevantCounter; 70 | return ap; 71 | } 72 | 73 | 74 | } -------------------------------------------------------------------------------- /src/common/SigmoidTable.java: -------------------------------------------------------------------------------- 1 | package common; 2 | 3 | /** 4 | * An instance of this class pre-computed values for the sigmoid function 5 | * Its main purpose to increase the speed of the program (or so people say :P) 6 | * since e^-x takes longer time then mult/add 7 | * 8 | */ 9 | public class SigmoidTable { 10 | 11 | // Default parameters for the table 12 | public static final double DEFAULT_MAX_X = 6; 13 | public static final int DEFAULT_SIGMOID_TABLE_SIZE = 10000000; 14 | 15 | /* 16 | * This sigmoidTable holds the precomputed sigmoid values of variables in the range 17 | * [-maxX, maxX] 18 | * tableSize decides the interval between two consecutive values that we 19 | * compute the sigmoid function for, i.e. the precision of the returned 20 | * sigmoid values 21 | */ 22 | private double[] sigmoidTable; 23 | private double maxX; 24 | private int tableSize; 25 | 26 | 27 | public SigmoidTable(int tableSize, double maxX) { 28 | this.tableSize = tableSize; 29 | this.maxX = maxX; 30 | initTable(); 31 | } 32 | 33 | /** 34 | * Default constructor 35 | * Initialize with default values 36 | */ 37 | public SigmoidTable() { 38 | this(DEFAULT_SIGMOID_TABLE_SIZE, DEFAULT_MAX_X); 39 | } 40 | 41 | /** 42 | * Initialize the precomputed sigmoid table. 43 | * The table consists of "tableSize" precomputed values for sigmoid 44 | * function for input values from -maxX to maxX (The difference between to 45 | * consecutive input value would be: 2 * maxX / (tableSize - 1) 46 | */ 47 | public void initTable() { 48 | sigmoidTable = new double[tableSize]; 49 | double step = (2 * maxX) / (tableSize - 1); 50 | for (int i = 0; i < tableSize - 1; i++) { 51 | double x = -maxX + i * step; 52 | sigmoidTable[i] = MathUtils.sigmoid(x); 53 | } 54 | } 55 | 56 | /** 57 | * Get the sigmoid function for x from the pre-computed table 58 | */ 59 | public double getSigmoid(double x) { 60 | if (x > maxX) 61 | return 1; 62 | else if (x < -maxX) 63 | return 0; 64 | else { 65 | int index = (int) Math.round((x + maxX) / (2 * maxX) * (tableSize - 1)); 66 | return sigmoidTable[index]; 67 | } 68 | // double result = MathUtils.sigmoid(x); 69 | // return result; 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/common/TanhTable.java: -------------------------------------------------------------------------------- 1 | package common; 2 | 3 | /** 4 | * An instance of this class pre-computed values for the tanh function 5 | * Its main purpose to increase the speed of the program (or so people say :P) 6 | * since e^-x takes longer time then mult/add 7 | * 8 | */ 9 | public class TanhTable { 10 | 11 | // Default parameters for the table 12 | public static final double DEFAULT_MAX_X = 6; 13 | public static final int DEFAULT_TANH_TABLE_SIZE = 10000000; 14 | 15 | /* 16 | * This tanhTable holds the precomputed tanh values of variables in the range 17 | * [-maxX, maxX] 18 | * tableSize decides the interval between two consecutive values that we 19 | * compute the tanh function for, i.e. the precision of the returned 20 | * tanh values 21 | */ 22 | private double[] tanhTable; 23 | private double maxX; 24 | private int tableSize; 25 | 26 | 27 | public TanhTable(int tableSize, double maxX) { 28 | this.tableSize = tableSize; 29 | this.maxX = maxX; 30 | initTable(); 31 | } 32 | 33 | /** 34 | * Default constructor 35 | * Initialize with default values 36 | */ 37 | public TanhTable() { 38 | this(DEFAULT_TANH_TABLE_SIZE, DEFAULT_MAX_X); 39 | } 40 | 41 | /** 42 | * Initialize the precomputed tanh table. 43 | * The table consists of "tableSize" precomputed values for tanh 44 | * function for input values from -maxX to maxX (The difference between to 45 | * consecutive input value would be: 2 * maxX / (tableSize - 1) 46 | */ 47 | public void initTable() { 48 | tanhTable = new double[tableSize]; 49 | double step = (2 * maxX) / (tableSize - 1); 50 | for (int i = 0; i < tableSize - 1; i++) { 51 | double x = -maxX + i * step; 52 | tanhTable[i] = MathUtils.tanh(x); 53 | } 54 | } 55 | 56 | /** 57 | * Get the tanh function for x from the pre-computed table 58 | */ 59 | public double getTanh(double x) { 60 | // if (x > 1000) { 61 | // System.out.println("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); 62 | // System.out.println("x: " + x); 63 | // return 1; 64 | // } else if (x < -1000) { 65 | // System.out.println("-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); 66 | // return -1; 67 | // } 68 | // return MathUtils.tanh(x); 69 | if (x > maxX) 70 | return 1; 71 | else if (x < -maxX) 72 | return -1; 73 | else { 74 | // int index = (int) Math.round((x + maxX) / (2 * maxX) * (tableSize - 1)); 75 | // return tanhTable[index]; 76 | return MathUtils.tanh(x); 77 | } 78 | 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/common/WordForm.java: -------------------------------------------------------------------------------- 1 | package common; 2 | 3 | /** 4 | * Constant values for word format 5 | * 6 | */ 7 | public class WordForm { 8 | public static final int WORD = 1; 9 | public static final int LEMMA = 2; 10 | public static final int WORD_POS = 3; 11 | public static final int LEMMA_POS = 4; 12 | } 13 | -------------------------------------------------------------------------------- /src/common/correlation/AreaUnderCurve.java: -------------------------------------------------------------------------------- 1 | package common.correlation; 2 | 3 | import java.util.Arrays; 4 | 5 | import common.exception.ValueException; 6 | 7 | public class AreaUnderCurve { 8 | public static double computeAUC(double[] golds, double[] predicted) { 9 | int positive = 0; 10 | for (double score: golds) { 11 | if (score == 1) positive++; 12 | } 13 | int negative = golds.length - positive; 14 | 15 | int total_count = golds.length; 16 | Point[] point_set = new Point[total_count]; 17 | for (int i = 0; i < golds.length; i++) { 18 | if (!(golds[i]==1) && !(golds[i] == 0)) { 19 | throw new ValueException("For evaluating AUC, gold scores are required to be 0 or 1."); 20 | } 21 | point_set[i] = new Point(golds[i], predicted[i]); 22 | } 23 | 24 | Arrays.sort(point_set); 25 | 26 | double xi = 1.0; 27 | double yi = 1.0; 28 | double xi_old = 1.0; 29 | double true_positive = positive; 30 | double false_positive = negative; 31 | double auc = 0; 32 | 33 | for (int i = 0; i < total_count; i++) { 34 | if (point_set[i].gold == 1) { 35 | true_positive -= 1; 36 | yi = true_positive / positive; 37 | } else { 38 | false_positive -= 1; 39 | xi = false_positive / negative; 40 | auc += (xi_old - xi) * yi; 41 | xi_old = xi; 42 | } 43 | } 44 | return auc; 45 | } 46 | 47 | 48 | static class Point implements Comparable{ 49 | double gold; 50 | double score; 51 | public Point(double gold, double score) { 52 | this.gold = gold; 53 | this.score = score; 54 | } 55 | @Override 56 | public int compareTo(Point o) { 57 | // TODO Auto-generated method stub 58 | if (this.score > o.score) return 1; 59 | if (this.score < o.score) return -1; 60 | return 0; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/common/correlation/MenCorrelation.java: -------------------------------------------------------------------------------- 1 | package common.correlation; 2 | 3 | import java.util.ArrayList; 4 | 5 | import org.apache.commons.math3.stat.correlation.PearsonsCorrelation; 6 | import org.apache.commons.math3.stat.correlation.SpearmansCorrelation; 7 | 8 | import space.SemanticSpace; 9 | 10 | import common.IOUtils; 11 | 12 | 13 | /** 14 | * This class can be used to evaluate a word vector space by computing the 15 | * correlation between the cosine of the words' vectors and the gold-standard 16 | * similarities of them (typically based on human judgment) 17 | * The name is kind of misleading since we can use other dataset than MEN 18 | * 19 | */ 20 | 21 | public class MenCorrelation { 22 | String[][] wordPairs; 23 | double[] golds; 24 | PearsonsCorrelation pearson; 25 | SpearmansCorrelation spearman; 26 | String name = ""; 27 | 28 | /** 29 | * Initialize with the path to the dataset file 30 | * @param dataset 31 | */ 32 | public MenCorrelation(String dataset) { 33 | pearson = new PearsonsCorrelation(); 34 | spearman = new SpearmansCorrelation(); 35 | readDataset(dataset); 36 | } 37 | 38 | 39 | public MenCorrelation(String[][] wordPairs, double[] golds) { 40 | pearson = new PearsonsCorrelation(); 41 | spearman = new SpearmansCorrelation(); 42 | this.wordPairs = wordPairs; 43 | this.golds = golds; 44 | } 45 | 46 | 47 | /** 48 | * Read the word pairs and the gold standard from the dataset 49 | * @param dataset 50 | */ 51 | public void readDataset(String dataset) { 52 | ArrayList data = IOUtils.readFile(dataset); 53 | golds = new double[data.size()]; 54 | wordPairs = new String[data.size()][2]; 55 | for (int i = 0; i < data.size(); i++) { 56 | String dataPiece = data.get(i); 57 | String elements[] = dataPiece.split("\t"); 58 | wordPairs[i][0] = elements[0]; 59 | wordPairs[i][1] = elements[1]; 60 | golds[i] = Double.parseDouble(elements[2]); 61 | //golds[i] = Double.parseDouble(elements[3]); 62 | } 63 | } 64 | 65 | /** 66 | * Compute the pearson correlation of the predicted values against the gold 67 | * standard 68 | * @param predicts 69 | * @return 70 | */ 71 | public double pearsonCorrelation(double[] predicts) { 72 | return pearson.correlation(golds, predicts); 73 | } 74 | 75 | /** 76 | * Compute the spearman correlation of the predicted values against the gold 77 | * standard 78 | * @param predicts 79 | * @return 80 | */ 81 | public double spearmanCorrelation(double[] predicts) { 82 | return spearman.correlation(golds, predicts); 83 | } 84 | 85 | 86 | /** 87 | * Evaluate the space using the pearson correlation 88 | * @param space 89 | * @return 90 | */ 91 | public double evaluateSpacePearson(SemanticSpace space) { 92 | double[] predicts = new double[golds.length]; 93 | for (int i = 0; i < golds.length; i++) { 94 | predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]); 95 | // System.out.println(wordPairs[i][0]); 96 | // System.out.println(wordPairs[i][1]); 97 | } 98 | return pearson.correlation(golds, predicts); 99 | } 100 | 101 | 102 | /** 103 | * Evaluate the space using the spearman correlation 104 | * @param space 105 | * @return 106 | */ 107 | public double evaluateSpaceSpearman(SemanticSpace space) { 108 | double[] predicts = new double[golds.length]; 109 | for (int i = 0; i < golds.length; i++) { 110 | predicts[i] = space.getSim(wordPairs[i][0], wordPairs[i][1]); 111 | } 112 | return spearman.correlation(golds, predicts); 113 | } 114 | 115 | 116 | /** 117 | * @return the gold standard (human's judgment on the similarities) 118 | */ 119 | public double[] getGolds() { 120 | return golds; 121 | } 122 | 123 | public void setName(String name) { 124 | this.name = name; 125 | } 126 | 127 | public String getName() { 128 | return this.name; 129 | } 130 | 131 | public String[][] getWordPairs() { 132 | return this.wordPairs; 133 | } 134 | 135 | public static void main(String[] args) { 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/common/exception/OutOfVocabularyException.java: -------------------------------------------------------------------------------- 1 | package common.exception; 2 | 3 | public class OutOfVocabularyException extends RuntimeException { 4 | 5 | /** 6 | * 7 | */ 8 | private static final long serialVersionUID = 1L; 9 | 10 | public OutOfVocabularyException(String msg) { 11 | super(msg); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/common/exception/ValueException.java: -------------------------------------------------------------------------------- 1 | package common.exception; 2 | 3 | public class ValueException extends RuntimeException { 4 | 5 | /** 6 | * 7 | */ 8 | private static final long serialVersionUID = 1L; 9 | public ValueException(String msg) { 10 | super(msg); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/common/wordnet/LexicalHypernym.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import common.IOUtils; 4 | import vocab.Vocab; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Random; 12 | import java.util.Set; 13 | import com.google.common.collect.Sets; 14 | 15 | 16 | public class LexicalHypernym { 17 | HashMap> features; 18 | HashMap> cohyponyms; 19 | HashMap> hypernyms; 20 | //Vocab vocab; 21 | Random random; 22 | 23 | public LexicalHypernym(String hypeFile, String cohypoFile, String featureFile, Vocab vocab) throws IOException { 24 | cohyponyms = readLexical(cohypoFile, vocab); 25 | hypernyms = readLexical(hypeFile, vocab); 26 | features = readFeatures(featureFile, vocab); 27 | //this.vocab = vocab; 28 | random = new Random(); 29 | } 30 | 31 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException { 32 | HashMap> lexical = new HashMap>(); 33 | ArrayList data = IOUtils.readFile(inputFile); 34 | for (int i = 0; i < data.size(); i++) { 35 | String dataPiece = data.get(i); 36 | String elements[] = dataPiece.split("\t"); 37 | String key = elements[0]; 38 | int keyIndex = vocab.getWordIndex(key); 39 | if (keyIndex == -1) continue; 40 | HashSet value = new HashSet(); 41 | for (int j = 1; j < elements.length; j++ ) { 42 | int wordIndex = vocab.getWordIndex(elements[j]);; 43 | if (wordIndex == -1) continue; 44 | //value.add(elements[j]); 45 | value.add(wordIndex); 46 | } 47 | lexical.put(keyIndex, value); 48 | } 49 | return lexical; 50 | } 51 | 52 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException { 53 | HashMap> features = new HashMap>(); 54 | ArrayList data = IOUtils.readFile(inputFile); 55 | for (int i = 0; i < data.size(); i++) { 56 | String dataPiece = data.get(i); 57 | String elements[] = dataPiece.split("\t"); 58 | String key = elements[0]; 59 | int keyIndex = vocab.getWordIndex(key); 60 | if (keyIndex == -1) continue; 61 | HashSet value = new HashSet(); 62 | for (int j = 1; j < elements.length; j++ ) { 63 | int wordIndex = -1; 64 | wordIndex = vocab.getWordIndex(elements[j]); 65 | if (wordIndex == -1) continue; 66 | value.add(wordIndex); 67 | } 68 | features.put(keyIndex, value); 69 | } 70 | return features; 71 | } 72 | /* 73 | public HashSet intersectionAnt(String target, String feature) { 74 | HashSet intersection = new HashSet(); 75 | if (antonyms.containsKey(target) && features.containsKey(feature)) { 76 | HashSet setTargets = antonyms.get(target); 77 | HashSet setFeatures = features.get(feature); 78 | intersection = getIntersection(setTargets, setFeatures); 79 | } 80 | return intersection; 81 | } 82 | 83 | public HashSet intersectionSyn(String target, String feature) { 84 | HashSet intersection = new HashSet(); 85 | if (synonyms.containsKey(target) && features.containsKey(feature)) { 86 | HashSet setTargets = synonyms.get(target); 87 | HashSet setFeatures = features.get(feature); 88 | intersection = getIntersection(setTargets, setFeatures); 89 | } 90 | return intersection; 91 | }*/ 92 | 93 | public Set intersectionHype(Integer targetIndex, Integer featureIndex) { 94 | Set intersection = new HashSet(); 95 | if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 96 | Set setHypes = hypernyms.get(targetIndex); 97 | Set setFeatures = features.get(featureIndex); 98 | intersection = Sets.intersection(setHypes, setFeatures); 99 | } 100 | return intersection; 101 | } 102 | 103 | public Set intersectionCohypo(Integer targetIndex, Integer featureIndex) { 104 | Set intersection = new HashSet(); 105 | if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 106 | Set setCohypos = cohyponyms.get(targetIndex); 107 | Set setFeatures = features.get(featureIndex); 108 | intersection = Sets.intersection(setCohypos, setFeatures); 109 | } 110 | return intersection; 111 | } 112 | 113 | public HashSet getIntersection(HashSet hs1, HashSet hs2) { 114 | HashSet intersection = new HashSet(); 115 | for (Integer element: hs1) { 116 | if (hs2.contains(element)) intersection.add(element); 117 | } 118 | return intersection; 119 | } 120 | 121 | public int getRandom(Set antonyms) { 122 | List listAnts = new ArrayList(antonyms); 123 | int id = random.nextInt(listAnts.size()); 124 | return listAnts.get(id); 125 | } 126 | 127 | public boolean hasHypeCohypo(Integer targetIndex){ 128 | return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex); 129 | } 130 | 131 | public boolean hasHypernyms(Integer targetIndex){ 132 | return hypernyms.containsKey(targetIndex); 133 | } 134 | 135 | public boolean hasCohyponyms(Integer targetIndex){ 136 | return cohyponyms.containsKey(targetIndex); 137 | } 138 | 139 | public boolean hasFeature(Integer featureIndex) { 140 | return features.containsKey(featureIndex); 141 | } 142 | 143 | public HashSet getFeatures(Integer featureIndex) { 144 | return features.get(featureIndex); 145 | } 146 | 147 | public HashSet getHypernyms(Integer targetIndex) { 148 | return hypernyms.get(targetIndex); 149 | } 150 | 151 | public HashSet getCohyponyms(Integer targetIndex) { 152 | return cohyponyms.get(targetIndex); 153 | } 154 | 155 | } 156 | -------------------------------------------------------------------------------- /src/common/wordnet/LexicalResource.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import common.IOUtils; 4 | import vocab.Vocab; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Random; 12 | import java.util.Set; 13 | import com.google.common.collect.Sets; 14 | 15 | 16 | public class LexicalResource { 17 | HashMap> antonyms; 18 | HashMap> synonyms; 19 | HashMap> features; 20 | //HashMap> cohyponyms; 21 | //HashMap> hypernyms; 22 | //Vocab vocab; 23 | Random random; 24 | 25 | public LexicalResource(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException { 26 | antonyms = readLexical(antFile, vocab); 27 | synonyms = readLexical(synFile, vocab); 28 | //cohyponyms = readLexical(cohypoFile, vocab); 29 | //hypernyms = readLexical(hypeFile, vocab); 30 | features = readFeatures(featureFile, vocab); 31 | //this.vocab = vocab; 32 | random = new Random(); 33 | } 34 | 35 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException { 36 | HashMap> lexical = new HashMap>(); 37 | ArrayList data = IOUtils.readFile(inputFile); 38 | for (int i = 0; i < data.size(); i++) { 39 | String dataPiece = data.get(i); 40 | String elements[] = dataPiece.split("\t"); 41 | String key = elements[0]; 42 | int keyIndex = vocab.getWordIndex(key); 43 | if (keyIndex == -1) continue; 44 | HashSet value = new HashSet(); 45 | for (int j = 1; j < elements.length; j++ ) { 46 | int wordIndex = vocab.getWordIndex(elements[j]);; 47 | if (wordIndex == -1) continue; 48 | //value.add(elements[j]); 49 | value.add(wordIndex); 50 | } 51 | lexical.put(keyIndex, value); 52 | } 53 | return lexical; 54 | } 55 | 56 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException { 57 | HashMap> features = new HashMap>(); 58 | ArrayList data = IOUtils.readFile(inputFile); 59 | for (int i = 0; i < data.size(); i++) { 60 | String dataPiece = data.get(i); 61 | String elements[] = dataPiece.split("\t"); 62 | String key = elements[0]; 63 | int keyIndex = vocab.getWordIndex(key); 64 | if (keyIndex == -1) continue; 65 | HashSet value = new HashSet(); 66 | for (int j = 1; j < elements.length; j++ ) { 67 | int wordIndex = -1; 68 | wordIndex = vocab.getWordIndex(elements[j]); 69 | if (wordIndex == -1) continue; 70 | value.add(wordIndex); 71 | } 72 | features.put(keyIndex, value); 73 | } 74 | return features; 75 | } 76 | /* 77 | public HashSet intersectionAnt(String target, String feature) { 78 | HashSet intersection = new HashSet(); 79 | if (antonyms.containsKey(target) && features.containsKey(feature)) { 80 | HashSet setTargets = antonyms.get(target); 81 | HashSet setFeatures = features.get(feature); 82 | intersection = getIntersection(setTargets, setFeatures); 83 | } 84 | return intersection; 85 | } 86 | 87 | public HashSet intersectionSyn(String target, String feature) { 88 | HashSet intersection = new HashSet(); 89 | if (synonyms.containsKey(target) && features.containsKey(feature)) { 90 | HashSet setTargets = synonyms.get(target); 91 | HashSet setFeatures = features.get(feature); 92 | intersection = getIntersection(setTargets, setFeatures); 93 | } 94 | return intersection; 95 | }*/ 96 | 97 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) { 98 | Set intersection = new HashSet(); 99 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 100 | Set setTargets = antonyms.get(targetIndex); 101 | Set setFeatures = features.get(featureIndex); 102 | intersection = Sets.intersection(setTargets, setFeatures); 103 | } 104 | return intersection; 105 | } 106 | 107 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) { 108 | Set intersection = new HashSet(); 109 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 110 | Set setTargets = synonyms.get(targetIndex); 111 | Set setFeatures = features.get(featureIndex); 112 | intersection = Sets.intersection(setTargets, setFeatures); 113 | } 114 | return intersection; 115 | } 116 | 117 | // public Set intersectionHype(Integer targetIndex, Integer featureIndex) { 118 | // Set intersection = new HashSet(); 119 | // if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 120 | // Set setHypes = hypernyms.get(targetIndex); 121 | // Set setFeatures = features.get(featureIndex); 122 | // intersection = Sets.intersection(setHypes, setFeatures); 123 | // } 124 | // return intersection; 125 | // } 126 | // 127 | // public Set intersectionCohypo(Integer targetIndex, Integer featureIndex) { 128 | // Set intersection = new HashSet(); 129 | // if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 130 | // Set setCohypos = cohyponyms.get(targetIndex); 131 | // Set setFeatures = features.get(featureIndex); 132 | // intersection = Sets.intersection(setCohypos, setFeatures); 133 | // } 134 | // return intersection; 135 | // } 136 | 137 | public HashSet getIntersection(HashSet hs1, HashSet hs2) { 138 | HashSet intersection = new HashSet(); 139 | for (Integer element: hs1) { 140 | if (hs2.contains(element)) intersection.add(element); 141 | } 142 | return intersection; 143 | } 144 | 145 | public int getRandom(Set antonyms) { 146 | List listAnts = new ArrayList(antonyms); 147 | int id = random.nextInt(listAnts.size()); 148 | return listAnts.get(id); 149 | } 150 | 151 | // public boolean hasHypeCohypo(Integer targetIndex){ 152 | // return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex); 153 | // } 154 | // 155 | // public boolean hasHypernyms(Integer targetIndex){ 156 | // return hypernyms.containsKey(targetIndex); 157 | // } 158 | // 159 | // public boolean hasCohyponyms(Integer targetIndex){ 160 | // return cohyponyms.containsKey(targetIndex); 161 | // } 162 | 163 | public boolean hasTarget(Integer targetIndex) { 164 | return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex); 165 | } 166 | 167 | public boolean hasAntonyms(Integer targetIndex) { 168 | return antonyms.containsKey(targetIndex); 169 | } 170 | 171 | public boolean hasSynonyms(Integer targetIndex) { 172 | return synonyms.containsKey(targetIndex); 173 | } 174 | 175 | public boolean hasFeature(Integer featureIndex) { 176 | return features.containsKey(featureIndex); 177 | } 178 | 179 | public HashSet getAntonyms(Integer targetIndex) { 180 | return antonyms.get(targetIndex); 181 | } 182 | 183 | public HashSet getSynonyms(Integer targetIndex) { 184 | return synonyms.get(targetIndex); 185 | } 186 | 187 | public HashSet getFeatures(Integer featureIndex) { 188 | return features.get(featureIndex); 189 | } 190 | 191 | // public HashSet getHypernyms(Integer targetIndex) { 192 | // return hypernyms.get(targetIndex); 193 | // } 194 | // 195 | // public HashSet getCohyponyms(Integer targetIndex) { 196 | // return cohyponyms.get(targetIndex); 197 | // } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /src/common/wordnet/LexicalResourceAdj.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import common.IOUtils; 4 | import vocab.Vocab; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Random; 12 | import java.util.Set; 13 | import com.google.common.collect.Sets; 14 | 15 | 16 | public class LexicalResourceAdj { 17 | HashMap> antonyms; 18 | HashMap> synonyms; 19 | HashMap> features; 20 | Random random; 21 | 22 | public LexicalResourceAdj(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException { 23 | features = readFeatures(featureFile, vocab); 24 | antonyms = readLexical(antFile, vocab); 25 | synonyms = readLexical(synFile, vocab); 26 | //this.vocab = vocab; 27 | //System.out.println("The number of antonyms: " + antonyms.size()); 28 | //System.out.println("The number of synonyms: " + synonyms.size()); 29 | //System.out.println("The number of contexts: " + features.size()); 30 | random = new Random(); 31 | } 32 | 33 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException { 34 | HashMap> lexical = new HashMap>(); 35 | ArrayList data = IOUtils.readFile(inputFile); 36 | for (int i = 0; i < data.size(); i++) { 37 | String dataPiece = data.get(i); 38 | String elements[] = dataPiece.split("\t"); 39 | String key = elements[0]; 40 | int keyIndex = vocab.getWordIndex(key); 41 | if (keyIndex == -1) continue; 42 | HashSet value = new HashSet(); 43 | for (int j = 1; j < elements.length; j++ ) { 44 | int wordIndex = vocab.getWordIndex(elements[j]);; 45 | if (wordIndex == -1) continue; 46 | //value.add(elements[j]); 47 | value.add(wordIndex); 48 | } 49 | lexical.put(keyIndex, value); 50 | //random antonym 51 | //List listAnts = new ArrayList(value); 52 | //int id = random.nextInt(listAnts.size()); 53 | //antRandom.put(keyIndex, listAnts.get(id)); 54 | } 55 | return lexical; 56 | } 57 | 58 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException { 59 | HashMap> features = new HashMap>(); 60 | ArrayList data = IOUtils.readFile(inputFile); 61 | for (int i = 0; i < data.size(); i++) { 62 | String dataPiece = data.get(i); 63 | String elements[] = dataPiece.split("\t"); 64 | String key = elements[0]; 65 | int keyIndex = vocab.getWordIndex(key); 66 | if (keyIndex == -1) continue; 67 | HashSet value = new HashSet(); 68 | for (int j = 1; j < elements.length; j++ ) { 69 | int wordIndex = -1; 70 | wordIndex = vocab.getWordIndex(elements[j]); 71 | if (wordIndex == -1) continue; 72 | value.add(wordIndex); 73 | } 74 | features.put(keyIndex, value); 75 | } 76 | return features; 77 | } 78 | /* 79 | public HashSet intersectionAnt(String target, String feature) { 80 | HashSet intersection = new HashSet(); 81 | if (antonyms.containsKey(target) && features.containsKey(feature)) { 82 | HashSet setTargets = antonyms.get(target); 83 | HashSet setFeatures = features.get(feature); 84 | intersection = getIntersection(setTargets, setFeatures); 85 | } 86 | return intersection; 87 | } 88 | 89 | public HashSet intersectionSyn(String target, String feature) { 90 | HashSet intersection = new HashSet(); 91 | if (synonyms.containsKey(target) && features.containsKey(feature)) { 92 | HashSet setTargets = synonyms.get(target); 93 | HashSet setFeatures = features.get(feature); 94 | intersection = getIntersection(setTargets, setFeatures); 95 | } 96 | return intersection; 97 | }*/ 98 | 99 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) { 100 | Set intersection = new HashSet(); 101 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 102 | Set setTargets = antonyms.get(targetIndex); 103 | Set setFeatures = features.get(featureIndex); 104 | intersection = Sets.intersection(setTargets, setFeatures); 105 | } 106 | return intersection; 107 | } 108 | 109 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) { 110 | Set intersection = new HashSet(); 111 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 112 | Set setTargets = synonyms.get(targetIndex); 113 | Set setFeatures = features.get(featureIndex); 114 | intersection = Sets.intersection(setTargets, setFeatures); 115 | } 116 | return intersection; 117 | } 118 | 119 | public HashSet getIntersection(HashSet hs1, HashSet hs2) { 120 | HashSet intersection = new HashSet(); 121 | for (Integer element: hs1) { 122 | if (hs2.contains(element)) intersection.add(element); 123 | } 124 | return intersection; 125 | } 126 | 127 | public int getRandom(Set antonyms) { 128 | List listAnts = new ArrayList(antonyms); 129 | int id = random.nextInt(listAnts.size()); 130 | return listAnts.get(id); 131 | } 132 | 133 | public boolean hasTarget(Integer targetIndex) { 134 | return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex); 135 | } 136 | 137 | public boolean hasAntonyms(Integer targetIndex) { 138 | return antonyms.containsKey(targetIndex); 139 | } 140 | 141 | public boolean hasSynonyms(Integer targetIndex) { 142 | return synonyms.containsKey(targetIndex); 143 | } 144 | 145 | public boolean hasFeature(Integer featureIndex) { 146 | return features.containsKey(featureIndex); 147 | } 148 | 149 | public HashSet getAntonyms(Integer targetIndex) { 150 | return antonyms.get(targetIndex); 151 | } 152 | 153 | public HashSet getSynonyms(Integer targetIndex) { 154 | return synonyms.get(targetIndex); 155 | } 156 | 157 | public HashSet getFeatures(Integer featureIndex) { 158 | return features.get(featureIndex); 159 | } 160 | 161 | } 162 | -------------------------------------------------------------------------------- /src/common/wordnet/LexicalResourceNoun.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import common.IOUtils; 4 | import vocab.Vocab; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Random; 12 | import java.util.Set; 13 | import com.google.common.collect.Sets; 14 | 15 | 16 | public class LexicalResourceNoun { 17 | HashMap> antonyms; 18 | HashMap> synonyms; 19 | HashMap> features; 20 | //HashMap> cohyponyms; 21 | //HashMap> hypernyms; 22 | //Vocab vocab; 23 | Random random; 24 | 25 | public LexicalResourceNoun(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException { 26 | antonyms = readLexical(antFile, vocab); 27 | synonyms = readLexical(synFile, vocab); 28 | //cohyponyms = readLexical(cohypoFile, vocab); 29 | //hypernyms = readLexical(hypeFile, vocab); 30 | features = readFeatures(featureFile, vocab); 31 | //this.vocab = vocab; 32 | random = new Random(); 33 | } 34 | 35 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException { 36 | HashMap> lexical = new HashMap>(); 37 | ArrayList data = IOUtils.readFile(inputFile); 38 | for (int i = 0; i < data.size(); i++) { 39 | String dataPiece = data.get(i); 40 | String elements[] = dataPiece.split("\t"); 41 | String key = elements[0]; 42 | int keyIndex = vocab.getWordIndex(key); 43 | if (keyIndex == -1) continue; 44 | HashSet value = new HashSet(); 45 | for (int j = 1; j < elements.length; j++ ) { 46 | int wordIndex = vocab.getWordIndex(elements[j]);; 47 | if (wordIndex == -1) continue; 48 | //value.add(elements[j]); 49 | value.add(wordIndex); 50 | } 51 | lexical.put(keyIndex, value); 52 | } 53 | return lexical; 54 | } 55 | 56 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException { 57 | HashMap> features = new HashMap>(); 58 | ArrayList data = IOUtils.readFile(inputFile); 59 | for (int i = 0; i < data.size(); i++) { 60 | String dataPiece = data.get(i); 61 | String elements[] = dataPiece.split("\t"); 62 | String key = elements[0]; 63 | int keyIndex = vocab.getWordIndex(key); 64 | if (keyIndex == -1) continue; 65 | HashSet value = new HashSet(); 66 | for (int j = 1; j < elements.length; j++ ) { 67 | int wordIndex = -1; 68 | wordIndex = vocab.getWordIndex(elements[j]); 69 | if (wordIndex == -1) continue; 70 | value.add(wordIndex); 71 | } 72 | features.put(keyIndex, value); 73 | } 74 | return features; 75 | } 76 | /* 77 | public HashSet intersectionAnt(String target, String feature) { 78 | HashSet intersection = new HashSet(); 79 | if (antonyms.containsKey(target) && features.containsKey(feature)) { 80 | HashSet setTargets = antonyms.get(target); 81 | HashSet setFeatures = features.get(feature); 82 | intersection = getIntersection(setTargets, setFeatures); 83 | } 84 | return intersection; 85 | } 86 | 87 | public HashSet intersectionSyn(String target, String feature) { 88 | HashSet intersection = new HashSet(); 89 | if (synonyms.containsKey(target) && features.containsKey(feature)) { 90 | HashSet setTargets = synonyms.get(target); 91 | HashSet setFeatures = features.get(feature); 92 | intersection = getIntersection(setTargets, setFeatures); 93 | } 94 | return intersection; 95 | }*/ 96 | 97 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) { 98 | Set intersection = new HashSet(); 99 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 100 | Set setTargets = antonyms.get(targetIndex); 101 | Set setFeatures = features.get(featureIndex); 102 | intersection = Sets.intersection(setTargets, setFeatures); 103 | } 104 | return intersection; 105 | } 106 | 107 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) { 108 | Set intersection = new HashSet(); 109 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 110 | Set setTargets = synonyms.get(targetIndex); 111 | Set setFeatures = features.get(featureIndex); 112 | intersection = Sets.intersection(setTargets, setFeatures); 113 | } 114 | return intersection; 115 | } 116 | 117 | // public Set intersectionHype(Integer targetIndex, Integer featureIndex) { 118 | // Set intersection = new HashSet(); 119 | // if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 120 | // Set setHypes = hypernyms.get(targetIndex); 121 | // Set setFeatures = features.get(featureIndex); 122 | // intersection = Sets.intersection(setHypes, setFeatures); 123 | // } 124 | // return intersection; 125 | // } 126 | // 127 | // public Set intersectionCohypo(Integer targetIndex, Integer featureIndex) { 128 | // Set intersection = new HashSet(); 129 | // if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 130 | // Set setCohypos = cohyponyms.get(targetIndex); 131 | // Set setFeatures = features.get(featureIndex); 132 | // intersection = Sets.intersection(setCohypos, setFeatures); 133 | // } 134 | // return intersection; 135 | // } 136 | 137 | public HashSet getIntersection(HashSet hs1, HashSet hs2) { 138 | HashSet intersection = new HashSet(); 139 | for (Integer element: hs1) { 140 | if (hs2.contains(element)) intersection.add(element); 141 | } 142 | return intersection; 143 | } 144 | 145 | public int getRandom(Set antonyms) { 146 | List listAnts = new ArrayList(antonyms); 147 | int id = random.nextInt(listAnts.size()); 148 | return listAnts.get(id); 149 | } 150 | 151 | // public boolean hasHypeCohypo(Integer targetIndex){ 152 | // return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex); 153 | // } 154 | // 155 | // public boolean hasHypernyms(Integer targetIndex){ 156 | // return hypernyms.containsKey(targetIndex); 157 | // } 158 | // 159 | // public boolean hasCohyponyms(Integer targetIndex){ 160 | // return cohyponyms.containsKey(targetIndex); 161 | // } 162 | 163 | public boolean hasTarget(Integer targetIndex) { 164 | return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex); 165 | } 166 | 167 | public boolean hasAntonyms(Integer targetIndex) { 168 | return antonyms.containsKey(targetIndex); 169 | } 170 | 171 | public boolean hasSynonyms(Integer targetIndex) { 172 | return synonyms.containsKey(targetIndex); 173 | } 174 | 175 | public boolean hasFeature(Integer featureIndex) { 176 | return features.containsKey(featureIndex); 177 | } 178 | 179 | public HashSet getAntonyms(Integer targetIndex) { 180 | return antonyms.get(targetIndex); 181 | } 182 | 183 | public HashSet getSynonyms(Integer targetIndex) { 184 | return synonyms.get(targetIndex); 185 | } 186 | 187 | public HashSet getFeatures(Integer featureIndex) { 188 | return features.get(featureIndex); 189 | } 190 | 191 | // public HashSet getHypernyms(Integer targetIndex) { 192 | // return hypernyms.get(targetIndex); 193 | // } 194 | // 195 | // public HashSet getCohyponyms(Integer targetIndex) { 196 | // return cohyponyms.get(targetIndex); 197 | // } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /src/common/wordnet/LexicalResourceVerb.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import common.IOUtils; 4 | import vocab.Vocab; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Random; 12 | import java.util.Set; 13 | import com.google.common.collect.Sets; 14 | 15 | 16 | public class LexicalResourceVerb { 17 | HashMap> antonyms; 18 | HashMap> synonyms; 19 | HashMap> features; 20 | //HashMap> cohyponyms; 21 | //HashMap> hypernyms; 22 | //Vocab vocab; 23 | Random random; 24 | 25 | public LexicalResourceVerb(String antFile, String synFile, String featureFile, Vocab vocab) throws IOException { 26 | antonyms = readLexical(antFile, vocab); 27 | synonyms = readLexical(synFile, vocab); 28 | features = readFeatures(featureFile, vocab); 29 | //cohyponyms = readLexical(cohypoFile, vocab); 30 | //hypernyms = readLexical(hypeFile, vocab); 31 | //this.vocab = vocab; 32 | random = new Random(); 33 | } 34 | 35 | public HashMap> readLexical(String inputFile, Vocab vocab) throws IOException { 36 | HashMap> lexical = new HashMap>(); 37 | ArrayList data = IOUtils.readFile(inputFile); 38 | for (int i = 0; i < data.size(); i++) { 39 | String dataPiece = data.get(i); 40 | String elements[] = dataPiece.split("\t"); 41 | String key = elements[0]; 42 | int keyIndex = vocab.getWordIndex(key); 43 | if (keyIndex == -1) continue; 44 | HashSet value = new HashSet(); 45 | for (int j = 1; j < elements.length; j++ ) { 46 | int wordIndex = vocab.getWordIndex(elements[j]);; 47 | if (wordIndex == -1) continue; 48 | //value.add(elements[j]); 49 | value.add(wordIndex); 50 | } 51 | lexical.put(keyIndex, value); 52 | } 53 | return lexical; 54 | } 55 | 56 | public HashMap> readFeatures(String inputFile, Vocab vocab) throws IOException { 57 | HashMap> features = new HashMap>(); 58 | ArrayList data = IOUtils.readFile(inputFile); 59 | for (int i = 0; i < data.size(); i++) { 60 | String dataPiece = data.get(i); 61 | String elements[] = dataPiece.split("\t"); 62 | String key = elements[0]; 63 | int keyIndex = vocab.getWordIndex(key); 64 | if (keyIndex == -1) continue; 65 | HashSet value = new HashSet(); 66 | for (int j = 1; j < elements.length; j++ ) { 67 | int wordIndex = -1; 68 | wordIndex = vocab.getWordIndex(elements[j]); 69 | if (wordIndex == -1) continue; 70 | value.add(wordIndex); 71 | } 72 | features.put(keyIndex, value); 73 | } 74 | return features; 75 | } 76 | /* 77 | public HashSet intersectionAnt(String target, String feature) { 78 | HashSet intersection = new HashSet(); 79 | if (antonyms.containsKey(target) && features.containsKey(feature)) { 80 | HashSet setTargets = antonyms.get(target); 81 | HashSet setFeatures = features.get(feature); 82 | intersection = getIntersection(setTargets, setFeatures); 83 | } 84 | return intersection; 85 | } 86 | 87 | public HashSet intersectionSyn(String target, String feature) { 88 | HashSet intersection = new HashSet(); 89 | if (synonyms.containsKey(target) && features.containsKey(feature)) { 90 | HashSet setTargets = synonyms.get(target); 91 | HashSet setFeatures = features.get(feature); 92 | intersection = getIntersection(setTargets, setFeatures); 93 | } 94 | return intersection; 95 | }*/ 96 | 97 | public Set intersectionAnt(Integer targetIndex, Integer featureIndex) { 98 | Set intersection = new HashSet(); 99 | if (antonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 100 | Set setTargets = antonyms.get(targetIndex); 101 | Set setFeatures = features.get(featureIndex); 102 | intersection = Sets.intersection(setTargets, setFeatures); 103 | } 104 | return intersection; 105 | } 106 | 107 | public Set intersectionSyn(Integer targetIndex, Integer featureIndex) { 108 | Set intersection = new HashSet(); 109 | if (synonyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 110 | Set setTargets = synonyms.get(targetIndex); 111 | Set setFeatures = features.get(featureIndex); 112 | intersection = Sets.intersection(setTargets, setFeatures); 113 | } 114 | return intersection; 115 | } 116 | 117 | // public Set intersectionHype(Integer targetIndex, Integer featureIndex) { 118 | // Set intersection = new HashSet(); 119 | // if (hypernyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 120 | // Set setHypes = hypernyms.get(targetIndex); 121 | // Set setFeatures = features.get(featureIndex); 122 | // intersection = Sets.intersection(setHypes, setFeatures); 123 | // } 124 | // return intersection; 125 | // } 126 | // 127 | // public Set intersectionCohypo(Integer targetIndex, Integer featureIndex) { 128 | // Set intersection = new HashSet(); 129 | // if (cohyponyms.containsKey(targetIndex) && features.containsKey(featureIndex)) { 130 | // Set setCohypos = cohyponyms.get(targetIndex); 131 | // Set setFeatures = features.get(featureIndex); 132 | // intersection = Sets.intersection(setCohypos, setFeatures); 133 | // } 134 | // return intersection; 135 | // } 136 | 137 | public HashSet getIntersection(HashSet hs1, HashSet hs2) { 138 | HashSet intersection = new HashSet(); 139 | for (Integer element: hs1) { 140 | if (hs2.contains(element)) intersection.add(element); 141 | } 142 | return intersection; 143 | } 144 | 145 | public int getRandom(Set antonyms) { 146 | List listAnts = new ArrayList(antonyms); 147 | int id = random.nextInt(listAnts.size()); 148 | return listAnts.get(id); 149 | } 150 | 151 | // public boolean hasHypeCohypo(Integer targetIndex){ 152 | // return hypernyms.containsKey(targetIndex) || cohyponyms.containsKey(targetIndex); 153 | // } 154 | // 155 | // public boolean hasHypernyms(Integer targetIndex){ 156 | // return hypernyms.containsKey(targetIndex); 157 | // } 158 | // 159 | // public boolean hasCohyponyms(Integer targetIndex){ 160 | // return cohyponyms.containsKey(targetIndex); 161 | // } 162 | 163 | public boolean hasTarget(Integer targetIndex) { 164 | return antonyms.containsKey(targetIndex) || synonyms.containsKey(targetIndex); 165 | } 166 | 167 | public boolean hasAntonyms(Integer targetIndex) { 168 | return antonyms.containsKey(targetIndex); 169 | } 170 | 171 | public boolean hasSynonyms(Integer targetIndex) { 172 | return synonyms.containsKey(targetIndex); 173 | } 174 | 175 | public boolean hasFeature(Integer featureIndex) { 176 | return features.containsKey(featureIndex); 177 | } 178 | 179 | public HashSet getAntonyms(Integer targetIndex) { 180 | return antonyms.get(targetIndex); 181 | } 182 | 183 | public HashSet getSynonyms(Integer targetIndex) { 184 | return synonyms.get(targetIndex); 185 | } 186 | 187 | public HashSet getFeatures(Integer featureIndex) { 188 | return features.get(featureIndex); 189 | } 190 | 191 | // public HashSet getHypernyms(Integer targetIndex) { 192 | // return hypernyms.get(targetIndex); 193 | // } 194 | // 195 | // public HashSet getCohyponyms(Integer targetIndex) { 196 | // return cohyponyms.get(targetIndex); 197 | // } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /src/common/wordnet/Synset.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class Synset { 6 | String id; 7 | String synsetType; 8 | String[] words; 9 | String antonymSSId; 10 | String[] simSSId; 11 | String[] hypoSSId; 12 | String[] hyperSSId; 13 | 14 | public Synset(String line) { 15 | String[] elements = line.split(" "); 16 | id = elements[0]; 17 | synsetType = elements[2]; 18 | readWords(elements); 19 | } 20 | 21 | public void readWords(String[] elements) { 22 | // read words 23 | 24 | int size = Integer.parseInt(elements[3],16); 25 | words = new String[size]; 26 | for (int i = 0; i< size; i++) { 27 | words[i] = elements[4 + 2 * i].replaceAll("_", "-"); 28 | } 29 | 30 | // read info 31 | int fieldNum = Integer.parseInt(elements[4 + size * 2],10); 32 | 33 | ArrayList simList = new ArrayList(); 34 | ArrayList hypoList = new ArrayList(); 35 | ArrayList hyperList = new ArrayList(); 36 | for (int i = 0; i < fieldNum; i++) { 37 | String type = elements[4 + size * 2 + 1 + i * 4]; 38 | String id = elements[4 + size * 2 + 2 + i * 4]; 39 | String pos = elements[4 + size * 2 + 3 + i * 4]; 40 | if (type.equals("&") || type.equals("^")) { 41 | if (pos.equals("a")) 42 | simList.add(id); 43 | } else if (type.equals("!")) { 44 | antonymSSId = id; 45 | } else if (type.equals("@")) { 46 | hyperList.add(id); 47 | } else if (type.equals("~")) { 48 | hypoList.add(id); 49 | } 50 | } 51 | simSSId = new String[simList.size()]; 52 | simSSId = simList.toArray(simSSId); 53 | hyperSSId = new String[simList.size()]; 54 | hyperSSId = hyperList.toArray(hyperSSId); 55 | hypoSSId = new String[simList.size()]; 56 | hypoSSId = hypoList.toArray(hypoSSId); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/common/wordnet/WordNetReader.java: -------------------------------------------------------------------------------- 1 | package common.wordnet; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | 9 | public class WordNetReader { 10 | public static HashMap readSynsets(String fileName) throws IOException{ 11 | HashMap data = new HashMap(); 12 | BufferedReader reader = new BufferedReader(new FileReader(fileName)); 13 | String line = reader.readLine(); 14 | while (line != null) { 15 | if (!line.startsWith(" ")) { 16 | Synset synset = new Synset(line); 17 | data.put(synset.id, synset); 18 | } 19 | line = reader.readLine(); 20 | } 21 | reader.close(); 22 | return data; 23 | } 24 | 25 | public static HashMap> getWord2SynsetIds(HashMap synsetMap) { 26 | HashMap> word2SynsetIds = new HashMap>(); 27 | for (String id: synsetMap.keySet()) { 28 | Synset synset = synsetMap.get(id); 29 | for (String word: synset.words) { 30 | if (!word2SynsetIds.containsKey(word)) { 31 | word2SynsetIds.put(word, new ArrayList()); 32 | } 33 | word2SynsetIds.get(word).add(id); 34 | } 35 | } 36 | return word2SynsetIds; 37 | } 38 | 39 | public static void main(String[] args) throws IOException{ 40 | String adjFile = args[0]; 41 | HashMap synsetMap = readSynsets(adjFile); 42 | for (String id: synsetMap.keySet()) { 43 | Synset synset = synsetMap.get(id); 44 | System.out.print(synset.id); 45 | for (String word: synset.words) { 46 | System.out.print(" " + word); 47 | } 48 | System.out.println(); 49 | } 50 | 51 | HashMap> word2SynsetIds = getWord2SynsetIds(synsetMap); 52 | for (String word: word2SynsetIds.keySet()) { 53 | System.out.print(word + ": "); 54 | System.out.print(word2SynsetIds.get(word) + "\n"); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/demo/HyperVecLearning.java: -------------------------------------------------------------------------------- 1 | package demo; 2 | 3 | import io.sentence.PlainSentenceInputStream; 4 | import io.word.CombinedWordInputStream; 5 | import io.word.PushBackWordStream; 6 | import io.word.WordInputStream; 7 | import io.sentence.SentenceInputStream; 8 | 9 | import java.io.File; 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | 13 | import common.wordnet.LexicalHypernym; 14 | import vocab.Vocab; 15 | import word2vec.MultiThreadWord2Vec; 16 | import word2vec.multitask.Hyper2Vec; 17 | 18 | 19 | 20 | public class HyperVecLearning { 21 | public static void main(String[] args) throws IOException{ 22 | 23 | 24 | MultiThreadWord2Vec word2vec = null; 25 | String configFile = args[0]; 26 | int size = Integer.parseInt(args[1]); 27 | int window = Integer.parseInt(args[2]); 28 | 29 | W2vProperties properties = new W2vProperties(configFile); 30 | boolean softmax = Boolean.parseBoolean(properties.getProperty("HierarchialSoftmax")); 31 | int negativeSamples = Integer.parseInt(properties.getProperty("NegativeSampling")); 32 | double subSampling = Double.parseDouble(properties.getProperty("SubSampling")); 33 | String trainDirPath = properties.getProperty("TrainDir"); 34 | String outputFile = properties.getProperty("WordVectorFile"); 35 | String vocabFile = properties.getProperty("VocabFile"); 36 | 37 | outputFile = outputFile.replaceAll(".bin", "_" + size + ".bin"); 38 | 39 | File trainDir = new File(trainDirPath); 40 | File[] trainFiles = trainDir.listFiles(); 41 | System.out.println("Starting training using dir " + trainDirPath); 42 | System.out.println("Output file: " + outputFile); 43 | 44 | boolean learnVocab = !(new File(vocabFile)).exists(); 45 | Vocab vocab = new Vocab(Integer.parseInt(properties.getProperty("MinFrequency"))); 46 | if (!learnVocab) 47 | vocab.loadVocab(vocabFile);// ,minFrequency); 48 | else { 49 | ArrayList wordStreamList = new ArrayList<>(); 50 | for (File trainFile: trainFiles) { 51 | WordInputStream wordStream = new PushBackWordStream(trainFile.getAbsolutePath(), 200); 52 | wordStreamList.add(wordStream); 53 | } 54 | 55 | CombinedWordInputStream wordStream = new CombinedWordInputStream(wordStreamList); 56 | vocab.learnVocabFromTrainStream(wordStream); 57 | // save vocabulary 58 | vocab.saveVocab(vocabFile); 59 | } 60 | 61 | word2vec = new Hyper2Vec(size, window, softmax, negativeSamples, subSampling); 62 | Hyper2Vec hypervec = (Hyper2Vec) word2vec; 63 | 64 | LexicalHypernym hypeNoun = new LexicalHypernym(properties.getProperty("hypeNoun"), 65 | properties.getProperty("cohypoNoun"), 66 | properties.getProperty("featureNoun"), 67 | vocab); 68 | LexicalHypernym hypeVerb = new LexicalHypernym(properties.getProperty("hypeVerb"), 69 | properties.getProperty("cohypoVerb"), 70 | properties.getProperty("featureVerb"), 71 | vocab); 72 | hypervec.setLexicalHypeNoun(hypeNoun); 73 | outputFile = outputFile.replaceAll(".bin", "_HypeNoun.bin"); 74 | hypervec.setLexicalHypeVerb(hypeVerb); 75 | outputFile = outputFile.replaceAll(".bin", "_HypeVerb.bin"); 76 | 77 | 78 | word2vec.setVocab(vocab); 79 | word2vec.initNetwork(); 80 | 81 | System.out.println("Start training"); 82 | try { 83 | ArrayList inputStreams = new ArrayList(); 84 | for (File trainFile: trainFiles) { 85 | SentenceInputStream sentenceInputStream = new PlainSentenceInputStream( 86 | new PushBackWordStream(trainFile.getAbsolutePath(), 200)); 87 | inputStreams.add(sentenceInputStream); 88 | } 89 | 90 | word2vec.trainModel(inputStreams); 91 | word2vec.saveVector(outputFile, true); 92 | 93 | System.out.println("The vocab size: " + vocab.getVocabSize() + " words"); 94 | } catch (IOException e) { 95 | System.exit(1); 96 | } 97 | 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/demo/W2vProperties.java: -------------------------------------------------------------------------------- 1 | package demo; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.util.Properties; 7 | 8 | public class W2vProperties{ 9 | protected Properties properties; 10 | public W2vProperties(String configFile) throws IOException{ 11 | 12 | 13 | properties = new Properties(); 14 | BufferedReader reader = new BufferedReader(new FileReader(configFile)); 15 | properties.load(reader); 16 | reader.close(); 17 | 18 | 19 | // PROJECT DIR 20 | String projectDir = properties.getProperty("ProjectDir"); 21 | 22 | // TRAIN DIR 23 | String sTrainDir = properties.getProperty("STrainDirName"); 24 | String sTrainDirPath = projectDir + "/" + sTrainDir; 25 | properties.setProperty("STrainDir", sTrainDirPath); 26 | 27 | String outputDir = projectDir; 28 | properties.setProperty("OutputDir", outputDir); 29 | 30 | // OUTPUT NAME 31 | String sOutputName = properties.getProperty("SOutputFileTemplate"); 32 | String sOutputFilePath = outputDir + "/" + sOutputName; 33 | properties.setProperty("SOutputFile", sOutputFilePath); 34 | 35 | // VOCAB FILE 36 | String vocabFileName = properties.getProperty("VocabFileName"); 37 | String vocabFile = outputDir + "/" + vocabFileName; 38 | properties.setProperty("VocabFile", vocabFile); 39 | 40 | // OUTPUT W2V 41 | 42 | String wordVectorFileName = properties.getProperty("WordVectorFileName"); 43 | String wordVectorFilePath = outputDir + "/" + wordVectorFileName; 44 | properties.setProperty("WordVectorFile", wordVectorFilePath); 45 | } 46 | 47 | public String getProperty(String key) { 48 | return properties.getProperty(key); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/io/sentence/PlainSentenceInputStream.java: -------------------------------------------------------------------------------- 1 | package io.sentence; 2 | 3 | import io.word.Phrase; 4 | import io.word.WordInputStream; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | 9 | import common.DataStructureUtils; 10 | 11 | import vocab.Vocab; 12 | 13 | public class PlainSentenceInputStream implements SentenceInputStream { 14 | public static final int DEFAULT_MAX_SENTENCE_LENGTH = 1000; 15 | WordInputStream inputStream; 16 | long wordCount; 17 | int[] sentence; 18 | 19 | public PlainSentenceInputStream(WordInputStream inputStream) { 20 | this.inputStream = inputStream; 21 | wordCount = 0; 22 | } 23 | 24 | @Override 25 | public boolean readNextSentence(Vocab vocab) throws IOException { 26 | ArrayList currentSentence = new ArrayList(); 27 | while (true) { 28 | // read the next word & the word index 29 | String word = ""; 30 | word = inputStream.readWord(); 31 | 32 | if ("".equals(word)) 33 | break; 34 | int wordIndex = vocab.getWordIndex(word); 35 | 36 | // if the word is not in the vocabulary, continue 37 | if (wordIndex == -1) 38 | continue; 39 | else 40 | wordCount++; 41 | 42 | // end of sentence -> break; 43 | if (wordIndex == 0) { 44 | // System.out.println("end of sentence: " + word); 45 | break; 46 | } 47 | 48 | currentSentence.add(wordIndex); 49 | // break if sentence is too long 50 | if (currentSentence.size() >= DEFAULT_MAX_SENTENCE_LENGTH) 51 | break; 52 | 53 | } 54 | // System.out.println("sentence length: " + sentence.size()); 55 | sentence = DataStructureUtils.intListToArray(currentSentence); 56 | if (sentence.length == 0 && inputStream.endOfFile()) 57 | return false; 58 | else 59 | return true; 60 | 61 | } 62 | 63 | @Override 64 | public int[] getCurrentSentence() throws IOException { 65 | return sentence; 66 | } 67 | 68 | @Override 69 | public Phrase[] getCurrentPhrases() throws IOException { 70 | return new Phrase[0]; 71 | } 72 | 73 | @Override 74 | public long getWordCount() { 75 | return wordCount; 76 | } 77 | 78 | @Override 79 | public boolean crossDocBoundary() { 80 | // TODO Auto-generated method stub 81 | return false; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/io/sentence/SentenceInputStream.java: -------------------------------------------------------------------------------- 1 | package io.sentence; 2 | 3 | import io.word.Phrase; 4 | 5 | import java.io.IOException; 6 | 7 | import vocab.Vocab; 8 | 9 | public interface SentenceInputStream { 10 | public boolean readNextSentence(Vocab vocab) throws IOException; 11 | 12 | public int[] getCurrentSentence() throws IOException; 13 | 14 | public boolean crossDocBoundary(); 15 | 16 | public Phrase[] getCurrentPhrases() throws IOException; 17 | 18 | public long getWordCount(); 19 | } 20 | -------------------------------------------------------------------------------- /src/io/sentence/SubSamplingSentenceInputStream.java: -------------------------------------------------------------------------------- 1 | package io.sentence; 2 | 3 | import io.word.Phrase; 4 | 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.Random; 8 | 9 | import common.DataStructureUtils; 10 | 11 | import vocab.Vocab; 12 | import vocab.VocabEntry; 13 | 14 | public class SubSamplingSentenceInputStream implements SentenceInputStream { 15 | 16 | SentenceInputStream inputStream; 17 | double frequencyThreshold; 18 | int[] sentence; 19 | Phrase[] phrases; 20 | Random rand = new Random(); 21 | 22 | public SubSamplingSentenceInputStream(SentenceInputStream inputStream, 23 | double frequencyThreshold) { 24 | this.inputStream = inputStream; 25 | this.frequencyThreshold = frequencyThreshold; 26 | } 27 | 28 | protected boolean isSampled(long count, long totalCount) { 29 | double randomThreshold = (double) (Math.sqrt(count 30 | / (frequencyThreshold * totalCount)) + 1) 31 | * (frequencyThreshold * totalCount) / count; 32 | if (randomThreshold >= rand.nextFloat()) { 33 | return true; 34 | } else { 35 | return false; 36 | } 37 | } 38 | 39 | protected void filterSentence(int[] unFilteredSentence, 40 | Phrase[] unFilteredPhrases, Vocab vocab) { 41 | ArrayList filteredIndices = new ArrayList(); 42 | long totalCount = vocab.getTrainWords(); 43 | int[] newPositions = new int[unFilteredSentence.length]; 44 | int newPosition = 0; 45 | for (int i = 0; i < unFilteredSentence.length; i++) { 46 | int vocabEntryIndex = unFilteredSentence[i]; 47 | if (vocabEntryIndex == -1) { 48 | newPositions[i] = Integer.MIN_VALUE; 49 | continue; 50 | } 51 | VocabEntry entry = vocab.getEntry(vocabEntryIndex); 52 | long count = entry.frequency; 53 | 54 | if (isSampled(count, totalCount)) { 55 | filteredIndices.add(vocabEntryIndex); 56 | newPositions[i] = newPosition; 57 | newPosition++; 58 | } 59 | // set those words'positions that are not in vocab to -1 60 | else { 61 | newPositions[i] = Integer.MIN_VALUE; 62 | } 63 | } 64 | // System.out.println("\nOld Sentence:"); 65 | // for (int i = 0; i < unFilteredSentence.length; i++) 66 | // { 67 | // System.out.print(" "+unFilteredSentence[i]); 68 | // } 69 | // System.out.println("\nOld phrase:"); 70 | // for (int i = 0; i < unFilteredPhrases.length; i++) 71 | // { 72 | // System.out.print("("+unFilteredPhrases[i].startPosition + " " + +unFilteredPhrases[i].endPosition + ") "); 73 | // } 74 | // System.out.println(); 75 | sentence = DataStructureUtils.intListToArray(filteredIndices); 76 | 77 | ArrayList fileterPhraseList = new ArrayList(); 78 | for (Phrase unFilteredPhrase : unFilteredPhrases) { 79 | int phraseType = unFilteredPhrase.phraseType; 80 | int startPosition = newPositions[unFilteredPhrase.startPosition]; 81 | int endPosition = newPositions[unFilteredPhrase.endPosition]; 82 | // TODO: check if this condition is correct 83 | if (endPosition - startPosition == unFilteredPhrase.endPosition - unFilteredPhrase.startPosition) { 84 | Phrase phrase = new Phrase(phraseType, startPosition, 85 | endPosition, unFilteredPhrase.tree); 86 | fileterPhraseList.add(phrase); 87 | } 88 | else if (Math.max(startPosition, endPosition) >= 0) { 89 | int maxPosition = Math.max(startPosition, endPosition); 90 | Phrase phrase = new Phrase(phraseType, maxPosition, 91 | maxPosition, unFilteredPhrase.tree); 92 | fileterPhraseList.add(phrase); 93 | } 94 | } 95 | // System.out.println("New pos:"); 96 | // for (int i = 0; i < newPositions.length; i++) 97 | // { 98 | // System.out.print(""+i+":"+newPositions[i] + " "); 99 | // } 100 | // System.out.println("\nNew Sentence:"); 101 | // for (int i = 0; i < sentence.length; i++) 102 | // { 103 | // System.out.print(" "+sentence[i]); 104 | // } 105 | // System.out.println("\nNew phrase:"); 106 | phrases = DataStructureUtils.phraseListToArray(fileterPhraseList); 107 | // for (int i = 0; i < phrases.length; i++) 108 | // { 109 | // System.out.print("("+phrases[i].startPosition + " " + +phrases[i].endPosition + ") "); 110 | // } 111 | // System.out.println(); 112 | 113 | } 114 | 115 | @Override 116 | public boolean readNextSentence(Vocab vocab) throws IOException { 117 | boolean hasNextSentence = inputStream.readNextSentence(vocab); 118 | if (hasNextSentence) { 119 | int[] unFilteredSentence = inputStream.getCurrentSentence(); 120 | Phrase[] unFilteredPhrases = inputStream.getCurrentPhrases(); 121 | filterSentence(unFilteredSentence, unFilteredPhrases, vocab); 122 | } 123 | return hasNextSentence; 124 | } 125 | 126 | @Override 127 | public int[] getCurrentSentence() throws IOException { 128 | return sentence; 129 | } 130 | 131 | @Override 132 | public Phrase[] getCurrentPhrases() throws IOException { 133 | return phrases; 134 | } 135 | 136 | @Override 137 | public long getWordCount() { 138 | return inputStream.getWordCount(); 139 | } 140 | 141 | @Override 142 | public boolean crossDocBoundary() { 143 | // TODO Auto-generated method stub 144 | return inputStream.crossDocBoundary(); 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /src/io/sentence/TreeInputStream.java: -------------------------------------------------------------------------------- 1 | package io.sentence; 2 | 3 | import java.io.IOException; 4 | 5 | import tree.Tree; 6 | 7 | public interface TreeInputStream { 8 | // return null if end of file 9 | public Tree readTree() throws IOException; 10 | public long getReadLine(); 11 | public void close() throws IOException; 12 | } 13 | -------------------------------------------------------------------------------- /src/io/word/CombinedWordInputStream.java: -------------------------------------------------------------------------------- 1 | package io.word; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | public class CombinedWordInputStream implements WordInputStream { 8 | Iterator streamIterator; 9 | WordInputStream currentStream; 10 | int streamCount = 0; 11 | 12 | public CombinedWordInputStream(List inputStream) { 13 | streamIterator = inputStream.iterator(); 14 | if (streamIterator.hasNext()) { 15 | currentStream = streamIterator.next(); 16 | streamCount++; 17 | System.out.println("read " + streamCount + "th stream"); 18 | } else { 19 | currentStream = null; 20 | } 21 | } 22 | 23 | @Override 24 | public String readWord() throws IOException { 25 | // TODO Auto-generated method stub 26 | if (currentStream == null) { 27 | return ""; 28 | } 29 | while (true) { 30 | String word = currentStream.readWord(); 31 | if (!word.equals("")) { 32 | return word; 33 | } 34 | currentStream.close(); 35 | boolean hasNextStream = false; 36 | while (streamIterator.hasNext()) { 37 | currentStream = streamIterator.next(); 38 | streamCount++; 39 | System.out.println("read " + streamCount + "th stream"); 40 | if (currentStream == null) { 41 | System.out.println("" + streamCount + "th stream is null"); 42 | continue; 43 | } else { 44 | System.out.println("" + streamCount 45 | + "th stream is not null"); 46 | hasNextStream = true; 47 | break; 48 | } 49 | } 50 | if (!hasNextStream) { 51 | currentStream = null; 52 | return ""; 53 | } 54 | } 55 | } 56 | 57 | @Override 58 | public boolean endOfFile() { 59 | // TODO Auto-generated method stub 60 | if (currentStream == null) 61 | return true; 62 | else if (currentStream.endOfFile()) { 63 | return streamIterator.hasNext(); 64 | } else 65 | return false; 66 | } 67 | 68 | @Override 69 | public void close() throws IOException { 70 | // TODO Auto-generated method stub 71 | if (currentStream == null) 72 | return; 73 | else { 74 | currentStream.close(); 75 | while (streamIterator.hasNext()) { 76 | currentStream = streamIterator.next(); 77 | if (currentStream != null) { 78 | currentStream.close(); 79 | } 80 | } 81 | } 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/io/word/Phrase.java: -------------------------------------------------------------------------------- 1 | package io.word; 2 | 3 | import common.WordForm; 4 | 5 | import tree.CcgTree; 6 | 7 | public class Phrase { 8 | public int phraseType; 9 | public int startPosition; 10 | public int endPosition; 11 | public CcgTree tree; 12 | 13 | public Phrase(int phraseType, int startPosition, int endPosition, 14 | CcgTree tree) { 15 | this.phraseType = phraseType; 16 | this.startPosition = startPosition; 17 | this.endPosition = endPosition; 18 | this.tree = tree; 19 | } 20 | 21 | public String toString() { 22 | StringBuffer sbResult = new StringBuffer(); 23 | sbResult.append("phrase type:" + phraseType + "\n"); 24 | sbResult.append("start:" + startPosition + "\n"); 25 | sbResult.append("end:" + endPosition + "\n"); 26 | sbResult.append("surface: \'" + tree.getSurfaceString(WordForm.WORD) + "\'\n"); 27 | return sbResult.toString(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/io/word/PushBackWordStream.java: -------------------------------------------------------------------------------- 1 | package io.word; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.PushbackInputStream; 8 | 9 | public class PushBackWordStream implements WordInputStream { 10 | protected PushbackInputStream inputStream; 11 | protected int maxWordLength; 12 | boolean reachedEndOfFile = false; 13 | 14 | public PushBackWordStream(String filePath, int maxWordLength) 15 | throws IOException { 16 | this.maxWordLength = maxWordLength; 17 | inputStream = new PushbackInputStream(new BufferedInputStream( 18 | new FileInputStream(filePath))); 19 | } 20 | 21 | public PushBackWordStream(InputStream is, int maxWordLength) { 22 | this.maxWordLength = maxWordLength; 23 | inputStream = new PushbackInputStream(new BufferedInputStream(is)); 24 | } 25 | 26 | @Override 27 | public String readWord() throws IOException { 28 | StringBuffer buff = new StringBuffer(); 29 | boolean newString = true; 30 | char ch; 31 | while (true) { 32 | int nextCh = inputStream.read(); 33 | if (nextCh == -1) { 34 | reachedEndOfFile = true; 35 | break; 36 | } 37 | ch = (char) nextCh; 38 | // for window character 39 | if (ch == 13) 40 | continue; 41 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 42 | if (!newString) { 43 | if (ch == '\n') { 44 | inputStream.unread(ch); 45 | } 46 | break; 47 | } 48 | // end of line = end of sentence 49 | if (ch == '\n') { 50 | return ""; 51 | } else 52 | continue; 53 | } 54 | buff.append(ch); 55 | newString = false; 56 | } 57 | String result = buff.toString(); 58 | if (result.length() > maxWordLength) { 59 | return result.substring(0, maxWordLength); 60 | } else { 61 | return result; 62 | } 63 | } 64 | 65 | @Override 66 | public void close() throws IOException { 67 | inputStream.close(); 68 | } 69 | 70 | @Override 71 | public boolean endOfFile() { 72 | return reachedEndOfFile; 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/io/word/WordFilter.java: -------------------------------------------------------------------------------- 1 | package io.word; 2 | 3 | public interface WordFilter { 4 | public boolean isFiltered(String word); 5 | } 6 | -------------------------------------------------------------------------------- /src/io/word/WordInputStream.java: -------------------------------------------------------------------------------- 1 | package io.word; 2 | 3 | import java.io.IOException; 4 | 5 | public interface WordInputStream { 6 | /** 7 | * get the next word from the stream 8 | * 9 | * @return A string as the next word 10 | * If it's the end of the stream, return "" 11 | * If it's the end of a sentence, return "" 12 | * @throws IOException 13 | */ 14 | public String readWord() throws IOException; 15 | 16 | /** 17 | * Check if the we reach the end of the stream 18 | * Seem a bit redundant since we 19 | * can get this information 20 | * 21 | * @return true: if the end of the stream is reach false: otherwise 22 | */ 23 | public boolean endOfFile(); 24 | 25 | /** 26 | * close the stream 27 | * 28 | * @throws IOException 29 | */ 30 | public void close() throws IOException; 31 | } 32 | -------------------------------------------------------------------------------- /src/neural/function/ActivationFunction.java: -------------------------------------------------------------------------------- 1 | package neural.function; 2 | 3 | public interface ActivationFunction { 4 | public double activation(double x); 5 | public double derivative(double x); 6 | public String getName(); 7 | } 8 | -------------------------------------------------------------------------------- /src/neural/function/Correlation.java: -------------------------------------------------------------------------------- 1 | package neural.function; 2 | 3 | import java.util.Random; 4 | 5 | import common.MathUtils; 6 | import common.exception.ValueException; 7 | 8 | public class Correlation { 9 | // double[] predicted; 10 | String name = "correlation"; 11 | double[] gold; 12 | int length; 13 | double aveY; 14 | double aveY2; 15 | // public train 16 | public Correlation(double[] gold) { 17 | // this.predicted = predicted; 18 | this.gold = gold; 19 | precompute(); 20 | } 21 | 22 | protected void precompute() { 23 | aveY = 0; 24 | aveY2 = 0; 25 | length = gold.length; 26 | 27 | for (int i = 0; i < gold.length; i++) { 28 | aveY += gold[i]; 29 | aveY2 += gold[i] * gold[i]; 30 | } 31 | aveY /= length; 32 | aveY2 /= length; 33 | } 34 | 35 | public Correlation(double[][] vectors, int[][] pairs) { 36 | gold = new double[pairs.length]; 37 | for (int i = 0; i < pairs.length; i++) { 38 | gold[i] = MathUtils.cosine(vectors[pairs[i][0]], vectors[pairs[i][1]]); 39 | } 40 | precompute(); 41 | } 42 | 43 | 44 | public double[] derivative(double[] predicted) { 45 | if (length != predicted.length) { 46 | throw new ValueException("Value must be the same"); 47 | } 48 | double[] result = new double[length]; 49 | double aveX = 0; 50 | double aveX2 = 0; 51 | double aveXY =0; 52 | for (int i = 0; i < gold.length; i++) { 53 | aveX += predicted[i]; 54 | aveX2 += predicted[i] * predicted[i]; 55 | aveXY += predicted[i] * gold[i]; 56 | } 57 | aveX /= length; 58 | aveX2 /= length; 59 | aveXY /= length; 60 | double ave2X = aveX * aveX; 61 | double ave2Y = aveY * aveY; 62 | double covXY = (aveXY - (aveX * aveY)); 63 | double covX = (aveX2 - ave2X); 64 | double covY = (aveY2 - ave2Y); 65 | double sCovX = Math.sqrt(covX); 66 | double sCovY = Math.sqrt(covY); 67 | 68 | double correlation = covXY / (sCovX * sCovY); 69 | for (int i = 0; i < length; i++) { 70 | result[i] = 1 / (covX * sCovY); 71 | result[i] *= (((gold[i] - aveY) * sCovX) - ((covXY / sCovY) * (predicted[i] - aveX))) / length; 72 | } 73 | 74 | System.out.println(name + ": " + correlation); 75 | return result; 76 | } 77 | 78 | public double[][] derivative(double[][] vectors, int[][] pairs) { 79 | int vocabSize = vectors.length; 80 | int vectorSize = vectors[0].length; 81 | double[][] result = new double[vocabSize][vectorSize]; 82 | double[] cosines = new double[gold.length]; 83 | for (int i = 0; i < pairs.length; i++) { 84 | cosines[i] = MathUtils.cosine(vectors[pairs[i][0]], vectors[pairs[i][1]]); 85 | } 86 | double[] cosDerivative = derivative(cosines); 87 | for (int i = 0; i < pairs.length; i++) { 88 | int index1 = pairs[i][0]; 89 | int index2 = pairs[i][1]; 90 | // TODO: optimize here 91 | double[] deltaX1 = MathUtils.cosineDerivative(vectors[index1], vectors[index2]); 92 | double[] deltaX2 = MathUtils.cosineDerivative(vectors[index2], vectors[index1]); 93 | for (int j = 0; j < vectorSize; j++) { 94 | result[index1][j] += cosDerivative[i] * deltaX1[j]; 95 | result[index2][j] += cosDerivative[i] * deltaX2[j]; 96 | } 97 | } 98 | 99 | return result; 100 | } 101 | 102 | public void setName(String name) { 103 | this.name = name; 104 | } 105 | 106 | public static void testPearsonDerivative() { 107 | Random random = new Random(); 108 | int arrayLength = 1000; 109 | double[] gold = new double[arrayLength]; 110 | double[] predicted = new double[arrayLength]; 111 | for (int i = 0; i < arrayLength; i++) { 112 | gold[i] = random.nextDouble(); 113 | predicted[i] = random.nextDouble(); 114 | } 115 | double alpha = 1; 116 | int iteration = 1000; 117 | Correlation cor = new Correlation(gold); 118 | for (int i = 0; i < iteration; i++) { 119 | double[] derivative = cor.derivative(predicted); 120 | for (int j = 0; j < derivative.length; j++) { 121 | predicted[j] += alpha * derivative[j]; 122 | } 123 | } 124 | } 125 | 126 | public static void testPearsonCosDerivative() { 127 | Random random = new Random(); 128 | int arrayLength = 3000; 129 | double[] gold = new double[arrayLength]; 130 | for (int i = 0; i < arrayLength; i++) { 131 | gold[i] = random.nextDouble(); 132 | } 133 | Correlation cor = new Correlation(gold); 134 | int vectorNum = 1000; 135 | int[][] pairs = new int[arrayLength][2]; 136 | int index = 0; 137 | while (index < arrayLength) { 138 | int i = random.nextInt(vectorNum); 139 | int j = random.nextInt(vectorNum); 140 | if (i == j) continue; 141 | pairs[index][0] = i; 142 | pairs[index][1] = j; 143 | index++; 144 | } 145 | int vectorSize = 100; 146 | double[][] vectors = new double[vectorNum][vectorSize]; 147 | for (int i = 0; i < vectorNum; i++) { 148 | for (int j = 0; j < vectorSize; j++) { 149 | vectors[i][j] = random.nextDouble(); 150 | } 151 | } 152 | 153 | int iteration = 10000; 154 | double alpha = 1; 155 | for (int iter = 0; iter < iteration; iter++) { 156 | double[][] delta = cor.derivative(vectors, pairs); 157 | for (int i = 0; i < vectorNum; i++) { 158 | for (int j = 0; j < vectorSize; j++) { 159 | vectors[i][j] += alpha * delta[i][j]; 160 | } 161 | } 162 | } 163 | } 164 | 165 | public static void main(String[] args) { 166 | // testPearsonDerivative(); 167 | testPearsonCosDerivative(); 168 | } 169 | 170 | 171 | 172 | } 173 | 174 | -------------------------------------------------------------------------------- /src/neural/function/Sigmoid.java: -------------------------------------------------------------------------------- 1 | package neural.function; 2 | 3 | import common.SigmoidTable; 4 | 5 | public class Sigmoid implements ActivationFunction { 6 | public static final SigmoidTable sigmoidTable = new SigmoidTable(); 7 | 8 | @Override 9 | public double activation(double x) { 10 | // TODO Auto-generated method stub 11 | return sigmoidTable.getSigmoid(x); 12 | } 13 | 14 | @Override 15 | public double derivative(double x) { 16 | // TODO Auto-generated method stub 17 | double sigmoid = sigmoidTable.getSigmoid(x); 18 | return sigmoid * (1 - sigmoid); 19 | } 20 | 21 | @Override 22 | public String getName() { 23 | // TODO Auto-generated method stub 24 | return "sigmoid"; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/neural/function/Tanh.java: -------------------------------------------------------------------------------- 1 | package neural.function; 2 | 3 | import common.TanhTable; 4 | 5 | public class Tanh implements ActivationFunction { 6 | public static final TanhTable tanhTable = new TanhTable(); 7 | 8 | @Override 9 | public double activation(double x) { 10 | // TODO Auto-generated method stub 11 | return tanhTable.getTanh(x); 12 | } 13 | 14 | @Override 15 | public double derivative(double x) { 16 | // TODO Auto-generated method stub 17 | double tanh = tanhTable.getTanh(x); 18 | return 1 - (tanh * tanh); 19 | } 20 | 21 | @Override 22 | public String getName() { 23 | // TODO Auto-generated method stub 24 | return "tanh"; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/space/AbstractSemanticSpace.java: -------------------------------------------------------------------------------- 1 | package space; 2 | 3 | import org.ejml.simple.SimpleMatrix; 4 | 5 | import common.SimpleMatrixUtils; 6 | import common.exception.OutOfVocabularyException; 7 | 8 | public abstract class AbstractSemanticSpace implements SemanticSpace{ 9 | 10 | @Override 11 | public double getSim(String word1, String word2) { 12 | // TODO Auto-generated method stub 13 | SimpleMatrix vector1 = getVector(word1); 14 | SimpleMatrix vector2 = getVector(word2); 15 | if (vector1 == null) { 16 | throw new OutOfVocabularyException(word1 +" not found"); 17 | } else if (vector2 == null) { 18 | throw new OutOfVocabularyException(word2 +" not found"); 19 | } 20 | return SimpleMatrixUtils.cosine(vector1, vector2); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/space/Neighbor.java: -------------------------------------------------------------------------------- 1 | package space; 2 | 3 | import java.util.Comparator; 4 | 5 | public class Neighbor { 6 | public String word; 7 | public double sim; 8 | 9 | public Neighbor(String word, double sim) { 10 | this.word = word; 11 | this.sim = sim; 12 | } 13 | 14 | public static Comparator NeighborComparator = new Comparator() { 15 | 16 | @Override 17 | public int compare( 18 | Neighbor o1, 19 | Neighbor o2) { 20 | if (o1.sim > o2.sim) { 21 | return -1; 22 | } else if (o1.sim == o2.sim) { 23 | return 0; 24 | } else { 25 | return 1; 26 | } 27 | } 28 | 29 | }; 30 | } 31 | -------------------------------------------------------------------------------- /src/space/SemanticSpace.java: -------------------------------------------------------------------------------- 1 | package space; 2 | 3 | import org.ejml.simple.SimpleMatrix; 4 | 5 | public interface SemanticSpace { 6 | // public boolean containsWord(String word); 7 | public int getVectorSize(); 8 | public SimpleMatrix getVector(String word); 9 | public double getSim(String word1, String word2); 10 | public double getDirection(String word1, String word2); 11 | public Neighbor[] getNeighbors(String word, int noNeighbor); 12 | public Neighbor[] getNeighbors(SimpleMatrix vector, int noNeighbor, String[] excludedWords); 13 | } 14 | -------------------------------------------------------------------------------- /src/space/Similarity.java: -------------------------------------------------------------------------------- 1 | package space; 2 | 3 | import org.ejml.simple.SimpleMatrix; 4 | 5 | import common.MathUtils; 6 | import common.SimpleMatrixUtils; 7 | 8 | public class Similarity { 9 | public static double cosine(double[] v1, double[] v2) { 10 | return MathUtils.cosine(v1, v2); 11 | } 12 | 13 | public static double cosine(SimpleMatrix v1, SimpleMatrix v2) { 14 | return SimpleMatrixUtils.cosine(v1, v2); 15 | } 16 | 17 | public static SimpleMatrix massCosine(SimpleMatrix matrix, SimpleMatrix vector) { 18 | return SimpleMatrixUtils.massCosine(matrix, vector); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/vocab/HuffmanTree.java: -------------------------------------------------------------------------------- 1 | package vocab; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | 6 | import common.DataStructureUtils; 7 | 8 | public class HuffmanTree { 9 | int[] binaries; 10 | int[] parentNodes; 11 | long[] counts; 12 | int vocabSize; 13 | 14 | /* 15 | * Create binary Huffman tree using the word counts intCounts must already 16 | * be sorted (descending order) 17 | */ 18 | public HuffmanTree(long[] inCounts) { 19 | 20 | vocabSize = inCounts.length; 21 | 22 | /* 23 | * counts: the count each node in a tree binaries: the code bit going 24 | * from the parent to the current node parentNodes: the direct parent of 25 | * each node 26 | * 27 | * These arrays are splitted into 2 groups: leaf nodes: 0 -> vocabSize-1 28 | * (descending counts) internal nodes: vocabSize -> 2 * vocabSize - 2 29 | * (ascending counts) 30 | */ 31 | counts = new long[2 * vocabSize - 1]; 32 | binaries = new int[2 * vocabSize - 1]; 33 | parentNodes = new int[2 * vocabSize - 1]; 34 | 35 | // creating a 36 | for (int i = 0; i < vocabSize; i++) { 37 | counts[i] = inCounts[i]; 38 | } 39 | for (int i = vocabSize; i < vocabSize * 2 - 1; i++) { 40 | counts[i] = (int) 1e15; 41 | } 42 | 43 | int pos1 = vocabSize - 1; // traverse in the leaf node indices 44 | int pos2 = vocabSize; // traverse in the internal node indices 45 | 46 | /* 47 | * Following algorithm constructs the Huffman tree by creating one 48 | * internal node at a time 49 | */ 50 | 51 | int min1i, min2i; 52 | for (int i = 0; i < vocabSize - 1; i++) { 53 | 54 | // First, find node with smallest count 'min1' 55 | if (pos1 >= 0) { 56 | if (counts[pos1] < counts[pos2]) { 57 | min1i = pos1; 58 | pos1--; 59 | } else { 60 | min1i = pos2; 61 | pos2++; 62 | } 63 | } else { 64 | min1i = pos2; 65 | pos2++; 66 | } 67 | // Then, find node with next smallest count 'min2' 68 | if (pos1 >= 0) { 69 | if (counts[pos1] < counts[pos2]) { 70 | min2i = pos1; 71 | pos1--; 72 | } else { 73 | min2i = pos2; 74 | pos2++; 75 | } 76 | } else { 77 | min2i = pos2; 78 | pos2++; 79 | } 80 | 81 | // sum the count, create a new node with the sum as its count 82 | counts[vocabSize + i] = counts[min1i] + counts[min2i]; 83 | // update the code & parent information 84 | parentNodes[min1i] = vocabSize + i; 85 | parentNodes[min2i] = vocabSize + i; 86 | binaries[min1i] = 0; // which is default in Java 87 | binaries[min2i] = 1; 88 | } 89 | } 90 | 91 | /* 92 | * retrieve the Huffman code of a index_th input entry (i.e. word in a 93 | * vocab) 94 | */ 95 | public String getCode(int index) { 96 | int parentIndex = index; 97 | 98 | StringBuffer code = new StringBuffer(); 99 | // traverse from the node to the root to get the reversed code 100 | // reverse and return the code 101 | while (true) { 102 | code.append(binaries[parentIndex]); 103 | parentIndex = parentNodes[parentIndex]; 104 | if (parentIndex > vocabSize * 2 - 2) { 105 | System.out.println(parentIndex); 106 | } 107 | if (parentIndex == vocabSize * 2 - 2) { 108 | break; 109 | } 110 | } 111 | return new StringBuilder(code.toString()).reverse().toString(); 112 | } 113 | 114 | /* 115 | * retrieve the ancestors of a index_th input entry in the Huffman tree 116 | */ 117 | public int[] getParentIndices(int index) { 118 | int currentIndex = index; 119 | ArrayList parentIndices = new ArrayList(); 120 | 121 | /* 122 | * traverse from the node to the root to get the reversed list of parent 123 | * indices in the internal node list (the original indices subtracted by 124 | * vocabSize) reverse the list, turn it into an array and return 125 | */ 126 | 127 | while (true) { 128 | int parentIndex = parentNodes[currentIndex]; 129 | parentIndices.add(parentIndex - vocabSize); 130 | currentIndex = parentIndex; 131 | if (parentIndex == vocabSize * 2 - 2) { 132 | break; 133 | } 134 | } 135 | Collections.reverse(parentIndices); 136 | return DataStructureUtils.intListToArray(parentIndices); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/vocab/VocabEntry.java: -------------------------------------------------------------------------------- 1 | package vocab; 2 | 3 | import java.util.Comparator; 4 | 5 | public class VocabEntry { 6 | // count the word in the training file 7 | public long frequency; 8 | 9 | // the ancestors' indices in the huffman tree 10 | public int[] ancestors; 11 | 12 | // the surface string 13 | public String word; 14 | 15 | // the huffman code 16 | public String code; 17 | 18 | public VocabEntry() { 19 | word = ""; 20 | frequency = 0; 21 | } 22 | 23 | public VocabEntry(String word, int frequency) { 24 | this.word = word; 25 | this.frequency = frequency; 26 | } 27 | 28 | public static Comparator VocabEntryFrequencyComparator = new Comparator() { 29 | 30 | @Override 31 | public int compare( 32 | VocabEntry o1, 33 | VocabEntry o2) { 34 | if (o1.frequency > o2.frequency) { 35 | return 1; 36 | } else if (o1.frequency < o2.frequency) { 37 | return -1; 38 | } else { 39 | return 0; 40 | } 41 | } 42 | 43 | }; 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/vocab/VocabEntryFilter.java: -------------------------------------------------------------------------------- 1 | package vocab; 2 | 3 | public interface VocabEntryFilter { 4 | public boolean isFiltered(VocabEntry entry); 5 | } 6 | -------------------------------------------------------------------------------- /src/vocab/filter/MinFrequencyVocabFilter.java: -------------------------------------------------------------------------------- 1 | package vocab.filter; 2 | 3 | import vocab.VocabEntry; 4 | import vocab.VocabEntryFilter; 5 | 6 | public class MinFrequencyVocabFilter implements VocabEntryFilter { 7 | protected int minFrequency; 8 | 9 | public MinFrequencyVocabFilter(int minFrequency) { 10 | this.minFrequency = minFrequency; 11 | } 12 | 13 | @Override 14 | public boolean isFiltered(VocabEntry entry) { 15 | // if the frequency of the word is less the minFrequency, return true 16 | // to filter it 17 | return entry.frequency < minFrequency; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/word2vec/MultiThreadWord2Vec.java: -------------------------------------------------------------------------------- 1 | package word2vec; 2 | 3 | import io.sentence.SentenceInputStream; 4 | import io.sentence.SubSamplingSentenceInputStream; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | 9 | import space.RawSemanticSpace; 10 | import common.correlation.MenCorrelation; 11 | import common.MeanAveragePrecision; 12 | 13 | /** 14 | * Still abstract class for learning words' vectors 15 | * Implement some common methods 16 | * 17 | */ 18 | public abstract class MultiThreadWord2Vec extends AbstractWord2Vec { 19 | 20 | protected MenCorrelation men; 21 | protected MenCorrelation ws; 22 | protected MeanAveragePrecision ap; 23 | protected MeanAveragePrecision entail; 24 | protected MeanAveragePrecision eval; 25 | protected RawSemanticSpace outputSpace; 26 | protected RawSemanticSpace negSpace; 27 | protected long lastWordCount = 0; 28 | protected int iteration = 0; 29 | protected int epochNum = 1; 30 | 31 | public MultiThreadWord2Vec(int projectionLayerSize, int windowSize, 32 | boolean hierarchicalSoftmax, int negativeSamples, double subSample) { 33 | super(projectionLayerSize, windowSize, hierarchicalSoftmax, 34 | negativeSamples, subSample); 35 | } 36 | 37 | @Override 38 | public void trainModel(ArrayList inputStreams) { 39 | wordCount = 0; 40 | lastWordCount = 0; 41 | trainWords = vocab.getTrainWords(); 42 | System.out.println("train words: " + trainWords); 43 | System.out.println("vocab size: " + vocab.getVocabSize()); 44 | System.out.println("hidden size: " + projectionLayerSize); 45 | System.out.println("first word:" + vocab.getEntry(0).word); 46 | System.out.println("last word:" + vocab.getEntry(vocab.getVocabSize() - 1).word); 47 | 48 | //The number of sentences in the corpus 49 | TrainingThread[] threads = new TrainingThread[inputStreams.size()]; 50 | for (int i = 0; i < inputStreams.size(); i++) { 51 | SentenceInputStream inputStream = inputStreams.get(i); 52 | if (subSample > 0) { 53 | inputStream = new SubSamplingSentenceInputStream(inputStream, subSample); 54 | } 55 | threads[i] = new TrainingThread(inputStream); 56 | threads[i].start(); 57 | } 58 | try { 59 | for (TrainingThread thread: threads) { 60 | thread.join(); 61 | } 62 | } catch (InterruptedException e) { 63 | // TODO Auto-generated catch block 64 | e.printStackTrace(); 65 | } 66 | 67 | System.out.println("total word count: " + wordCount); 68 | } 69 | 70 | protected void trainModelThread(SentenceInputStream inputStream) { 71 | long oldWordCount = 0; 72 | try { 73 | while (true) { 74 | 75 | // read the whole sentence, 76 | // the output would be the list of the word's indices in the 77 | // dictionary 78 | boolean hasNextSentence = inputStream.readNextSentence(vocab); 79 | if (!hasNextSentence) break; 80 | int[] sentence = inputStream.getCurrentSentence(); 81 | // if end of file, finish 82 | if (sentence.length == 0) { 83 | continue; 84 | // if (!hasNextSentence) 85 | // break; 86 | } 87 | 88 | // check word count 89 | // update alpha 90 | long newSentenceWordCount = inputStream.getWordCount() - oldWordCount; 91 | oldWordCount = inputStream.getWordCount(); 92 | 93 | synchronized (this) { 94 | wordCount = wordCount + newSentenceWordCount; 95 | if (wordCount - lastWordCount >= 10000) { 96 | lastWordCount = wordCount; 97 | iteration++; 98 | // update alpha 99 | // what about thread safe??? 100 | 101 | alpha = starting_alpha 102 | * (1 - (double) wordCount / (trainWords + 1)); 103 | if (alpha < starting_alpha * 0.0001) { 104 | alpha = starting_alpha * 0.0001; 105 | } 106 | if (iteration % 10 == 0) { 107 | System.out.println("Trained: " + wordCount + " words, " + "Training rate: " + alpha); 108 | //System.out.println("Training rate: " + alpha); 109 | } 110 | 111 | } 112 | } 113 | 114 | trainSentence(sentence); 115 | } 116 | } catch (IOException e) { 117 | e.printStackTrace(); 118 | System.exit(1); 119 | } 120 | } 121 | 122 | 123 | public void printStatistics() { 124 | } 125 | 126 | 127 | public abstract void trainSentence(int[] sentence); 128 | 129 | protected class TrainingThread extends Thread { 130 | SentenceInputStream inputStream; //put each sentence of the corpus 131 | 132 | public TrainingThread(SentenceInputStream inputStream) { 133 | this.inputStream = inputStream; 134 | } 135 | 136 | public void run() { 137 | trainModelThread(inputStream); 138 | } 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /src/word2vec/UniGram.java: -------------------------------------------------------------------------------- 1 | package word2vec; 2 | 3 | import java.util.Random; 4 | 5 | import vocab.Vocab; 6 | 7 | public class UniGram { 8 | public static final int DEFAULT_TABLE_SIZE = 100000000; 9 | protected int randomTablesize; 10 | protected int[] randomTable; 11 | private Random random; 12 | 13 | public UniGram(Vocab vocab, int tableSize) { 14 | this.randomTablesize = tableSize; 15 | initUnigramTable(vocab); 16 | random = new Random(); 17 | } 18 | 19 | public UniGram(Vocab vocab) { 20 | this(vocab, DEFAULT_TABLE_SIZE); 21 | } 22 | 23 | /** 24 | * Create an unigram table to randomly generate a word. The probability of 25 | * generating a word corresponds to its frequency^3/4 26 | */ 27 | protected void initUnigramTable(Vocab vocab) { 28 | long trainWordsPow = 0; 29 | double sumPow; 30 | double power = (double) 0.75; 31 | int vocabSize = vocab.getVocabSize(); 32 | randomTable = new int[randomTablesize]; 33 | 34 | // trainWordsPow = sum (frequency ^ 3/4) 35 | for (int i = 0; i < vocabSize; i++) { 36 | trainWordsPow += Math.pow(vocab.getEntry(i).frequency, power); 37 | } 38 | int index = 0; 39 | sumPow = (double) Math.pow(vocab.getEntry(index).frequency, power) 40 | / trainWordsPow; 41 | 42 | // fill up the uni-gram table with words from the vocabulary 43 | // the number of times a word appear is in proportion with its 44 | // frequency^3/4 45 | for (int i = 0; i < randomTablesize; i++) { 46 | randomTable[i] = index; 47 | if (i / (double) randomTablesize > sumPow) { 48 | index++; 49 | if (index < vocabSize) { 50 | sumPow += Math.pow(vocab.getEntry(index).frequency, power) 51 | / trainWordsPow; 52 | } else { 53 | System.out.println("what does it mean here"); 54 | } 55 | } 56 | if (index >= vocabSize) 57 | index = vocabSize - 1; 58 | } 59 | } 60 | 61 | public int randomWordIndex() { 62 | int randomInt = random.nextInt(randomTablesize); 63 | return randomTable[randomInt]; 64 | } 65 | 66 | } 67 | --------------------------------------------------------------------------------