├── .gitignore
├── LICENSE
├── README.md
├── data_preparation.ipynb
├── demo_input
    ├── negative_samples_small.tsv
    ├── negative_with_entities_small.tsv
    ├── positive_samples_small.tsv
    └── positive_with_entities_small.tsv
├── demo_output
    ├── background_samples.txt
    ├── temp_doc2vec.txt
    └── top_scoring_positive_class.txt
├── features_classification.ipynb
├── packages
    ├── data_preparation_tools.py
    ├── features_generation_tools.py
    ├── mirna_detector
    │   ├── LICENSE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── data
    │   │   └── mirna.txt
    │   ├── test.py
    │   └── tests
    │   │   └── sentences.txt
    └── model_tools.py
└── webservice
    ├── app.py
    ├── scorer.py
    └── scorer_ws.pyproj


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | # private data
65 | *private.*
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Catalyst Code
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # corpus-to-graph-ml
 2 | This repository contains machine learning related work for the corpus to graph project, including Jupyter research notebooks and a Flask webservice to host the model.
 3 | 
 4 | The packages folder contains python code with the main logic for the data transformation and the feature generation tools.
 5 | 
 6 |  The webservice contains an example for a flask based scoring service that can be used in order to expose the trained model.
 7 | 
 8 | The data_preparation notebook contains an example of running the data transformation pipeline, and the features_classification notebook contains code examples for generating different features and training and evaluating different classifiers.
 9 | 
10 | The only missing piece that shold be provided is an entity recognition endpoint (specifically here we used [GNAT](http://gnat.sourceforge.net/)). You can also alternatively provide a text file with the results of the entity recognition process.
11 | 
12 | #Dependencies:
13 | 
14 | We highly recommend using the [Anaconda](https://www.continuum.io/downloads) distribution (or any similiar distribution), to make your life easier, as it comes with most of the packages we use in this notebook. 
15 | 
16 | In our notebooks, we use the following libraries:
17 | 
18 |  - [SKlearn](http://scikit-learn.org/stable/install.html) (comes pre-installed with Anaconda)
19 |  - [NLTK](http://www.nltk.org/install.html) (make sure to install the NLTK stopwords, lemmatization, and stemming packages by [calling nltk.download() manually](http://www.nltk.org/data.html))
20 |  - [gensim](https://radimrehurek.com/gensim/install.html) (Make sure that you have *cython* installed beforehand in order run it the optimized version of the code)
21 |  - [spacy.io](https://spacy.io/docs/#getting-started) - For spacy - make sure you the english model installed 
22 |  - [requests](https://pypi.python.org/pypi/requests)
23 | 


--------------------------------------------------------------------------------
/data_preparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false,
  8 |     "scrolled": true
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "producing inputs for binary case...\n",
 16 |       "before data split\n",
 17 |       "splitted data\n",
 18 |       "running step:entities\n",
 19 |       "running step:trim\n",
 20 |       "running step:normalize\n",
 21 |       "running step:rmdigits\n",
 22 |       "Finished producing files, took:0:00:00.156000\n"
 23 |      ]
 24 |     }
 25 |    ],
 26 |    "source": [
 27 |     "import sys\n",
 28 |     "sys.path.append(\"./packages\")\n",
 29 |     "\n",
 30 |     "import data_preparation_tools as dpt\n",
 31 |     "reload(dpt)\n",
 32 |     "import datetime\n",
 33 |     "from datetime import date\n",
 34 |     "\n",
 35 |     "# uncomment when doanloading nltk for the first time\n",
 36 |     "#import nltk\n",
 37 |     "#nltk.download()\n",
 38 |     "\n",
 39 |     "# an optional optimization:\n",
 40 |     "# import preprocessed text with entitity recognition data\n",
 41 |     "# in order to prevent the server from going to GNAT each time\n",
 42 |     "# the input files are in format of tab seperated files with the first\n",
 43 |     "# column containing the sentence and the remaining columns containing \n",
 44 |     "# the recognized entities\n",
 45 |     "texts_with_entities_file_paths = [r\"./demo_input/positive_with_entities_small.tsv\",\n",
 46 |     "                                  r\"./demo_input/negative_with_entities_small.tsv\"]\n",
 47 |     "\n",
 48 |     "for p in texts_with_entities_file_paths:\n",
 49 |     "    dpt.import_to_texts_entities_dictonary_from_file(p)\n",
 50 |     "today = date.today()\n",
 51 |     "prefix = \"%s_%s_%s\"%(today.month,today.day,today.year)\n",
 52 |     "\n",
 53 |     "# fill out these paths\n",
 54 |     "\n",
 55 |     "output_dir = r\"./demo_output\"\n",
 56 |     "positive_samples_path = r\"./demo_input/positive_samples_small.tsv\"\n",
 57 |     "negative_samples_path = r\"./demo_input/negative_samples_small.tsv\"\n",
 58 |     "\n",
 59 |     "start_time = datetime.datetime.now()\n",
 60 |     "\n",
 61 |     "# this pipeline will produce files with all possible outputs\n",
 62 |     "dpt.run_data_preparation_pipeline(positive_samples_path,negative_samples_path, prefix, output_dir, run_multiclass=False)\n",
 63 |     "\n",
 64 |     "run_time = datetime.datetime.now() - start_time\n",
 65 |     "print \"Finished producing files, took:%s\"%run_time"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": []
 76 |   }
 77 |  ],
 78 |  "metadata": {
 79 |   "kernelspec": {
 80 |    "display_name": "Python 2",
 81 |    "language": "python",
 82 |    "name": "python2"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 2
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython2",
 94 |    "version": "2.7.11"
 95 |   }
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 0
 99 | }
100 | 


--------------------------------------------------------------------------------
/demo_input/negative_samples_small.tsv:
--------------------------------------------------------------------------------
 1 | 406953	MESH:D012021	11693395	mir18a	\N	\N	\N	The influence of microgravity on voluntary and electrically evoked contractions of the triceps surae (TS) of seven crewmembers was studied before and after spaceflights (space MIR flights: MIR-18, -22, -25, -26, and -27).	The influence of microgravity on voluntary and electrically evoked contractions of the triceps surae ( TS ) of seven crewmembers was studied before and after spaceflights ( space MIR flights : MIR-18 , -22 , -25 , -26 , and -27 ) .	The influence microgravity voluntary electrically evoked contractions triceps surae TS seven crewmembers studied spaceflights space MIR flights MIR-18 -22 -25 -26 -27	MIR-18, -22, -25, -26, and -27	triceps surae
 2 | 574502	9606	12749511	mir500a	\N	\N	\N	Eligible patients were to have a glycated hemoglobin (HbA1c) value < or = 8.5% and mean fasting plasma glucose (FPG) concentrations < or = 200 mg/dL while receiving MIR 500 mg BID for at least 8 weeks.	Eligible patients were to have a glycated hemoglobin ( HbA1c ) value < or = 8.5 % and mean fasting plasma glucose ( FPG ) concentrations < or = 200 mg/dL while receiving MIR 500 mg BID for at least 8 weeks .	Eligible patients glycated hemoglobin HbA1c value 8.5 mean fasting plasma glucose FPG concentrations 200 mg/dL receiving MIR 500 mg BID least 8 weeks	MIR 500	patient
 3 | 574502	9606	12749511	mir500a	\N	\N	\N	After a 2-week, single-blind lead-in period, patients were randomly assigned to receive MXR 1000 or 1500 mg QD for 24 weeks or to continue MIR 500 mg BID for 24 weeks.	After a 2-week , single-blind lead-in period , patients were randomly assigned to receive MXR 1000 or 1500 mg QD for 24 weeks or to continue MIR 500 mg BID for 24 weeks .	After 2-week single-blind lead-in period patients randomly assigned receive MXR 1000 1500 mg QD 24 weeks continue MIR 500 mg BID 24 weeks	MIR 500	patient
 4 | 494324	CHEBI:29108	15538371	mir375	\N	\N	\N	The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis.	The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis .	The mechanism secretion modified miR-375 independent changes glucose metabolism intracellular Ca2+-signalling correlated direct effect insulin exocytosis	miR-375	Ca2+
 5 | 494324	MESH:D005947	15538371	mir375	\N	\N	\N	The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis.	The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis .	The mechanism secretion modified miR-375 independent changes glucose metabolism intracellular Ca2+-signalling correlated direct effect insulin exocytosis	miR-375	glucose
 6 | 406934	MESH:D007938	15737576	mir142	\N	\N	\N	Some human miRNAs are linked to leukemias: the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia.	Some human miRNAs are linked to leukemias : the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia .	Some human miRNAs linked leukemias miR-15a/miR-16 locus frequently deleted down-regulated patients B-cell chronic lymphocytic leukemia miR-142 translocation site found case aggressive B-cell leukemia	miR-142	leukemias
 7 | 406948	9606	15737576	mir15a	\N	\N	\N	Some human miRNAs are linked to leukemias: the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia.	Some human miRNAs are linked to leukemias : the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia .	Some human miRNAs linked leukemias miR-15a/miR-16 locus frequently deleted down-regulated patients B-cell chronic lymphocytic leukemia miR-142 translocation site found case aggressive B-cell leukemia	miR-15a	patients
 8 | 406906	9606	15901636	mir122	\N	\N	\N	We propose that specific microRNAs, such as Mirn122a, could be involved in the posttranscriptional regulation of mRNAs such as Tnp2 in the mammalian testis.	We propose that specific microRNAs , such as Mirn122a , could be involved in the posttranscriptional regulation of mRNAs such as Tnp2 in the mammalian testis .	We propose specific microRNAs Mirn122a could involved posttranscriptional regulation mRNAs Tnp2 mammalian testis	Mirn122a	mammalian
 9 | 406952	MESH:D016393	15944707	mir17	\N	\N	\N	Enforced expression of the mir-17-92 cluster acted with c-myc expression to accelerate tumour development in a mouse B-cell lymphoma model.	Enforced expression of the mir-17-92 cluster acted with c-myc expression to accelerate tumour development in a mouse B-cell lymphoma model .	Enforced expression mir-17-92 cluster acted c-myc expression accelerate tumour development mouse B-cell lymphoma model	mir-17	B-cell lymphoma
10 | 406952	MESH:D008223	15944707	mir17	\N	\N	\N	Tumours derived from haematopoietic stem cells expressing a subset of the mir-17-92 cluster and c-myc could be distinguished by an absence of apoptosis that was otherwise prevalent in c-myc-induced lymphomas.	Tumours derived from haematopoietic stem cells expressing a subset of the mir-17-92 cluster and c-myc could be distinguished by an absence of apoptosis that was otherwise prevalent in c-myc-induced lymphomas .	Tumours derived haematopoietic stem cells expressing subset mir-17-92 cluster c-myc could distinguished absence apoptosis otherwise prevalent c-myc-induced lymphomas	mir-17	lymphomas
11 | 


--------------------------------------------------------------------------------
/demo_input/negative_with_entities_small.tsv:
--------------------------------------------------------------------------------
 1 | The measured Mir-18 crew skin dose equivalent rate was 1133 microSv/day .	Mir-18
 2 | The influence of microgravity on voluntary and electrically evoked contractions of the triceps surae ( TS ) of seven crewmembers was studied before and after spaceflights ( space MIR flights : MIR-18 , -22 , -25 , -26 , and -27 ) .	TS	MIR	MIR-18
 3 | Eligible patients were to have a glycated hemoglobin ( HbA1c ) value < or = 8.5 % and mean fasting plasma glucose ( FPG ) concentrations < or = 200 mg/dL while receiving MIR 500 mg BID for at least 8 weeks .	hemoglobin	MIR 500	BID	mg	FPG	dL
 4 | After a 2-week , single-blind lead-in period , patients were randomly assigned to receive MXR 1000 or 1500 mg QD for 24 weeks or to continue MIR 500 mg BID for 24 weeks .	mg	MXR	a 2	BID	MIR 500
 5 | The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis .	miR-375	insulin	Ca2
 6 | Some human miRNAs are linked to leukemias : the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia .	miR-15a	B	miR-142	miR-16
 7 | We propose that specific microRNAs , such as Mirn122a , could be involved in the posttranscriptional regulation of mRNAs such as Tnp2 in the mammalian testis .	Tnp2	Mirn122a
 8 | Enforced expression of the mir-17-92 cluster acted with c-myc expression to accelerate tumour development in a mouse B-cell lymphoma model .	c-myc	B	mir-17-92
 9 | Tumours derived from haematopoietic stem cells expressing a subset of the mir-17-92 cluster and c-myc could be distinguished by an absence of apoptosis that was otherwise prevalent in c-myc-induced lymphomas .	c-myc	mir-17-92
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/demo_input/positive_samples_small.tsv:
--------------------------------------------------------------------------------
 1 | 406884	136319	15806104	mirlet7b	POSITIVE	DIRECT	DOWN	In particular, we experimentally validate common regulation of Mtpn by miR-375, miR-124 and let-7b and thus provide evidence for coordinate microRNA control in mammals.	In particular , we experimentally validate common regulation of Mtpn by miR-375 , miR-124 and let-7b and thus provide evidence for coordinate microRNA control in mammals .	In particular experimentally validate common regulation Mtpn miR-375 miR-124 let-7b thus provide evidence coordinate microRNA control mammals	let-7b	Mtpn
 2 | 406884	1573	22761738	mirlet7b	NEGATIVE	DIRECT	NA	CONCLUSIONS: Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues.	CONCLUSIONS : Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues .	CONCLUSIONS Our results demonstrated decreased expression let-7b could lead high expression CYP2J2 protein cancerous tissues	Let-7b	CYP2J2
 3 | 406884	1573	22761738	mirlet7b	NEGATIVE	DIRECT	NA	CONCLUSIONS: Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues.	CONCLUSIONS : Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues .	CONCLUSIONS Our results demonstrated decreased expression let-7b could lead high expression CYP2J2 protein cancerous tissues	let-7b	CYP2J2
 4 | 406884	1573	22761738	mirlet7b	NEGATIVE	DIRECT	NA	Furthermore, let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2.	Furthermore , let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2 .	Furthermore let-7b may diminish cell proliferation promote cell apoptosis tumor cells via posttranscriptional repression CYP2J2	Let-7b	CYP2J2
 5 | 406884	1573	22761738	mirlet7b	NEGATIVE	DIRECT	NA	Furthermore, let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2.	Furthermore , let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2 .	Furthermore let-7b may diminish cell proliferation promote cell apoptosis tumor cells via posttranscriptional repression CYP2J2	let-7b	CYP2J2
 6 | 406884	1573	22761738	mirlet7b	NEGATIVE	DIRECT	NA	In addition, let-7b decreased the enzymatic activity of endogenous CYP2J2.	In addition , let-7b decreased the enzymatic activity of endogenous CYP2J2 .	In addition let-7b decreased enzymatic activity endogenous CYP2J2	Let-7b	CYP2J2
 7 | 406892	10217	24785011	mir100	NEGATIVE	INDIRECT	NA	CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells.	CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells .	CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells	miR-100	RBSP3
 8 | 406892	10217	24785011	mir100	NEGATIVE	INDIRECT	NA	CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells.	CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells .	CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells	miR-100	RBSP3
 9 | 406892	10217	24785011	mir100	NEGATIVE	INDIRECT	NA	CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells.	CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells .	CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells	miR-100	RBSP3
10 | 406892	2261	25344675	mir100	POSITIVE	INDIRECT	DOWN	FGFR3 was significantly downregulated by overexpressing miR-100 in pancreatic cancer cells and knocking down FGFR3 by siRNA exerted similar effect as miR-100.	FGFR3 was significantly downregulated by overexpressing miR-100 in pancreatic cancer cells and knocking down FGFR3 by siRNA exerted similar effect as miR-100 .	FGFR3 significantly downregulated overexpressing miR-100 pancreatic cancer cells knocking FGFR3 siRNA exerted similar effect miR-100	miR-100	FGFR3
11 | 406892	2261	25344675	mir100	POSITIVE	INDIRECT	DOWN	Luciferase essay showed FGFR3 was direct target of miR-100.	Luciferase essay showed FGFR3 was direct target of miR-100 .	Luciferase essay showed FGFR3 direct target miR-100	miR-100	FGFR3
12 | 406892	10217	24785011	mir100	NEGATIVE	INDIRECT	NA	CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells.	CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells .	CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells	miR-100	RBSP3
13 | 406892	10217	24785011	mir100	NEGATIVE	INDIRECT	NA	CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells.	CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells .	CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells	miR-100	RBSP3
14 | 406892	2261	25344675	mir100	POSITIVE	INDIRECT	DOWN	Our study demonstrated that miR-100 played an important role in pancreatic cancer development, possibly through targeting FGFR3.	Our study demonstrated that miR-100 played an important role in pancreatic cancer development , possibly through targeting FGFR3 .	Our study demonstrated miR-100 played important role pancreatic cancer development possibly targeting FGFR3	miR-100	FGFR3
15 | 406892	2261	25344675	mir100	POSITIVE	INDIRECT	DOWN	The predicted target of miR-100, fibroblast growth factor receptor 3 (FGFR3), was downregulated by siRNA to examine its effect on pancreatic cancer cells.	The predicted target of miR-100 , fibroblast growth factor receptor 3 ( FGFR3 ) , was downregulated by siRNA to examine its effect on pancreatic cancer cells .	The predicted target miR-100 fibroblast growth factor receptor 3 FGFR3 downregulated siRNA examine effect pancreatic cancer cells	miR-100	FGFR3
16 | 406892	2261	25344675	mir100	POSITIVE	INDIRECT	DOWN	The predicted target of miR-100, fibroblast growth factor receptor 3 (FGFR3), was downregulated by siRNA to examine its effect on pancreatic cancer cells.	The predicted target of miR-100 , fibroblast growth factor receptor 3 ( FGFR3 ) , was downregulated by siRNA to examine its effect on pancreatic cancer cells .	The predicted target miR-100 fibroblast growth factor receptor 3 FGFR3 downregulated siRNA examine effect pancreatic cancer cells	miR-100	fibroblast growth factor receptor 3
17 | 406892	2261	25493074	mir100	NEGATIVE	INDIRECT	NA	Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes, including FGFR3 and its relationship with proliferation, apoptosis and DNA ploidy.	Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes , including FGFR3 and its relationship with proliferation , apoptosis and DNA ploidy .	Our aim analyze role miR-100 bladder cancer cell lines controlling expression possible target genes including FGFR3 relationship proliferation apoptosis DNA ploidy	miR-100	FGFR3
18 | 406892	2261	25493074	mir100	NEGATIVE	INDIRECT	NA	Recently, lowered expression of miR-100, resulting in upregulation of FGFR3, has been correlated with low-grade, non-invasive bladder urothelial cancer, as an alternative oncogenesis pathway to the typical FGFR3 gene mutation.	Recently , lowered expression of miR-100 , resulting in upregulation of FGFR3 , has been correlated with low-grade , non-invasive bladder urothelial cancer , as an alternative oncogenesis pathway to the typical FGFR3 gene mutation .	Recently lowered expression miR-100 resulting upregulation FGFR3 correlated low-grade non-invasive bladder urothelial cancer alternative oncogenesis pathway typical FGFR3 gene mutation	miR-100	FGFR3
19 | 406892	2261	25493074	mir100	POSITIVE	INDIRECT	DOWN	Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes, including FGFR3 and its relationship with proliferation, apoptosis and DNA ploidy.	Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes , including FGFR3 and its relationship with proliferation , apoptosis and DNA ploidy .	Our aim analyze role miR-100 bladder cancer cell lines controlling expression possible target genes including FGFR3 relationship proliferation apoptosis DNA ploidy	miR-100	FGFR3
20 | 406892	2261	25493074	mir100	POSITIVE	INDIRECT	DOWN	Recently, lowered expression of miR-100, resulting in upregulation of FGFR3, has been correlated with low-grade, non-invasive bladder urothelial cancer, as an alternative oncogenesis pathway to the typical FGFR3 gene mutation.	Recently , lowered expression of miR-100 , resulting in upregulation of FGFR3 , has been correlated with low-grade , non-invasive bladder urothelial cancer , as an alternative oncogenesis pathway to the typical FGFR3 gene mutation .	Recently lowered expression miR-100 resulting upregulation FGFR3 correlated low-grade non-invasive bladder urothelial cancer alternative oncogenesis pathway typical FGFR3 gene mutation	miR-100	FGFR3
21 | 406892	2261	26018508	mir100	NEGATIVE	INDIRECT	NA	Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation.	Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation .	Bioinformatics analysis luciferase reporter assay suggest miR-100 binds 3'UTR FGFR3 mRNA prevent translation	miR-100	FGFR3
22 | 406892	2261	26018508	mir100	NEGATIVE	INDIRECT	NA	Here we reported significantly higher levels of fibroblast growth factor receptor 3 (FGFR3) and significantly lower levels of miR-100 in the OS specimen, compared to those in the paired normal bone tissues.	Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues .	Here reported significantly higher levels fibroblast growth factor receptor 3 FGFR3 significantly lower levels miR-100 OS specimen compared paired normal bone tissues	miR-100	FGFR3
23 | 406892	2261	26018508	mir100	NEGATIVE	INDIRECT	NA	Here we reported significantly higher levels of fibroblast growth factor receptor 3 (FGFR3) and significantly lower levels of miR-100 in the OS specimen, compared to those in the paired normal bone tissues.	Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues .	Here reported significantly higher levels fibroblast growth factor receptor 3 FGFR3 significantly lower levels miR-100 OS specimen compared paired normal bone tissues	miR-100	fibroblast growth factor receptor 3
24 | 406892	2261	26018508	mir100	NEGATIVE	INDIRECT	NA	Taken together, our data demonstrate that miR-100 may inhibit the growth of OS through FGFR3.	Taken together , our data demonstrate that miR-100 may inhibit the growth of OS through FGFR3 .	Taken together data demonstrate miR-100 may inhibit growth OS FGFR3	miR-100	FGFR3
25 | 406892	2261	26018508	mir100	NEGATIVE	INDIRECT	NA	We found that overexpression of miR-100 in OS cells decreased FGFR3 protein levels, whereas inhibition of miR-100 increased FGFR3 protein levels, without affecting FGFR3 transcripts.	We found that overexpression of miR-100 in OS cells decreased FGFR3 protein levels , whereas inhibition of miR-100 increased FGFR3 protein levels , without affecting FGFR3 transcripts .	We found overexpression miR-100 OS cells decreased FGFR3 protein levels whereas inhibition miR-100 increased FGFR3 protein levels without affecting FGFR3 transcripts	miR-100	FGFR3
26 | 406892	2261	26018508	mir100	POSITIVE	INDIRECT	DOWN	Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation.	Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation .	Bioinformatics analysis luciferase reporter assay suggest miR-100 binds 3'UTR FGFR3 mRNA prevent translation	miR-100	FGFR3
27 | 406892	2261	26018508	mir100	POSITIVE	INDIRECT	DOWN	Here we reported significantly higher levels of fibroblast growth factor receptor 3 (FGFR3) and significantly lower levels of miR-100 in the OS specimen, compared to those in the paired normal bone tissues.	Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues .	Here reported significantly higher levels fibroblast growth factor receptor 3 FGFR3 significantly lower levels miR-100 OS specimen compared paired normal bone tissues	miR-100	FGFR3


--------------------------------------------------------------------------------
/demo_input/positive_with_entities_small.tsv:
--------------------------------------------------------------------------------
 1 | In particular , we experimentally validate common regulation of Mtpn by miR-375 , miR-124 and let-7b and thus provide evidence for coordinate microRNA control in mammals .	let-7b	miR-124	Mtpn	miR-375
 2 | CONCLUSIONS : Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues .	let-7b	CYP2J2
 3 | Furthermore , let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2 .	CYP2J2	let-7b	tumor
 4 | In addition , let-7b decreased the enzymatic activity of endogenous CYP2J2 .	CYP2J2	let-7b
 5 | Let-7b significantly inhibited the tumor phenotype by targeting CYP2J2 .	CYP2J2	Let-7b	tumor
 6 | Luciferase and western blot assays revealed that CYP2J2 was regulated by let-7b .	let-7b	CYP2J2
 7 | CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells .	RBSP3	miR-100
 8 | Our study demonstrated that miR-100 played an important role in pancreatic cancer development , possibly through targeting FGFR3 .	pancreatic	FGFR3	miR-100
 9 | The predicted target of miR-100 , fibroblast growth factor receptor 3 ( FGFR3 ) , was downregulated by siRNA to examine its effect on pancreatic cancer cells .	pancreatic	FGFR3	fibroblast growth factor receptor 3	receptor	miR-100
10 | Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes , including FGFR3 and its relationship with proliferation , apoptosis and DNA ploidy .	bladder cancer	FGFR3	miR-100
11 | Recently , lowered expression of miR-100 , resulting in upregulation of FGFR3 , has been correlated with low-grade , non-invasive bladder urothelial cancer , as an alternative oncogenesis pathway to the typical FGFR3 gene mutation .	FGFR3	miR-100
12 | Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation .	FGFR3	miR-100
13 | Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues .	OS	FGFR3	miR-100	fibroblast growth factor receptor 3
14 | Luciferase essay showed FGFR3 was direct target of miR-100 .	miR-100	FGFR3
15 | Taken together , our data demonstrate that miR-100 may inhibit the growth of OS through FGFR3 .	miR-100	FGFR3	OS
16 | We found that overexpression of miR-100 in OS cells decreased FGFR3 protein levels , whereas inhibition of miR-100 increased FGFR3 protein levels , without affecting FGFR3 transcripts .	miR-100	OS	FGFR3
17 | FGFR3 was significantly downregulated by overexpressing miR-100 in pancreatic cancer cells and knocking down FGFR3 by siRNA exerted similar effect as miR-100 .	FGFR3	miR-100	pancreatic


--------------------------------------------------------------------------------
/demo_output/background_samples.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CatalystCode/corpus-to-graph-ml/e8b431f7342b674d4ee07c20c403531c9f6aad3f/demo_output/background_samples.txt


--------------------------------------------------------------------------------
/demo_output/temp_doc2vec.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CatalystCode/corpus-to-graph-ml/e8b431f7342b674d4ee07c20c403531c9f6aad3f/demo_output/temp_doc2vec.txt


--------------------------------------------------------------------------------
/demo_output/top_scoring_positive_class.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CatalystCode/corpus-to-graph-ml/e8b431f7342b674d4ee07c20c403531c9f6aad3f/demo_output/top_scoring_positive_class.txt


--------------------------------------------------------------------------------
/features_classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import sys\n",
 14 |     "sys.path.append(\"./packages\")\n",
 15 |     "\n",
 16 |     "import matplotlib\n",
 17 |     "import datetime\n",
 18 |     "import numpy as np\n",
 19 |     "import data_preparation_tools as dpt\n",
 20 |     "import features_generation_tools as fgt\n",
 21 |     "import model_tools\n",
 22 |     "from sklearn.linear_model import LogisticRegression\n",
 23 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 24 |     "from sklearn.metrics import f1_score\n",
 25 |     "from sklearn.metrics import accuracy_score\n",
 26 |     "from sklearn.metrics import make_scorer\n",
 27 |     "from sklearn.ensemble import GradientBoostingClassifier"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": false,
 35 |     "scrolled": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "# path used to save temporary doc2vec files\n",
 40 |     "temp_doc2vec_file = r\"./demo_output/temp_doc2vec.txt\"\n",
 41 |     "# path to text file that contains background sentences used in doc2vec\n",
 42 |     "background_samples_file_path = r\"./demo_output/background_samples.txt\"\n",
 43 |     "\n",
 44 |     "doc2vec_func = lambda x_train,x_test : fgt.get_doc2vec_features(x_train, x_test, temp_doc2vec_file, background_samples_file_path)\n",
 45 |     "bow_func = lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,3))\n",
 46 |     "\n",
 47 |     "# evaluate different features\n",
 48 |     "gen_features_methods = [\n",
 49 |     "fgt.GenFeaturesMethod(\"bow_1_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,1))),\n",
 50 |     "fgt.GenFeaturesMethod(\"bow_2_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (2,2))),\n",
 51 |     "fgt.GenFeaturesMethod(\"bow_3_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (3,3))),\n",
 52 |     "fgt.GenFeaturesMethod(\"bow_1_3_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,3))),\n",
 53 |     "fgt.GenFeaturesMethod(\"doc2vec\", lambda x_train,x_test : fgt.get_doc2vec_features(x_train, x_test, temp_doc2vec_file, background_samples_file_path)),\n",
 54 |     "fgt.GenFeaturesMethod(\"pos_3_3\", lambda x_train,x_test : fgt.to_pos_bow(x_train, x_test, (3,3))),\n",
 55 |     "fgt.GenFeaturesMethod(\"bow_1_3_pos_3_3\", lambda x_train,x_test : fgt.get_bow_and_pos_features(x_train, x_test, (1,3), (3,3))),\n",
 56 |     "fgt.GenFeaturesMethod(\"bow_1_3_doc2vec\", lambda x_train,x_test : fgt.get_compound_features(x_train, x_test, [bow_func, doc2vec_func]))\n",
 57 |     "]\n",
 58 |     "\n",
 59 |     "#Cs= [0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 0.8] + np.linspace(1,5, 9).tolist()\n",
 60 |     "Cs = np.linspace(0.005,0.25,10)\n",
 61 |     "\n",
 62 |     "# evaluates different classifiers\n",
 63 |     "evaluation_methods = [\n",
 64 |     "    fgt.EvaluationMethod(\"logistic regression l1\", lambda: LogisticRegression(C=0.1, penalty='l1', solver='liblinear')),\n",
 65 |     "    fgt.EvaluationMethod(\"lr l1 cv\", lambda: LogisticRegressionCV(penalty='l1', cv=5, scoring=make_scorer(f1_score), solver='liblinear', Cs=Cs, refit=True)),\n",
 66 |     "    fgt.EvaluationMethod(\"lr l2 cv\", lambda: LogisticRegressionCV(penalty='l2', cv=5, scoring=make_scorer(f1_score), solver='liblinear', Cs=Cs, refit=True)),\n",
 67 |     "    #fgt.EvaluationMethod(\"GBC\", lambda: GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=10, random_state=0))\n",
 68 |     "]\n",
 69 |     "\n",
 70 |     "# path to input dir \n",
 71 |     "input_dir = r\"./demo_output\"\n",
 72 |     "startTime = datetime.datetime.now()\n",
 73 |     "\n",
 74 |     "models = fgt.run_gen_features_pipeline(input_dir, gen_features_methods, evaluation_methods)\n",
 75 |     "\n",
 76 |     "runTime = datetime.datetime.now() - startTime\n",
 77 |     "print \"Finished generating features, took:%s\"%runTime"
 78 |    ]
 79 |   }
 80 |  ],
 81 |  "metadata": {
 82 |   "kernelspec": {
 83 |    "display_name": "Python 2",
 84 |    "language": "python",
 85 |    "name": "python2"
 86 |   },
 87 |   "language_info": {
 88 |    "codemirror_mode": {
 89 |     "name": "ipython",
 90 |     "version": 2
 91 |    },
 92 |    "file_extension": ".py",
 93 |    "mimetype": "text/x-python",
 94 |    "name": "python",
 95 |    "nbconvert_exporter": "python",
 96 |    "pygments_lexer": "ipython2",
 97 |    "version": "2.7.11"
 98 |   }
 99 |  },
100 |  "nbformat": 4,
101 |  "nbformat_minor": 0
102 | }
103 | 


--------------------------------------------------------------------------------
/packages/data_preparation_tools.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import nltk
  4 | import urllib
  5 | import requests
  6 | import re
  7 | from mirna_detector import is_mirna
  8 | from os import path
  9 | from sklearn.cross_validation import train_test_split
 10 | 
 11 | 
 12 | #constants:
 13 | DEFAULT_VARIABLE_FORMAT_TABLE = {'gene': 'GGVARENTTY%dGG',
 14 |                                 'mirna' : 'MMVARENTTY%dMM' }
 15 | 
 16 | PAIR_ENTITTY_VARIABLE_NAMES = {'gene': 'GGVARENTTYGG',
 17 |                                 'mirna' : 'MMVARENTTYMM' }
 18 | 
 19 | OTHER_ENTITY_VARIABLE_TABLE = {'gene': 'GGVARENTTYOTRGG',
 20 |                                 'mirna' : 'MMVARENTTYOTRMM' }
 21 | 
 22 | # expects output in GNAT's output format
 23 | ENTITY_RECOGNITION_SERVICE_URL_FORMAT = ""
 24 | 
 25 | DEFAULT_SENTENCE_COLUMN = 8
 26 | DEFAULT_LABEL_COLUMNS = [4,5]
 27 | DEFAULT_MIRNA_ENTITY_COLUMN = 10
 28 | DEFAULT_GENE_ENTITY_COLUMN = 11
 29 | DEFAULT_MIN_SENTENCE_LENGTH = 6
 30 | DEFAULT_EXTRA_WORDS_COUNT = 3
 31 | ENTITY_REGEX = re.compile(r"GGVARENTTYGG|MMVARENTTYMM")
 32 | DEFAULT_TEST_SIZE = 0.25
 33 | 
 34 | # TODO: we should probably take this from a file
 35 | DEFAULT_NON_ENTITY_DICTIONARY = ['and', 'lab']
 36 | DEFAULT_MIN_ENTITY_LENGTH = 3
 37 | 
 38 | # Object to pass along and contains the input to the different transformations
 39 | class InputData:
 40 |     def __init__(self, data_train, data_test, contexts_train, contexts_test):
 41 |         self.data_train = data_train
 42 |         self.data_test = data_test
 43 |         self.contexts_train = contexts_train
 44 |         self.contexts_test = contexts_test
 45 | 
 46 | # entities is a list of dictionaries in the format of {"text" : "...", type:
 47 | # "...", }
 48 | def replace_entities_with_variables(text, entities, variable_format_table=OTHER_ENTITY_VARIABLE_TABLE):
 49 |     text_parts = []
 50 |     
 51 |     for entity in entities:
 52 |         entity_replacement = variable_format_table[entity["type"]]
 53 |         text = text.replace(entity["text"], entity_replacement)
 54 |         
 55 |     return text
 56 | 
 57 | # entities is a list of dictionaries in the format of {"text" : "...", type:
 58 | # "...", }
 59 | def replace_entities_with_variables_old(text, entities, variable_name_table=DEFAULT_VARIABLE_FORMAT_TABLE):
 60 |     text_parts = []
 61 |     
 62 |     locations_by_type = {}
 63 |     
 64 |     for entity in entities:
 65 |         index = 0
 66 |         index = text.find(entity["text"])
 67 |         if (index == -1):
 68 |             continue
 69 |         
 70 |         if (not locations_by_type.has_key(entity["type"])):
 71 |             locations_by_type[entity["type"]] = {}
 72 |             
 73 |         locations_by_type[entity["type"]][index] = entity
 74 |     
 75 |     for t in locations_by_type:
 76 |         entities_to_variables = {}
 77 |         index = 1
 78 |         # assign a variable to each entity
 79 |         for i in sorted(locations_by_type[t]):
 80 |             entities_to_variables[locations_by_type[t][i]["text"]] = variable_format_table[t] % index
 81 |             index = index + 1
 82 | 
 83 |         for entity in entities_to_variables:
 84 |             text = text.replace(entity, entities_to_variables[entity])
 85 |         
 86 |     return text
 87 | 
 88 | texts_entities_dictionary = {}
 89 | def get_entities_for_text(text):
 90 |     # in case we already have the result...
 91 |     if (texts_entities_dictionary.has_key(text)):
 92 |         return texts_entities_dictionary[text]
 93 |     
 94 |     url = ENTITY_RECOGNITION_SERVICE_URL_FORMAT % (urllib.quote(text))
 95 |     r = requests.get(url)
 96 |     if (r.status_code != 200):
 97 |         # TODO: Throw exception...
 98 |         print "got bad error code:%d" % r.status_code
 99 |         return None
100 |     
101 |     parsed_entities = []
102 |     entities = []
103 |     for line in r.text.split("\n"):
104 |         if (len(line) == 0):
105 |             continue
106 |         line_parts = line.split("\t")
107 |         entity_type = line_parts[2]
108 |         start_index = int(line_parts[5])
109 |         end_index = int(line_parts[6])
110 |         text = line_parts[7]
111 |         
112 |         # only look at genes for now
113 |         if (entity_type != 'gene' and entity_type != 'mirna'):
114 |             continue
115 |         
116 |         # check if we don't have collisions, always take the longer string
117 |         objects_to_remove = []
118 |         add_entity = True
119 |         for e in parsed_entities:
120 |             if (start_index <= e["startIndex"] and end_index >= e["endIndex"] or start_index <= e["startIndex"] and end_index >= e["startIndex"] or start_index <= e["endIndex"] and end_index >= e["endIndex"]):
121 |                 currentLen = len(text)
122 |                 second_len = len(e["text"])
123 |                 if (currentLen <= second_len):
124 |                     add_entity = False
125 |                     break
126 |                 else:
127 |                     objects_to_remove.append(e)
128 | 
129 |         for e in objects_to_remove:
130 |             parsed_entities.remove(e)
131 | 
132 |         if (add_entity):
133 |             parsed_entities.append({
134 |                     "text" : text,
135 |                     "startIndex" : start_index,
136 |                     "endIndex" : end_index
137 |                 })
138 | 
139 |     for e in parsed_entities:
140 |         if (not e["text"] in entities):
141 |             entities.append(e["text"])
142 | 
143 |     # merge entities in case there is overlap
144 | 
145 |     return entities
146 | 
147 | # utility function to add text to entities data to the dictionary
148 | def import_to_texts_entities_dictonary_from_file(file_path):
149 |     with open(file_path) as handle:
150 |         for line in handle:
151 |             parts = line.rstrip().split("\t")
152 |             texts_entities_dictionary[parts[0]] = parts[1:]
153 |             
154 | 
155 | # the results of the entityt recognition might be noisy
156 | def filter_entities(entities, non_entities_dictionary=DEFAULT_NON_ENTITY_DICTIONARY, min_entity_length=DEFAULT_MIN_ENTITY_LENGTH):
157 |     filtered_entities = []
158 |     for e in entities:
159 |         if (len(e) < min_entity_length):
160 |             continue
161 |         if (e in non_entities_dictionary):
162 |             continue
163 |         
164 |         filtered_entities.append(e)
165 |         
166 |     return filtered_entities
167 | 
168 | def entity_list_to_descriptors(entities):
169 |     result = []
170 |     for e in entities:
171 |         type = "mirna" if is_mirna(e) else "gene"
172 |         result.append({"text" : e, "type" : type})
173 |     return result
174 | 
175 | def extract_and_replace_entities(text, context=None, return_descriptors=False):
176 |     if (context != None and context.has_key("pair_entities")):
177 |         replaced_text = replace_entities_with_variables(text, context["pair_entities"], PAIR_ENTITTY_VARIABLE_NAMES)
178 |     if (context != None and context.has_key("all_entities")):
179 |         # TODO: run unification logic here as well?
180 |         entity_descriptors = context["all_entities"]
181 |     else:
182 |         entities = get_entities_for_text(text)
183 |         entities = filter_entities(entities)
184 |         entity_descriptors = entity_list_to_descriptors(entities)
185 |     
186 |     replaced_text = replace_entities_with_variables(replaced_text, entity_descriptors)
187 |     if (return_descriptors):
188 |         return (replaced_text, descriptors)
189 |     else:
190 |         return replaced_text
191 | 
192 | # extract data from CSV/TSV file
193 | def extract_sentences(input_file_path,
194 |                      sentence_columns=DEFAULT_SENTENCE_COLUMN, mirna_entity_column=DEFAULT_MIRNA_ENTITY_COLUMN,
195 |                      gene_entity_column=DEFAULT_GENE_ENTITY_COLUMN, label_column_indices=None, 
196 |                      label_tag=None, sample_size=-1):
197 |     sentences = []
198 |     labels = []
199 |     contexts = []
200 |     all_sentences = {}
201 |     with open(input_file_path) as input:
202 |         for line in input:
203 |             splitted_line = line.rstrip().split("\t")
204 |             sentence = splitted_line[sentence_columns]
205 |             
206 |             # TODO: randomly sample one sentence instead of just the first one in the list
207 |             # We do this in order to make sure that there isn't a bias in the model towards sentences that
208 |             # that appear more than other ones
209 |             if (all_sentences.has_key(sentence)):
210 |                 continue
211 | 
212 |             all_sentences[sentence] = 1
213 |             
214 |             contexts.append({"pair_entities" : [{"text" : splitted_line[mirna_entity_column], "type": "mirna"},
215 |                         {"text" : splitted_line[gene_entity_column], "type": "gene"}]})
216 |                 
217 |             if (label_column_indices != None):
218 |                 label = splitted_line[label_column_indices[0]]
219 |                 for i in label_column_indices[1:]:
220 |                     label = label + "_" + splitted_line[i]
221 |                 labels.append(label)
222 |             
223 |             sentences.append(sentence)
224 |             
225 |     if (sample_size != -1):
226 |         indices = np.random.choice(len(sentences), sample_size)
227 |         sentences = [sentences[index] for index in indices]
228 |         contexts = [contexts[index] for index in indices]
229 |     
230 |     if (label_tag != None):
231 |         labels = [label_tag] * len(sentences)
232 |     return (sentences, labels, contexts)
233 |     
234 | def extract_sentences_with_multiclass_labels(input_file_path, sample_size=-1):
235 |     return extract_sentences(input_file_path, label_column_indices=DEFAULT_LABEL_COLUMNS, sample_size=sample_size)
236 | 
237 | def is_sequence(arg):
238 |     return (not hasattr(arg, "strip") and hasattr(arg, "__getitem__") or hasattr(arg, "__iter__"))
239 | 
240 | def write_lines_or_tuples_to_file(lines, output_file_path, seperator="\t"):
241 |     are_tuples = is_sequence(lines[0])
242 |     with open(output_file_path, "w") as handle:
243 |         for o in sentences:
244 |             if are_tuples:
245 |                 text = o[0]
246 |                 for item in o[1:]:
247 |                     text = text + seperator + item
248 |             else:
249 |                 text = o
250 |             handle.write(text + "\n")
251 | 
252 | # get all entities for a list of sentences:
253 | def get_entities_for_file(in_file_path, out_file_path):
254 |     sentences = extract_sentences(in_file_path)
255 |     with open(out_file_path) as out_handle:
256 |         for s,i in zip(sentences, xrange(len(sentences))):
257 |             print "%d of %d" % (i, len(sentences))
258 |             entities = get_entities_for_text(s)
259 |             out_handle.write(sentence + "\t" + "\t".join(entities) + "\n")
260 | 
261 | def trim_sentence_around_entities(text , context=None, min_length=DEFAULT_MIN_SENTENCE_LENGTH, extra_words_count=DEFAULT_EXTRA_WORDS_COUNT):
262 |     sentence_parts = text.split()
263 |     
264 |     if (len(sentence_parts) < min_length):
265 |         return text
266 |     
267 |     first_index = -1
268 |     last_index = -1
269 |     
270 |     for part,i in zip(sentence_parts, xrange(len(sentence_parts))):
271 |         if (ENTITY_REGEX.match(part)):
272 |             if (first_index == -1):
273 |                 first_index = i
274 |             last_index = i
275 |     
276 |     size = last_index - first_index + extra_words_count * 2
277 |     
278 |     # ensure
279 |     if (size < min_length):
280 |         extra_words_count = extra_words_count + math.ceil((min_length - size) / 2)
281 |     
282 |     first_index = max(0, first_index - extra_words_count)
283 |     last_index = min(len(sentence_parts), last_index + extra_words_count + 1)
284 |     
285 |     trimmed_sentence_parts = sentence_parts[first_index:last_index]
286 |     return " ".join(trimmed_sentence_parts)
287 | 
288 | # langueage based sentence process
289 | def normalize_text(sent, context=None):
290 |     return sent.lower()
291 | 
292 | # stop words removal
293 | def remove_stop_words(sent, context=None):
294 |     processed_tokens = []
295 |     tokens = nltk.word_tokenize(sent)
296 |     for t in tokens:
297 |         # ignore stop words
298 |         if (t in nltk.corpus.stopwords.words('english') or len(t) < 2):
299 |             continue
300 |         processed_tokens.append(t)
301 | 
302 |     return " ".join(processed_tokens)
303 | 
304 | # digits removal
305 | def remove_all_digit_tokens(sent, context=None):
306 |     processed_tokens = []
307 |     tokens = nltk.word_tokenize(sent)
308 |     for t in tokens:
309 |         # ignore stop words
310 |         if (t.isdigit()):
311 |             continue
312 |         processed_tokens.append(t)
313 | 
314 |     return " ".join(processed_tokens)
315 | 
316 | # run stemmer on the words
317 | def stem_text(sent, context=None):
318 |     processed_tokens = []
319 |     tokens = nltk.word_tokenize(sent)
320 |     porter = nltk.PorterStemmer()
321 |     for t in tokens:
322 |         t = porter.stem(t)
323 |         processed_tokens.append(t)
324 | 
325 |     return " ".join(processed_tokens)
326 | 
327 | # Split to train and test sample sets:
328 | def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE):
329 |     d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size)
330 |     d_test_2 = []
331 |     l_test_2 = []
332 |     c_test_2 = []
333 | 
334 |     train_dict = {}
335 |     for d in d_train:
336 |         train_dict[d] = 1
337 | 
338 |     for d,l,c in zip(d_test, l_test, c_test):
339 |         if (train_dict.has_key(d)):
340 |             continue
341 |         d_test_2.append(d)
342 |         l_test_2.append(l)
343 |         c_test_2.append(c)
344 | 
345 |     return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2)
346 | 
347 | # utility to extracts entities from preproceseed files
348 | def extract_entities_from_entity_file(input_file_paths, out_file_path):
349 |     all_entities = {}
350 | 
351 |     for file_path in input_file_paths:
352 |         with open(file_path) as handle:
353 |             for l in handle:
354 |                 parts = l.rstrip().split("\t")
355 |                 for e in parts[1:]:
356 |                     all_entities[e] = 1
357 | 
358 |     with open(out_file_path, "w") as handle:
359 |         for e in sorted(all_entities.keys()):
360 |             handle.write(e + "\n")
361 | 
362 | def run_step(step_name, step_func, inputs_dict, required):
363 |     temp_dict = {}
364 |     for k in inputs_dict:
365 |         
366 |         if (required != None):
367 |             found_all = True
368 |             for r in required:
369 |                 if (k.find(r) == -1):
370 |                     found_all = False
371 |             if (not found_all):
372 |                 continue
373 |         
374 |         result_train = []
375 |         result_test = []
376 |         
377 |         for l,c in zip(inputs_dict[k].data_train, inputs_dict[k].contexts_train):
378 |             result_train.append(step_func(l, context=c))
379 |             
380 |         for l,c in zip(inputs_dict[k].data_test, inputs_dict[k].contexts_test):
381 |             result_test.append(step_func(l, context=c))
382 |             
383 |         temp_dict[k + "_" + step_name] = InputData(result_train, result_test, 
384 |                                                    inputs_dict[k].contexts_train, inputs_dict[k].contexts_test)
385 |     
386 |     for k in temp_dict:
387 |         inputs_dict[k] = temp_dict[k]
388 | 
389 | def run_step_unlabeled_data(step_name, step_func, inputs_dict, required):
390 |     temp_dict = {}
391 |     for k in inputs_dict:
392 |         
393 |         if (required != None):
394 |             found_all = True
395 |             for r in required:
396 |                 if (k.find(r) == -1):
397 |                     found_all = False
398 |             if (not found_all):
399 |                 continue
400 |         
401 |         results = []
402 | 
403 |         for l in inputs_dict[k]:
404 |             results.append(step_func(l))
405 |             
406 |         temp_dict[k + "_" + step_name] = results
407 |     
408 |     for k in temp_dict:
409 |         inputs_dict[k] = temp_dict[k]
410 | 
411 | # 3rd part specify required step
412 | TRANSFORMATION_STEPS = [('entities', extract_and_replace_entities),
413 |             ('trim', trim_sentence_around_entities, ['entities']),
414 |             ('normalize', normalize_text, ['entities']),
415 |             ('rmdigits', remove_all_digit_tokens, ['entities']),
416 | #            ('rmstopwords', remove_stop_words, ['entities'])
417 | #            ('stem', stem_text, ['entities'])
418 |             ]
419 | 
420 | TRANSFORMATION_STEPS_UNLABELED = [('entities', extract_and_replace_entities),
421 |             ('normalize', normalize_text, ['entities']),
422 |             ('rmdigits', remove_all_digit_tokens, ['entities']),
423 |             ('rmstopwords', remove_stop_words, ['entities'])]
424 | 
425 | # steps is an array of tuples, with the first as name and the second is the processing function
426 | def run_transformations_on_single_sentence(text, context, steps=TRANSFORMATION_STEPS):
427 |     current = text
428 |     for s in steps:
429 |         current = s[1](current, context=context)
430 |     return current
431 | 
432 | def run_transformations_on_data(sentences, labels, contexts, output_files_prefix, output_dir, write_context_to_files=True):
433 |     # split to train and test:
434 |     print "before data split"
435 |     s_train, s_test, l_train, l_test, c_train, c_test = split_to_test_and_train(sentences, labels, contexts)
436 |     inputs = {
437 |         'data' : InputData(s_train, s_test, c_train, c_test)
438 |     }
439 | 
440 |     print "splitted data"
441 |     # run each step on all of the already existing ones
442 |     # TODO: pass entities metadata and let the trimming work even without the
443 |     # regexes..
444 |     
445 |     for s in TRANSFORMATION_STEPS:
446 |         print "running step:%s" % (s[0])
447 |         required = None
448 |         if (len(s) > 2):
449 |             required = s[2]
450 |         run_step(s[0], s[1], inputs, required)
451 |         
452 |     # todo: write outputs to files
453 |     for name in inputs:
454 |         train_file_path = path.join(output_dir, output_files_prefix + "_" + name + "_train.tsv")
455 |         test_file_path = path.join(output_dir, output_files_prefix + "_" + name + "_test.tsv")
456 |         
457 |         train_data = inputs[name].data_train
458 |         test_data = inputs[name].data_test
459 |         
460 |         with open(train_file_path, "w") as handle:
461 |             for text,label in zip(train_data,l_train):
462 |                 if (len(text.strip()) == 0):
463 |                     continue
464 |                 handle.write("%s\t%s\n" % (label, text))
465 |                 
466 |         with open(test_file_path, "w") as handle:
467 |             for text,label in zip(test_data,l_test):
468 |                 if (len(text.strip()) == 0):
469 |                     continue
470 |                 handle.write("%s\t%s\n" % (label, text))
471 | 
472 |     if (write_context_to_files):
473 |         train_context_file_path = path.join(output_dir, output_files_prefix + "_context_train.tsv")
474 |         test_context_file_path = path.join(output_dir, output_files_prefix + "_context_test.tsv")
475 | 
476 |         #({"pair_entities" : [{"text" : splitted_line[mirna_entity_column], "type": "mirna"},
477 |         #                {"text" : splitted_line[gene_entity_column], "type": "gene"}]})
478 | 
479 |         with open(train_context_file_path, "w") as handle:
480 |             for c in c_train:
481 |                 # TODO: generalize this, for now we just do this quick and dirty..
482 |                 entity_1_type = c["pair_entities"][0]["type"]
483 | 
484 |                 if (entity_1_type == "mirna"):
485 |                     mirna = c["pair_entities"][0]["text"]
486 |                     gene = c["pair_entities"][1]["text"]
487 |                 else:
488 |                     mirna = c["pair_entities"][1]["text"]
489 |                     gene = c["pair_entities"][0]["text"]
490 | 
491 |                 handle.write(mirna + "\t" + gene + "\n")
492 | 
493 |         with open(test_context_file_path, "w") as handle:
494 |             for c in c_test:
495 |                 # TODO: generalize this, for now we just do this quick and dirty..
496 |                 entity_1_type = c["pair_entities"][0]["type"]
497 | 
498 |                 if (entity_1_type == "mirna"):
499 |                     mirna = c["pair_entities"][0]["text"]
500 |                     gene = c["pair_entities"][1]["text"]
501 |                 else:
502 |                     mirna = c["pair_entities"][1]["text"]
503 |                     gene = c["pair_entities"][0]["text"]
504 | 
505 |                 handle.write(mirna + "\t" + gene + "\n")
506 |         
507 | 
508 | 
509 | def write_lines_to_file(lines, out_file_path):
510 |     with open(out_file_path,"w") as handle:
511 |         for line in lines:
512 |             handle.write(line + "\n")
513 | 
514 | def read_lines_from_file(in_file_path):
515 |     with open(in_file_path) as handle:
516 |         lines = [line.rstrip() for line in handle]
517 |     return lines
518 | 
519 | def sample_from_file(in_file_path, out_file_path, sample_size):
520 |     lines = read_lines_from_file(in_file_path)
521 |     sampled = [lines[index] for index in np.random.choice(len(lines), sample_size)]
522 |     write_lines_to_file(sampled, out_file_path)
523 | 
524 | def run_data_preparation_pipeline(positive_samples_file_path, negative_samples_file_path, 
525 |                                   output_files_prefix, output_dir, run_multiclass=False):
526 |     # two classes case:
527 |     pos_sentences, pos_labels, pos_contexts = extract_sentences(positive_samples_file_path, label_tag='RELATION')
528 |     neg_sentences, neg_labels, neg_contexts = extract_sentences(negative_samples_file_path, label_tag='NO_RELATION', sample_size = len(pos_sentences))
529 |     contexts = pos_contexts + neg_contexts
530 |     sentences = pos_sentences + neg_sentences
531 |     labels = pos_labels + neg_labels
532 |     
533 |     print "producing inputs for binary case..."
534 | 
535 |     run_transformations_on_data(sentences, labels, contexts, output_files_prefix + "_binary", output_dir)
536 |     
537 |     if (run_multiclass):
538 |         # multi class case:
539 |         print "producing inputs for multiclass case..."
540 |         pos_sentences, pos_labels, pos_entities = extract_sentences_with_multiclass_labels(positive_samples_file_path)
541 |         sentences = pos_sentences + neg_sentences
542 |         labels = pos_labels + neg_labels
543 |         contexts = pos_contexts + neg_contexts
544 |     
545 |         run_transformations_on_data(sentences, labels, contexts, output_files_prefix + "_multiclass", output_dir)
546 | 
547 | 
548 | def run_transformations_on_unlabeled_data(sentences, output_files_prefix, output_dir):
549 |     inputs = {
550 |         'data' : sentences
551 |     }
552 |     
553 |     # run each step on all of the already existing ones
554 |     # TODO: pass entities metadata and let the trimming work even without the
555 |     # regexes..
556 |     
557 |     for s in TRANSFORMATION_STEPS_UNLABELED:
558 |         print "running step:%s" % (s[0])
559 |         required = None
560 |         if (len(s) > 2):
561 |             required = s[2]
562 |         run_step_unlabeled_data(s[0], s[1], inputs, required)
563 |         
564 |     # todo: write outputs to files
565 |     for name in inputs:
566 |         out_file_path = path.join(output_dir, output_files_prefix + "_" + name + ".txt")
567 |         
568 |         with open(out_file_path, "w") as handle:
569 |             for text in inputs[name]:
570 |                 if (len(text.strip()) == 0):
571 |                     continue
572 |                 handle.write("%s\n" % (text))
573 | 
574 | 
575 | def run_unlabeled_data_preparation_pipeline(samples_file_path, output_files_prefix, output_dir):
576 |     sentences = read_lines_from_file(samples_file_path)
577 |     run_transformations_on_unlabeled_data(sentences, output_files_prefix + "_binary", output_dir)


--------------------------------------------------------------------------------
/packages/features_generation_tools.py:
--------------------------------------------------------------------------------
  1 | import data_preparation_tools as dpt
  2 | import fnmatch
  3 | import gensim
  4 | import logging
  5 | import multiprocessing
  6 | import numpy as np
  7 | import sklearn.metrics as metrics
  8 | import re
  9 | from gensim.models.doc2vec import *
 10 | from os import listdir
 11 | from os.path import isfile, join
 12 | from sklearn.feature_extraction.text import CountVectorizer
 13 | from scipy.sparse import hstack
 14 | from spacy.en import English
 15 | 
 16 | DEFAULT_BOW_NGRAM_RANGE = (1,1)
 17 | DEFAULT_BOW_MAX_FEATURES = None
 18 | DEFAULT_BOW_BINARY = True
 19 | ENTITY_REGEX = re.compile(r"GGVARENTTY[0-9]+GG|MMVARENTTY[0-9]+MM", re.IGNORECASE)
 20 | 
 21 | BINARY_LABELS_TO_CLASSES_TABLE = {
 22 |     'NO_RELATION' : 0,
 23 |     'RELATION' : 1
 24 | }
 25 | 
 26 | MULTICLASS_LABELS_TO_CLASSES_TABLE = {
 27 |     'NO_RELATION' : 0,
 28 |     'NEGATIVE_DIRECT' : 1,
 29 |     'NEGATIVE_INDIRECT' : 2,
 30 |     'POSITIVE_DIRECT' : 3,
 31 |     'POSITIVE_INDIRECT' : 4
 32 | }
 33 | 
 34 | class TrainTestData:
 35 |      def __init__(self, train_data, train_labels, test_data, test_labels, is_multiclass, feature_gen_model = None):
 36 |         self.train_data = train_data
 37 |         self.train_labels = train_labels
 38 |         self.test_data = test_data
 39 |         self.test_labels = test_labels
 40 |         self.is_multiclass = is_multiclass
 41 |         self.feature_gen_model = feature_gen_model
 42 | 
 43 | class EvaluationResult:
 44 |     def __init__(self, model, features_gen_model, test_data, scores):
 45 |         self.model = model
 46 |         self.test_data = test_data
 47 |         self.scores = scores
 48 |         self.features_gen_model = features_gen_model
 49 | 
 50 | # read texts and labels from data file:
 51 | def read_data_from_file(file_path):
 52 |     with open(file_path) as handle:
 53 |         labels = []
 54 |         data = []
 55 |         for l in handle:
 56 |             parts = l.rstrip().split("\t")
 57 |             if (len(parts) < 2):
 58 |                 continue
 59 |             labels.append(parts[0])
 60 |             data.append(parts[1])
 61 |         return data,labels
 62 |     
 63 | def read_train_and_test_data_from_path(path):
 64 |     only_files = [f for f in listdir(path) if (isfile(join(path, f)))]
 65 |     train_files = [f for f in only_files if fnmatch.fnmatch(f, '*_train.tsv')]
 66 |     data_names = ["_".join(f.split("_")[:-1]) for f in train_files]
 67 |     data_table = {}
 68 |     data_table_no_entities = {}
 69 |     
 70 |     for name in data_names:
 71 |         train_data, train_labels = read_data_from_file(join(path, name + "_train.tsv"))
 72 |         test_data, test_labels = read_data_from_file(join(path, name + "_test.tsv"))
 73 |         
 74 |         is_multiclass = name.find('multiclass') > -1
 75 |         
 76 |         # without entities as well:
 77 |         train_data_no_entities, indices_to_remove = remove_entities_from_text(train_data)
 78 |         train_labels_no_entities = train_labels
 79 |         test_data_no_entities, indices_to_remove = remove_entities_from_text(test_data)
 80 |         test_labels_no_entities = test_labels
 81 |         
 82 |         data_table[name] = TrainTestData(train_data, train_labels, test_data, test_labels, is_multiclass)
 83 |         data_table_no_entities[name] = TrainTestData(train_data_no_entities, train_labels_no_entities,
 84 |                                                      test_data_no_entities, test_labels_no_entities, is_multiclass)
 85 |     
 86 |     return data_table, data_table_no_entities
 87 | 
 88 | def remove_entities_from_text(sentences):
 89 |     fixed_sentences = []
 90 |     indices_to_remove = []
 91 |     for s,i in zip(sentences,range(len(sentences))):
 92 |         new_sent = []
 93 |         for t in s.split():
 94 |             if (not ENTITY_REGEX.match(t)):
 95 |                 new_sent.append(t)
 96 |         if (len(new_sent) == 0):
 97 |             indices_to_remove.append(i)
 98 |         #else:
 99 |         fixed_sentences.append(" ".join(new_sent))
100 |         
101 |     return fixed_sentences, indices_to_remove
102 | 
103 | nlp_parser = None
104 | def to_nlp_objs(sentences):
105 |     global nlp_parser
106 |     # init once
107 |     if (nlp_parser == None):
108 |         nlp_parser = English()
109 | 
110 |     nlp_objs = []
111 |     for s in sentences:
112 |         nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False))
113 |     return nlp_objs
114 | 
115 | def get_nlp_features(sentences):
116 |     parsed = to_nlp_objs(sentences)
117 |     pos_tags = []
118 |     for p in parsed:
119 |         pos_tags.append([s.pos_ for s in p])
120 | 
121 |     return pos_tags
122 | 
123 | def to_pos_bow(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, binary=DEFAULT_BOW_BINARY):
124 |     #TODO: can do this more efficiently, this is a workaround for now
125 |     pos_tags_train = [" ".join(s) for s in get_nlp_features(train_samples)]
126 |     pos_tags_test = [" ".join(s) for s in get_nlp_features(test_samples)]
127 |     return to_bag_of_words(pos_tags_train, pos_tags_test, ngram_range=ngram_range, binary=binary, max_features=None)
128 | 
129 | def to_bag_of_words(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, 
130 |                       max_features=DEFAULT_BOW_MAX_FEATURES, binary=DEFAULT_BOW_BINARY):
131 |         #Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
132 |         vectorizer = CountVectorizer(analyzer = "word",
133 |                                      tokenizer = None,
134 |                                      preprocessor = None,
135 |                                      stop_words = None,
136 |                                      max_features = max_features,
137 |                                      binary = binary,
138 |                                      ngram_range=ngram_range)
139 | 
140 |         train_data_features = vectorizer.fit_transform(train_samples)
141 |         test_data_features = vectorizer.transform(test_samples)
142 |         return train_data_features, test_data_features, vectorizer
143 |     
144 | def get_bow_features(train_samples, test_samples, ngram_range):
145 |     return to_bag_of_words(train_samples, test_samples, ngram_range=ngram_range)
146 | 
147 | def get_bow_and_pos_features(train_samples, test_samples, ngram_range, pos_ngram_range):
148 |     bow_train_features, bow_test_features = get_bow_features(train_samples, test_samples, ngram_range)
149 |     pos_train_features, pos_test_features = to_pos_bow(train_samples, test_samples, ngram_range=pos_ngram_range)
150 | 
151 |     
152 |     train_features = hstack((bow_train_features, pos_train_features))
153 |     test_features = hstack((bow_test_features, pos_test_features))
154 | 
155 |     return train_features, test_features
156 | 
157 | def get_compound_features(train_data, test_data, feature_gen_methods):
158 |     train_features_list = []
159 |     test_features_list = []
160 | 
161 |     for m in feature_gen_methods:
162 |         train_features, test_features = m(train_data, test_data)
163 |         train_features_list.append(train_features)
164 |         test_features_list.append(test_features)
165 | 
166 |     train_features = train_features_list[0]
167 |     test_features = test_features_list[0]
168 | 
169 |     for i in xrange(1,len(feature_gen_methods)):
170 |         train_features = hstack((train_features, train_features_list[i]))
171 |         test_features = hstack((test_features, test_features_list[i]))
172 | 
173 |     return train_features, test_features
174 |   
175 | def merge_into_file(input_path_or_data, output):
176 |     if (input_path_or_data == None):
177 |         return
178 | 
179 |     # if it's data and not path
180 |     if (dpt.is_sequence(input_path_or_data)):
181 |         for l in input_path_or_data:
182 |             output.write(l + "\n")
183 |         return len(input_path_or_data)
184 | 
185 |     count = 0;
186 |     with open(input_path_or_data) as input:
187 |         for l in input:
188 |             output.write(l)
189 |             count = count + 1
190 |         return count
191 |     
192 | def build_doc2vec_model(data, temp_doc2vec_input_file_path, background_samples_file_path = None,
193 |                       model_file_path = None, should_log = False):
194 | 
195 |     if (should_log):
196 |         reload(logging)
197 |         logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
198 |         logger = logging.getLogger()
199 | 
200 |     # merge the data into one file and then let gensim TaggedLineDocument class take care of the rest
201 |     # this can be further optimized by creating a custom doc2vec iterator that will read the files in sequence
202 |     print "creating temp file..."
203 |     with open(temp_doc2vec_input_file_path,"w") as output:
204 |         merge_into_file(data, output)
205 |         merge_into_file(background_samples_file_path, output)
206 |     
207 |     with open(temp_doc2vec_input_file_path) as handle:
208 |         print "creating model..."
209 |         # TODO: add min_count = 5, but deal with empty sentences..
210 |         ncpus = multiprocessing.cpu_count()
211 |         model = Doc2Vec(TaggedLineDocument(handle), size = 200, window=8, min_count = 5, workers = ncpus)
212 |         print "model built"
213 |         if (model_file_path != None):
214 |             model.save(model_file_path)
215 | 
216 |     #return model
217 |     return model
218 | 
219 | # get the doc2vec feature vectors
220 | def get_doc2vec_features(train_data, test_data,
221 |                          temp_doc2vec_input_file_path, background_samples_file_path = None):
222 | 
223 |     input_data = train_data + test_data
224 |     model = build_doc2vec_model(input_data, temp_doc2vec_input_file_path, background_samples_file_path, should_log = True)
225 | 
226 |     # extract the vectors according to their class
227 |     train_embeddings = [model.docvecs[index] for index in xrange(len(train_data))]
228 |     test_embeddings = [model.docvecs[index] for index in xrange(len(train_data), len(train_data) + len(test_data))]
229 |     #background_embeddings = [model.docvecs[index] for index in xrange(len(train_data) + len(test_data), model.docvecs.count)]
230 |     
231 |     return train_embeddings, test_embeddings, model
232 | 
233 | def label_to_class(label, is_multiclass, auto_add_classes=False):
234 |     if (is_multiclass==True):
235 |         if (not MULTICLASS_LABELS_TO_CLASSES_TABLE.has_key(label) and auto_add_classes):
236 |             max_class = max([MULTICLASS_LABELS_TO_CLASSES_TABLE[k] for k in MULTICLASS_LABELS_TO_CLASSES_TABLE])
237 |             MULTICLASS_LABELS_TO_CLASSES_TABLE[label] = max_class + 1
238 |         
239 |         return MULTICLASS_LABELS_TO_CLASSES_TABLE[label]
240 |     
241 |     return BINARY_LABELS_TO_CLASSES_TABLE[label]
242 | 
243 | def labels_to_classes(labels, is_multiclass=False):
244 |     classes = []
245 |     for label in labels:
246 |         classes.append(label_to_class(label, is_multiclass))
247 |     return classes
248 |     
249 |     
250 | def gen_features_and_classes(train_test_data, gen_features_func):
251 |     train_classes = labels_to_classes(train_test_data.train_labels, is_multiclass = train_test_data.is_multiclass)
252 |     test_classes = labels_to_classes(train_test_data.test_labels, is_multiclass = train_test_data.is_multiclass)
253 |     
254 |     train_features, test_features, model = gen_features_func(train_test_data.train_data, train_test_data.test_data)
255 |     
256 |     return TrainTestData(train_features, train_classes, test_features, test_classes, train_test_data.is_multiclass, model)
257 | 
258 | def write_features_classes_to_file(file_path, data, labels):
259 |     with open(file_path, "w") as handle:
260 |         for d,l in zip(data, labels):
261 |             line_text = str(l) + "," + ",".join([str (x) for x in d.toarray()[0]])
262 |             handle.write(line_text + "\n")
263 |             
264 | # feature evaluation
265 | def read_data_labels(file_path):
266 |     data = []
267 |     labels = []
268 |     with open(file_path) as handle:
269 |         for l in handle:
270 |             parts = l.rstrip().split(",")
271 |             labels.append(float(parts[0]))
272 |             data.append([float(i) for i in parts[1:]])
273 |             
274 |     return data, labels
275 | 
276 | def read_train_test_data(input_dir, name):
277 |     train_file_path = join(input_dir, name + "_train.csv")
278 |     test_file_path = join(input_dir, name + "_test.csv")
279 |     train_data, train_labels = read_data_labels(train_file_path)
280 |     test_data, test_labels = read_data_labels(test_file_path)
281 |     
282 |     is_multiclass = name.find("multiclass") > -1
283 |     
284 |     return TrainTestData(train_data, train_labels, test_data, test_labels, is_multiclass)
285 | 
286 | from sklearn.feature_selection import SelectFromModel
287 | from sklearn.linear_model import LassoCV
288 | 
289 | def evaluate_model(train_test_data, model_initializer):
290 |     clf = model_initializer()
291 |     clf = clf.fit(train_test_data.train_data, train_test_data.train_labels)
292 |     
293 |     labels_predicted = clf.predict(train_test_data.test_data)
294 |     scores_predicted = clf.predict_proba(train_test_data.test_data)
295 |     print metrics.classification_report(train_test_data.test_labels, labels_predicted)
296 |     return EvaluationResult(clf, train_test_data.feature_gen_model, train_test_data.test_data, scores_predicted)
297 | 
298 | class GenFeaturesMethod:
299 |     def __init__(self, name, func, no_entities = False):
300 |         self.name = name
301 |         self.func = func
302 |         self.no_entities = no_entities
303 | 
304 | class EvaluationMethod:
305 |     def __init__(self, name, func):
306 |         self.name = name
307 |         self.func = func
308 | 
309 | # get path to the data input dir, and a list of GenFeaturesMethod objects
310 | def run_gen_features_pipeline(input_dir, gen_features_methods, evaluation_methods):
311 |     data_dict, data_dict_no_entities = read_train_and_test_data_from_path(input_dir)
312 |     results = []
313 |     for name in data_dict:
314 |         for gfm in gen_features_methods:
315 |             print "generating %s features for %s"%(gfm.name, name)
316 |             if (gfm.no_entities):
317 |                 data = data_dict_no_entities[name]
318 |             else:
319 |                 data = data_dict[name]
320 |                         
321 |             train_test_data = gen_features_and_classes(data, gfm.func)
322 |             
323 |             for em in evaluation_methods:
324 |                 print "model evaluation for: %s, %s, %s"%(name, gfm.name, em.name)
325 |                 result = evaluate_model(train_test_data, em.func)
326 |                 results.append(result)
327 |     return results
328 | 


--------------------------------------------------------------------------------
/packages/mirna_detector/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Miroculus
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/packages/mirna_detector/README.md:
--------------------------------------------------------------------------------
1 | # miRNA-detector
2 | Python library to detect miRNA mentions in plain texts.
3 | 


--------------------------------------------------------------------------------
/packages/mirna_detector/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | import re
  5 | import time 
  6 | from os import path
  7 | from random import shuffle
  8 | 
  9 | expression = '((hsa|mmu|\b)?-?(miRNA-|miR|\(miR\)|micoRNA|hsa-let|let-|microRNA-|micro ribonucleic acid)(-|\s|\d)?((-|\w|\*|\/)*\d+(-|\w|\/)*)+\*?)'
 10 | extraction = '(hsa-(mir|let)-\d+(-|\w|\/)*)+'
 11 | 
 12 | mirnas = []
 13 | dataFilePath = path.join(path.dirname(path.realpath(__file__)), 'data/mirna.txt')
 14 | with open(dataFilePath) as f:
 15 | 
 16 |     content = f.readlines()
 17 |     for line in content:
 18 |         values = line.split('\t')
 19 |         mirna = values[2]
 20 |         if mirna.find('hsa') != -1:
 21 |             mirnas.append(mirna)
 22 |             values = values[3].split(';')
 23 |             if values[0]!='':
 24 |                 mirnas+=values
 25 | 
 26 | 
 27 | def refine(results):
 28 |     parsed = []
 29 |     for result in results:
 30 |         value = re.search(expression, result, flags=re.I)
 31 |         if value:
 32 |             parsed.append(value.group())
 33 |     
 34 |     return parsed
 35 | 
 36 | 
 37 | def filterMirnas(results):
 38 |     global mirnas
 39 | 
 40 |     return [result for result in results if (result in mirnas or re.search(extraction, result, flags=re.I))]
 41 | 
 42 | 
 43 | def normalize(candidate):
 44 | 
 45 |     #---------
 46 |     if not re.search(expression, candidate, flags=re.I):
 47 |         return (candidate, '', '')
 48 |     #---------
 49 |     
 50 | 
 51 |     value = re.search(r'\d+', candidate)
 52 |     if not value:
 53 |         return (candidate,'','')
 54 | 
 55 |     if candidate[0] == '-':
 56 |         return (candidate, '', '')
 57 | 
 58 |     base = value.group()
 59 |     namePrefix = 'hsa-mir-'
 60 |     if re.search('let', candidate, re.I):
 61 |         namePrefix = 'hsa-let-'
 62 | 
 63 |     baseIndex = candidate.find(base) + len(base)
 64 |     nameSufix = candidate[baseIndex:]
 65 | 
 66 |     return (namePrefix, base, nameSufix)
 67 | 
 68 | 
 69 | def splitter(results):
 70 |     results = [result for result in results.split('/') if len(result)>0]
 71 |     # extract the mirna number base aka mir-(\d+)
 72 |     namePrefix, base, nameSufix = normalize(results[0])
 73 |     splitted = []
 74 | 
 75 |     splitted.append(namePrefix+base+nameSufix)
 76 | 
 77 |     for result in results[1:]:
 78 |         
 79 |         if not result[0].isdigit() and len(result)==1:
 80 |             splitted.append(namePrefix+base+result)
 81 |         else:
 82 |             np, b, ns = normalize(result)
 83 |             if np == result:
 84 |                 value = namePrefix+result
 85 |                 value = value.replace('--', '-')
 86 |                 splitted.append(value)
 87 |             else:
 88 |                 splitted.append(np+b+ns)
 89 |     return refine(splitted)
 90 | 
 91 | 
 92 | def expand(sentence, result, limit):
 93 |     value = result[0]
 94 |     
 95 |     if sentence[result[2]] == ',' or sentence[result[2]:result[2]+5]==' and ' if len(sentence)>result[2]+5 else False:
 96 |         expanded = sentence[result[2]:limit].replace(', ','/')
 97 |         expanded = re.sub(r'\s?and ', '/', expanded)
 98 |         spaceIndex = expanded.find(' ')
 99 |         value += expanded        
100 |     return value
101 | 
102 | 
103 | def validate(sentence):
104 |     detected = {"sentence":sentence, "detectedMirnas":[]}
105 | 
106 |     if type(sentence) == list:
107 |         sentence = ' '.join(sentence)
108 |     p = re.compile(expression, flags=re.I|re.M)
109 | 
110 |     results = []
111 |     for m in p.finditer(sentence):
112 |         results.append((m.group(), m.start(), m.end()))
113 | 
114 |     parsedResults = []
115 |     for index, result in enumerate(results):
116 |         lastIndex = 0
117 |         if index+1<len(results):
118 |             lastIndex = results[index+1][1]
119 |         else:
120 |             lastIndex = len(sentence)
121 |         extractValue = expand(sentence, result, lastIndex)
122 | 
123 |         values = splitter(extractValue)
124 |         values = filterMirnas(values)
125 |         if len(values)>0:
126 |             for miRNA in values:
127 |                 detected["detectedMirnas"].append({
128 |                     "mirna": miRNA,
129 |                     "origin": extractValue
130 |                 })
131 |         parsedResults += values
132 | 
133 |     return detected
134 | 
135 | def is_mirna(text):
136 |     values = splitter(text)
137 |     values = filterMirnas(values)
138 |     return (len(values) > 0)
139 |     


--------------------------------------------------------------------------------
/packages/mirna_detector/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __init__ import validate
  5 | 
  6 | total = 0
  7 | errors = 0
  8 | 
  9 | def testValidate(value, expected, index):
 10 |     global total
 11 |     global errors
 12 |     total+=1
 13 |     try:
 14 |         result = validate(value)
 15 |         result = [value['mirna'] for value in result['detectedMirnas']]
 16 |         assert result == expected
 17 |     except AssertionError as e:
 18 |         errors+=1
 19 |         print index, ': >>> ', result, 'not equals to', expected
 20 | 
 21 | 
 22 | testValidate("""Validation assays confirmed the dysregulation of miR-223, 
 23 |     miR-146a and miR-155 previously associated with human rheumatoid 
 24 |     arthritis (RA) pathology, as well as that of miR-221/222 and 
 25 |     miR-323-3p.""",
 26 |     ['hsa-mir-223', 'hsa-mir-146a', 'hsa-mir-155', 'hsa-mir-221', 'hsa-mir-222', 'hsa-mir-323-3p'],
 27 |     0)
 28 | 
 29 | testValidate("""We found that both miR-17 and miR-20a (miR-17/20a) target 
 30 |     the UBE2C gene in gastric cancer cells.""",
 31 |     ['hsa-mir-17', 'hsa-mir-20a', 'hsa-mir-17', 'hsa-mir-20a'],
 32 |     1)
 33 | 
 34 | testValidate("""Collectively, these data identify the E2F1/miR-421/Pink 
 35 |     axis as a regulator of mitochondrial fragmentation and cardiomyocyte 
 36 |     apoptosis, and suggest potential therapeutic targets in treatment of 
 37 |     cardiac diseases.""",
 38 |     ['hsa-mir-421'],
 39 |     2)
 40 | 
 41 | testValidate("""Based on extant data linking MIR 137 gene with structural 
 42 |     brain anomalies and functional brain activations in schizophrenia, 
 43 |     we hypothesized that MIR137 risk variants rs1625579 and rs1198588 
 44 |     would be associated with reduced fractional anisotropy in 
 45 |     frontostriatal brain regions, impaired neurocognitive functioning 
 46 |     and worse psychotic symptoms in schizophrenia patients compared with 
 47 |     healthy controls.""",
 48 |     ['hsa-mir-137', 'hsa-mir-137'],3)
 49 | 
 50 | testValidate("""Betulinic acid-dependent repression of Sp1, Sp3, Sp4, 
 51 |     and Sp-regulated genes was due, in part, to induction of the Sp 
 52 |     repressor ZBTB10 and downregulation of microRNA-27a (miR-27a), which 
 53 |     constitutively inhibits ZBTB10 expression, and we show for the first 
 54 |     time that the effects of betulinic acid on the miR-27a:ZBTB10-Sp 
 55 |     transcription factor axis were cannabinoid 1 (CB1) and CB2 
 56 |     receptor-dependent, thus identifying a new cellular target for this 
 57 |     anticancer agent.""",
 58 |     ['hsa-mir-27a', 'hsa-mir-27a', 'hsa-mir-27a'],4)
 59 | 
 60 | 
 61 | testValidate("""The reintroduction of miR-148a and miR-34b/c in cancer 
 62 |     cells with epigenetic inactivation inhibited their motility, reduced 
 63 |     tumor growth, and inhibited metastasis formation in xenograft models, 
 64 |     with an associated down-regulation of the miRNA oncogenic target 
 65 |     genes, such as C-MYC, E2F3, CDK6, and TGIF2.""",
 66 |     ['hsa-mir-148a', 'hsa-mir-34b', 'hsa-mir-34c'],5)
 67 | 
 68 | 
 69 | testValidate("""TGF), known to inhibit mir-200c expression in tumour 
 70 |     cells , along with feedforward and negative feedback loops between the 
 71 |     miR-200/ZEB1/JAG1 (; ; this study) would establish a vicious circle' in 
 72 |     the metastatic bone microenvironment (model ), as may also apply to the 
 73 |     organ tropism of other cancers.""",
 74 |     ['hsa-mir-200c', 'hsa-mir-200'],6)
 75 | 
 76 | 
 77 | testValidate("""The roles of miR-17-5p and p21 were evaluated with 
 78 |     specific antisense oligonucleotides (ODN) that were designed and 
 79 |     used to inhibit their expression.""",
 80 |     ['hsa-mir-17-5p'],7)
 81 | 
 82 | 
 83 | testValidate("""Consistently, depletion of miR-26a/b by miR-26 sponge 
 84 |     could increase the activity of luciferase reporter genes fused to 
 85 |     the 3 UTR of the same cohort of nine genes (AGPAT5, CHD1, ERLIN1, 
 86 |     GREB1, HSPA8, KPNA2, MREG, NARG1 and PLOD2) by more than 30% 
 87 |     (Figured).""",
 88 |     ['hsa-mir-26a', 'hsa-mir-26b', 'hsa-mir-26'],8)
 89 | 
 90 | 
 91 | testValidate("""In our present study, we found that the expression of 
 92 |     miR-361-5p in CRPC was lower than in androgen-dependent prostate 
 93 |     cancer (ADPC), indicating that miR-361-5p may play an important role 
 94 |     in the progression of ADPC to CRPC.""",
 95 |     ['hsa-mir-361-5p', 'hsa-mir-361-5p'],9)
 96 | 
 97 | 
 98 | testValidate("""The expression of miR-146a, CD40, CD80 and CD86 on AchR 
 99 |     specific B cells were analyzed by qRT-PCR and flow cytometry.""",
100 |     ['hsa-mir-146a'],10)
101 | 
102 | 
103 | testValidate("""METHODS: We examined the association between the 
104 |     expression of miR-16, miR-21, miR-93, miR-135b, miR-146a, and miR-182 
105 |     in total RNA from the placentas of 86 term infants as measured by 
106 |     quantitative real-time PCR and newborn neurobehavioral outcomes as 
107 |     assessed using the NICU Network Neurobehavioral Scales (NNNS).""",
108 |     ['hsa-mir-16', 'hsa-mir-21', 'hsa-mir-93', 'hsa-mir-135b', 'hsa-mir-146a', 'hsa-mir-182'],
109 |     11)
110 | 
111 | 
112 | testValidate("""METHODS: The current study validates nine miRNAs 
113 |     (miR-18a/b miR-25, miR-29c, miR-106b, miR375, miR-424, miR-505 and 
114 |     let-7b) significantly correlated with established prognostic 
115 |     breast cancer biomarkers.""",
116 |     ['hsa-mir-18a', 'hsa-mir-18b', 'hsa-mir-25', 'hsa-mir-29c', 'hsa-mir-106b', 'hsa-mir-375', 
117 |     'hsa-mir-424', 'hsa-mir-505', 'hsa-let-7b'],12)
118 | 
119 | 
120 | testValidate("""No significant relationships were observed between these 
121 |     two single nucleotide polymorphisms (SNPs) and onset risk of HCC 
122 |     after adjusting the factors as age, gender, smoking and drinking 
123 |     status in comparison with HBsAg positive controls: hsa-mir-146a 
124 |     rs2910164 (CG + GG vs CC): adjusting OR = 1.10, 95%CI: 0.90 - 1.36; 
125 |     hsa-mir-196-a2 rs11614913 (CC + CT vs TT): adjusting OR = 1.01, 
126 |     95%CI: 0.81 - 1.25; as well as in comparison with HBsAg negative 
127 |     controls: hsa-mir-146a rs2910164 (CG + GG vs CC): adjusting OR = 
128 |     1.06, 95%CI: 0.87 - 1.29; hsa-mir-196-a2 rs11614913 (CC + CT vs TT): 
129 |     adjusting OR = 0.94, 95%CI: 0.76 - 1.16.""",
130 |     ['hsa-mir-146a', 'hsa-mir-196-a2', 'hsa-mir-146a', 'hsa-mir-196-a2'],
131 |     13)
132 | 
133 | 
134 | testValidate("""Hsa-miR-96 caused a decrease in SOX5 3-UTR luciferase 
135 |     activity by 60.34%4.79%, and both hsa-miR-7 and hsa-miR-17 caused 
136 |     a decrease in NR4A3 3-UTR luciferase activity by 65.01%4.07% and 
137 |     45.11%6.76, respectively, compared with controls (FigureC).""",
138 |     ['hsa-mir-96', 'hsa-mir-7', 'hsa-mir-17'],14)
139 | 
140 | 
141 | testValidate("""Furthermore, miR-20a and miR-17-5p were increased in 
142 |     the metastatic carcinoma and six atypical pituitary adenomas as 
143 |     compared to eight typical pituitary adenomas as measured by 
144 |     quantitative real-time PCR.""",
145 |     ['hsa-mir-20a', 'hsa-mir-17-5p'],15)
146 | 
147 | 
148 | testValidate("""These results suggested that anti-miR-33a inhibit 
149 |     activation and extracellular matrix production, at least in part, 
150 |     via the activation of PI3K/Akt pathway and PPAR-a and anti sense 
151 |     of miR-33a may be a novel potential therapeutic approach for 
152 |     treating hepatic fibrosis in the future.""",
153 |     ['hsa-mir-33a'],16)
154 | 
155 | 
156 | testValidate("""The results suggested that the miR-181b, miR-219-2-3p, 
157 |     miR-346, miR-195, miR-1308, miR-92a, miR-17, miR-103 and let-7g are 
158 |     the key players to reflect the schizophrenia illnesses status and 
159 |     may serve as candidate biomarkers for diagnosis of schizophrenia.""",
160 |     ['hsa-mir-181b', 'hsa-mir-219-2-3p', 'hsa-mir-346', 'hsa-mir-195', 'hsa-mir-1308',
161 |     'hsa-mir-92a', 'hsa-mir-17', 'hsa-mir-103', 'hsa-let-7g'],17)
162 | 
163 | 
164 | testValidate("""Specifically discussed miRs include miR-7, miR-9/miR-9*, 
165 |     miR-10a/miR-10a*/miR-10b, miR-15b, miR-17-92, miR-21, miR-26a, 
166 |     miR-34a, miR-93, miR-101, miR-124, miR-125a, miR-125b, miR-128, 
167 |     miR-137, miR-146b-5p, miR-153, miR-181a/miR-181b, miR-196a/miR-196b, 
168 |     miR-218, miR-221/miR-222, miR-296, miR-302-367, miR-326, miR-381, 
169 |     miR-451, and let-7a.""",
170 |     ['hsa-mir-7', 'hsa-mir-9', 'hsa-mir-9*', 'hsa-mir-10a', 'hsa-mir-10a*', 'hsa-mir-10b', 
171 |     'hsa-mir-15b', 'hsa-mir-17-92', 'hsa-mir-21', 'hsa-mir-26a', 'hsa-mir-34a', 'hsa-mir-93', 
172 |     'hsa-mir-101', 'hsa-mir-124', 'hsa-mir-125a', 'hsa-mir-125b', 'hsa-mir-128', 'hsa-mir-137', 
173 |     'hsa-mir-146b-5p', 'hsa-mir-153', 'hsa-mir-181a', 'hsa-mir-181b', 'hsa-mir-196a', 
174 |     'hsa-mir-196b', 'hsa-mir-218', 'hsa-mir-221', 'hsa-mir-222', 'hsa-mir-296', 'hsa-mir-302-367', 
175 |     'hsa-mir-326', 'hsa-mir-381', 'hsa-mir-451', 'hsa-let-7a'],18)
176 | 
177 | 
178 | testValidate("""TT genotype for miR-196a2 gene also showed 3.2-fold 
179 |     risk toward LC and the risk was fivefold higher for squamous cell 
180 |     carcinoma.""",
181 |     ['hsa-mir-196a2'],19)
182 | 
183 | 
184 | testValidate("""Thus, loss of miR-125b-1 may have a key role in the 
185 |     pathogenesis and progression of squamous cell carcinomas of head 
186 |     and neck and possibly of other tumors.""",
187 |     ['hsa-mir-125b-1'],20)
188 | 
189 | 
190 | testValidate("""The present prospective case-control study investigated 
191 |     the involvement of microRNA (miR)-10b in the development of bone 
192 |     metastasis arising from primary breast carcinoma.""",
193 |     ['hsa-mir-10b'],21)
194 | 
195 | 
196 | testValidate("""Four other miRNAs (miR-146b, -181b, let-7a and let-7c) 
197 |     are known oncogenic or tumor suppressor miRNAs.""",
198 |     ['hsa-mir-146b', 'hsa-mir-181b', 'hsa-let-7a', 'hsa-let-7c'],22)
199 | 
200 | 
201 | testValidate("""BACKGROUND: The purpose of this study was to identify 
202 |     new tumour suppressor microRNAs (miRs) in clear cell renal cell 
203 |     carcinoma (ccRCC), carry out functional analysis of their suppressive 
204 |     role and identify their specific target genes.""",
205 |     [],23)
206 | 
207 | 
208 | testValidate("""Subsequent quantitative PCR analyses of these splenic 
209 |     B cells revealed that C/EBPb, a transcriptional regulator of 
210 |     interleukin-6 that is linked to B-cell lymphoproliferative 
211 |     disorders, is downregulated when either miR-K12-11 or miR-155 is 
212 |     ectopically expressed.""",
213 |     ['hsa-mir-K12-11', 'hsa-mir-155'],24)
214 | 
215 | 
216 | testValidate("""Thus, there is a possibility that the lack of change 
217 |     in miRs-182 and -96 following acoustic trauma is due to a slower 
218 |     degradation rate or no degradation compared to the targeted degradation 
219 |     of miR-183, which in turn may lead to the inconsistent expression 
220 |     pattern of these miRNAs within the cluster.""",
221 |     ['hsa-mir-182', 'hsa-mir-96', 'hsa-mir-183'],25)
222 | 
223 | 
224 | testValidate("""Hsa-miR-92b and hsa-miR-9/9* were reported previously 
225 |     to be expressed in brain tumors and in cell lines derived from 
226 |     brain tumors and were documented to be expressed specifically in 
227 |     the developing nervous system """,
228 |     ['hsa-mir-92b', 'hsa-mir-9', 'hsa-mir-9*'],26)
229 | 
230 | 
231 | testValidate("""However, only 3 miRNAs (miR-199a-5p, -27a, and -29a) 
232 |     correlated with hypertrophy; more importantly, only miR-29a 
233 |     correlated also with fibrosis.""",
234 |     ['hsa-mir-199a-5p', 'hsa-mir-27a', 'hsa-mir-29a', 'hsa-mir-29a'],27)
235 | 
236 | 
237 | testValidate("""We found that target genes such as CDH1 (miR-1/206), 
238 |     ATM (miR-18a/b), KLF6 (miR-18a/b and miR-181c), Smad2(miR-18a/b, 
239 |     miR-1/206 and miR-149), Dicer were down expressed with the 
240 |     development of NPC, while BCL2L2 (miR-29a/b/c and miR-203), and YY1 
241 |     (miR-29a/b/c) were overexpressed during the development of NPC.""",
242 |     ['hsa-mir-1', 'hsa-mir-206', 'hsa-mir-18a', 'hsa-mir-18b', 'hsa-mir-18a', 'hsa-mir-18b', 
243 |     'hsa-mir-181c', 'hsa-mir-18a', 'hsa-mir-18b', 'hsa-mir-1', 'hsa-mir-206', 'hsa-mir-149',
244 |     'hsa-mir-29a', 'hsa-mir-29b', 'hsa-mir-29c', 'hsa-mir-203', 'hsa-mir-29a', 'hsa-mir-29b',
245 |     'hsa-mir-29c'],28)
246 | 
247 | 
248 | testValidate("""The miR-200 family (miR-200a, -200b, -200c, -141 and -429) 
249 |     and miR-205 are frequently silenced in advanced cancer and have been 
250 |     implicated in epithelial to mesenchymal transition (EMT) and tumor 
251 |     invasion by targeting the transcriptional repressors of E-cadherin, 
252 |     ZEB1 and ZEB2.""",
253 |     ['hsa-mir-200', 'hsa-mir-200a', 'hsa-mir-200b', 'hsa-mir-200c', 'hsa-mir-141', 'hsa-mir-429',
254 |     'hsa-mir-205'],29)
255 | 
256 | 
257 | testValidate("""Here, the expression of the miRNAs miR-15a/16-1 in 
258 |     PBMC, CD4, and CD8 from RR-MS patients has been investigated.""",
259 |     ['hsa-mir-15a', 'hsa-mir-16-1'],30)
260 | 
261 | 
262 | testValidate("""Subsequent quantitative PCR analyses of these splenic 
263 |     B cells revealed that C/EBPb, a transcriptional regulator of 
264 |     interleukin-6 that is linked to B-cell lymphoproliferative 
265 |     disorders, is downregulated when either miR-K12-11 is 
266 |     ectopically expressed.""",
267 |     ['hsa-mir-K12-11'], 31)
268 | 
269 | print total-errors, '/',total, (total-errors)*100/total, '% test passed'


--------------------------------------------------------------------------------
/packages/model_tools.py:
--------------------------------------------------------------------------------
 1 | import data_preparation_tools as dpt
 2 | import features_generation_tools as fgt
 3 | from sklearn.externals import joblib
 4 | 
 5 | DEFAULT_VERSION = "1.1"
 6 | 
 7 | class ScoringModel:
 8 |     # TODO: add the ability to pass in the array of transformations as well...for now we just use eveyrthing
 9 |     def __init__(self, features_generator, ml_model, transformations=dpt.TRANSFORMATION_STEPS, version=DEFAULT_VERSION):
10 |         self.transformations = transformations
11 |         self.features_generator = features_generator
12 |         self.ml_model = ml_model
13 |         self.version = version
14 | 
15 |     def score(self, text, context):
16 |         transformed = dpt.run_transformations_on_single_sentence(text, context,self.transformations)
17 |         features = self.features_generator.transform([transformed])
18 |         return self.ml_model.predict_proba(features)[0]
19 | 
20 |     def save_model(self, file_path):
21 |         return joblib.dump(self,file_path, compress=True)
22 | 
23 |     @staticmethod
24 |     def from_file(file_path):
25 |         return joblib.load(file_path)


--------------------------------------------------------------------------------
/webservice/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | from flask import request
 3 | from flask import jsonify
 4 | from flask import Response
 5 | from sklearn.externals import joblib
 6 | import json
 7 | import scorer
 8 | 
 9 | # convert the scorer result to the response format expected by the client
10 | # TODO: move to another module
11 | def scorer_result_to_response_format(scoring_results_and_entities):
12 |     response_result = { 'modelVersion' : scorer.get_version(), 'relations' : []}
13 |     
14 |     for tup in scoring_results_and_entities:
15 |         relation = {}
16 |         scorer_result = tup[0]
17 |         relation['entities'] = tup[1]
18 |         max_score = -1
19 |         max_score_class = ""
20 |         scores_list = []
21 |         for i in range(len(scorer_result)):
22 |             if (scorer_result[i] > max_score):
23 |                 max_score = scorer_result[i]
24 |                 max_score_class = str(i)
25 |                 
26 |         relation['classification'] = max_score_class
27 |         relation['score'] = max_score
28 |         response_result['relations'].append(relation)
29 |     
30 |     return response_result
31 | 
32 | app = Flask(__name__)
33 | 
34 | @app.route('/')
35 | def api_root():
36 |     return 'Relation classification service'
37 | 
38 | @app.route('/score', methods = ['POST'])
39 | def score():
40 |     if request.headers['Content-Type'] != 'application/json':
41 |         resp = Response('Unssuported content type, expected application/json', status=500);
42 |         return resp
43 |     if (not request.json.has_key('text')):
44 |         resp = Response('Bad request: missing "text" field in JSON body', status=500);
45 |         return resp
46 |     if (not request.json.has_key('entities')):
47 |         resp = Response('Bad request: missing "entities" field in JSON body', status=500);
48 |         return resp
49 |     
50 |     text = request.json['text']
51 |     entities = request.json['entities']
52 |     try:
53 |         scorerResult = scorer.evaluate_score(text, entities)
54 |         resp = jsonify(scorer_result_to_response_format(scorerResult))
55 |         resp.status_code = 200
56 |         return resp
57 |     except Exception as e:
58 |         resp = Response("Internal Server Error: %s"%e, status = 500)
59 |         return resp
60 |     
61 | @app.route('/updatemodel', methods = ['POST'])
62 | def update_model():
63 |     if request.headers['Content-Type'] != 'application/json':
64 |         resp = Response('Unssuported content type, expected application/json', status=500);
65 |         return resp
66 |     if (not request.json.has_key('path')):
67 |         resp = Response('Bad request: missing "path" field in JSON body', status=500);
68 |         return resp
69 |     
70 |     path = request.json['path']
71 |     try:
72 |         scorer.load_model_from_url(path)
73 |         resp = Response("", status=200);
74 |         return resp
75 |     except Exception as e:
76 |         resp = Response("Internal Server Error: %s"%e, status = 500)
77 |         return resp
78 |     
79 |     
80 | if __name__ == '__main__':
81 |     app.run()


--------------------------------------------------------------------------------
/webservice/scorer.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import itertools
  3 | import os
  4 | import pickle
  5 | import sys
  6 | import sklearn
  7 | import urllib
  8 | from mirna_detector import is_mirna
  9 | from os import path
 10 | 
 11 | # add the dir above to path
 12 | # TODO: make this more standard...
 13 | current_dir_path = path.dirname(path.realpath(__file__))
 14 | sys.path.append(path.join(path.dirname(current_dir_path),"packages"))
 15 | from model_tools import ScoringModel
 16 | 
 17 | # TODO: Create dev and prod envs..
 18 | model_file_name = r"scoring_model.pkl"
 19 | model_directory_path = path.join(current_dir_path, r"model")
 20 | model_file_path = path.join(model_directory_path,model_file_name)
 21 | 
 22 | scoring_model = None
 23 | 
 24 | try:
 25 |     scoring_model = ScoringModel.from_file(model_file_path)
 26 | except Exception as e:
 27 |     print "Failed loading model: %s"%e
 28 | 
 29 | def get_text_from_entity_dict(e):
 30 |     if e.has_key("origin"):
 31 |         return e["origin"]
 32 |     if e.has_key("value"):
 33 |         return e["value"]
 34 |     return None
 35 | 
 36 | def get_version():
 37 |     if (scoring_model == None):
 38 |         raise Exception("No model file loaded, or bad model exists")
 39 |     return scoring_model.version
 40 | 
 41 | def get_temp_model_path():
 42 |     return path.join(model_directory_path, model_file_name + "_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
 43 | 
 44 | def load_model_from_url(url):
 45 |     # TODO: move this into a class..
 46 |     global scoring_model
 47 |     url_opener = urllib.URLopener()
 48 |     temp_model_path =  get_temp_model_path()
 49 |     url_opener.retrieve(url, temp_model_path)
 50 | 
 51 |     # try to load the model:
 52 |     try:
 53 |         temp_model = ScoringModel.from_file(temp_model_path)
 54 |     except Exception as e:
 55 |         print "Failed to load donwloaded model: %s"%e
 56 |         os.remove(temp_model_path)
 57 |         raise RuntimeError("Failed to load donwloaded model! error: %s"%e)
 58 | 
 59 |     # update model:
 60 |     scoring_model = temp_model
 61 | 
 62 |     # delete existing model
 63 |     if (path.isfile(model_file_path)):
 64 |         os.remove(model_file_path)
 65 |     os.rename(temp_model_path, model_file_path)
 66 | 
 67 | 
 68 | # TODO: move this to an object with an init function...
 69 | def evaluate_score(sentence, entities):
 70 |     if (scoring_model == None):
 71 |         raise Exception("No model file loaded, or bad model exists")
 72 | 
 73 |     for e in entities:
 74 |         if e.has_key("type"):
 75 |             e["type"] = e["type"].lower()
 76 |     
 77 |     # TODO:
 78 |     # We merge the entities here such that there are no overlaps, and also check for the mirna specifically
 79 |     # this should basically solved on the entity recognition side
 80 |     # we should consider removing this part when this issue is solved
 81 |     filtered_entities = []
 82 |     for entity in entities:
 83 |         # check if we don't have collisions, always take the longer string
 84 |         objects_to_remove = []
 85 |         add_entity = True
 86 | 
 87 |         start_index = int(entity["from"])
 88 |         end_index = int(entity["to"])
 89 |         text = get_text_from_entity_dict(entity)
 90 | 
 91 |         for e in filtered_entities:
 92 |             if (start_index <= e["start_index"] and end_index >= e["end_index"] 
 93 |                 or start_index <= e["start_index"] and end_index >= e["start_index"]
 94 |                 or start_index <= e["end_index"] and end_index >= e["end_index"]):
 95 |                 current_len = len(text)
 96 |                 second_len = len(e["text"])
 97 |                 if (current_len <= second_len):
 98 |                     add_entity = False
 99 |                     break
100 |                 else:
101 |                     objects_to_remove.append(e)
102 | 
103 |         for e in objects_to_remove:
104 |             filtered_entities.remove(e)
105 | 
106 |         # todo: for now using the mirna detector to detect miRNA since it seems that
107 |         # the current results are currently not accurate
108 |         if (not add_entity):
109 |             continue
110 | 
111 |         type = entity["type"]
112 |         if (is_mirna(text)):
113 |             type = "mirna"
114 |         entity["type"] = type
115 | 
116 |         filtered_entities.append({
117 |                 "text" : text,
118 |                 "type" : type,
119 |                 "start_index" : start_index,
120 |                 "end_index" : end_index,
121 |                 "original_entity" : entity
122 |             })
123 | 
124 |     # for now just return the same result for all pairs of gens / miRNA entities:
125 |     mirna_entities = [e for e in filtered_entities if (e["type"]=="mirna")]
126 |     gene_entities = [e for e in filtered_entities if (e["type"]=="gene")]
127 |     scores = []
128 |     
129 |     for p in itertools.product(mirna_entities, gene_entities):
130 |         context = {"pair_entities" :[
131 |                 p[0],
132 |                 p[1]
133 |             ],
134 |             "all_entities" : filtered_entities
135 |         }
136 | 
137 |         score = scoring_model.score(sentence, context)
138 |         scores.append((score, (p[0]["original_entity"],p[1]["original_entity"])))
139 |     
140 |     return scores
141 | 


--------------------------------------------------------------------------------
/webservice/scorer_ws.pyproj:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" DefaultTargets="Build">
 3 |   <PropertyGroup>
 4 |     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
 5 |     <SchemaVersion>2.0</SchemaVersion>
 6 |     <ProjectGuid>{594fde71-3dd6-4d9a-919b-56a57da8d3b7}</ProjectGuid>
 7 |     <ProjectHome />
 8 |     <StartupFile>app.py</StartupFile>
 9 |     <SearchPath />
10 |     <WorkingDirectory>.</WorkingDirectory>
11 |     <OutputPath>.</OutputPath>
12 |     <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
13 |     <LaunchProvider>Standard Python launcher</LaunchProvider>
14 |     <InterpreterId />
15 |     <InterpreterVersion />
16 |   </PropertyGroup>
17 |   <PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
18 |   <PropertyGroup Condition="'$(Configuration)' == 'Release'" />
19 |   <PropertyGroup>
20 |     <VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
21 |     <PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
22 |   </PropertyGroup>
23 |   <ItemGroup>
24 |     <Content Include="server_setup.txt" />
25 |   </ItemGroup>
26 |   <ItemGroup>
27 |     <Compile Include="app.py" />
28 |     <Compile Include="scorer.py" />
29 |   </ItemGroup>
30 |   <Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
31 |   <Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
32 | </Project>


--------------------------------------------------------------------------------