├── .gitignore ├── LICENSE ├── README.md ├── data_preparation.ipynb ├── demo_input ├── negative_samples_small.tsv ├── negative_with_entities_small.tsv ├── positive_samples_small.tsv └── positive_with_entities_small.tsv ├── demo_output ├── background_samples.txt ├── temp_doc2vec.txt └── top_scoring_positive_class.txt ├── features_classification.ipynb ├── packages ├── data_preparation_tools.py ├── features_generation_tools.py ├── mirna_detector │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── data │ │ └── mirna.txt │ ├── test.py │ └── tests │ │ └── sentences.txt └── model_tools.py └── webservice ├── app.py ├── scorer.py └── scorer_ws.pyproj /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | # private data 65 | *private.* 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Catalyst Code 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # corpus-to-graph-ml 2 | This repository contains machine learning related work for the corpus to graph project, including Jupyter research notebooks and a Flask webservice to host the model. 3 | 4 | The packages folder contains python code with the main logic for the data transformation and the feature generation tools. 5 | 6 |  The webservice contains an example for a flask based scoring service that can be used in order to expose the trained model. 7 | 8 | The data_preparation notebook contains an example of running the data transformation pipeline, and the features_classification notebook contains code examples for generating different features and training and evaluating different classifiers. 9 | 10 | The only missing piece that shold be provided is an entity recognition endpoint (specifically here we used [GNAT](http://gnat.sourceforge.net/)). You can also alternatively provide a text file with the results of the entity recognition process. 11 | 12 | #Dependencies: 13 | 14 | We highly recommend using the [Anaconda](https://www.continuum.io/downloads) distribution (or any similiar distribution), to make your life easier, as it comes with most of the packages we use in this notebook. 15 | 16 | In our notebooks, we use the following libraries: 17 | 18 | - [SKlearn](http://scikit-learn.org/stable/install.html) (comes pre-installed with Anaconda) 19 | - [NLTK](http://www.nltk.org/install.html) (make sure to install the NLTK stopwords, lemmatization, and stemming packages by [calling nltk.download() manually](http://www.nltk.org/data.html)) 20 | - [gensim](https://radimrehurek.com/gensim/install.html) (Make sure that you have *cython* installed beforehand in order run it the optimized version of the code) 21 | - [spacy.io](https://spacy.io/docs/#getting-started) - For spacy - make sure you the english model installed 22 | - [requests](https://pypi.python.org/pypi/requests) 23 | -------------------------------------------------------------------------------- /data_preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "scrolled": true 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "producing inputs for binary case...\n", 16 | "before data split\n", 17 | "splitted data\n", 18 | "running step:entities\n", 19 | "running step:trim\n", 20 | "running step:normalize\n", 21 | "running step:rmdigits\n", 22 | "Finished producing files, took:0:00:00.156000\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "import sys\n", 28 | "sys.path.append(\"./packages\")\n", 29 | "\n", 30 | "import data_preparation_tools as dpt\n", 31 | "reload(dpt)\n", 32 | "import datetime\n", 33 | "from datetime import date\n", 34 | "\n", 35 | "# uncomment when doanloading nltk for the first time\n", 36 | "#import nltk\n", 37 | "#nltk.download()\n", 38 | "\n", 39 | "# an optional optimization:\n", 40 | "# import preprocessed text with entitity recognition data\n", 41 | "# in order to prevent the server from going to GNAT each time\n", 42 | "# the input files are in format of tab seperated files with the first\n", 43 | "# column containing the sentence and the remaining columns containing \n", 44 | "# the recognized entities\n", 45 | "texts_with_entities_file_paths = [r\"./demo_input/positive_with_entities_small.tsv\",\n", 46 | " r\"./demo_input/negative_with_entities_small.tsv\"]\n", 47 | "\n", 48 | "for p in texts_with_entities_file_paths:\n", 49 | " dpt.import_to_texts_entities_dictonary_from_file(p)\n", 50 | "today = date.today()\n", 51 | "prefix = \"%s_%s_%s\"%(today.month,today.day,today.year)\n", 52 | "\n", 53 | "# fill out these paths\n", 54 | "\n", 55 | "output_dir = r\"./demo_output\"\n", 56 | "positive_samples_path = r\"./demo_input/positive_samples_small.tsv\"\n", 57 | "negative_samples_path = r\"./demo_input/negative_samples_small.tsv\"\n", 58 | "\n", 59 | "start_time = datetime.datetime.now()\n", 60 | "\n", 61 | "# this pipeline will produce files with all possible outputs\n", 62 | "dpt.run_data_preparation_pipeline(positive_samples_path,negative_samples_path, prefix, output_dir, run_multiclass=False)\n", 63 | "\n", 64 | "run_time = datetime.datetime.now() - start_time\n", 65 | "print \"Finished producing files, took:%s\"%run_time" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 2", 81 | "language": "python", 82 | "name": "python2" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 2 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython2", 94 | "version": "2.7.11" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 0 99 | } 100 | -------------------------------------------------------------------------------- /demo_input/negative_samples_small.tsv: -------------------------------------------------------------------------------- 1 | 406953 MESH:D012021 11693395 mir18a \N \N \N The influence of microgravity on voluntary and electrically evoked contractions of the triceps surae (TS) of seven crewmembers was studied before and after spaceflights (space MIR flights: MIR-18, -22, -25, -26, and -27). The influence of microgravity on voluntary and electrically evoked contractions of the triceps surae ( TS ) of seven crewmembers was studied before and after spaceflights ( space MIR flights : MIR-18 , -22 , -25 , -26 , and -27 ) . The influence microgravity voluntary electrically evoked contractions triceps surae TS seven crewmembers studied spaceflights space MIR flights MIR-18 -22 -25 -26 -27 MIR-18, -22, -25, -26, and -27 triceps surae 2 | 574502 9606 12749511 mir500a \N \N \N Eligible patients were to have a glycated hemoglobin (HbA1c) value < or = 8.5% and mean fasting plasma glucose (FPG) concentrations < or = 200 mg/dL while receiving MIR 500 mg BID for at least 8 weeks. Eligible patients were to have a glycated hemoglobin ( HbA1c ) value < or = 8.5 % and mean fasting plasma glucose ( FPG ) concentrations < or = 200 mg/dL while receiving MIR 500 mg BID for at least 8 weeks . Eligible patients glycated hemoglobin HbA1c value 8.5 mean fasting plasma glucose FPG concentrations 200 mg/dL receiving MIR 500 mg BID least 8 weeks MIR 500 patient 3 | 574502 9606 12749511 mir500a \N \N \N After a 2-week, single-blind lead-in period, patients were randomly assigned to receive MXR 1000 or 1500 mg QD for 24 weeks or to continue MIR 500 mg BID for 24 weeks. After a 2-week , single-blind lead-in period , patients were randomly assigned to receive MXR 1000 or 1500 mg QD for 24 weeks or to continue MIR 500 mg BID for 24 weeks . After 2-week single-blind lead-in period patients randomly assigned receive MXR 1000 1500 mg QD 24 weeks continue MIR 500 mg BID 24 weeks MIR 500 patient 4 | 494324 CHEBI:29108 15538371 mir375 \N \N \N The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis. The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis . The mechanism secretion modified miR-375 independent changes glucose metabolism intracellular Ca2+-signalling correlated direct effect insulin exocytosis miR-375 Ca2+ 5 | 494324 MESH:D005947 15538371 mir375 \N \N \N The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis. The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis . The mechanism secretion modified miR-375 independent changes glucose metabolism intracellular Ca2+-signalling correlated direct effect insulin exocytosis miR-375 glucose 6 | 406934 MESH:D007938 15737576 mir142 \N \N \N Some human miRNAs are linked to leukemias: the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia. Some human miRNAs are linked to leukemias : the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia . Some human miRNAs linked leukemias miR-15a/miR-16 locus frequently deleted down-regulated patients B-cell chronic lymphocytic leukemia miR-142 translocation site found case aggressive B-cell leukemia miR-142 leukemias 7 | 406948 9606 15737576 mir15a \N \N \N Some human miRNAs are linked to leukemias: the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia. Some human miRNAs are linked to leukemias : the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia . Some human miRNAs linked leukemias miR-15a/miR-16 locus frequently deleted down-regulated patients B-cell chronic lymphocytic leukemia miR-142 translocation site found case aggressive B-cell leukemia miR-15a patients 8 | 406906 9606 15901636 mir122 \N \N \N We propose that specific microRNAs, such as Mirn122a, could be involved in the posttranscriptional regulation of mRNAs such as Tnp2 in the mammalian testis. We propose that specific microRNAs , such as Mirn122a , could be involved in the posttranscriptional regulation of mRNAs such as Tnp2 in the mammalian testis . We propose specific microRNAs Mirn122a could involved posttranscriptional regulation mRNAs Tnp2 mammalian testis Mirn122a mammalian 9 | 406952 MESH:D016393 15944707 mir17 \N \N \N Enforced expression of the mir-17-92 cluster acted with c-myc expression to accelerate tumour development in a mouse B-cell lymphoma model. Enforced expression of the mir-17-92 cluster acted with c-myc expression to accelerate tumour development in a mouse B-cell lymphoma model . Enforced expression mir-17-92 cluster acted c-myc expression accelerate tumour development mouse B-cell lymphoma model mir-17 B-cell lymphoma 10 | 406952 MESH:D008223 15944707 mir17 \N \N \N Tumours derived from haematopoietic stem cells expressing a subset of the mir-17-92 cluster and c-myc could be distinguished by an absence of apoptosis that was otherwise prevalent in c-myc-induced lymphomas. Tumours derived from haematopoietic stem cells expressing a subset of the mir-17-92 cluster and c-myc could be distinguished by an absence of apoptosis that was otherwise prevalent in c-myc-induced lymphomas . Tumours derived haematopoietic stem cells expressing subset mir-17-92 cluster c-myc could distinguished absence apoptosis otherwise prevalent c-myc-induced lymphomas mir-17 lymphomas 11 | -------------------------------------------------------------------------------- /demo_input/negative_with_entities_small.tsv: -------------------------------------------------------------------------------- 1 | The measured Mir-18 crew skin dose equivalent rate was 1133 microSv/day . Mir-18 2 | The influence of microgravity on voluntary and electrically evoked contractions of the triceps surae ( TS ) of seven crewmembers was studied before and after spaceflights ( space MIR flights : MIR-18 , -22 , -25 , -26 , and -27 ) . TS MIR MIR-18 3 | Eligible patients were to have a glycated hemoglobin ( HbA1c ) value < or = 8.5 % and mean fasting plasma glucose ( FPG ) concentrations < or = 200 mg/dL while receiving MIR 500 mg BID for at least 8 weeks . hemoglobin MIR 500 BID mg FPG dL 4 | After a 2-week , single-blind lead-in period , patients were randomly assigned to receive MXR 1000 or 1500 mg QD for 24 weeks or to continue MIR 500 mg BID for 24 weeks . mg MXR a 2 BID MIR 500 5 | The mechanism by which secretion is modified by miR-375 is independent of changes in glucose metabolism or intracellular Ca2+-signalling but correlated with a direct effect on insulin exocytosis . miR-375 insulin Ca2 6 | Some human miRNAs are linked to leukemias : the miR-15a/miR-16 locus is frequently deleted or down-regulated in patients with B-cell chronic lymphocytic leukemia and miR-142 is at a translocation site found in a case of aggressive B-cell leukemia . miR-15a B miR-142 miR-16 7 | We propose that specific microRNAs , such as Mirn122a , could be involved in the posttranscriptional regulation of mRNAs such as Tnp2 in the mammalian testis . Tnp2 Mirn122a 8 | Enforced expression of the mir-17-92 cluster acted with c-myc expression to accelerate tumour development in a mouse B-cell lymphoma model . c-myc B mir-17-92 9 | Tumours derived from haematopoietic stem cells expressing a subset of the mir-17-92 cluster and c-myc could be distinguished by an absence of apoptosis that was otherwise prevalent in c-myc-induced lymphomas . c-myc mir-17-92 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /demo_input/positive_samples_small.tsv: -------------------------------------------------------------------------------- 1 | 406884 136319 15806104 mirlet7b POSITIVE DIRECT DOWN In particular, we experimentally validate common regulation of Mtpn by miR-375, miR-124 and let-7b and thus provide evidence for coordinate microRNA control in mammals. In particular , we experimentally validate common regulation of Mtpn by miR-375 , miR-124 and let-7b and thus provide evidence for coordinate microRNA control in mammals . In particular experimentally validate common regulation Mtpn miR-375 miR-124 let-7b thus provide evidence coordinate microRNA control mammals let-7b Mtpn 2 | 406884 1573 22761738 mirlet7b NEGATIVE DIRECT NA CONCLUSIONS: Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues. CONCLUSIONS : Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues . CONCLUSIONS Our results demonstrated decreased expression let-7b could lead high expression CYP2J2 protein cancerous tissues Let-7b CYP2J2 3 | 406884 1573 22761738 mirlet7b NEGATIVE DIRECT NA CONCLUSIONS: Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues. CONCLUSIONS : Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues . CONCLUSIONS Our results demonstrated decreased expression let-7b could lead high expression CYP2J2 protein cancerous tissues let-7b CYP2J2 4 | 406884 1573 22761738 mirlet7b NEGATIVE DIRECT NA Furthermore, let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2. Furthermore , let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2 . Furthermore let-7b may diminish cell proliferation promote cell apoptosis tumor cells via posttranscriptional repression CYP2J2 Let-7b CYP2J2 5 | 406884 1573 22761738 mirlet7b NEGATIVE DIRECT NA Furthermore, let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2. Furthermore , let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2 . Furthermore let-7b may diminish cell proliferation promote cell apoptosis tumor cells via posttranscriptional repression CYP2J2 let-7b CYP2J2 6 | 406884 1573 22761738 mirlet7b NEGATIVE DIRECT NA In addition, let-7b decreased the enzymatic activity of endogenous CYP2J2. In addition , let-7b decreased the enzymatic activity of endogenous CYP2J2 . In addition let-7b decreased enzymatic activity endogenous CYP2J2 Let-7b CYP2J2 7 | 406892 10217 24785011 mir100 NEGATIVE INDIRECT NA CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells. CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells . CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells miR-100 RBSP3 8 | 406892 10217 24785011 mir100 NEGATIVE INDIRECT NA CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells. CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells . CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells miR-100 RBSP3 9 | 406892 10217 24785011 mir100 NEGATIVE INDIRECT NA CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells. CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells . CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells miR-100 RBSP3 10 | 406892 2261 25344675 mir100 POSITIVE INDIRECT DOWN FGFR3 was significantly downregulated by overexpressing miR-100 in pancreatic cancer cells and knocking down FGFR3 by siRNA exerted similar effect as miR-100. FGFR3 was significantly downregulated by overexpressing miR-100 in pancreatic cancer cells and knocking down FGFR3 by siRNA exerted similar effect as miR-100 . FGFR3 significantly downregulated overexpressing miR-100 pancreatic cancer cells knocking FGFR3 siRNA exerted similar effect miR-100 miR-100 FGFR3 11 | 406892 2261 25344675 mir100 POSITIVE INDIRECT DOWN Luciferase essay showed FGFR3 was direct target of miR-100. Luciferase essay showed FGFR3 was direct target of miR-100 . Luciferase essay showed FGFR3 direct target miR-100 miR-100 FGFR3 12 | 406892 10217 24785011 mir100 NEGATIVE INDIRECT NA CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells. CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells . CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells miR-100 RBSP3 13 | 406892 10217 24785011 mir100 NEGATIVE INDIRECT NA CONCLUSION: I treatment inhibited the expression of miR-100, which modulated RBSP3 in FTC cells. CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells . CONCLUSION I treatment inhibited expression miR-100 modulated RBSP3 FTC cells miR-100 RBSP3 14 | 406892 2261 25344675 mir100 POSITIVE INDIRECT DOWN Our study demonstrated that miR-100 played an important role in pancreatic cancer development, possibly through targeting FGFR3. Our study demonstrated that miR-100 played an important role in pancreatic cancer development , possibly through targeting FGFR3 . Our study demonstrated miR-100 played important role pancreatic cancer development possibly targeting FGFR3 miR-100 FGFR3 15 | 406892 2261 25344675 mir100 POSITIVE INDIRECT DOWN The predicted target of miR-100, fibroblast growth factor receptor 3 (FGFR3), was downregulated by siRNA to examine its effect on pancreatic cancer cells. The predicted target of miR-100 , fibroblast growth factor receptor 3 ( FGFR3 ) , was downregulated by siRNA to examine its effect on pancreatic cancer cells . The predicted target miR-100 fibroblast growth factor receptor 3 FGFR3 downregulated siRNA examine effect pancreatic cancer cells miR-100 FGFR3 16 | 406892 2261 25344675 mir100 POSITIVE INDIRECT DOWN The predicted target of miR-100, fibroblast growth factor receptor 3 (FGFR3), was downregulated by siRNA to examine its effect on pancreatic cancer cells. The predicted target of miR-100 , fibroblast growth factor receptor 3 ( FGFR3 ) , was downregulated by siRNA to examine its effect on pancreatic cancer cells . The predicted target miR-100 fibroblast growth factor receptor 3 FGFR3 downregulated siRNA examine effect pancreatic cancer cells miR-100 fibroblast growth factor receptor 3 17 | 406892 2261 25493074 mir100 NEGATIVE INDIRECT NA Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes, including FGFR3 and its relationship with proliferation, apoptosis and DNA ploidy. Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes , including FGFR3 and its relationship with proliferation , apoptosis and DNA ploidy . Our aim analyze role miR-100 bladder cancer cell lines controlling expression possible target genes including FGFR3 relationship proliferation apoptosis DNA ploidy miR-100 FGFR3 18 | 406892 2261 25493074 mir100 NEGATIVE INDIRECT NA Recently, lowered expression of miR-100, resulting in upregulation of FGFR3, has been correlated with low-grade, non-invasive bladder urothelial cancer, as an alternative oncogenesis pathway to the typical FGFR3 gene mutation. Recently , lowered expression of miR-100 , resulting in upregulation of FGFR3 , has been correlated with low-grade , non-invasive bladder urothelial cancer , as an alternative oncogenesis pathway to the typical FGFR3 gene mutation . Recently lowered expression miR-100 resulting upregulation FGFR3 correlated low-grade non-invasive bladder urothelial cancer alternative oncogenesis pathway typical FGFR3 gene mutation miR-100 FGFR3 19 | 406892 2261 25493074 mir100 POSITIVE INDIRECT DOWN Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes, including FGFR3 and its relationship with proliferation, apoptosis and DNA ploidy. Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes , including FGFR3 and its relationship with proliferation , apoptosis and DNA ploidy . Our aim analyze role miR-100 bladder cancer cell lines controlling expression possible target genes including FGFR3 relationship proliferation apoptosis DNA ploidy miR-100 FGFR3 20 | 406892 2261 25493074 mir100 POSITIVE INDIRECT DOWN Recently, lowered expression of miR-100, resulting in upregulation of FGFR3, has been correlated with low-grade, non-invasive bladder urothelial cancer, as an alternative oncogenesis pathway to the typical FGFR3 gene mutation. Recently , lowered expression of miR-100 , resulting in upregulation of FGFR3 , has been correlated with low-grade , non-invasive bladder urothelial cancer , as an alternative oncogenesis pathway to the typical FGFR3 gene mutation . Recently lowered expression miR-100 resulting upregulation FGFR3 correlated low-grade non-invasive bladder urothelial cancer alternative oncogenesis pathway typical FGFR3 gene mutation miR-100 FGFR3 21 | 406892 2261 26018508 mir100 NEGATIVE INDIRECT NA Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation. Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation . Bioinformatics analysis luciferase reporter assay suggest miR-100 binds 3'UTR FGFR3 mRNA prevent translation miR-100 FGFR3 22 | 406892 2261 26018508 mir100 NEGATIVE INDIRECT NA Here we reported significantly higher levels of fibroblast growth factor receptor 3 (FGFR3) and significantly lower levels of miR-100 in the OS specimen, compared to those in the paired normal bone tissues. Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues . Here reported significantly higher levels fibroblast growth factor receptor 3 FGFR3 significantly lower levels miR-100 OS specimen compared paired normal bone tissues miR-100 FGFR3 23 | 406892 2261 26018508 mir100 NEGATIVE INDIRECT NA Here we reported significantly higher levels of fibroblast growth factor receptor 3 (FGFR3) and significantly lower levels of miR-100 in the OS specimen, compared to those in the paired normal bone tissues. Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues . Here reported significantly higher levels fibroblast growth factor receptor 3 FGFR3 significantly lower levels miR-100 OS specimen compared paired normal bone tissues miR-100 fibroblast growth factor receptor 3 24 | 406892 2261 26018508 mir100 NEGATIVE INDIRECT NA Taken together, our data demonstrate that miR-100 may inhibit the growth of OS through FGFR3. Taken together , our data demonstrate that miR-100 may inhibit the growth of OS through FGFR3 . Taken together data demonstrate miR-100 may inhibit growth OS FGFR3 miR-100 FGFR3 25 | 406892 2261 26018508 mir100 NEGATIVE INDIRECT NA We found that overexpression of miR-100 in OS cells decreased FGFR3 protein levels, whereas inhibition of miR-100 increased FGFR3 protein levels, without affecting FGFR3 transcripts. We found that overexpression of miR-100 in OS cells decreased FGFR3 protein levels , whereas inhibition of miR-100 increased FGFR3 protein levels , without affecting FGFR3 transcripts . We found overexpression miR-100 OS cells decreased FGFR3 protein levels whereas inhibition miR-100 increased FGFR3 protein levels without affecting FGFR3 transcripts miR-100 FGFR3 26 | 406892 2261 26018508 mir100 POSITIVE INDIRECT DOWN Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation. Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation . Bioinformatics analysis luciferase reporter assay suggest miR-100 binds 3'UTR FGFR3 mRNA prevent translation miR-100 FGFR3 27 | 406892 2261 26018508 mir100 POSITIVE INDIRECT DOWN Here we reported significantly higher levels of fibroblast growth factor receptor 3 (FGFR3) and significantly lower levels of miR-100 in the OS specimen, compared to those in the paired normal bone tissues. Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues . Here reported significantly higher levels fibroblast growth factor receptor 3 FGFR3 significantly lower levels miR-100 OS specimen compared paired normal bone tissues miR-100 FGFR3 -------------------------------------------------------------------------------- /demo_input/positive_with_entities_small.tsv: -------------------------------------------------------------------------------- 1 | In particular , we experimentally validate common regulation of Mtpn by miR-375 , miR-124 and let-7b and thus provide evidence for coordinate microRNA control in mammals . let-7b miR-124 Mtpn miR-375 2 | CONCLUSIONS : Our results demonstrated that the decreased expression of let-7b could lead to the high expression of CYP2J2 protein in cancerous tissues . let-7b CYP2J2 3 | Furthermore , let-7b may diminish cell proliferation and promote cell apoptosis of tumor cells via posttranscriptional repression of CYP2J2 . CYP2J2 let-7b tumor 4 | In addition , let-7b decreased the enzymatic activity of endogenous CYP2J2 . CYP2J2 let-7b 5 | Let-7b significantly inhibited the tumor phenotype by targeting CYP2J2 . CYP2J2 Let-7b tumor 6 | Luciferase and western blot assays revealed that CYP2J2 was regulated by let-7b . let-7b CYP2J2 7 | CONCLUSION : I treatment inhibited the expression of miR-100 , which modulated RBSP3 in FTC cells . RBSP3 miR-100 8 | Our study demonstrated that miR-100 played an important role in pancreatic cancer development , possibly through targeting FGFR3 . pancreatic FGFR3 miR-100 9 | The predicted target of miR-100 , fibroblast growth factor receptor 3 ( FGFR3 ) , was downregulated by siRNA to examine its effect on pancreatic cancer cells . pancreatic FGFR3 fibroblast growth factor receptor 3 receptor miR-100 10 | Our aim is to analyze the role of miR-100 in bladder cancer cell lines in controlling the expression of some of its possible target genes , including FGFR3 and its relationship with proliferation , apoptosis and DNA ploidy . bladder cancer FGFR3 miR-100 11 | Recently , lowered expression of miR-100 , resulting in upregulation of FGFR3 , has been correlated with low-grade , non-invasive bladder urothelial cancer , as an alternative oncogenesis pathway to the typical FGFR3 gene mutation . FGFR3 miR-100 12 | Bioinformatics analysis and luciferase reporter assay suggest that miR-100 binds to the 3'UTR of FGFR3 mRNA to prevent its translation . FGFR3 miR-100 13 | Here we reported significantly higher levels of fibroblast growth factor receptor 3 ( FGFR3 ) and significantly lower levels of miR-100 in the OS specimen , compared to those in the paired normal bone tissues . OS FGFR3 miR-100 fibroblast growth factor receptor 3 14 | Luciferase essay showed FGFR3 was direct target of miR-100 . miR-100 FGFR3 15 | Taken together , our data demonstrate that miR-100 may inhibit the growth of OS through FGFR3 . miR-100 FGFR3 OS 16 | We found that overexpression of miR-100 in OS cells decreased FGFR3 protein levels , whereas inhibition of miR-100 increased FGFR3 protein levels , without affecting FGFR3 transcripts . miR-100 OS FGFR3 17 | FGFR3 was significantly downregulated by overexpressing miR-100 in pancreatic cancer cells and knocking down FGFR3 by siRNA exerted similar effect as miR-100 . FGFR3 miR-100 pancreatic -------------------------------------------------------------------------------- /demo_output/background_samples.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/corpus-to-graph-ml/e8b431f7342b674d4ee07c20c403531c9f6aad3f/demo_output/background_samples.txt -------------------------------------------------------------------------------- /demo_output/temp_doc2vec.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/corpus-to-graph-ml/e8b431f7342b674d4ee07c20c403531c9f6aad3f/demo_output/temp_doc2vec.txt -------------------------------------------------------------------------------- /demo_output/top_scoring_positive_class.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CatalystCode/corpus-to-graph-ml/e8b431f7342b674d4ee07c20c403531c9f6aad3f/demo_output/top_scoring_positive_class.txt -------------------------------------------------------------------------------- /features_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import sys\n", 14 | "sys.path.append(\"./packages\")\n", 15 | "\n", 16 | "import matplotlib\n", 17 | "import datetime\n", 18 | "import numpy as np\n", 19 | "import data_preparation_tools as dpt\n", 20 | "import features_generation_tools as fgt\n", 21 | "import model_tools\n", 22 | "from sklearn.linear_model import LogisticRegression\n", 23 | "from sklearn.linear_model import LogisticRegressionCV\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "from sklearn.metrics import accuracy_score\n", 26 | "from sklearn.metrics import make_scorer\n", 27 | "from sklearn.ensemble import GradientBoostingClassifier" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false, 35 | "scrolled": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "# path used to save temporary doc2vec files\n", 40 | "temp_doc2vec_file = r\"./demo_output/temp_doc2vec.txt\"\n", 41 | "# path to text file that contains background sentences used in doc2vec\n", 42 | "background_samples_file_path = r\"./demo_output/background_samples.txt\"\n", 43 | "\n", 44 | "doc2vec_func = lambda x_train,x_test : fgt.get_doc2vec_features(x_train, x_test, temp_doc2vec_file, background_samples_file_path)\n", 45 | "bow_func = lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,3))\n", 46 | "\n", 47 | "# evaluate different features\n", 48 | "gen_features_methods = [\n", 49 | "fgt.GenFeaturesMethod(\"bow_1_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,1))),\n", 50 | "fgt.GenFeaturesMethod(\"bow_2_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (2,2))),\n", 51 | "fgt.GenFeaturesMethod(\"bow_3_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (3,3))),\n", 52 | "fgt.GenFeaturesMethod(\"bow_1_3_gram\", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,3))),\n", 53 | "fgt.GenFeaturesMethod(\"doc2vec\", lambda x_train,x_test : fgt.get_doc2vec_features(x_train, x_test, temp_doc2vec_file, background_samples_file_path)),\n", 54 | "fgt.GenFeaturesMethod(\"pos_3_3\", lambda x_train,x_test : fgt.to_pos_bow(x_train, x_test, (3,3))),\n", 55 | "fgt.GenFeaturesMethod(\"bow_1_3_pos_3_3\", lambda x_train,x_test : fgt.get_bow_and_pos_features(x_train, x_test, (1,3), (3,3))),\n", 56 | "fgt.GenFeaturesMethod(\"bow_1_3_doc2vec\", lambda x_train,x_test : fgt.get_compound_features(x_train, x_test, [bow_func, doc2vec_func]))\n", 57 | "]\n", 58 | "\n", 59 | "#Cs= [0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 0.8] + np.linspace(1,5, 9).tolist()\n", 60 | "Cs = np.linspace(0.005,0.25,10)\n", 61 | "\n", 62 | "# evaluates different classifiers\n", 63 | "evaluation_methods = [\n", 64 | " fgt.EvaluationMethod(\"logistic regression l1\", lambda: LogisticRegression(C=0.1, penalty='l1', solver='liblinear')),\n", 65 | " fgt.EvaluationMethod(\"lr l1 cv\", lambda: LogisticRegressionCV(penalty='l1', cv=5, scoring=make_scorer(f1_score), solver='liblinear', Cs=Cs, refit=True)),\n", 66 | " fgt.EvaluationMethod(\"lr l2 cv\", lambda: LogisticRegressionCV(penalty='l2', cv=5, scoring=make_scorer(f1_score), solver='liblinear', Cs=Cs, refit=True)),\n", 67 | " #fgt.EvaluationMethod(\"GBC\", lambda: GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=10, random_state=0))\n", 68 | "]\n", 69 | "\n", 70 | "# path to input dir \n", 71 | "input_dir = r\"./demo_output\"\n", 72 | "startTime = datetime.datetime.now()\n", 73 | "\n", 74 | "models = fgt.run_gen_features_pipeline(input_dir, gen_features_methods, evaluation_methods)\n", 75 | "\n", 76 | "runTime = datetime.datetime.now() - startTime\n", 77 | "print \"Finished generating features, took:%s\"%runTime" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 2", 84 | "language": "python", 85 | "name": "python2" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 2 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython2", 97 | "version": "2.7.11" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 0 102 | } 103 | -------------------------------------------------------------------------------- /packages/data_preparation_tools.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import nltk 4 | import urllib 5 | import requests 6 | import re 7 | from mirna_detector import is_mirna 8 | from os import path 9 | from sklearn.cross_validation import train_test_split 10 | 11 | 12 | #constants: 13 | DEFAULT_VARIABLE_FORMAT_TABLE = {'gene': 'GGVARENTTY%dGG', 14 | 'mirna' : 'MMVARENTTY%dMM' } 15 | 16 | PAIR_ENTITTY_VARIABLE_NAMES = {'gene': 'GGVARENTTYGG', 17 | 'mirna' : 'MMVARENTTYMM' } 18 | 19 | OTHER_ENTITY_VARIABLE_TABLE = {'gene': 'GGVARENTTYOTRGG', 20 | 'mirna' : 'MMVARENTTYOTRMM' } 21 | 22 | # expects output in GNAT's output format 23 | ENTITY_RECOGNITION_SERVICE_URL_FORMAT = "" 24 | 25 | DEFAULT_SENTENCE_COLUMN = 8 26 | DEFAULT_LABEL_COLUMNS = [4,5] 27 | DEFAULT_MIRNA_ENTITY_COLUMN = 10 28 | DEFAULT_GENE_ENTITY_COLUMN = 11 29 | DEFAULT_MIN_SENTENCE_LENGTH = 6 30 | DEFAULT_EXTRA_WORDS_COUNT = 3 31 | ENTITY_REGEX = re.compile(r"GGVARENTTYGG|MMVARENTTYMM") 32 | DEFAULT_TEST_SIZE = 0.25 33 | 34 | # TODO: we should probably take this from a file 35 | DEFAULT_NON_ENTITY_DICTIONARY = ['and', 'lab'] 36 | DEFAULT_MIN_ENTITY_LENGTH = 3 37 | 38 | # Object to pass along and contains the input to the different transformations 39 | class InputData: 40 | def __init__(self, data_train, data_test, contexts_train, contexts_test): 41 | self.data_train = data_train 42 | self.data_test = data_test 43 | self.contexts_train = contexts_train 44 | self.contexts_test = contexts_test 45 | 46 | # entities is a list of dictionaries in the format of {"text" : "...", type: 47 | # "...", } 48 | def replace_entities_with_variables(text, entities, variable_format_table=OTHER_ENTITY_VARIABLE_TABLE): 49 | text_parts = [] 50 | 51 | for entity in entities: 52 | entity_replacement = variable_format_table[entity["type"]] 53 | text = text.replace(entity["text"], entity_replacement) 54 | 55 | return text 56 | 57 | # entities is a list of dictionaries in the format of {"text" : "...", type: 58 | # "...", } 59 | def replace_entities_with_variables_old(text, entities, variable_name_table=DEFAULT_VARIABLE_FORMAT_TABLE): 60 | text_parts = [] 61 | 62 | locations_by_type = {} 63 | 64 | for entity in entities: 65 | index = 0 66 | index = text.find(entity["text"]) 67 | if (index == -1): 68 | continue 69 | 70 | if (not locations_by_type.has_key(entity["type"])): 71 | locations_by_type[entity["type"]] = {} 72 | 73 | locations_by_type[entity["type"]][index] = entity 74 | 75 | for t in locations_by_type: 76 | entities_to_variables = {} 77 | index = 1 78 | # assign a variable to each entity 79 | for i in sorted(locations_by_type[t]): 80 | entities_to_variables[locations_by_type[t][i]["text"]] = variable_format_table[t] % index 81 | index = index + 1 82 | 83 | for entity in entities_to_variables: 84 | text = text.replace(entity, entities_to_variables[entity]) 85 | 86 | return text 87 | 88 | texts_entities_dictionary = {} 89 | def get_entities_for_text(text): 90 | # in case we already have the result... 91 | if (texts_entities_dictionary.has_key(text)): 92 | return texts_entities_dictionary[text] 93 | 94 | url = ENTITY_RECOGNITION_SERVICE_URL_FORMAT % (urllib.quote(text)) 95 | r = requests.get(url) 96 | if (r.status_code != 200): 97 | # TODO: Throw exception... 98 | print "got bad error code:%d" % r.status_code 99 | return None 100 | 101 | parsed_entities = [] 102 | entities = [] 103 | for line in r.text.split("\n"): 104 | if (len(line) == 0): 105 | continue 106 | line_parts = line.split("\t") 107 | entity_type = line_parts[2] 108 | start_index = int(line_parts[5]) 109 | end_index = int(line_parts[6]) 110 | text = line_parts[7] 111 | 112 | # only look at genes for now 113 | if (entity_type != 'gene' and entity_type != 'mirna'): 114 | continue 115 | 116 | # check if we don't have collisions, always take the longer string 117 | objects_to_remove = [] 118 | add_entity = True 119 | for e in parsed_entities: 120 | if (start_index <= e["startIndex"] and end_index >= e["endIndex"] or start_index <= e["startIndex"] and end_index >= e["startIndex"] or start_index <= e["endIndex"] and end_index >= e["endIndex"]): 121 | currentLen = len(text) 122 | second_len = len(e["text"]) 123 | if (currentLen <= second_len): 124 | add_entity = False 125 | break 126 | else: 127 | objects_to_remove.append(e) 128 | 129 | for e in objects_to_remove: 130 | parsed_entities.remove(e) 131 | 132 | if (add_entity): 133 | parsed_entities.append({ 134 | "text" : text, 135 | "startIndex" : start_index, 136 | "endIndex" : end_index 137 | }) 138 | 139 | for e in parsed_entities: 140 | if (not e["text"] in entities): 141 | entities.append(e["text"]) 142 | 143 | # merge entities in case there is overlap 144 | 145 | return entities 146 | 147 | # utility function to add text to entities data to the dictionary 148 | def import_to_texts_entities_dictonary_from_file(file_path): 149 | with open(file_path) as handle: 150 | for line in handle: 151 | parts = line.rstrip().split("\t") 152 | texts_entities_dictionary[parts[0]] = parts[1:] 153 | 154 | 155 | # the results of the entityt recognition might be noisy 156 | def filter_entities(entities, non_entities_dictionary=DEFAULT_NON_ENTITY_DICTIONARY, min_entity_length=DEFAULT_MIN_ENTITY_LENGTH): 157 | filtered_entities = [] 158 | for e in entities: 159 | if (len(e) < min_entity_length): 160 | continue 161 | if (e in non_entities_dictionary): 162 | continue 163 | 164 | filtered_entities.append(e) 165 | 166 | return filtered_entities 167 | 168 | def entity_list_to_descriptors(entities): 169 | result = [] 170 | for e in entities: 171 | type = "mirna" if is_mirna(e) else "gene" 172 | result.append({"text" : e, "type" : type}) 173 | return result 174 | 175 | def extract_and_replace_entities(text, context=None, return_descriptors=False): 176 | if (context != None and context.has_key("pair_entities")): 177 | replaced_text = replace_entities_with_variables(text, context["pair_entities"], PAIR_ENTITTY_VARIABLE_NAMES) 178 | if (context != None and context.has_key("all_entities")): 179 | # TODO: run unification logic here as well? 180 | entity_descriptors = context["all_entities"] 181 | else: 182 | entities = get_entities_for_text(text) 183 | entities = filter_entities(entities) 184 | entity_descriptors = entity_list_to_descriptors(entities) 185 | 186 | replaced_text = replace_entities_with_variables(replaced_text, entity_descriptors) 187 | if (return_descriptors): 188 | return (replaced_text, descriptors) 189 | else: 190 | return replaced_text 191 | 192 | # extract data from CSV/TSV file 193 | def extract_sentences(input_file_path, 194 | sentence_columns=DEFAULT_SENTENCE_COLUMN, mirna_entity_column=DEFAULT_MIRNA_ENTITY_COLUMN, 195 | gene_entity_column=DEFAULT_GENE_ENTITY_COLUMN, label_column_indices=None, 196 | label_tag=None, sample_size=-1): 197 | sentences = [] 198 | labels = [] 199 | contexts = [] 200 | all_sentences = {} 201 | with open(input_file_path) as input: 202 | for line in input: 203 | splitted_line = line.rstrip().split("\t") 204 | sentence = splitted_line[sentence_columns] 205 | 206 | # TODO: randomly sample one sentence instead of just the first one in the list 207 | # We do this in order to make sure that there isn't a bias in the model towards sentences that 208 | # that appear more than other ones 209 | if (all_sentences.has_key(sentence)): 210 | continue 211 | 212 | all_sentences[sentence] = 1 213 | 214 | contexts.append({"pair_entities" : [{"text" : splitted_line[mirna_entity_column], "type": "mirna"}, 215 | {"text" : splitted_line[gene_entity_column], "type": "gene"}]}) 216 | 217 | if (label_column_indices != None): 218 | label = splitted_line[label_column_indices[0]] 219 | for i in label_column_indices[1:]: 220 | label = label + "_" + splitted_line[i] 221 | labels.append(label) 222 | 223 | sentences.append(sentence) 224 | 225 | if (sample_size != -1): 226 | indices = np.random.choice(len(sentences), sample_size) 227 | sentences = [sentences[index] for index in indices] 228 | contexts = [contexts[index] for index in indices] 229 | 230 | if (label_tag != None): 231 | labels = [label_tag] * len(sentences) 232 | return (sentences, labels, contexts) 233 | 234 | def extract_sentences_with_multiclass_labels(input_file_path, sample_size=-1): 235 | return extract_sentences(input_file_path, label_column_indices=DEFAULT_LABEL_COLUMNS, sample_size=sample_size) 236 | 237 | def is_sequence(arg): 238 | return (not hasattr(arg, "strip") and hasattr(arg, "__getitem__") or hasattr(arg, "__iter__")) 239 | 240 | def write_lines_or_tuples_to_file(lines, output_file_path, seperator="\t"): 241 | are_tuples = is_sequence(lines[0]) 242 | with open(output_file_path, "w") as handle: 243 | for o in sentences: 244 | if are_tuples: 245 | text = o[0] 246 | for item in o[1:]: 247 | text = text + seperator + item 248 | else: 249 | text = o 250 | handle.write(text + "\n") 251 | 252 | # get all entities for a list of sentences: 253 | def get_entities_for_file(in_file_path, out_file_path): 254 | sentences = extract_sentences(in_file_path) 255 | with open(out_file_path) as out_handle: 256 | for s,i in zip(sentences, xrange(len(sentences))): 257 | print "%d of %d" % (i, len(sentences)) 258 | entities = get_entities_for_text(s) 259 | out_handle.write(sentence + "\t" + "\t".join(entities) + "\n") 260 | 261 | def trim_sentence_around_entities(text , context=None, min_length=DEFAULT_MIN_SENTENCE_LENGTH, extra_words_count=DEFAULT_EXTRA_WORDS_COUNT): 262 | sentence_parts = text.split() 263 | 264 | if (len(sentence_parts) < min_length): 265 | return text 266 | 267 | first_index = -1 268 | last_index = -1 269 | 270 | for part,i in zip(sentence_parts, xrange(len(sentence_parts))): 271 | if (ENTITY_REGEX.match(part)): 272 | if (first_index == -1): 273 | first_index = i 274 | last_index = i 275 | 276 | size = last_index - first_index + extra_words_count * 2 277 | 278 | # ensure 279 | if (size < min_length): 280 | extra_words_count = extra_words_count + math.ceil((min_length - size) / 2) 281 | 282 | first_index = max(0, first_index - extra_words_count) 283 | last_index = min(len(sentence_parts), last_index + extra_words_count + 1) 284 | 285 | trimmed_sentence_parts = sentence_parts[first_index:last_index] 286 | return " ".join(trimmed_sentence_parts) 287 | 288 | # langueage based sentence process 289 | def normalize_text(sent, context=None): 290 | return sent.lower() 291 | 292 | # stop words removal 293 | def remove_stop_words(sent, context=None): 294 | processed_tokens = [] 295 | tokens = nltk.word_tokenize(sent) 296 | for t in tokens: 297 | # ignore stop words 298 | if (t in nltk.corpus.stopwords.words('english') or len(t) < 2): 299 | continue 300 | processed_tokens.append(t) 301 | 302 | return " ".join(processed_tokens) 303 | 304 | # digits removal 305 | def remove_all_digit_tokens(sent, context=None): 306 | processed_tokens = [] 307 | tokens = nltk.word_tokenize(sent) 308 | for t in tokens: 309 | # ignore stop words 310 | if (t.isdigit()): 311 | continue 312 | processed_tokens.append(t) 313 | 314 | return " ".join(processed_tokens) 315 | 316 | # run stemmer on the words 317 | def stem_text(sent, context=None): 318 | processed_tokens = [] 319 | tokens = nltk.word_tokenize(sent) 320 | porter = nltk.PorterStemmer() 321 | for t in tokens: 322 | t = porter.stem(t) 323 | processed_tokens.append(t) 324 | 325 | return " ".join(processed_tokens) 326 | 327 | # Split to train and test sample sets: 328 | def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE): 329 | d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size) 330 | d_test_2 = [] 331 | l_test_2 = [] 332 | c_test_2 = [] 333 | 334 | train_dict = {} 335 | for d in d_train: 336 | train_dict[d] = 1 337 | 338 | for d,l,c in zip(d_test, l_test, c_test): 339 | if (train_dict.has_key(d)): 340 | continue 341 | d_test_2.append(d) 342 | l_test_2.append(l) 343 | c_test_2.append(c) 344 | 345 | return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2) 346 | 347 | # utility to extracts entities from preproceseed files 348 | def extract_entities_from_entity_file(input_file_paths, out_file_path): 349 | all_entities = {} 350 | 351 | for file_path in input_file_paths: 352 | with open(file_path) as handle: 353 | for l in handle: 354 | parts = l.rstrip().split("\t") 355 | for e in parts[1:]: 356 | all_entities[e] = 1 357 | 358 | with open(out_file_path, "w") as handle: 359 | for e in sorted(all_entities.keys()): 360 | handle.write(e + "\n") 361 | 362 | def run_step(step_name, step_func, inputs_dict, required): 363 | temp_dict = {} 364 | for k in inputs_dict: 365 | 366 | if (required != None): 367 | found_all = True 368 | for r in required: 369 | if (k.find(r) == -1): 370 | found_all = False 371 | if (not found_all): 372 | continue 373 | 374 | result_train = [] 375 | result_test = [] 376 | 377 | for l,c in zip(inputs_dict[k].data_train, inputs_dict[k].contexts_train): 378 | result_train.append(step_func(l, context=c)) 379 | 380 | for l,c in zip(inputs_dict[k].data_test, inputs_dict[k].contexts_test): 381 | result_test.append(step_func(l, context=c)) 382 | 383 | temp_dict[k + "_" + step_name] = InputData(result_train, result_test, 384 | inputs_dict[k].contexts_train, inputs_dict[k].contexts_test) 385 | 386 | for k in temp_dict: 387 | inputs_dict[k] = temp_dict[k] 388 | 389 | def run_step_unlabeled_data(step_name, step_func, inputs_dict, required): 390 | temp_dict = {} 391 | for k in inputs_dict: 392 | 393 | if (required != None): 394 | found_all = True 395 | for r in required: 396 | if (k.find(r) == -1): 397 | found_all = False 398 | if (not found_all): 399 | continue 400 | 401 | results = [] 402 | 403 | for l in inputs_dict[k]: 404 | results.append(step_func(l)) 405 | 406 | temp_dict[k + "_" + step_name] = results 407 | 408 | for k in temp_dict: 409 | inputs_dict[k] = temp_dict[k] 410 | 411 | # 3rd part specify required step 412 | TRANSFORMATION_STEPS = [('entities', extract_and_replace_entities), 413 | ('trim', trim_sentence_around_entities, ['entities']), 414 | ('normalize', normalize_text, ['entities']), 415 | ('rmdigits', remove_all_digit_tokens, ['entities']), 416 | # ('rmstopwords', remove_stop_words, ['entities']) 417 | # ('stem', stem_text, ['entities']) 418 | ] 419 | 420 | TRANSFORMATION_STEPS_UNLABELED = [('entities', extract_and_replace_entities), 421 | ('normalize', normalize_text, ['entities']), 422 | ('rmdigits', remove_all_digit_tokens, ['entities']), 423 | ('rmstopwords', remove_stop_words, ['entities'])] 424 | 425 | # steps is an array of tuples, with the first as name and the second is the processing function 426 | def run_transformations_on_single_sentence(text, context, steps=TRANSFORMATION_STEPS): 427 | current = text 428 | for s in steps: 429 | current = s[1](current, context=context) 430 | return current 431 | 432 | def run_transformations_on_data(sentences, labels, contexts, output_files_prefix, output_dir, write_context_to_files=True): 433 | # split to train and test: 434 | print "before data split" 435 | s_train, s_test, l_train, l_test, c_train, c_test = split_to_test_and_train(sentences, labels, contexts) 436 | inputs = { 437 | 'data' : InputData(s_train, s_test, c_train, c_test) 438 | } 439 | 440 | print "splitted data" 441 | # run each step on all of the already existing ones 442 | # TODO: pass entities metadata and let the trimming work even without the 443 | # regexes.. 444 | 445 | for s in TRANSFORMATION_STEPS: 446 | print "running step:%s" % (s[0]) 447 | required = None 448 | if (len(s) > 2): 449 | required = s[2] 450 | run_step(s[0], s[1], inputs, required) 451 | 452 | # todo: write outputs to files 453 | for name in inputs: 454 | train_file_path = path.join(output_dir, output_files_prefix + "_" + name + "_train.tsv") 455 | test_file_path = path.join(output_dir, output_files_prefix + "_" + name + "_test.tsv") 456 | 457 | train_data = inputs[name].data_train 458 | test_data = inputs[name].data_test 459 | 460 | with open(train_file_path, "w") as handle: 461 | for text,label in zip(train_data,l_train): 462 | if (len(text.strip()) == 0): 463 | continue 464 | handle.write("%s\t%s\n" % (label, text)) 465 | 466 | with open(test_file_path, "w") as handle: 467 | for text,label in zip(test_data,l_test): 468 | if (len(text.strip()) == 0): 469 | continue 470 | handle.write("%s\t%s\n" % (label, text)) 471 | 472 | if (write_context_to_files): 473 | train_context_file_path = path.join(output_dir, output_files_prefix + "_context_train.tsv") 474 | test_context_file_path = path.join(output_dir, output_files_prefix + "_context_test.tsv") 475 | 476 | #({"pair_entities" : [{"text" : splitted_line[mirna_entity_column], "type": "mirna"}, 477 | # {"text" : splitted_line[gene_entity_column], "type": "gene"}]}) 478 | 479 | with open(train_context_file_path, "w") as handle: 480 | for c in c_train: 481 | # TODO: generalize this, for now we just do this quick and dirty.. 482 | entity_1_type = c["pair_entities"][0]["type"] 483 | 484 | if (entity_1_type == "mirna"): 485 | mirna = c["pair_entities"][0]["text"] 486 | gene = c["pair_entities"][1]["text"] 487 | else: 488 | mirna = c["pair_entities"][1]["text"] 489 | gene = c["pair_entities"][0]["text"] 490 | 491 | handle.write(mirna + "\t" + gene + "\n") 492 | 493 | with open(test_context_file_path, "w") as handle: 494 | for c in c_test: 495 | # TODO: generalize this, for now we just do this quick and dirty.. 496 | entity_1_type = c["pair_entities"][0]["type"] 497 | 498 | if (entity_1_type == "mirna"): 499 | mirna = c["pair_entities"][0]["text"] 500 | gene = c["pair_entities"][1]["text"] 501 | else: 502 | mirna = c["pair_entities"][1]["text"] 503 | gene = c["pair_entities"][0]["text"] 504 | 505 | handle.write(mirna + "\t" + gene + "\n") 506 | 507 | 508 | 509 | def write_lines_to_file(lines, out_file_path): 510 | with open(out_file_path,"w") as handle: 511 | for line in lines: 512 | handle.write(line + "\n") 513 | 514 | def read_lines_from_file(in_file_path): 515 | with open(in_file_path) as handle: 516 | lines = [line.rstrip() for line in handle] 517 | return lines 518 | 519 | def sample_from_file(in_file_path, out_file_path, sample_size): 520 | lines = read_lines_from_file(in_file_path) 521 | sampled = [lines[index] for index in np.random.choice(len(lines), sample_size)] 522 | write_lines_to_file(sampled, out_file_path) 523 | 524 | def run_data_preparation_pipeline(positive_samples_file_path, negative_samples_file_path, 525 | output_files_prefix, output_dir, run_multiclass=False): 526 | # two classes case: 527 | pos_sentences, pos_labels, pos_contexts = extract_sentences(positive_samples_file_path, label_tag='RELATION') 528 | neg_sentences, neg_labels, neg_contexts = extract_sentences(negative_samples_file_path, label_tag='NO_RELATION', sample_size = len(pos_sentences)) 529 | contexts = pos_contexts + neg_contexts 530 | sentences = pos_sentences + neg_sentences 531 | labels = pos_labels + neg_labels 532 | 533 | print "producing inputs for binary case..." 534 | 535 | run_transformations_on_data(sentences, labels, contexts, output_files_prefix + "_binary", output_dir) 536 | 537 | if (run_multiclass): 538 | # multi class case: 539 | print "producing inputs for multiclass case..." 540 | pos_sentences, pos_labels, pos_entities = extract_sentences_with_multiclass_labels(positive_samples_file_path) 541 | sentences = pos_sentences + neg_sentences 542 | labels = pos_labels + neg_labels 543 | contexts = pos_contexts + neg_contexts 544 | 545 | run_transformations_on_data(sentences, labels, contexts, output_files_prefix + "_multiclass", output_dir) 546 | 547 | 548 | def run_transformations_on_unlabeled_data(sentences, output_files_prefix, output_dir): 549 | inputs = { 550 | 'data' : sentences 551 | } 552 | 553 | # run each step on all of the already existing ones 554 | # TODO: pass entities metadata and let the trimming work even without the 555 | # regexes.. 556 | 557 | for s in TRANSFORMATION_STEPS_UNLABELED: 558 | print "running step:%s" % (s[0]) 559 | required = None 560 | if (len(s) > 2): 561 | required = s[2] 562 | run_step_unlabeled_data(s[0], s[1], inputs, required) 563 | 564 | # todo: write outputs to files 565 | for name in inputs: 566 | out_file_path = path.join(output_dir, output_files_prefix + "_" + name + ".txt") 567 | 568 | with open(out_file_path, "w") as handle: 569 | for text in inputs[name]: 570 | if (len(text.strip()) == 0): 571 | continue 572 | handle.write("%s\n" % (text)) 573 | 574 | 575 | def run_unlabeled_data_preparation_pipeline(samples_file_path, output_files_prefix, output_dir): 576 | sentences = read_lines_from_file(samples_file_path) 577 | run_transformations_on_unlabeled_data(sentences, output_files_prefix + "_binary", output_dir) -------------------------------------------------------------------------------- /packages/features_generation_tools.py: -------------------------------------------------------------------------------- 1 | import data_preparation_tools as dpt 2 | import fnmatch 3 | import gensim 4 | import logging 5 | import multiprocessing 6 | import numpy as np 7 | import sklearn.metrics as metrics 8 | import re 9 | from gensim.models.doc2vec import * 10 | from os import listdir 11 | from os.path import isfile, join 12 | from sklearn.feature_extraction.text import CountVectorizer 13 | from scipy.sparse import hstack 14 | from spacy.en import English 15 | 16 | DEFAULT_BOW_NGRAM_RANGE = (1,1) 17 | DEFAULT_BOW_MAX_FEATURES = None 18 | DEFAULT_BOW_BINARY = True 19 | ENTITY_REGEX = re.compile(r"GGVARENTTY[0-9]+GG|MMVARENTTY[0-9]+MM", re.IGNORECASE) 20 | 21 | BINARY_LABELS_TO_CLASSES_TABLE = { 22 | 'NO_RELATION' : 0, 23 | 'RELATION' : 1 24 | } 25 | 26 | MULTICLASS_LABELS_TO_CLASSES_TABLE = { 27 | 'NO_RELATION' : 0, 28 | 'NEGATIVE_DIRECT' : 1, 29 | 'NEGATIVE_INDIRECT' : 2, 30 | 'POSITIVE_DIRECT' : 3, 31 | 'POSITIVE_INDIRECT' : 4 32 | } 33 | 34 | class TrainTestData: 35 | def __init__(self, train_data, train_labels, test_data, test_labels, is_multiclass, feature_gen_model = None): 36 | self.train_data = train_data 37 | self.train_labels = train_labels 38 | self.test_data = test_data 39 | self.test_labels = test_labels 40 | self.is_multiclass = is_multiclass 41 | self.feature_gen_model = feature_gen_model 42 | 43 | class EvaluationResult: 44 | def __init__(self, model, features_gen_model, test_data, scores): 45 | self.model = model 46 | self.test_data = test_data 47 | self.scores = scores 48 | self.features_gen_model = features_gen_model 49 | 50 | # read texts and labels from data file: 51 | def read_data_from_file(file_path): 52 | with open(file_path) as handle: 53 | labels = [] 54 | data = [] 55 | for l in handle: 56 | parts = l.rstrip().split("\t") 57 | if (len(parts) < 2): 58 | continue 59 | labels.append(parts[0]) 60 | data.append(parts[1]) 61 | return data,labels 62 | 63 | def read_train_and_test_data_from_path(path): 64 | only_files = [f for f in listdir(path) if (isfile(join(path, f)))] 65 | train_files = [f for f in only_files if fnmatch.fnmatch(f, '*_train.tsv')] 66 | data_names = ["_".join(f.split("_")[:-1]) for f in train_files] 67 | data_table = {} 68 | data_table_no_entities = {} 69 | 70 | for name in data_names: 71 | train_data, train_labels = read_data_from_file(join(path, name + "_train.tsv")) 72 | test_data, test_labels = read_data_from_file(join(path, name + "_test.tsv")) 73 | 74 | is_multiclass = name.find('multiclass') > -1 75 | 76 | # without entities as well: 77 | train_data_no_entities, indices_to_remove = remove_entities_from_text(train_data) 78 | train_labels_no_entities = train_labels 79 | test_data_no_entities, indices_to_remove = remove_entities_from_text(test_data) 80 | test_labels_no_entities = test_labels 81 | 82 | data_table[name] = TrainTestData(train_data, train_labels, test_data, test_labels, is_multiclass) 83 | data_table_no_entities[name] = TrainTestData(train_data_no_entities, train_labels_no_entities, 84 | test_data_no_entities, test_labels_no_entities, is_multiclass) 85 | 86 | return data_table, data_table_no_entities 87 | 88 | def remove_entities_from_text(sentences): 89 | fixed_sentences = [] 90 | indices_to_remove = [] 91 | for s,i in zip(sentences,range(len(sentences))): 92 | new_sent = [] 93 | for t in s.split(): 94 | if (not ENTITY_REGEX.match(t)): 95 | new_sent.append(t) 96 | if (len(new_sent) == 0): 97 | indices_to_remove.append(i) 98 | #else: 99 | fixed_sentences.append(" ".join(new_sent)) 100 | 101 | return fixed_sentences, indices_to_remove 102 | 103 | nlp_parser = None 104 | def to_nlp_objs(sentences): 105 | global nlp_parser 106 | # init once 107 | if (nlp_parser == None): 108 | nlp_parser = English() 109 | 110 | nlp_objs = [] 111 | for s in sentences: 112 | nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False)) 113 | return nlp_objs 114 | 115 | def get_nlp_features(sentences): 116 | parsed = to_nlp_objs(sentences) 117 | pos_tags = [] 118 | for p in parsed: 119 | pos_tags.append([s.pos_ for s in p]) 120 | 121 | return pos_tags 122 | 123 | def to_pos_bow(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, binary=DEFAULT_BOW_BINARY): 124 | #TODO: can do this more efficiently, this is a workaround for now 125 | pos_tags_train = [" ".join(s) for s in get_nlp_features(train_samples)] 126 | pos_tags_test = [" ".join(s) for s in get_nlp_features(test_samples)] 127 | return to_bag_of_words(pos_tags_train, pos_tags_test, ngram_range=ngram_range, binary=binary, max_features=None) 128 | 129 | def to_bag_of_words(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, 130 | max_features=DEFAULT_BOW_MAX_FEATURES, binary=DEFAULT_BOW_BINARY): 131 | #Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool. 132 | vectorizer = CountVectorizer(analyzer = "word", 133 | tokenizer = None, 134 | preprocessor = None, 135 | stop_words = None, 136 | max_features = max_features, 137 | binary = binary, 138 | ngram_range=ngram_range) 139 | 140 | train_data_features = vectorizer.fit_transform(train_samples) 141 | test_data_features = vectorizer.transform(test_samples) 142 | return train_data_features, test_data_features, vectorizer 143 | 144 | def get_bow_features(train_samples, test_samples, ngram_range): 145 | return to_bag_of_words(train_samples, test_samples, ngram_range=ngram_range) 146 | 147 | def get_bow_and_pos_features(train_samples, test_samples, ngram_range, pos_ngram_range): 148 | bow_train_features, bow_test_features = get_bow_features(train_samples, test_samples, ngram_range) 149 | pos_train_features, pos_test_features = to_pos_bow(train_samples, test_samples, ngram_range=pos_ngram_range) 150 | 151 | 152 | train_features = hstack((bow_train_features, pos_train_features)) 153 | test_features = hstack((bow_test_features, pos_test_features)) 154 | 155 | return train_features, test_features 156 | 157 | def get_compound_features(train_data, test_data, feature_gen_methods): 158 | train_features_list = [] 159 | test_features_list = [] 160 | 161 | for m in feature_gen_methods: 162 | train_features, test_features = m(train_data, test_data) 163 | train_features_list.append(train_features) 164 | test_features_list.append(test_features) 165 | 166 | train_features = train_features_list[0] 167 | test_features = test_features_list[0] 168 | 169 | for i in xrange(1,len(feature_gen_methods)): 170 | train_features = hstack((train_features, train_features_list[i])) 171 | test_features = hstack((test_features, test_features_list[i])) 172 | 173 | return train_features, test_features 174 | 175 | def merge_into_file(input_path_or_data, output): 176 | if (input_path_or_data == None): 177 | return 178 | 179 | # if it's data and not path 180 | if (dpt.is_sequence(input_path_or_data)): 181 | for l in input_path_or_data: 182 | output.write(l + "\n") 183 | return len(input_path_or_data) 184 | 185 | count = 0; 186 | with open(input_path_or_data) as input: 187 | for l in input: 188 | output.write(l) 189 | count = count + 1 190 | return count 191 | 192 | def build_doc2vec_model(data, temp_doc2vec_input_file_path, background_samples_file_path = None, 193 | model_file_path = None, should_log = False): 194 | 195 | if (should_log): 196 | reload(logging) 197 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 198 | logger = logging.getLogger() 199 | 200 | # merge the data into one file and then let gensim TaggedLineDocument class take care of the rest 201 | # this can be further optimized by creating a custom doc2vec iterator that will read the files in sequence 202 | print "creating temp file..." 203 | with open(temp_doc2vec_input_file_path,"w") as output: 204 | merge_into_file(data, output) 205 | merge_into_file(background_samples_file_path, output) 206 | 207 | with open(temp_doc2vec_input_file_path) as handle: 208 | print "creating model..." 209 | # TODO: add min_count = 5, but deal with empty sentences.. 210 | ncpus = multiprocessing.cpu_count() 211 | model = Doc2Vec(TaggedLineDocument(handle), size = 200, window=8, min_count = 5, workers = ncpus) 212 | print "model built" 213 | if (model_file_path != None): 214 | model.save(model_file_path) 215 | 216 | #return model 217 | return model 218 | 219 | # get the doc2vec feature vectors 220 | def get_doc2vec_features(train_data, test_data, 221 | temp_doc2vec_input_file_path, background_samples_file_path = None): 222 | 223 | input_data = train_data + test_data 224 | model = build_doc2vec_model(input_data, temp_doc2vec_input_file_path, background_samples_file_path, should_log = True) 225 | 226 | # extract the vectors according to their class 227 | train_embeddings = [model.docvecs[index] for index in xrange(len(train_data))] 228 | test_embeddings = [model.docvecs[index] for index in xrange(len(train_data), len(train_data) + len(test_data))] 229 | #background_embeddings = [model.docvecs[index] for index in xrange(len(train_data) + len(test_data), model.docvecs.count)] 230 | 231 | return train_embeddings, test_embeddings, model 232 | 233 | def label_to_class(label, is_multiclass, auto_add_classes=False): 234 | if (is_multiclass==True): 235 | if (not MULTICLASS_LABELS_TO_CLASSES_TABLE.has_key(label) and auto_add_classes): 236 | max_class = max([MULTICLASS_LABELS_TO_CLASSES_TABLE[k] for k in MULTICLASS_LABELS_TO_CLASSES_TABLE]) 237 | MULTICLASS_LABELS_TO_CLASSES_TABLE[label] = max_class + 1 238 | 239 | return MULTICLASS_LABELS_TO_CLASSES_TABLE[label] 240 | 241 | return BINARY_LABELS_TO_CLASSES_TABLE[label] 242 | 243 | def labels_to_classes(labels, is_multiclass=False): 244 | classes = [] 245 | for label in labels: 246 | classes.append(label_to_class(label, is_multiclass)) 247 | return classes 248 | 249 | 250 | def gen_features_and_classes(train_test_data, gen_features_func): 251 | train_classes = labels_to_classes(train_test_data.train_labels, is_multiclass = train_test_data.is_multiclass) 252 | test_classes = labels_to_classes(train_test_data.test_labels, is_multiclass = train_test_data.is_multiclass) 253 | 254 | train_features, test_features, model = gen_features_func(train_test_data.train_data, train_test_data.test_data) 255 | 256 | return TrainTestData(train_features, train_classes, test_features, test_classes, train_test_data.is_multiclass, model) 257 | 258 | def write_features_classes_to_file(file_path, data, labels): 259 | with open(file_path, "w") as handle: 260 | for d,l in zip(data, labels): 261 | line_text = str(l) + "," + ",".join([str (x) for x in d.toarray()[0]]) 262 | handle.write(line_text + "\n") 263 | 264 | # feature evaluation 265 | def read_data_labels(file_path): 266 | data = [] 267 | labels = [] 268 | with open(file_path) as handle: 269 | for l in handle: 270 | parts = l.rstrip().split(",") 271 | labels.append(float(parts[0])) 272 | data.append([float(i) for i in parts[1:]]) 273 | 274 | return data, labels 275 | 276 | def read_train_test_data(input_dir, name): 277 | train_file_path = join(input_dir, name + "_train.csv") 278 | test_file_path = join(input_dir, name + "_test.csv") 279 | train_data, train_labels = read_data_labels(train_file_path) 280 | test_data, test_labels = read_data_labels(test_file_path) 281 | 282 | is_multiclass = name.find("multiclass") > -1 283 | 284 | return TrainTestData(train_data, train_labels, test_data, test_labels, is_multiclass) 285 | 286 | from sklearn.feature_selection import SelectFromModel 287 | from sklearn.linear_model import LassoCV 288 | 289 | def evaluate_model(train_test_data, model_initializer): 290 | clf = model_initializer() 291 | clf = clf.fit(train_test_data.train_data, train_test_data.train_labels) 292 | 293 | labels_predicted = clf.predict(train_test_data.test_data) 294 | scores_predicted = clf.predict_proba(train_test_data.test_data) 295 | print metrics.classification_report(train_test_data.test_labels, labels_predicted) 296 | return EvaluationResult(clf, train_test_data.feature_gen_model, train_test_data.test_data, scores_predicted) 297 | 298 | class GenFeaturesMethod: 299 | def __init__(self, name, func, no_entities = False): 300 | self.name = name 301 | self.func = func 302 | self.no_entities = no_entities 303 | 304 | class EvaluationMethod: 305 | def __init__(self, name, func): 306 | self.name = name 307 | self.func = func 308 | 309 | # get path to the data input dir, and a list of GenFeaturesMethod objects 310 | def run_gen_features_pipeline(input_dir, gen_features_methods, evaluation_methods): 311 | data_dict, data_dict_no_entities = read_train_and_test_data_from_path(input_dir) 312 | results = [] 313 | for name in data_dict: 314 | for gfm in gen_features_methods: 315 | print "generating %s features for %s"%(gfm.name, name) 316 | if (gfm.no_entities): 317 | data = data_dict_no_entities[name] 318 | else: 319 | data = data_dict[name] 320 | 321 | train_test_data = gen_features_and_classes(data, gfm.func) 322 | 323 | for em in evaluation_methods: 324 | print "model evaluation for: %s, %s, %s"%(name, gfm.name, em.name) 325 | result = evaluate_model(train_test_data, em.func) 326 | results.append(result) 327 | return results 328 | -------------------------------------------------------------------------------- /packages/mirna_detector/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Miroculus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /packages/mirna_detector/README.md: -------------------------------------------------------------------------------- 1 | # miRNA-detector 2 | Python library to detect miRNA mentions in plain texts. 3 | -------------------------------------------------------------------------------- /packages/mirna_detector/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import re 5 | import time 6 | from os import path 7 | from random import shuffle 8 | 9 | expression = '((hsa|mmu|\b)?-?(miRNA-|miR|\(miR\)|micoRNA|hsa-let|let-|microRNA-|micro ribonucleic acid)(-|\s|\d)?((-|\w|\*|\/)*\d+(-|\w|\/)*)+\*?)' 10 | extraction = '(hsa-(mir|let)-\d+(-|\w|\/)*)+' 11 | 12 | mirnas = [] 13 | dataFilePath = path.join(path.dirname(path.realpath(__file__)), 'data/mirna.txt') 14 | with open(dataFilePath) as f: 15 | 16 | content = f.readlines() 17 | for line in content: 18 | values = line.split('\t') 19 | mirna = values[2] 20 | if mirna.find('hsa') != -1: 21 | mirnas.append(mirna) 22 | values = values[3].split(';') 23 | if values[0]!='': 24 | mirnas+=values 25 | 26 | 27 | def refine(results): 28 | parsed = [] 29 | for result in results: 30 | value = re.search(expression, result, flags=re.I) 31 | if value: 32 | parsed.append(value.group()) 33 | 34 | return parsed 35 | 36 | 37 | def filterMirnas(results): 38 | global mirnas 39 | 40 | return [result for result in results if (result in mirnas or re.search(extraction, result, flags=re.I))] 41 | 42 | 43 | def normalize(candidate): 44 | 45 | #--------- 46 | if not re.search(expression, candidate, flags=re.I): 47 | return (candidate, '', '') 48 | #--------- 49 | 50 | 51 | value = re.search(r'\d+', candidate) 52 | if not value: 53 | return (candidate,'','') 54 | 55 | if candidate[0] == '-': 56 | return (candidate, '', '') 57 | 58 | base = value.group() 59 | namePrefix = 'hsa-mir-' 60 | if re.search('let', candidate, re.I): 61 | namePrefix = 'hsa-let-' 62 | 63 | baseIndex = candidate.find(base) + len(base) 64 | nameSufix = candidate[baseIndex:] 65 | 66 | return (namePrefix, base, nameSufix) 67 | 68 | 69 | def splitter(results): 70 | results = [result for result in results.split('/') if len(result)>0] 71 | # extract the mirna number base aka mir-(\d+) 72 | namePrefix, base, nameSufix = normalize(results[0]) 73 | splitted = [] 74 | 75 | splitted.append(namePrefix+base+nameSufix) 76 | 77 | for result in results[1:]: 78 | 79 | if not result[0].isdigit() and len(result)==1: 80 | splitted.append(namePrefix+base+result) 81 | else: 82 | np, b, ns = normalize(result) 83 | if np == result: 84 | value = namePrefix+result 85 | value = value.replace('--', '-') 86 | splitted.append(value) 87 | else: 88 | splitted.append(np+b+ns) 89 | return refine(splitted) 90 | 91 | 92 | def expand(sentence, result, limit): 93 | value = result[0] 94 | 95 | if sentence[result[2]] == ',' or sentence[result[2]:result[2]+5]==' and ' if len(sentence)>result[2]+5 else False: 96 | expanded = sentence[result[2]:limit].replace(', ','/') 97 | expanded = re.sub(r'\s?and ', '/', expanded) 98 | spaceIndex = expanded.find(' ') 99 | value += expanded 100 | return value 101 | 102 | 103 | def validate(sentence): 104 | detected = {"sentence":sentence, "detectedMirnas":[]} 105 | 106 | if type(sentence) == list: 107 | sentence = ' '.join(sentence) 108 | p = re.compile(expression, flags=re.I|re.M) 109 | 110 | results = [] 111 | for m in p.finditer(sentence): 112 | results.append((m.group(), m.start(), m.end())) 113 | 114 | parsedResults = [] 115 | for index, result in enumerate(results): 116 | lastIndex = 0 117 | if index+10: 126 | for miRNA in values: 127 | detected["detectedMirnas"].append({ 128 | "mirna": miRNA, 129 | "origin": extractValue 130 | }) 131 | parsedResults += values 132 | 133 | return detected 134 | 135 | def is_mirna(text): 136 | values = splitter(text) 137 | values = filterMirnas(values) 138 | return (len(values) > 0) 139 | -------------------------------------------------------------------------------- /packages/mirna_detector/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __init__ import validate 5 | 6 | total = 0 7 | errors = 0 8 | 9 | def testValidate(value, expected, index): 10 | global total 11 | global errors 12 | total+=1 13 | try: 14 | result = validate(value) 15 | result = [value['mirna'] for value in result['detectedMirnas']] 16 | assert result == expected 17 | except AssertionError as e: 18 | errors+=1 19 | print index, ': >>> ', result, 'not equals to', expected 20 | 21 | 22 | testValidate("""Validation assays confirmed the dysregulation of miR-223, 23 | miR-146a and miR-155 previously associated with human rheumatoid 24 | arthritis (RA) pathology, as well as that of miR-221/222 and 25 | miR-323-3p.""", 26 | ['hsa-mir-223', 'hsa-mir-146a', 'hsa-mir-155', 'hsa-mir-221', 'hsa-mir-222', 'hsa-mir-323-3p'], 27 | 0) 28 | 29 | testValidate("""We found that both miR-17 and miR-20a (miR-17/20a) target 30 | the UBE2C gene in gastric cancer cells.""", 31 | ['hsa-mir-17', 'hsa-mir-20a', 'hsa-mir-17', 'hsa-mir-20a'], 32 | 1) 33 | 34 | testValidate("""Collectively, these data identify the E2F1/miR-421/Pink 35 | axis as a regulator of mitochondrial fragmentation and cardiomyocyte 36 | apoptosis, and suggest potential therapeutic targets in treatment of 37 | cardiac diseases.""", 38 | ['hsa-mir-421'], 39 | 2) 40 | 41 | testValidate("""Based on extant data linking MIR 137 gene with structural 42 | brain anomalies and functional brain activations in schizophrenia, 43 | we hypothesized that MIR137 risk variants rs1625579 and rs1198588 44 | would be associated with reduced fractional anisotropy in 45 | frontostriatal brain regions, impaired neurocognitive functioning 46 | and worse psychotic symptoms in schizophrenia patients compared with 47 | healthy controls.""", 48 | ['hsa-mir-137', 'hsa-mir-137'],3) 49 | 50 | testValidate("""Betulinic acid-dependent repression of Sp1, Sp3, Sp4, 51 | and Sp-regulated genes was due, in part, to induction of the Sp 52 | repressor ZBTB10 and downregulation of microRNA-27a (miR-27a), which 53 | constitutively inhibits ZBTB10 expression, and we show for the first 54 | time that the effects of betulinic acid on the miR-27a:ZBTB10-Sp 55 | transcription factor axis were cannabinoid 1 (CB1) and CB2 56 | receptor-dependent, thus identifying a new cellular target for this 57 | anticancer agent.""", 58 | ['hsa-mir-27a', 'hsa-mir-27a', 'hsa-mir-27a'],4) 59 | 60 | 61 | testValidate("""The reintroduction of miR-148a and miR-34b/c in cancer 62 | cells with epigenetic inactivation inhibited their motility, reduced 63 | tumor growth, and inhibited metastasis formation in xenograft models, 64 | with an associated down-regulation of the miRNA oncogenic target 65 | genes, such as C-MYC, E2F3, CDK6, and TGIF2.""", 66 | ['hsa-mir-148a', 'hsa-mir-34b', 'hsa-mir-34c'],5) 67 | 68 | 69 | testValidate("""TGF), known to inhibit mir-200c expression in tumour 70 | cells , along with feedforward and negative feedback loops between the 71 | miR-200/ZEB1/JAG1 (; ; this study) would establish a vicious circle' in 72 | the metastatic bone microenvironment (model ), as may also apply to the 73 | organ tropism of other cancers.""", 74 | ['hsa-mir-200c', 'hsa-mir-200'],6) 75 | 76 | 77 | testValidate("""The roles of miR-17-5p and p21 were evaluated with 78 | specific antisense oligonucleotides (ODN) that were designed and 79 | used to inhibit their expression.""", 80 | ['hsa-mir-17-5p'],7) 81 | 82 | 83 | testValidate("""Consistently, depletion of miR-26a/b by miR-26 sponge 84 | could increase the activity of luciferase reporter genes fused to 85 | the 3 UTR of the same cohort of nine genes (AGPAT5, CHD1, ERLIN1, 86 | GREB1, HSPA8, KPNA2, MREG, NARG1 and PLOD2) by more than 30% 87 | (Figured).""", 88 | ['hsa-mir-26a', 'hsa-mir-26b', 'hsa-mir-26'],8) 89 | 90 | 91 | testValidate("""In our present study, we found that the expression of 92 | miR-361-5p in CRPC was lower than in androgen-dependent prostate 93 | cancer (ADPC), indicating that miR-361-5p may play an important role 94 | in the progression of ADPC to CRPC.""", 95 | ['hsa-mir-361-5p', 'hsa-mir-361-5p'],9) 96 | 97 | 98 | testValidate("""The expression of miR-146a, CD40, CD80 and CD86 on AchR 99 | specific B cells were analyzed by qRT-PCR and flow cytometry.""", 100 | ['hsa-mir-146a'],10) 101 | 102 | 103 | testValidate("""METHODS: We examined the association between the 104 | expression of miR-16, miR-21, miR-93, miR-135b, miR-146a, and miR-182 105 | in total RNA from the placentas of 86 term infants as measured by 106 | quantitative real-time PCR and newborn neurobehavioral outcomes as 107 | assessed using the NICU Network Neurobehavioral Scales (NNNS).""", 108 | ['hsa-mir-16', 'hsa-mir-21', 'hsa-mir-93', 'hsa-mir-135b', 'hsa-mir-146a', 'hsa-mir-182'], 109 | 11) 110 | 111 | 112 | testValidate("""METHODS: The current study validates nine miRNAs 113 | (miR-18a/b miR-25, miR-29c, miR-106b, miR375, miR-424, miR-505 and 114 | let-7b) significantly correlated with established prognostic 115 | breast cancer biomarkers.""", 116 | ['hsa-mir-18a', 'hsa-mir-18b', 'hsa-mir-25', 'hsa-mir-29c', 'hsa-mir-106b', 'hsa-mir-375', 117 | 'hsa-mir-424', 'hsa-mir-505', 'hsa-let-7b'],12) 118 | 119 | 120 | testValidate("""No significant relationships were observed between these 121 | two single nucleotide polymorphisms (SNPs) and onset risk of HCC 122 | after adjusting the factors as age, gender, smoking and drinking 123 | status in comparison with HBsAg positive controls: hsa-mir-146a 124 | rs2910164 (CG + GG vs CC): adjusting OR = 1.10, 95%CI: 0.90 - 1.36; 125 | hsa-mir-196-a2 rs11614913 (CC + CT vs TT): adjusting OR = 1.01, 126 | 95%CI: 0.81 - 1.25; as well as in comparison with HBsAg negative 127 | controls: hsa-mir-146a rs2910164 (CG + GG vs CC): adjusting OR = 128 | 1.06, 95%CI: 0.87 - 1.29; hsa-mir-196-a2 rs11614913 (CC + CT vs TT): 129 | adjusting OR = 0.94, 95%CI: 0.76 - 1.16.""", 130 | ['hsa-mir-146a', 'hsa-mir-196-a2', 'hsa-mir-146a', 'hsa-mir-196-a2'], 131 | 13) 132 | 133 | 134 | testValidate("""Hsa-miR-96 caused a decrease in SOX5 3-UTR luciferase 135 | activity by 60.34%4.79%, and both hsa-miR-7 and hsa-miR-17 caused 136 | a decrease in NR4A3 3-UTR luciferase activity by 65.01%4.07% and 137 | 45.11%6.76, respectively, compared with controls (FigureC).""", 138 | ['hsa-mir-96', 'hsa-mir-7', 'hsa-mir-17'],14) 139 | 140 | 141 | testValidate("""Furthermore, miR-20a and miR-17-5p were increased in 142 | the metastatic carcinoma and six atypical pituitary adenomas as 143 | compared to eight typical pituitary adenomas as measured by 144 | quantitative real-time PCR.""", 145 | ['hsa-mir-20a', 'hsa-mir-17-5p'],15) 146 | 147 | 148 | testValidate("""These results suggested that anti-miR-33a inhibit 149 | activation and extracellular matrix production, at least in part, 150 | via the activation of PI3K/Akt pathway and PPAR-a and anti sense 151 | of miR-33a may be a novel potential therapeutic approach for 152 | treating hepatic fibrosis in the future.""", 153 | ['hsa-mir-33a'],16) 154 | 155 | 156 | testValidate("""The results suggested that the miR-181b, miR-219-2-3p, 157 | miR-346, miR-195, miR-1308, miR-92a, miR-17, miR-103 and let-7g are 158 | the key players to reflect the schizophrenia illnesses status and 159 | may serve as candidate biomarkers for diagnosis of schizophrenia.""", 160 | ['hsa-mir-181b', 'hsa-mir-219-2-3p', 'hsa-mir-346', 'hsa-mir-195', 'hsa-mir-1308', 161 | 'hsa-mir-92a', 'hsa-mir-17', 'hsa-mir-103', 'hsa-let-7g'],17) 162 | 163 | 164 | testValidate("""Specifically discussed miRs include miR-7, miR-9/miR-9*, 165 | miR-10a/miR-10a*/miR-10b, miR-15b, miR-17-92, miR-21, miR-26a, 166 | miR-34a, miR-93, miR-101, miR-124, miR-125a, miR-125b, miR-128, 167 | miR-137, miR-146b-5p, miR-153, miR-181a/miR-181b, miR-196a/miR-196b, 168 | miR-218, miR-221/miR-222, miR-296, miR-302-367, miR-326, miR-381, 169 | miR-451, and let-7a.""", 170 | ['hsa-mir-7', 'hsa-mir-9', 'hsa-mir-9*', 'hsa-mir-10a', 'hsa-mir-10a*', 'hsa-mir-10b', 171 | 'hsa-mir-15b', 'hsa-mir-17-92', 'hsa-mir-21', 'hsa-mir-26a', 'hsa-mir-34a', 'hsa-mir-93', 172 | 'hsa-mir-101', 'hsa-mir-124', 'hsa-mir-125a', 'hsa-mir-125b', 'hsa-mir-128', 'hsa-mir-137', 173 | 'hsa-mir-146b-5p', 'hsa-mir-153', 'hsa-mir-181a', 'hsa-mir-181b', 'hsa-mir-196a', 174 | 'hsa-mir-196b', 'hsa-mir-218', 'hsa-mir-221', 'hsa-mir-222', 'hsa-mir-296', 'hsa-mir-302-367', 175 | 'hsa-mir-326', 'hsa-mir-381', 'hsa-mir-451', 'hsa-let-7a'],18) 176 | 177 | 178 | testValidate("""TT genotype for miR-196a2 gene also showed 3.2-fold 179 | risk toward LC and the risk was fivefold higher for squamous cell 180 | carcinoma.""", 181 | ['hsa-mir-196a2'],19) 182 | 183 | 184 | testValidate("""Thus, loss of miR-125b-1 may have a key role in the 185 | pathogenesis and progression of squamous cell carcinomas of head 186 | and neck and possibly of other tumors.""", 187 | ['hsa-mir-125b-1'],20) 188 | 189 | 190 | testValidate("""The present prospective case-control study investigated 191 | the involvement of microRNA (miR)-10b in the development of bone 192 | metastasis arising from primary breast carcinoma.""", 193 | ['hsa-mir-10b'],21) 194 | 195 | 196 | testValidate("""Four other miRNAs (miR-146b, -181b, let-7a and let-7c) 197 | are known oncogenic or tumor suppressor miRNAs.""", 198 | ['hsa-mir-146b', 'hsa-mir-181b', 'hsa-let-7a', 'hsa-let-7c'],22) 199 | 200 | 201 | testValidate("""BACKGROUND: The purpose of this study was to identify 202 | new tumour suppressor microRNAs (miRs) in clear cell renal cell 203 | carcinoma (ccRCC), carry out functional analysis of their suppressive 204 | role and identify their specific target genes.""", 205 | [],23) 206 | 207 | 208 | testValidate("""Subsequent quantitative PCR analyses of these splenic 209 | B cells revealed that C/EBPb, a transcriptional regulator of 210 | interleukin-6 that is linked to B-cell lymphoproliferative 211 | disorders, is downregulated when either miR-K12-11 or miR-155 is 212 | ectopically expressed.""", 213 | ['hsa-mir-K12-11', 'hsa-mir-155'],24) 214 | 215 | 216 | testValidate("""Thus, there is a possibility that the lack of change 217 | in miRs-182 and -96 following acoustic trauma is due to a slower 218 | degradation rate or no degradation compared to the targeted degradation 219 | of miR-183, which in turn may lead to the inconsistent expression 220 | pattern of these miRNAs within the cluster.""", 221 | ['hsa-mir-182', 'hsa-mir-96', 'hsa-mir-183'],25) 222 | 223 | 224 | testValidate("""Hsa-miR-92b and hsa-miR-9/9* were reported previously 225 | to be expressed in brain tumors and in cell lines derived from 226 | brain tumors and were documented to be expressed specifically in 227 | the developing nervous system """, 228 | ['hsa-mir-92b', 'hsa-mir-9', 'hsa-mir-9*'],26) 229 | 230 | 231 | testValidate("""However, only 3 miRNAs (miR-199a-5p, -27a, and -29a) 232 | correlated with hypertrophy; more importantly, only miR-29a 233 | correlated also with fibrosis.""", 234 | ['hsa-mir-199a-5p', 'hsa-mir-27a', 'hsa-mir-29a', 'hsa-mir-29a'],27) 235 | 236 | 237 | testValidate("""We found that target genes such as CDH1 (miR-1/206), 238 | ATM (miR-18a/b), KLF6 (miR-18a/b and miR-181c), Smad2(miR-18a/b, 239 | miR-1/206 and miR-149), Dicer were down expressed with the 240 | development of NPC, while BCL2L2 (miR-29a/b/c and miR-203), and YY1 241 | (miR-29a/b/c) were overexpressed during the development of NPC.""", 242 | ['hsa-mir-1', 'hsa-mir-206', 'hsa-mir-18a', 'hsa-mir-18b', 'hsa-mir-18a', 'hsa-mir-18b', 243 | 'hsa-mir-181c', 'hsa-mir-18a', 'hsa-mir-18b', 'hsa-mir-1', 'hsa-mir-206', 'hsa-mir-149', 244 | 'hsa-mir-29a', 'hsa-mir-29b', 'hsa-mir-29c', 'hsa-mir-203', 'hsa-mir-29a', 'hsa-mir-29b', 245 | 'hsa-mir-29c'],28) 246 | 247 | 248 | testValidate("""The miR-200 family (miR-200a, -200b, -200c, -141 and -429) 249 | and miR-205 are frequently silenced in advanced cancer and have been 250 | implicated in epithelial to mesenchymal transition (EMT) and tumor 251 | invasion by targeting the transcriptional repressors of E-cadherin, 252 | ZEB1 and ZEB2.""", 253 | ['hsa-mir-200', 'hsa-mir-200a', 'hsa-mir-200b', 'hsa-mir-200c', 'hsa-mir-141', 'hsa-mir-429', 254 | 'hsa-mir-205'],29) 255 | 256 | 257 | testValidate("""Here, the expression of the miRNAs miR-15a/16-1 in 258 | PBMC, CD4, and CD8 from RR-MS patients has been investigated.""", 259 | ['hsa-mir-15a', 'hsa-mir-16-1'],30) 260 | 261 | 262 | testValidate("""Subsequent quantitative PCR analyses of these splenic 263 | B cells revealed that C/EBPb, a transcriptional regulator of 264 | interleukin-6 that is linked to B-cell lymphoproliferative 265 | disorders, is downregulated when either miR-K12-11 is 266 | ectopically expressed.""", 267 | ['hsa-mir-K12-11'], 31) 268 | 269 | print total-errors, '/',total, (total-errors)*100/total, '% test passed' -------------------------------------------------------------------------------- /packages/model_tools.py: -------------------------------------------------------------------------------- 1 | import data_preparation_tools as dpt 2 | import features_generation_tools as fgt 3 | from sklearn.externals import joblib 4 | 5 | DEFAULT_VERSION = "1.1" 6 | 7 | class ScoringModel: 8 | # TODO: add the ability to pass in the array of transformations as well...for now we just use eveyrthing 9 | def __init__(self, features_generator, ml_model, transformations=dpt.TRANSFORMATION_STEPS, version=DEFAULT_VERSION): 10 | self.transformations = transformations 11 | self.features_generator = features_generator 12 | self.ml_model = ml_model 13 | self.version = version 14 | 15 | def score(self, text, context): 16 | transformed = dpt.run_transformations_on_single_sentence(text, context,self.transformations) 17 | features = self.features_generator.transform([transformed]) 18 | return self.ml_model.predict_proba(features)[0] 19 | 20 | def save_model(self, file_path): 21 | return joblib.dump(self,file_path, compress=True) 22 | 23 | @staticmethod 24 | def from_file(file_path): 25 | return joblib.load(file_path) -------------------------------------------------------------------------------- /webservice/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask import request 3 | from flask import jsonify 4 | from flask import Response 5 | from sklearn.externals import joblib 6 | import json 7 | import scorer 8 | 9 | # convert the scorer result to the response format expected by the client 10 | # TODO: move to another module 11 | def scorer_result_to_response_format(scoring_results_and_entities): 12 | response_result = { 'modelVersion' : scorer.get_version(), 'relations' : []} 13 | 14 | for tup in scoring_results_and_entities: 15 | relation = {} 16 | scorer_result = tup[0] 17 | relation['entities'] = tup[1] 18 | max_score = -1 19 | max_score_class = "" 20 | scores_list = [] 21 | for i in range(len(scorer_result)): 22 | if (scorer_result[i] > max_score): 23 | max_score = scorer_result[i] 24 | max_score_class = str(i) 25 | 26 | relation['classification'] = max_score_class 27 | relation['score'] = max_score 28 | response_result['relations'].append(relation) 29 | 30 | return response_result 31 | 32 | app = Flask(__name__) 33 | 34 | @app.route('/') 35 | def api_root(): 36 | return 'Relation classification service' 37 | 38 | @app.route('/score', methods = ['POST']) 39 | def score(): 40 | if request.headers['Content-Type'] != 'application/json': 41 | resp = Response('Unssuported content type, expected application/json', status=500); 42 | return resp 43 | if (not request.json.has_key('text')): 44 | resp = Response('Bad request: missing "text" field in JSON body', status=500); 45 | return resp 46 | if (not request.json.has_key('entities')): 47 | resp = Response('Bad request: missing "entities" field in JSON body', status=500); 48 | return resp 49 | 50 | text = request.json['text'] 51 | entities = request.json['entities'] 52 | try: 53 | scorerResult = scorer.evaluate_score(text, entities) 54 | resp = jsonify(scorer_result_to_response_format(scorerResult)) 55 | resp.status_code = 200 56 | return resp 57 | except Exception as e: 58 | resp = Response("Internal Server Error: %s"%e, status = 500) 59 | return resp 60 | 61 | @app.route('/updatemodel', methods = ['POST']) 62 | def update_model(): 63 | if request.headers['Content-Type'] != 'application/json': 64 | resp = Response('Unssuported content type, expected application/json', status=500); 65 | return resp 66 | if (not request.json.has_key('path')): 67 | resp = Response('Bad request: missing "path" field in JSON body', status=500); 68 | return resp 69 | 70 | path = request.json['path'] 71 | try: 72 | scorer.load_model_from_url(path) 73 | resp = Response("", status=200); 74 | return resp 75 | except Exception as e: 76 | resp = Response("Internal Server Error: %s"%e, status = 500) 77 | return resp 78 | 79 | 80 | if __name__ == '__main__': 81 | app.run() -------------------------------------------------------------------------------- /webservice/scorer.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import itertools 3 | import os 4 | import pickle 5 | import sys 6 | import sklearn 7 | import urllib 8 | from mirna_detector import is_mirna 9 | from os import path 10 | 11 | # add the dir above to path 12 | # TODO: make this more standard... 13 | current_dir_path = path.dirname(path.realpath(__file__)) 14 | sys.path.append(path.join(path.dirname(current_dir_path),"packages")) 15 | from model_tools import ScoringModel 16 | 17 | # TODO: Create dev and prod envs.. 18 | model_file_name = r"scoring_model.pkl" 19 | model_directory_path = path.join(current_dir_path, r"model") 20 | model_file_path = path.join(model_directory_path,model_file_name) 21 | 22 | scoring_model = None 23 | 24 | try: 25 | scoring_model = ScoringModel.from_file(model_file_path) 26 | except Exception as e: 27 | print "Failed loading model: %s"%e 28 | 29 | def get_text_from_entity_dict(e): 30 | if e.has_key("origin"): 31 | return e["origin"] 32 | if e.has_key("value"): 33 | return e["value"] 34 | return None 35 | 36 | def get_version(): 37 | if (scoring_model == None): 38 | raise Exception("No model file loaded, or bad model exists") 39 | return scoring_model.version 40 | 41 | def get_temp_model_path(): 42 | return path.join(model_directory_path, model_file_name + "_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S")) 43 | 44 | def load_model_from_url(url): 45 | # TODO: move this into a class.. 46 | global scoring_model 47 | url_opener = urllib.URLopener() 48 | temp_model_path = get_temp_model_path() 49 | url_opener.retrieve(url, temp_model_path) 50 | 51 | # try to load the model: 52 | try: 53 | temp_model = ScoringModel.from_file(temp_model_path) 54 | except Exception as e: 55 | print "Failed to load donwloaded model: %s"%e 56 | os.remove(temp_model_path) 57 | raise RuntimeError("Failed to load donwloaded model! error: %s"%e) 58 | 59 | # update model: 60 | scoring_model = temp_model 61 | 62 | # delete existing model 63 | if (path.isfile(model_file_path)): 64 | os.remove(model_file_path) 65 | os.rename(temp_model_path, model_file_path) 66 | 67 | 68 | # TODO: move this to an object with an init function... 69 | def evaluate_score(sentence, entities): 70 | if (scoring_model == None): 71 | raise Exception("No model file loaded, or bad model exists") 72 | 73 | for e in entities: 74 | if e.has_key("type"): 75 | e["type"] = e["type"].lower() 76 | 77 | # TODO: 78 | # We merge the entities here such that there are no overlaps, and also check for the mirna specifically 79 | # this should basically solved on the entity recognition side 80 | # we should consider removing this part when this issue is solved 81 | filtered_entities = [] 82 | for entity in entities: 83 | # check if we don't have collisions, always take the longer string 84 | objects_to_remove = [] 85 | add_entity = True 86 | 87 | start_index = int(entity["from"]) 88 | end_index = int(entity["to"]) 89 | text = get_text_from_entity_dict(entity) 90 | 91 | for e in filtered_entities: 92 | if (start_index <= e["start_index"] and end_index >= e["end_index"] 93 | or start_index <= e["start_index"] and end_index >= e["start_index"] 94 | or start_index <= e["end_index"] and end_index >= e["end_index"]): 95 | current_len = len(text) 96 | second_len = len(e["text"]) 97 | if (current_len <= second_len): 98 | add_entity = False 99 | break 100 | else: 101 | objects_to_remove.append(e) 102 | 103 | for e in objects_to_remove: 104 | filtered_entities.remove(e) 105 | 106 | # todo: for now using the mirna detector to detect miRNA since it seems that 107 | # the current results are currently not accurate 108 | if (not add_entity): 109 | continue 110 | 111 | type = entity["type"] 112 | if (is_mirna(text)): 113 | type = "mirna" 114 | entity["type"] = type 115 | 116 | filtered_entities.append({ 117 | "text" : text, 118 | "type" : type, 119 | "start_index" : start_index, 120 | "end_index" : end_index, 121 | "original_entity" : entity 122 | }) 123 | 124 | # for now just return the same result for all pairs of gens / miRNA entities: 125 | mirna_entities = [e for e in filtered_entities if (e["type"]=="mirna")] 126 | gene_entities = [e for e in filtered_entities if (e["type"]=="gene")] 127 | scores = [] 128 | 129 | for p in itertools.product(mirna_entities, gene_entities): 130 | context = {"pair_entities" :[ 131 | p[0], 132 | p[1] 133 | ], 134 | "all_entities" : filtered_entities 135 | } 136 | 137 | score = scoring_model.score(sentence, context) 138 | scores.append((score, (p[0]["original_entity"],p[1]["original_entity"]))) 139 | 140 | return scores 141 | -------------------------------------------------------------------------------- /webservice/scorer_ws.pyproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | 2.0 6 | {594fde71-3dd6-4d9a-919b-56a57da8d3b7} 7 | 8 | app.py 9 | 10 | . 11 | . 12 | {888888a0-9f3d-457c-b088-3a5042f75d52} 13 | Standard Python launcher 14 | 15 | 16 | 17 | 18 | 19 | 20 | 10.0 21 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | --------------------------------------------------------------------------------