├── netwalk
├── __init__.py
├── utils.py
├── models.py
├── walk.py
├── walkdataset.py
└── translator.py
├── Makefile
├── JuxtaposeTutorial
├── line.png
├── keypair.png
├── attachvolume.png
├── attachvolume2.png
├── keypairname.png
├── securitygroup.png
├── spotrequests.png
├── connectinstance.png
├── selectinstance.png
└── Embedding_Methodology.png
├── requirements.txt
├── regression.py
├── test
├── data
│ ├── line-config.json
│ ├── cross-config.json
│ ├── circle-config.json
│ ├── prefrontal_cortex.json
│ └── brain-heart-config.json
├── test_utils.py
├── test_walkdataset.py
├── test_walk.py
└── test_temp.py
├── Line
└── IDConvertor.json
├── make_directed.py
├── network_stats.py
├── test_translator.py
├── find_common_genes.py
├── random_tree_generator.py
├── dataset_generator.py
├── similarity.py
├── dangle.py
├── dimensionality_reduction.py
├── README.md
└── runner.py
/netwalk/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | setup:
2 | pip install -r requirements.txt
3 | regression:
4 | python regression.py
--------------------------------------------------------------------------------
/JuxtaposeTutorial/line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/line.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/keypair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/keypair.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/attachvolume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/attachvolume.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/attachvolume2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/attachvolume2.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/keypairname.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/keypairname.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/securitygroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/securitygroup.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/spotrequests.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/spotrequests.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/connectinstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/connectinstance.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/selectinstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/selectinstance.png
--------------------------------------------------------------------------------
/JuxtaposeTutorial/Embedding_Methodology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/Embedding_Methodology.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim==3.8.3
2 | matplotlib==3.1.3
3 | networkx==2.4
4 | numpy==1.18.1
5 | pandas==1.0.1
6 | scikit-learn==0.22.1
7 | scipy==1.4.1
8 | seaborn==0.10.0
9 | sklearn==0.0
10 | torch==1.6.0
11 | torchvision==0.7.0
--------------------------------------------------------------------------------
/regression.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 |
4 | # initialize the test suite
5 | loader = unittest.TestLoader()
6 | start_dir = './test/'
7 | suite = loader.discover(start_dir)
8 |
9 | # initialize a runner, pass it your suite and run it
10 | runner = unittest.TextTestRunner(verbosity=3)
11 | result = runner.run(suite)
--------------------------------------------------------------------------------
/test/data/line-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_replicates": 10,
3 | "percentage": 0.4,
4 | "n_anchors": 6,
5 | "anchor_test_ratio": 0.5,
6 | "min_dangle_size": 3,
7 | "max_dangle_size": 10,
8 | "anchor_file_address": "test/data/line_anchors.csv",
9 | "phenotypes": ["1", "2"],
10 | "experiment_name": "Line",
11 | "test_ratio": 0.5,
12 | "data_directory": "test/data"
13 | }
14 |
--------------------------------------------------------------------------------
/test/data/cross-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_replicates": 1,
3 | "percentage": 0.4,
4 | "n_anchors": 10,
5 | "anchor_test_ratio": 0.5,
6 | "min_dangle_size": 3,
7 | "max_dangle_size": 5,
8 | "anchor_file_address": "test/data/cross_anchors.csv",
9 | "phenotypes": ["cross_1", "cross_2"],
10 | "experiment_name": "Cross",
11 | "test_ratio": 0.5,
12 | "data_directory": "test/data"
13 | }
14 |
--------------------------------------------------------------------------------
/test/data/circle-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_replicates": 1,
3 | "percentage": 0.4,
4 | "n_anchors": 10,
5 | "anchor_test_ratio": 0.5,
6 | "min_dangle_size": 3,
7 | "max_dangle_size": 5,
8 | "anchor_file_address": "test/data/circle_anchors.csv",
9 | "phenotypes": ["circle_1", "circle_2"],
10 | "experiment_name": "Circle",
11 | "test_ratio": 0.5,
12 | "data_directory": "test/data"
13 | }
14 |
--------------------------------------------------------------------------------
/test/data/prefrontal_cortex.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_replicates": 1,
3 | "percentage": 0.3,
4 | "n_anchors": 20,
5 | "anchor_test_ratio": 0.2,
6 | "min_dangle_size": 10,
7 | "max_dangle_size": 15,
8 | "anchor_file_address": "data/common_cortex_genes.csv",
9 | "phenotypes": ["human", "chimpanzee", "macaque", "mouse"],
10 | "experiment_name": "Cortex",
11 | "test_ratio": 0.2,
12 | "data_directory": "pcortex_data"
13 | }
14 |
--------------------------------------------------------------------------------
/test/data/brain-heart-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_replicates": 1,
3 | "percentage": 0.4,
4 | "n_anchors": 20,
5 | "anchor_test_ratio": 0.5,
6 | "min_dangle_size": 7,
7 | "max_dangle_size": 10,
8 | "anchor_file_address": "data/heart_brain_shared.csv",
9 | "phenotypes": ["brain_1", "brain_2", "brain_3", "heart_1", "heart_2", "heart_3"],
10 | "experiment_name": "Heart_Brain",
11 | "test_ratio": 0.5,
12 | "data_directory": "data"
13 | }
14 |
--------------------------------------------------------------------------------
/Line/IDConvertor.json:
--------------------------------------------------------------------------------
1 | {"0": 0, "1": 1, "10": 2, "11": 3, "12": 4, "13": 5, "14": 6, "15": 7, "16": 8, "17": 9, "18": 10, "19": 11, "2": 12, "20": 13, "3": 14, "4": 15, "5": 16, "6": 17, "7": 18, "8": 19, "9": 20, "pseudo_14": 21, "pseudo_14_000": 22, "pseudo_14_001": 23, "pseudo_14_002": 24, "pseudo_15": 25, "pseudo_15_000": 26, "pseudo_15_001": 27, "pseudo_15_002": 28, "pseudo_15_003": 29, "pseudo_5": 30, "pseudo_5_000": 31, "pseudo_5_001": 32, "pseudo_5_002": 33, "pseudo_5_003": 34, "pseudo_5_004": 35, "pseudo_5_005": 36}
--------------------------------------------------------------------------------
/make_directed.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 |
3 | G = nx.read_edgelist("/home/farhad/Network/juxt/brain_heart_data/heart_1.csv", delimiter=',',nodetype=str, data=(('cor',float),))
4 | #print(G.edges(data=True))
5 | T = nx.algorithms.tree.mst.minimum_spanning_tree(G, weight='cor')
6 | edj = T.edges()
7 | print(edj)
8 | address = 'test/data/heart_directed_1.txt'
9 | with open(address, 'w') as fout:
10 | for e in edj:
11 | # write edges to file
12 | node_1 = e[0]
13 | node_2 = e[1]
14 | fout.write('{}\t{}\n'.format(str(node_1),str(node_2)))
15 |
--------------------------------------------------------------------------------
/network_stats.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def alignment_permutation_test(vocab1_length, vocab2_length, distance, actual_score, num_iteration=1000):
5 | n = min(vocab1_length, vocab2_length)
6 | indices = list(range(n))
7 | scores = []
8 | for i in range(num_iteration):
9 | v1 = np.random.choice(indices, size=n)
10 | v2 = np.random.choice(indices, size=n)
11 | s = 0
12 | for i, j in zip(v1, v2):
13 | s += distance[i, j]
14 | scores.append(s/n)
15 | scores = np.array(scores)
16 | print(scores)
17 | p = sum(scores >= actual_score) / num_iteration
18 | return p
19 |
20 |
21 |
--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from netwalk.utils import Vocabulary
3 | from netwalk.utils import load
4 | from netwalk.utils import Similarity
5 |
6 | class TestUtils(unittest.TestCase):
7 | def test_load(self):
8 | similarity = load("data/similarity_file.csv", sep=",")
9 | expected = {("gene1", "gene2"): 0.5,
10 | ("gene2", "gene3"): 0.7,
11 | ("gene1", "gene3"): 0.0}
12 |
13 | self.assertDictEqual(similarity, expected)
14 |
15 | def test_Vocabulary(self):
16 | genes = ['g0', 'g1', 'g2', 'g3', 'g4']
17 | id_2_name_map = {0: 'g0', 1: 'g1', 2: 'g2', 3: 'g3', 4: 'g4'}
18 | name_2_id_map = {'g0': 0, 'g1': 1, 'g2': 2, 'g3': 3, 'g4': 4}
19 | vocab = Vocabulary(genes)
20 | self.assertDictEqual(vocab.index, name_2_id_map)
21 | self.assertDictEqual(vocab.name, id_2_name_map)
22 | self.assertListEqual(vocab.genes, genes)
23 |
24 | def test_Similarity(self):
25 | d = {("gene1", "gene2"): 0.5,
26 | ("gene2", "gene3"): 0.7,
27 | ("gene1", "gene3"): 0.6,
28 | ("gene1", "gene4"): 0.3}
29 | similarity = Similarity(d)
30 | symmetric_keys = similarity.symmetric_key_set()
31 | expected_sym_keys = [("gene1", "gene2"), ("gene2", "gene1"), ("gene2", "gene3"), ("gene3", "gene2"),
32 | ("gene1", "gene3"), ("gene3", "gene1"), ("gene1", "gene4"), ("gene4", "gene1")]
33 |
34 | assert set(expected_sym_keys) == set(symmetric_keys)
35 |
--------------------------------------------------------------------------------
/test_translator.py:
--------------------------------------------------------------------------------
1 | from netwalk.translator import IDCovertor
2 | import os
3 | import argparse
4 | import json
5 | import glob
6 |
7 | if __name__ == '__main__':
8 | parser = argparse.ArgumentParser(description='Translate datasets')
9 | parser.add_argument('-c', '--config', metavar='JSON file path',
10 | action='store', required=True,
11 | help='Path to a config file')
12 | args = parser.parse_args()
13 | # read config file
14 | with open(args.config) as fin:
15 | params = json.load(fin)
16 |
17 | edge_list_file_addresses = glob.glob(os.path.join(params['experiment_name'],
18 | 'anchored_*.csv'))
19 | print(os.listdir(params['experiment_name']))
20 | #output_file_address = 'Line/translated_line.csv'
21 | convertor = IDCovertor(edge_list_file_addresses, sep=',')
22 | for edge_list_file_address in edge_list_file_addresses:
23 | dir_path, file_name = os.path.split(edge_list_file_address)
24 | output_file_address = os.path.join(dir_path, f'translated_{file_name}')
25 | convertor.translate(edge_list_file_address, output_file_address, sep=',')
26 |
27 | convertor_file = os.path.join(params['experiment_name'],
28 | 'IDConvertor.json')
29 | convertor.save(convertor_file)
30 | con = IDCovertor.load(convertor_file)
31 | assert con.id2int == convertor.id2int
32 | assert con.int2id == convertor.int2id
33 | assert con.ids == convertor.ids
34 |
35 |
--------------------------------------------------------------------------------
/netwalk/utils.py:
--------------------------------------------------------------------------------
1 | ''' This module contains utilities classes and functions.
2 |
3 | '''
4 | import os.path
5 | import json
6 | import random
7 | import torch
8 | import copy
9 | import numpy as np
10 |
11 | class Vocabulary(object):
12 | '''Create a bijective mapping between gene name/ID and indices.
13 |
14 | Args:
15 | genes: An array-like containing gene names/IDs.
16 | '''
17 |
18 | def __init__(self, genes):
19 | self.genes = genes
20 | self.index = dict(zip(sorted(genes), range(len(genes))))
21 | self.name = {idx: gene for gene, idx in self.index.items()}
22 | self.dim = len(self.genes)
23 |
24 | def to_indices(self, genes):
25 | return [self.index[gene] for gene in genes]
26 |
27 | def to_names(self, indices):
28 | return [self.name[i] for i in indices]
29 |
30 | def __len__(self):
31 | return len(self.genes)
32 |
33 |
34 | def load_walks(file_dir='.', prefix='pair_walk', sep=','):
35 | ''' Read dataset from a file.
36 |
37 | Args:
38 | address: Address of a CSV file containing walks.
39 | sep: A field delimiter.
40 | Returns:
41 | data: A list of walks.
42 | genes: The set of all genes that appear in at least one walk.
43 | '''
44 | walk_address = os.path.join(file_dir, prefix + '_walks.csv')
45 | walks = np.genfromtxt(walk_address, dtype=np.uint16, delimiter=sep)
46 | return walks
47 |
48 |
49 | def dump_walks(walks, out_dir='.', prefix='pair_walk', sep=','):
50 | # Create walks file
51 | pass
52 |
53 |
54 |
--------------------------------------------------------------------------------
/test/test_walkdataset.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from netwalk.walkdataset import WalkDataset
3 | from netwalk.walkdataset import PairWalkDataset
4 | from netwalk.utils import Vocabulary
5 |
6 |
7 | class TestWalkDataset(unittest.TestCase):
8 | def setUp(self):
9 | data = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
10 | [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
11 | [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
12 | [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
13 | [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]
14 | self.data = data
15 |
16 | def test_from_csv(self):
17 | dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',')
18 | self.assertListEqual(dataset.walks, self.data)
19 |
20 | def test__len__(self):
21 | dataset = WalkDataset(original_walks=[], vocab=Vocabulary([]))
22 | self.assertEqual(len(dataset), 0)
23 | dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',')
24 | self.assertEqual(len(dataset), 5)
25 |
26 | def test__getitem__(self):
27 | dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',')
28 | for i, expected in enumerate(self.data):
29 | self.assertListEqual(expected, dataset[i])
30 |
31 |
32 | class TestPairedWalkDataset(unittest.TestCase):
33 | def setUp(self):
34 | data = [([0, 1, 2, 3, 4], [0, 1, 2, 3, 4]),
35 | ([5, 6, 7, 8, 9], [5, 8, 7, 8, 9])]
36 | self.data = data
37 |
38 | def test_from_csv(self):
39 | dataset = PairWalkDataset.from_csv('data/sample_pair_walk_dataset.csv', sep=',')
40 | self.assertEqual(len(self.data), len(dataset))
41 | for i, observed_walk in enumerate(dataset):
42 | (expected_walk_a, expected_walk_b) = self.data[i]
43 | observed_walk_a, observed_walk_b = observed_walk
44 | self.assertListEqual(expected_walk_a, observed_walk_a)
45 | self.assertListEqual(expected_walk_b, observed_walk_b)
46 |
--------------------------------------------------------------------------------
/find_common_genes.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import multiprocessing as mp
3 |
4 |
5 | def load(address):
6 | genes = set()
7 | with open(address) as fin:
8 | for line in fin:
9 | line = line.strip()
10 | if line == '':
11 | continue
12 | gene_a, gene_b, _ = line.split(',')
13 | genes.add(gene_a)
14 | genes.add(gene_b)
15 | return genes
16 |
17 | def get_common_genes(edge_lists, genes_of_interest):
18 | pool = mp.Pool(min(mp.cpu_count(), len(edge_llists)))
19 | profiles_genes = [pool.apply(load, args=(address, )) for address in edge_lists]
20 | pool.close()
21 | common_genes = set(genes_of_interest)
22 | for genes in profiles_genes:
23 | common_genes = genes & common_genes
24 | return common_genes
25 |
26 |
27 | if __name__ == '__main__':
28 | output_address = 'data/common_cortex_genes.csv'
29 | edge_llists = [
30 | 'pcortex_data/network_1_12_chimpanzee.csv',
31 | 'pcortex_data/network_1_12_human.csv',
32 | 'pcortex_data/network_1_12_macaque.csv',
33 | 'pcortex_data/network_1_12_mouse.csv']
34 | #genes_of_interest = '/home/fam918/Documents/CodeRepos/WALKS/netwalk/data/homeostasis_genes.csv'
35 | #output_address = 'heart_brain_shared.csv'
36 | #edge_llists = ['/home/farhad/Network/netwalk/data/network_1_200_heart.csv',
37 | # '/home/farhad/Network/netwalk/data/network_2_200_heart.csv',
38 | # '/home/farhad/Network/netwalk/data/network_3_200_heart.csv',
39 | # '/home/farhad/Network/netwalk/data/network_1_200_brain.csv',
40 | # '/home/farhad/Network/netwalk/data/network_2_200_brain.csv',
41 | # '/home/farhad/Network/netwalk/data/network_3_200_brain.csv']
42 | genes_of_interest = 'data/cellular_homeostasis.csv'
43 | with open(genes_of_interest) as f:
44 | lines = f.read().splitlines()
45 | common_genes = get_common_genes(edge_llists, lines)
46 | with open(output_address, 'w') as fout:
47 | for gene in common_genes:
48 | fout.write('{}\n'.format(gene))
49 |
--------------------------------------------------------------------------------
/random_tree_generator.py:
--------------------------------------------------------------------------------
1 | import random
2 | import networkx as nx
3 |
4 | START_SIZE=50
5 | CURRENT_SIZE=50
6 | FINAL_SIZE=200
7 | NODE_STEP=10
8 |
9 | while CURRENT_SIZE <= FINAL_SIZE:
10 | #print(CURRENT_SIZE - 1)
11 | address = 'test/data/random_tree_{}.csv'.format(CURRENT_SIZE)
12 | with open(address, 'w') as fout:
13 | if CURRENT_SIZE == START_SIZE:
14 | G = nx.generators.trees.random_tree(START_SIZE)
15 | while nx.number_connected_components(G) > 1:
16 | nx.generators.trees.random_tree(START_SIZE)
17 | edj = list(G.edges())
18 | n = list(G.nodes())
19 | for e in edj:
20 | # write edges to file
21 | fout.write('{},{},1\n'.format(str(e[0]),str(e[1])))
22 | fout.write('{},{},1\n'.format(str(e[1]),str(e[0])))
23 | elif CURRENT_SIZE != START_SIZE:
24 | # read in the previous graph and write it to file
25 | previous_address = 'test/data/random_tree_{}.csv'.format(CURRENT_SIZE-NODE_STEP)
26 | file_previous = open(previous_address, 'r')
27 | Lines = file_previous.readlines()
28 | fout.writelines(Lines)
29 |
30 | G = nx.generators.trees.random_tree(NODE_STEP)
31 | while nx.number_connected_components(G) > 1:
32 | nx.generators.trees.random_tree(NODE_STEP)
33 | edj = list(G.edges())
34 |
35 | for e in edj:
36 | # write edges to file
37 | node_1 = e[0] + CURRENT_SIZE-NODE_STEP - 1
38 | node_2 = e[1] + CURRENT_SIZE- NODE_STEP - 1
39 | fout.write('{},{},1\n'.format(str(node_1),str(node_2)))
40 | fout.write('{},{},1\n'.format(str(node_2),str(node_1)))
41 | # connect a node in graph to a random node in the original graph
42 | rand_node = random.randint(0, CURRENT_SIZE-1-NODE_STEP)
43 | #print(rand_node)
44 | print(CURRENT_SIZE,rand_node,node_1)
45 | fout.write('{},{},1\n'.format(str(rand_node), str(node_1)))
46 | fout.write('{},{},1\n'.format(str(node_1), str(rand_node)))
47 | CURRENT_SIZE= CURRENT_SIZE + NODE_STEP
48 |
49 |
50 | address = 'test/data/test_PPI.txt'
51 | with open(address, 'w') as fout:
52 | G = nx.scale_free_graph(100)
53 | edj = list(G.edges())
54 |
55 | for e in edj:
56 | # write edges to file
57 | node_1 = e[0]
58 | node_2 = e[1]
59 | fout.write('{}\t{}\t1\n'.format(str(node_1),str(node_2)))
60 |
--------------------------------------------------------------------------------
/dataset_generator.py:
--------------------------------------------------------------------------------
1 | ''' This module generate walk datasets.
2 | '''
3 | import argparse
4 | import random
5 | import numpy as np
6 | import netwalk.utils as utils
7 | import os.path
8 | from netwalk.walk import WalkGenerator
9 | from similarity import Similarity
10 | import gensim.models
11 | import time
12 | import multiprocessing as mp
13 | import json
14 | import pandas as pd
15 | import seaborn as sns
16 | import copy
17 |
18 | def generate_walks(edge_list_address, walk_per_node, walk_length, workers = 4):
19 | similarity = Similarity(correlation_file_path=edge_list_address, anchors=[],
20 | alphas=[], sep=',', prefix='pseudo')
21 | genes = list(similarity.idx.keys())
22 | start_time = time.time()
23 | gen_walk = WalkGenerator(similarity.matrix, genes, walk_length, walk_per_node)
24 | print("takes {} seconds to create walk object.".format(
25 | time.time() - start_time))
26 |
27 | num_cpus = workers
28 | pool = mp.Pool(num_cpus)
29 | arguments = list(range(len(gen_walk)))
30 | chunk_size = len(gen_walk) // num_cpus
31 | walks = pool.map(gen_walk, arguments, chunksize=chunk_size)
32 | return walks
33 |
34 |
35 | if __name__ == '__main__':
36 | parser = argparse.ArgumentParser(description='Generate datasets')
37 | parser.add_argument('-c', '--config', metavar='JSON file path',
38 | action='store', required=True,
39 | help='Path to a config file')
40 | args = parser.parse_args()
41 | # read config file
42 | with open(args.config) as fin:
43 | params = json.load(fin)
44 | # make walks and train the network
45 | for pheno in params['phenotypes']:
46 | for rep_id in range(params['n_replicates']):
47 | edge_list_address = os.path.join(params['experiment_name'],
48 | 'translated_anchored_{}_{}.csv'.format(pheno,
49 | str(rep_id)))
50 | # Create walks
51 | walks = generate_walks(edge_list_address, params['walk_per_node'],
52 | params['walk_length'], workers=params['n_workers'])
53 |
54 | # Write walks to file
55 | address = os.path.join(params['experiment_name'],
56 | '{}_{}_walks.csv'.format(pheno, str(rep_id)))
57 | with open(address, 'w') as fout:
58 | for w in walks:
59 | fout.write('{}\n'.format(','.join([str(s) for s in w])))
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/test/test_walk.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | from netwalk.walk import Walk
4 | from netwalk.walk import PairWalk
5 |
6 |
7 | class TestWalk(unittest.TestCase):
8 | def test_from_dict(self):
9 | d = {(1, 2): 0.7,
10 | (1, 3): 0.1,
11 | (2, 4): 0.5,
12 | (3, 4): 0.3}
13 | similarity = [[0, 7/8, 1/8, 0],
14 | [7/12, 0, 0, 5/12],
15 | [1/4, 0, 0, 3/4],
16 | [0, 5/8, 3/8, 0]]
17 | CDF = np.array([[0, 0.875, 1, 1],
18 | [7/12, 7/12, 7/12, 1],
19 | [0.25, 0.25, 0.25, 1],
20 | [0, 0.625, 1, 1]])
21 |
22 | walk = Walk(d)
23 | self.assertListEqual(list(walk._nodes), [1, 2, 3, 4])
24 | diff = np.array(similarity) - walk.prob
25 | self.assertAlmostEqual(np.linalg.norm(diff), 0)
26 | ids = np.array([walk._ids[node] for node in walk._nodes])
27 | diff = np.linalg.norm(ids - np.arange(len(walk._nodes)))
28 | self.assertAlmostEqual(diff, 0)
29 | diff = np.linalg.norm(walk.cdf - CDF)
30 | self.assertAlmostEqual(diff, 0)
31 |
32 | def test_generate(self):
33 | similarity = {(1, 2): 0.5, (1, 3): 0.0}
34 |
35 | walk = Walk(similarity)
36 | self.assertListEqual(walk.generate(3, 3), [2, 2, 2, 2])
37 |
38 | self.assertListEqual((walk.generate(1, 5)), [0, 1, 0, 1, 0, 1])
39 | self.assertListEqual((walk.generate(2, 5)), [1, 0, 1, 0, 1, 0])
40 |
41 | def test_make_walks(self):
42 | similarity = {(1, 2): 0.5, (1, 3): 0.0}
43 | walk = Walk(similarity)
44 | dataset = walk.make_walks(walk_per_node=2, walk_length=3)
45 | expected_dataset = [[0, 1, 0, 1], [0, 1, 0, 1], [1, 0, 1, 0],
46 | [1, 0, 1, 0], [2, 2, 2, 2], [2, 2, 2, 2]]
47 | for expected, observed in zip(expected_dataset, dataset['walks']):
48 | self.assertListEqual(expected, list(observed))
49 | expected_nodes = [1, 2, 3]
50 | self.assertListEqual(expected_nodes, list(dataset['nodes']))
51 | expected_ids = {1: 0, 2: 1, 3: 2}
52 | self.assertEqual(expected_ids, dataset['ids'])
53 |
54 |
55 | class TestPairWalk(unittest.TestCase):
56 |
57 | def test_generate(self):
58 | similarity = {(1, 2): 0.5, (1, 3): 0.0}
59 | walk = PairWalk(similarity)
60 | self.assertListEqual(walk.generate(3, 3), [2, 2, 2, 2, 2, 2, 2, 2])
61 | self.assertListEqual((walk.generate(1, 5)), [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
62 | self.assertListEqual((walk.generate(2, 5)), [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
63 |
--------------------------------------------------------------------------------
/netwalk/models.py:
--------------------------------------------------------------------------------
1 | ''' This class contain the models.
2 |
3 | '''
4 | import math
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn import TransformerEncoder, TransformerEncoderLayer
9 |
10 |
11 | class TransformerModel(nn.Module):
12 |
13 | def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
14 | super(TransformerModel, self).__init__()
15 | self.model_type = 'Transformer'
16 | self.src_mask = None
17 | self.pos_encoder = PositionalEncoding(ninp, dropout)
18 | encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
19 | self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
20 | self.encoder = nn.Embedding(ntoken, ninp)
21 | self.ninp = ninp
22 | self.decoder = nn.Linear(ninp, ntoken)
23 | self.init_weights()
24 |
25 | def _generate_square_subsequent_mask(self, sz):
26 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
27 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
28 | return mask
29 |
30 | def init_weights(self):
31 | initrange = 0.1
32 | self.encoder.weight.data.uniform_(-initrange, initrange)
33 | self.decoder.bias.data.zero_()
34 | self.decoder.weight.data.uniform_(-initrange, initrange)
35 |
36 | def forward(self, src):
37 | if self.src_mask is None or self.src_mask.size(0) != len(src):
38 | device = src.device
39 | mask = self._generate_square_subsequent_mask(len(src)).to(device)
40 | self.src_mask = mask
41 | src = self.encoder(src) * math.sqrt(self.ninp)
42 | src = self.pos_encoder(src)
43 | output = self.transformer_encoder(src, self.src_mask)
44 | output = self.decoder(output)
45 | return F.log_softmax(output, dim=-1)
46 |
47 | def embedding(self, src):
48 | model.eval() # Turn on the evaluation mode
49 | with torch.no_grad():
50 | src = self.encoder(src) * math.sqrt(self.ninp)
51 | src = self.pos_encoder(src)
52 | output = self.transformer_encoder(src, None)
53 | return output
54 |
55 | def save(self, address):
56 | torch.save(self.state_dict(), address)
57 |
58 | @classmethod
59 | def load(cls, model, address):
60 | model.load_state_dict(torch.load(address))
61 | model.to(device)
62 | return model
63 |
64 |
65 | class PositionalEncoding(nn.Module):
66 | ''' Positional encoding used in transforemers.
67 | '''
68 |
69 | def __init__(self, d_model, dropout=0.1, max_len=5000):
70 | super(PositionalEncoding, self).__init__()
71 | self.dropout = nn.Dropout(p=dropout)
72 |
73 | pe = torch.zeros(max_len, d_model)
74 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
75 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
76 | pe[:, 0::2] = torch.sin(position * div_term)
77 | pe[:, 1::2] = torch.cos(position * div_term)
78 | pe = pe.unsqueeze(0).transpose(0, 1)
79 | self.register_buffer('pe', pe)
80 |
81 | def forward(self, x):
82 | x = x + self.pe[:x.size(0), :]
83 | return self.dropout(x)
84 |
--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 |
5 | class Similarity(object):
6 | def __init__(self, correlation_file_path, anchors, alphas, sep=',',
7 | prefix='pseudo', string_id=False):
8 | self.real_genes = set()
9 | with open(correlation_file_path) as fin:
10 | for line in fin:
11 | line = line.strip()
12 | if line == '':
13 | continue
14 | a, b, _ = line.split(sep)
15 | a = a.strip()
16 | b = b.strip()
17 | if string_id is False:
18 | a = int(a)
19 | b = int(b)
20 | self.real_genes.add(a)
21 | self.real_genes.add(b)
22 | self.real_genes = list(sorted(self.real_genes))
23 | assert set(anchors).issubset(self.real_genes)
24 | self.pseudo_genes = []
25 | for anchor, alpha in zip(anchors, alphas):
26 | self.pseudo_genes.append('{}_{}'.format(prefix, anchor))
27 | for i in range(alpha):
28 | self.pseudo_genes.append('{}_{}_{:0>3d}'.format(prefix, anchor, i))
29 | genes = self.real_genes + self.pseudo_genes
30 | n = len(genes)
31 | self.matrix = np.zeros((n, n), dtype=np.float32)
32 | self.idx = {gene: i for i, gene in enumerate(genes)}
33 | # Assign values to the correlation matrix
34 | with open(correlation_file_path) as fin:
35 | for line in fin:
36 | line = line.strip()
37 | if line == '':
38 | continue
39 | a, b, cor = line.split(sep)
40 | if string_id is False:
41 | a = int(a)
42 | b = int(b)
43 | i = self.idx[a]
44 | j = self.idx[b]
45 | self.matrix[i,j] = np.float32(cor)
46 | self.matrix[j,i] = np.float32(cor)
47 |
48 | def average_correlation(self):
49 | n = len(self.real_genes)
50 | values = self.matrix[0:n, 0:n][np.nonzero(self.matrix[0:n, 0:n])]
51 | return np.mean(values)
52 |
53 |
54 | def __getitem__(self, item):
55 | a, b = item
56 | i = self.idx[a]
57 | j = self.idx[b]
58 | return self.matrix[i, j]
59 |
60 | def transform(self, transform=None):
61 | if transform is None:
62 | transform = lambda x: 0.5 * x + 0.5
63 | n = len(self.real_genes) + len(self.pseudo_genes)
64 | for i in range(n):
65 | for j in range(n):
66 | self.matrix[i, j] = transform(self.matrix[i, j])
67 |
68 | def apply_threshold(self, lower_cor, upper_cor, value):
69 | n = len(self.real_genes) + len(self.pseudo_genes)
70 | for i in range(n):
71 | for j in range(n):
72 | if self.matrix[i, j] > lower_cor and self.matrix[i, j] < upper_cor:
73 | self.matrix[i, j] = value
74 |
75 | def to_csv(self, file_name):
76 | n = len(self.real_genes) + len(self.pseudo_genes)
77 | genes = self.real_genes + self.pseudo_genes
78 | with open(file_name, 'w') as f:
79 | for i in range(n):
80 | for j in range(n):
81 | if i == j:
82 | break
83 | else:
84 | f.write(','.join([genes[i], genes[j], str(self.matrix[i, j])]))
85 | f.write("\n")
86 |
87 | def augment(self, dangles):
88 | genes = self.real_genes + self.pseudo_genes
89 | for (a, b), w in dangles.items():
90 | assert a in genes, "gene is missing from similarity matrix."
91 | assert b in genes, "gene is missing from similarity matrix."
92 | i = self.idx[a]
93 | j = self.idx[b]
94 | self.matrix[i, j] = w
95 | self.matrix[j, i] = w
96 |
--------------------------------------------------------------------------------
/netwalk/walk.py:
--------------------------------------------------------------------------------
1 | ''' This module generates walks from a network.
2 |
3 | '''
4 | import numpy as np
5 | import gensim.models
6 | import seaborn as sns
7 | from matplotlib import pyplot as plt
8 | import pandas as pd
9 | import time
10 | import multiprocessing as mp
11 |
12 |
13 | EPSILON = 1E-6
14 |
15 | class Probability():
16 | def __init__(self, matrix, gene_names):
17 | n = matrix.shape[0]
18 | assert matrix.shape[0] == matrix.shape[1]
19 | assert len(gene_names) == n
20 | total_prob = matrix.sum(axis=1).reshape(n, 1)
21 | corrections = []
22 | for i, p in enumerate(total_prob):
23 | if total_prob[i] < EPSILON:
24 | total_prob[i] = 1
25 | corrections.append(i)
26 | self.prob = matrix / total_prob
27 | for i in corrections:
28 | self.prob[i, i] = 1
29 | for i in range(n):
30 | if abs(np.sum(self.prob[i]) - 1) > EPSILON:
31 | self.prob[i] /= (self.prob[i]).sum()
32 | print((self.prob[i]).sum())
33 | try:
34 | assert abs(np.sum(self.prob[i]) - 1) < EPSILON
35 | except:
36 | print(abs(np.sum(self.prob[i]) - 1))
37 | raise
38 | self.idx = {name:i for i, name in enumerate(gene_names)}
39 |
40 | def __getitem__(self, gene):
41 | i = self.idx[gene]
42 | return self.prob[i]
43 |
44 |
45 |
46 | class WalkGenerator(object):
47 | ''' Create walks using a graph defined by a similarity matrix.
48 |
49 | Args:
50 | similarity: A dictionary representing the similarity between
51 | pairs of nodes, where the similarity between nodes u and v
52 | is represented by similarity((u, v)).
53 | '''
54 | def __init__(self, similarity_matrix, genes, walk_length, walk_per_node, fountains=None):
55 | self.walk_length = walk_length
56 | self.nodes = np.copy(genes)
57 | if fountains is None:
58 | self.fountains = np.copy(self.nodes)
59 | else:
60 | self.fountains = np.copy(fountains)
61 | self.starters = np.repeat(self.fountains, walk_per_node)
62 | np.random.shuffle(self.starters)
63 | self.LENGTH = len(self.starters)
64 | self.prob = Probability(similarity_matrix, self.nodes)
65 |
66 |
67 |
68 | def __len__(self):
69 | return self.LENGTH
70 |
71 | def __getitem__(self, i):
72 | ''' Generate a random walk starting from the i-th gene.
73 |
74 | Args:
75 | start: Starting point of the random walk.
76 | length: Length of the random walk.
77 | '''
78 | if i >= self.LENGTH:
79 | raise StopIteration
80 | current_node = self.starters[i]
81 | walk = [current_node]
82 | for _ in range(self.walk_length):
83 | next_node = np.random.choice(self.nodes, p=self.prob[current_node])
84 | walk.append(next_node)
85 | current_node = next_node
86 | return walk
87 |
88 | def __call__(self, i):
89 | return self[i]
90 |
91 |
92 | #if __name__ == '__main__':
93 | #similarity_matrix = np.random.rand(15000, 15000)
94 | #genes = np.array(range(15000), dtype=np.uint16)
95 |
96 | #similarity_matrix = np.array([[0.0, 0.0, 0.6, 0.0],
97 | # [0.0, 0.0, 0.3, 0.0],
98 | # [0.6, 0.3, 0.0, 0.0],
99 | # [0.0, 0.0, 0.0, 0.0]])
100 | #
101 | #genes = ['1', '2', '3', '4']
102 | #start_time = time.time()
103 | #walks = WalkGenerator(similarity_matrix,genes, 50, 100)
104 | #hours, rem = divmod(time.time() - start_time, 3600)
105 | #minutes, seconds = divmod(rem, 60)
106 | #print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
107 | #num_cpus = mp.cpu_count() - 1
108 | #pool = mp.Pool(num_cpus)
109 | #arguments = list(range(len(walks)))
110 | #chunk_size = len(walks) // num_cpus
111 | #results = pool.map(walks, arguments, chunksize=chunk_size)
112 | #with open('walks.csv', 'w') as fout:
113 | # for w in results:
114 | # fout.write('{}\n'.format(','.join([str(x) for x in w])))
115 |
116 | #for w in walks:
117 | # print(w)
118 |
119 | # colour_map = "Greens_r"
120 | # model = gensim.models.Word2Vec(sentences=walks,
121 | # size=5,
122 | # window=2,
123 | # min_count=2,
124 | # workers=3,
125 | # iter=1)
126 | # wv1 = model.wv
127 | # vocab_size = len(genes)
128 | # dist1 = np.zeros((vocab_size, vocab_size))
129 | # for i, gene_i in enumerate(genes):
130 | # for j, gene_j in enumerate(genes):
131 | # dist1[i,j] = np.linalg.norm(wv1[gene_i] - wv1[gene_j])
132 | #
133 | # df = pd.DataFrame(dist1, columns=genes, index=genes)
134 | # ax = sns.heatmap(df, cmap=colour_map, square=True)
135 | # plt.show()
136 |
--------------------------------------------------------------------------------
/netwalk/walkdataset.py:
--------------------------------------------------------------------------------
1 | ''' This module contains WalkDataset.
2 | '''
3 | from torch.utils.data import Dataset
4 | from netwalk.utils import Vocabulary
5 |
6 |
7 | class WalkDataset(Dataset):
8 | ''' Create a dataset of walks, where each walk is a sequence of genes.
9 |
10 | Args:
11 | original_walks: a nested list of gene names/IDs.
12 | vocab: A Vocabulary object including all genes in the original_walks.
13 | '''
14 | def __init__(self, original_walks, vocab):
15 | super(WalkDataset, self).__init__()
16 | self.vocab = vocab
17 | self.walks = self._vocab_index(original_walks, vocab)
18 |
19 | @staticmethod
20 | def _vocab_index(original_walks, vocab):
21 | ''' Translate walks from original node names to integer indices.
22 | Args:
23 | original_walks: The original walks, where each node is represented
24 | by node name/ID.
25 | vocab: A Vocabulary object including all genes in the original_walks.
26 | Returns:
27 | A translated version of original_walks, where each node is
28 | represented with an integer index. These indices starts with
29 | 0 to n with no gap, where n is the number of different nodes
30 | present in at least one of the walks in original_walks.
31 |
32 | '''
33 | walks = []
34 | for walk in original_walks:
35 | walks.append([vocab.index[name] for name in walk])
36 | return walks
37 |
38 | @classmethod
39 | def read_csv(cls, address, sep):
40 | ''' Read dataset from a file.
41 |
42 | Args:
43 | address: Address of a CSV file containing walks.
44 | sep: A field delimiter.
45 | Returns:
46 | data: A list of walks.
47 | genes: The set of all genes that appear in at least one walk.
48 | '''
49 | data = []
50 | genes = set()
51 | with open(address) as fin:
52 | for line in fin:
53 | line = line.strip()
54 | if line == '':
55 | continue
56 | walk = [gene.strip() for gene in line.split(sep)]
57 | data.append(walk)
58 | genes.update(walk)
59 | return data, genes
60 |
61 | @classmethod
62 | def from_csv(cls, address, sep):
63 | ''' Create a WalkDataset from a CSV file.
64 |
65 | Args:
66 | address: Address of a CSV file containing walks.
67 | sep: A field delimiter.
68 | Returns:
69 | A WalkDataset object.
70 | '''
71 | data, genes = cls.read_csv(address, sep)
72 | vocab = Vocabulary(genes)
73 | return cls(data, vocab)
74 |
75 | def __getitem__(self, idx):
76 | return self.walks[idx]
77 |
78 | def __len__(self):
79 | return len(self.walks)
80 |
81 |
82 | class PairWalkDataset(WalkDataset):
83 | ''' Create a dataset of walks, where each walk is a sequence of genes.
84 |
85 | Args:
86 | original_walks: a nested list of gene names/IDs.
87 | vocab: A Vocabulary object including all genes in the original_walks.
88 | '''
89 | def __init__(self, original_walks, vocab):
90 | super(PairWalkDataset, self).__init__(original_walks, vocab)
91 |
92 |
93 | @classmethod
94 | def from_csv(cls, address, sep):
95 | ''' Create a WalkDataset from a CSV file.
96 |
97 | Args:
98 | address: Address of a CSV file containing walks.
99 | sep: A field delimiter.
100 | Returns:
101 | A WalkDataset object.
102 | '''
103 | data = []
104 | pair_walks, genes = cls.read_csv(address, sep)
105 | for walk_walk in pair_walks:
106 | middle = len(walk_walk) // 2
107 | walk_a, walk_b = walk_walk[:middle], walk_walk[middle:]
108 | data.append((walk_a, walk_b))
109 | vocab = Vocabulary(genes)
110 | return cls(data, vocab)
111 |
112 | @staticmethod
113 | def _vocab_index(original_walks, vocab):
114 | ''' Translate walks from original node names to integer indices.
115 | Args:
116 | original_walks: The original walks, where each node is represented
117 | by node name/ID.
118 | vocab: A Vocabulary object including all genes in the original_walks.
119 | Returns:
120 | A translated version of original_walks, where each node is
121 | represented with an integer index. These indices starts with
122 | 0 to n with no gap, where n is the number of different nodes
123 | present in at least one of the walks in original_walks.
124 |
125 | '''
126 | walks = []
127 | for walk_a, walk_b in original_walks:
128 | translated_walk_a = [vocab.index[name] for name in walk_a]
129 | translated_walk_b = [vocab.index[name] for name in walk_b]
130 | walks.append((translated_walk_a, translated_walk_b))
131 | return walks
132 |
--------------------------------------------------------------------------------
/netwalk/translator.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import numpy as np
4 | import pandas as pd
5 |
6 |
7 |
8 | class ID2NameTranslator(object):
9 | def __init__(self, vocab_file_address, sep=','):
10 | assert os.path.isfile(vocab_file_address)
11 | df = pd.read_csv(vocab_file_address, sep=sep)
12 | df.columns = ['ID', 'Name']
13 | self.df = df
14 | self.ids = df.iloc[:, 0].values
15 | self.names = df.iloc[:, 1].values
16 | self.__id2name = {ID: name for ID, name in zip(self.ids, self.names)}
17 |
18 | def id2name(self, ensemble_id, default=''):
19 | return self.__id2name.get(ensemble_id, default)
20 |
21 | def names2ids(self, names):
22 | selected = self.df[self.df.iloc[:, 1].isin(names)]
23 | data = []
24 | for name in names:
25 | data.extend(selected[selected.iloc[:,1] == name].values)
26 | data = pd.DataFrame.from_records(data, columns=['ID', 'Name'])
27 | return list(data.ID), list(data.Name)
28 |
29 |
30 | class IDCovertor(object):
31 | def __init__(self, edge_list_file_addresses, sep=','):
32 | idset = set()
33 | for edge_list_file_address in edge_list_file_addresses:
34 | with open(edge_list_file_address) as fin:
35 | for line in fin:
36 | id1, id2, _ = line.strip().split(sep)
37 | idset.add(id1)
38 | idset.add(id2)
39 | n = len(idset)
40 | self.ids = sorted(idset)
41 | self.id2int = {a_id: i for a_id, i in zip(self.ids, range(n))}
42 | self.int2id = {i: a_id for a_id, i in zip(self.ids, range(n))}
43 |
44 | def ids2ints(self, ids):
45 | return [self.id2int[x] for x in ids if x in self.ids]
46 |
47 | def ints2ids(self, ints):
48 | return [self.int2id[int(x)] for x in ints]
49 |
50 | def save(self, json_file_address):
51 | with open(json_file_address, 'w') as fout:
52 | json.dump(self.id2int, fout)
53 |
54 | @classmethod
55 | def load(cls, json_file_address):
56 | with open(json_file_address) as fin:
57 | id2int = json.load(fin)
58 | convertor = IDCovertor([])
59 | convertor.id2int = id2int
60 | convertor.int2id = {i: a_id for a_id, i in id2int.items()}
61 | convertor.ids = sorted(id2int.keys())
62 | return convertor
63 |
64 | def translate(self, input_file_address, output_file_address, sep=','):
65 | df = pd.read_csv(input_file_address, sep=sep, header=None)
66 | df.columns = ['ID1', 'ID2', 'Cor']
67 | with open(output_file_address, 'w') as fout:
68 | for id1, id2, cor in df.itertuples(index=False):
69 | fout.write('{}{}{}{}{}\n'.format(self.id2int[id1], sep,
70 | self.id2int[id2], sep,
71 | cor))
72 |
73 |
74 | def vocab2id_and_name(vocab, id_convertor_file_path, id2name_translator_file, default_name='', sep=','):
75 | id2name_translator = ID2NameTranslator(id2name_translator_file, sep=sep)
76 | id_convertor = IDCovertor.load(id_convertor_file_path)
77 | id_names = {}
78 | for k in vocab:
79 | ensemble_id = id_convertor.int2id[int(k)]
80 | name = id2name_translator.id2name(ensemble_id, default_name)
81 | id_names[k] = (ensemble_id, name)
82 | return id_names
83 |
84 |
85 | if __name__ == '__main__':
86 | ensemble_id_name_file = '../skeletal_data/mouse.vocab'
87 | # trans = ID2NameTranslator(ensemble_id_name_file, sep=',')
88 | # assert trans.id2name('ENSMUSG00000064372') == 'mt-Tp'
89 | # assert trans.id2name('ENSMUSG00000106796') == 'AC124394.4'
90 | # edge_list_file_addresses = ['../Skeletal_Cells/anchored_chicken_imm_0.csv',
91 | # '../Skeletal_Cells/anchored_chicken_ost_0.csv',
92 | # '../Skeletal_Cells/anchored_gar_imm_0.csv',
93 | # '../Skeletal_Cells/anchored_gar_ost_0.csv',
94 | # '../Skeletal_Cells/anchored_frog_imm_0.csv',
95 | # '../Skeletal_Cells/anchored_frog_ost_0.csv',
96 | # '../Skeletal_Cells/anchored_mouse_imm_0.csv',
97 | # '../Skeletal_Cells/anchored_mouse_ost_0.csv']
98 | # output_file_address = '../skeletal_data/translated_chicken_imm.csv'
99 | # convertor = IDCovertor(edge_list_file_addresses, sep=',')
100 | # for edge_list_file_address in edge_list_file_addresses:
101 | # dir_path, file_name = os.path.split(edge_list_file_address)
102 | # output_file_address = os.path.join(dir_path, f'translated_{file_name}')
103 | # convertor.translate(edge_list_file_address, output_file_address, sep=',')
104 | convertor_file = '../skeletal_data/IDConvertor.json'
105 | # convertor.save(convertor_file)
106 | # con = IDCovertor.load(convertor_file)
107 | # assert con.id2int == convertor.id2int
108 | # assert con.int2id == convertor.int2id
109 | # assert con.ids == convertor.ids
110 | # # Test vocab2id_and_name
111 | v2id_name = vocab2id_and_name(['19210', '19211'], convertor_file, ensemble_id_name_file, default_name='', sep=',')
112 | assert v2id_name['19210'][0] == 'ENSMUSG00000114019'
113 | assert v2id_name['19211'][0] == 'ENSMUSG00000114025'
114 | print(v2id_name)
115 |
--------------------------------------------------------------------------------
/dangle.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from similarity import Similarity
4 | import numpy as np
5 | import pandas as pd
6 | import random
7 | import os
8 | import json
9 | from sklearn.model_selection import train_test_split
10 |
11 |
12 | def build_backbone(anchors, alphas, weight, edge_percentage):
13 | dangling = {}
14 | for anchor, alpha in zip(anchors,alphas):
15 | pseudo_anchor = 'pseudo_{}'.format(anchor)
16 | dangles = dangling_structure(pseudo_anchor,
17 | alpha,
18 | weight,
19 | edge_percentage)
20 | dangles[(anchor, pseudo_anchor)] = weight
21 | dangling.update(dangles)
22 | return dangling
23 |
24 |
25 | def dangling_structure(gene, alpha, weight, edge_percentage):
26 | num_dangles = alpha
27 | dangles = ['{}_{:0>3d}'.format(gene, i) for i in range(alpha)]
28 | sim = {}
29 | potential_edges = []
30 | for gene_i in dangles:
31 | for gene_j in dangles:
32 | if gene_i == gene_j:
33 | break
34 | else:
35 | potential_edges.append((gene_i, gene_j))
36 | random.shuffle(potential_edges)
37 | connected_genes = set()
38 | for gene_i, gene_j in potential_edges:
39 | if {gene_i, gene_j} < connected_genes:
40 | continue
41 | elif len(connected_genes) < num_dangles:
42 | connected_genes.add(gene_i)
43 | connected_genes.add(gene_j)
44 | sim[(gene_i, gene_j)] = weight
45 |
46 | sim[(gene, dangles[0])] = weight
47 | for gene_i, gene_j in potential_edges:
48 | if random.random() < edge_percentage:
49 | sim[(gene_i, gene_j)] = weight
50 |
51 | return sim
52 |
53 |
54 | def main(experiment_name, phenotypes, data_directory, anchor_genes,
55 | num_replicates=1, percent=0.4, num_anchors=50, min_dangle_size=3,
56 | max_dangle_size=10, test_ratio=0.5):
57 | assert isinstance(phenotypes, list)
58 | alphas = random.choices(range(min_dangle_size, max_dangle_size),
59 | k=int(num_anchors * test_ratio))
60 | assert len(alphas) < len(anchor_genes)
61 | anchor_train_groups = []
62 | anchor_test_groups = []
63 | backbones = []
64 | # Create all backbones
65 | for rep_id in range(num_replicates):
66 | random.shuffle(anchor_genes)
67 | candidates = anchor_genes[:int(num_anchors)]
68 | genes_of_interest_train, genes_of_interest_test = train_test_split(
69 | candidates,
70 | shuffle=True,
71 | test_size=test_ratio)
72 |
73 | anchor_train_groups.append(genes_of_interest_train)
74 | anchor_test_groups.append(genes_of_interest_test)
75 | backbones.append(
76 | build_backbone(anchors=anchor_train_groups[rep_id], alphas=alphas,
77 | weight=1, edge_percentage=percent))
78 | # Write train anchors to file
79 | with open(os.path.join(experiment_name, 'train_anchors.csv'), 'w') as fout:
80 | for gene_group in anchor_train_groups:
81 | fout.write(','.join(gene_group))
82 | fout.write("\n")
83 | # Write test anchors to file
84 | with open(os.path.join(experiment_name, 'test_anchors.csv'), 'w') as fout:
85 | for gene_group in anchor_test_groups:
86 | fout.write(','.join(gene_group))
87 | fout.write("\n")
88 | # Adding the backbones and create the similarity object
89 | for pheno in phenotypes:
90 | file_name = os.path.join(data_directory, "{}.csv".format(pheno))
91 | for rep_id in range(num_replicates):
92 | sim_file_name = "anchored_{}_{}.csv".format(pheno, str(rep_id))
93 | out_address = os.path.join(experiment_name, sim_file_name)
94 | similarity = Similarity(file_name,
95 | anchors=anchor_train_groups[rep_id],
96 | alphas=alphas, string_id=True)
97 | similarity.transform()
98 | similarity.apply_threshold(lower_cor=0.2, upper_cor=0.8,
99 | value=0)
100 | similarity.augment(backbones[rep_id])
101 |
102 | similarity.to_csv(out_address)
103 |
104 |
105 |
106 | if __name__ == '__main__':
107 | parser = argparse.ArgumentParser(description='Generate dangling structures')
108 | parser.add_argument('-c', '--config', metavar='JSON file path',
109 | action='store', required=True,
110 | help='Path to a config file')
111 | args = parser.parse_args()
112 | config_file_address = args.config
113 | with open(config_file_address) as fin:
114 | params = json.load(fin)
115 | homeostasis_genes = pd.read_csv(params['anchor_file_address'],
116 | dtype=str).iloc[:,0].values
117 | main(experiment_name=params['experiment_name'],
118 | phenotypes=params['phenotypes'],
119 | data_directory=params['data_directory'],
120 | anchor_genes=homeostasis_genes,
121 | num_replicates=params['n_replicates'],
122 | percent=params['percentage'],
123 | num_anchors=params['n_anchors'],
124 | min_dangle_size=params['min_dangle_size'],
125 | max_dangle_size=params['max_dangle_size'],
126 | test_ratio=params['test_ratio'])
127 |
128 |
--------------------------------------------------------------------------------
/test/test_temp.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from netwalk.temp import *
3 | from netwalk.utils import load
4 |
5 |
6 | class TestTemp(unittest.TestCase):
7 | def test_transform(self):
8 | edge_list = {("gene1", "gene2"): 0.5,
9 | ("gene2", "gene3"): -0.7,
10 | ("gene1", "gene3"): 0.0}
11 | transformed = transform(edge_list)
12 | transformed_dict = {k: val for k, val in transformed.items()}
13 | expected = {("gene1", "gene2"): 0.75,
14 | ("gene2", "gene3"): 0.15,
15 | ("gene1", "gene3"): 0.5}
16 | transformed_vals = list(transformed_dict.values())
17 | expected_vals = list(expected.values())
18 | for a, b in zip(transformed_vals, expected_vals):
19 | self.assertAlmostEqual(a, b, places=5)
20 |
21 | edge_list = load("data/fake_networks/network_2.csv", sep=",")
22 | transformed = transform(edge_list)
23 | expected = {("g1", "g2"): 0.9,
24 | ("g2", "g3"): 0.55,
25 | ("g2", "g1"): 0.9,
26 | ("g3", "g2"): 0.55,
27 | ("g2", "g4"): 0.75,
28 | ("g4", "g3"): 0.55,
29 | ("g3", "g1"): 0.75,
30 | ("g1", "g3"): 0.75,
31 | ("g1", "g4"): 0.10,
32 | ("g4", "g1"): 0.10,
33 | ("g4", "g2"): 0.75,
34 | ("g3", "g4"): 0.55}
35 |
36 | transformed_dict = {k: val for k, val in transformed.items()}
37 | transformed_vals = list(transformed_dict.values())
38 | expected_vals = list(expected.values())
39 | for a, b in zip(transformed_vals, expected_vals):
40 | self.assertAlmostEqual(a, b, places=5)
41 |
42 | def test_filter(self):
43 | edge_list = {("gene1", "gene2"): 0.5,
44 | ("gene2", "gene3"): -0.7,
45 | ("gene1", "gene3"): 0.0}
46 | transformed = transform(edge_list)
47 | transformed_dict = {k: val for k, val in transformed.items()}
48 | filtered = filter(transformed_dict, exclude=(0.3, 0.6))
49 | filtered_dict = {k: val for k, val in filtered.items()}
50 |
51 | expected = {("gene1", "gene2"): 0.75,
52 | ("gene2", "gene3"): 0.15}
53 |
54 | filtered_vals = list(filtered_dict.values())
55 | expected_vals = list(expected.values())
56 | for a, b in zip(filtered_vals, expected_vals):
57 | self.assertAlmostEqual(a, b, places=5)
58 |
59 | edge_list = {("gene1", "gene2"): 0.5,
60 | ("gene2", "gene3"): -0.7,
61 | ("gene1", "gene3"): 0.0}
62 |
63 | filtered = filter(edge_list, exclude=(0, 0.7))
64 | filtered_dict = {k: val for k, val in filtered.items()}
65 |
66 | expected = {("gene1", "gene2"): -0.7,
67 | ("gene2", "gene3"): 0.0}
68 |
69 | filtered_vals = list(filtered_dict.values())
70 | expected_vals = list(expected.values())
71 | for a, b in zip(filtered_vals, expected_vals):
72 | self.assertAlmostEqual(a, b, places=5)
73 |
74 | def test_overlay_networks(self):
75 | original_edge_list_1 = Similarity({("gene1", "gene2"): 0.5,
76 | ("gene2", "gene3"): 0.7,
77 | ("gene1", "gene3"): 0.6,
78 | ("gene1", "gene4"): 0.3,
79 | ("gene2", "gene4"): 0.1,
80 | ("gene3", "gene4"): 0.0,
81 | ("gene1", "gene3"): 0.0})
82 |
83 | original_edge_list_2 = Similarity({("gene1", "gene2"): 0.5,
84 | ("gene2", "gene3"): 0.7,
85 | ("gene1", "gene3"): 0.0,
86 | ("gene1", "gene4"): 0.0,
87 | ("gene2", "gene4"): 0.3,
88 | ("gene4", "gene3"): 0.7,
89 | ("gene1", "gene3"): 0.2})
90 |
91 | edge_list_1 = Similarity({("gene1", "gene2"): 0.5,
92 | ("gene2", "gene3"): 0.7,
93 | ("gene1", "gene3"): 0.6,
94 | ("gene1", "gene4"): 0.3})
95 |
96 | edge_list_2 = Similarity({("gene1", "gene2"): 0.5,
97 | ("gene2", "gene3"): 0.7,
98 | ("gene2", "gene4"): 0.3})
99 |
100 | net_1, net_2 = overlay_networks(net_a_similarity=edge_list_1, net_b_similarity=edge_list_2,
101 | original_net_a=original_edge_list_1, original_net_b=original_edge_list_2)
102 |
103 | assert set(net_1.symmetric_key_set()) == set(net_2.symmetric_key_set())
104 | net_1_dict = {k: val for k, val in net_1.items()}
105 | net_2_dict = {k: val for k, val in net_2.items()}
106 | assert set(net_1_dict.values()) == {0.5, 0.7, 0.6, 0.3, 0.1}
107 | assert set(net_2_dict.values()) == {0.5, 0.7, 0.3, 0.2, 0.0}
108 |
109 | def test_create_spine(self):
110 | net1 = Similarity.load("../data/fake_networks/network_1.csv", sep=",")
111 | net2 = Similarity.load("../data/fake_networks/network_2.csv", sep=",")
112 |
113 | expected_spine = ["g1", "g4"]
114 | expected_pseudo_spine = ["pseudo_g1", "pseudo_g4"]
115 | expected_similarity = {("pseudo_g1", "pseudo_g10"): 0.5,
116 | ("pseudo_g1", "pseudo_g11"): 0.5,
117 | ("pseudo_g1", "pseudo_g4"): -0.8,
118 | ("pseudo_g4", "pseudo_g40"): 0.5,
119 | ("pseudo_g4", "pseudo_g41"): 0.5}
120 |
121 | spine, pseudo_spine, backbone = create_spine(spine=["g1", "g4"], net_a_tsfmd_similarity=net1,
122 | net_b_tsfmd_similarity=net2,
123 | prefix='pseudo_', alpha=2, weight=0.5)
124 | assert set(spine) == set(expected_spine)
125 | assert set(pseudo_spine) == set(expected_pseudo_spine)
126 |
127 | self.assertEqual(len(backbone), len(expected_similarity))
128 |
129 | for key, val in backbone.items():
130 | assert key in expected_similarity.keys()
131 | assert expected_similarity[key] == val
132 |
133 | expected_spine = ["g1", "g4", "g3"]
134 | expected_pseudo_spine = ["pseudo_g1", "pseudo_g4", "pseudo_g3"]
135 | expected_similarity = {("pseudo_g1", "pseudo_g3"): 0.5,
136 | ("pseudo_g1", "pseudo_g10"): 0.5,
137 | ("pseudo_g1", "pseudo_g11"): 0.5,
138 | ("pseudo_g4", "pseudo_g1"): -0.8,
139 | ("pseudo_g3", "pseudo_g30"): 0.5,
140 | ("pseudo_g3", "pseudo_g31"): 0.5,
141 | ("pseudo_g4", "pseudo_g40"): 0.5,
142 | ("pseudo_g4", "pseudo_g41"): 0.5}
143 |
144 | spine, pseudo_spine, backbone = create_spine(spine=["g1", "g4", "g3"], net_a_tsfmd_similarity=net1,
145 | net_b_tsfmd_similarity=net2,
146 | prefix='pseudo_', alpha=2, weight=0.5)
147 |
148 | assert set(spine) == set(expected_spine)
149 | assert set(pseudo_spine) == set(expected_pseudo_spine)
150 |
151 | self.assertEqual(len(backbone), len(expected_similarity))
152 |
153 | for key, val in backbone.items():
154 | assert key in expected_similarity.keys()
155 | assert expected_similarity[key] == val
156 |
157 | def test_add_anchor(self):
158 | net1 = Similarity.load("../data/fake_networks/network_1.csv", sep=",")
159 | net2 = Similarity.load("../data/fake_networks/network_2.csv", sep=",")
160 |
161 | spine_similarity_1 = {("pseudo_g1", "pseudo_g3"): 0.5,
162 | ("pseudo_g1", "pseudo_g10"): 0.5,
163 | ("pseudo_g1", "pseudo_g11"): 0.5,
164 | ("pseudo_g4", "pseudo_g1"): -0.8,
165 | ("pseudo_g3", "pseudo_g30"): 0.5,
166 | ("pseudo_g3", "pseudo_g31"): 0.5,
167 | ("pseudo_g4", "pseudo_g40"): 0.5,
168 | ("pseudo_g4", "pseudo_g41"): 0.5}
169 |
170 | spine_similarity_2 = {("pseudo_g1", "pseudo_g10"): 0.5,
171 | ("pseudo_g1", "pseudo_g11"): 0.5,
172 | ("pseudo_g4", "pseudo_g1"): -0.8,
173 | ("pseudo_g4", "pseudo_g40"): 0.5,
174 | ("pseudo_g4", "pseudo_g41"): 0.5}
175 |
176 | anchored_net1 = add_anchor(net1, pseudo_similarity=backbone, spine_genes=["g1", "g4", "g3"],
177 | pseudo_spine_genes=["pseudo_g1", "pseudo_g4", "pseudo_g3"],
178 | weight=0.5)
179 |
180 | anchored_net2 = add_anchor(net2, pseudo_similarity=backbone, spine_genes=["g1", "g4"],
181 | pseudo_spine_genes=["pseudo_g1", "pseudo_g4"],
182 | weight=0.7)
183 |
184 | for key, val in anchored_net1.items():
185 | assert key in spine_similarity_1.keys()
186 | for key, val in anchored_net2.items():
187 | assert key in spine_similarity_2.keys()
188 |
189 | result_keys = [anchored_net1[x] for x in anchored_net1.keys()]
190 | self.assertListEqual(result_keys, net1.update(spine_similarity_1))
191 |
192 | result_keys = [anchored_net2[x] for x in anchored_net2.keys()]
193 | self.assertListEqual(result_keys, net2.update(spine_similarity_2))
194 |
--------------------------------------------------------------------------------
/dimensionality_reduction.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import json
4 | import argparse
5 | import random
6 | from sklearn.decomposition import PCA
7 | from sklearn.manifold import TSNE
8 | from matplotlib import pyplot
9 | from gensim.models import Word2Vec
10 | import numpy as np
11 | import pandas as pd
12 | import seaborn as sns
13 | from netwalk.translator import ID2NameTranslator, IDCovertor
14 | from scipy.spatial.distance import cdist
15 | from itertools import combinations
16 |
17 | def pca_visualization(model, out_file_name):
18 | x = model[model.wv.vocab]
19 | pca = PCA(n_components=2)
20 | result = pca.fit_transform(x)
21 | # create a scatter plot of the projection
22 | pyplot.scatter(result[:, 0], result[:, 1])
23 | #pyplot.xlim(-25, 25)
24 | #pyplot.ylim(-25, 25)
25 | words = list(model.wv.vocab)
26 | #for i, word in enumerate(words):
27 | #pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
28 | pyplot.savefig(out_file_name)
29 |
30 |
31 | def tsne_plot(model, out_file_name, perplexity=30, components=2, init='pca',
32 | num_iter=500, rand_state=0):
33 | labels = []
34 | tokens = []
35 |
36 | for word in model.wv.vocab:
37 | tokens.append(model[word])
38 | labels.append(word)
39 |
40 | tsne_model = TSNE(perplexity=perplexity, n_components=components,
41 | init=init, n_iter=num_iter, random_state=rand_state)
42 | new_values = tsne_model.fit_transform(tokens)
43 |
44 | x = []
45 | y = []
46 | for value in new_values:
47 | x.append(value[0])
48 | y.append(value[1])
49 | print(labels)
50 | print(labels["pseudo" in labels])
51 | c=["royalblue" if "pseudo" in x else "orangered" for x in labels]
52 | pyplot.figure(figsize=(16, 16))
53 | for i in range(len(x)):
54 | pyplot.scatter(x[i], y[i], color=c[i], s=30)
55 |
56 | pyplot.savefig(out_file_name)
57 |
58 |
59 | def tsne_visualize(model, gene, list_names, vocab_length, num_components, out_file_name, converter, translate):
60 | """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query gene,
61 | its list of most similar genes, and a list of other genes.
62 | vv """
63 | arrays = np.empty((0, vocab_length), dtype='f')
64 | gene_labels = [gene]
65 | color_list = ['red']
66 |
67 | # adds the vector of the query gene
68 | arrays = np.append(arrays, model.wv.__getitem__([gene]), axis=0)
69 |
70 | # gets list of most similar genes
71 | close_genes = model.wv.most_similar([gene])
72 |
73 | # adds the vector for each of the closest genes to the array
74 | for gne_score in close_genes:
75 | gne_vector = model.wv.__getitem__([gne_score[0]])
76 | gene_labels.append(gne_score[0])
77 | color_list.append('blue')
78 | arrays = np.append(arrays, gne_vector, axis=0)
79 |
80 | # adds the vector for each of the genes from list_names to the array
81 | for gne in list_names:
82 | gne_vector = model.wv.__getitem__([gne])
83 | gene_labels.append(gne)
84 | color_list.append('green')
85 | arrays = np.append(arrays, gne_vector, axis=0)
86 |
87 | # Reduces the dimensionality from 300 to 50 dimensions with PCA
88 | reduc = PCA(n_components=num_components).fit_transform(arrays)
89 | # Finds t-SNE coordinates for 2 dimensions
90 | np.set_printoptions(suppress=True)
91 | Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
92 |
93 | gene_labels = ints_to_names(gene_labels, translate, converter)
94 |
95 | # Sets everything up to plot
96 | df = pd.DataFrame({'x': [x for x in Y[:, 0]],
97 | 'y': [y for y in Y[:, 1]],
98 | 'genes': gene_labels,
99 | 'color': color_list})
100 |
101 | fig, _ = pyplot.subplots()
102 | fig.set_size_inches(9, 9)
103 |
104 | # Basic plot
105 | sns.set_style("ticks")
106 | p1 = sns.regplot(data=df,
107 | x="x",
108 | y="y",
109 | fit_reg=False,
110 | marker="o",
111 | scatter_kws={'s': 40,
112 | 'facecolors': df['color']
113 | }
114 | )
115 |
116 | # Adds annotations one by one with a loop
117 | for line in range(0, df.shape[0]):
118 | p1.text(df["x"][line],
119 | df['y'][line],
120 | ' ' + df["genes"][line].title(),
121 | horizontalalignment='left',
122 | verticalalignment='bottom', size='medium',
123 | color=df['color'][line],
124 | weight='normal'
125 | ).set_size(15)
126 |
127 | pyplot.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
128 | pyplot.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
129 |
130 | pyplot.title('t-SNE visualization for {}'.format(gene.title()))
131 | pyplot.savefig(out_file_name)
132 |
133 | def names_to_ints(names, trans, convertor):
134 | ids, names = trans.names2ids(names)
135 | ints = [str(i) for i in convertor.ids2ints(ids)]
136 | return ints, ids, names
137 |
138 | def ints_to_names(ints, trans, convertor):
139 | #int to id
140 | ids = [i for i in convertor.ints2ids(ints)]
141 | #id to name
142 | names = [trans.id2name(x) for x in ids]
143 | return names
144 |
145 | if __name__ == '__main__':
146 | parser = argparse.ArgumentParser(description='Generate datasets')
147 | parser.add_argument('-c', '--config', metavar='JSON file path',
148 | action='store', required=True,
149 | help='Path to a config file')
150 | args = parser.parse_args()
151 | # read training config
152 | with open(args.config) as fin:
153 | params = json.load(fin)
154 |
155 | ensemble_id_name_file = params['vocab']
156 | convertor_file = os.path.join(params['experiment_name'],'IDConvertor.json')
157 | trans = ID2NameTranslator(ensemble_id_name_file, sep=',')
158 | convertor = IDCovertor.load(convertor_file)
159 |
160 | for rep_id in range(params['n_replicates']):
161 | for pheno in params['phenotypes']:
162 | path = os.path.join(params['experiment_name'],
163 | '{}_{}.model'.format(pheno, rep_id))
164 | viz_path = os.path.join(params['experiment_name'],
165 | '{}_{}_tsne.pdf'.format(pheno,
166 | rep_id))
167 | pca_path = os.path.join(params['experiment_name'],
168 | '{}_{}_pca.pdf'.format(pheno,
169 | rep_id))
170 | # load model
171 | model = Word2Vec.load(path)
172 | genes = list(model.wv.vocab)
173 | # make tsne
174 | #tsne_plot(model, out_file_name=viz_path)
175 | #pca_visualization(model, out_file_name=pca_path)
176 | names = params['select_genes']
177 | ints, ids, names = names_to_ints(names, trans, convertor)
178 | for ind, g, name in zip(ints, ids, names):
179 | sim_path = os.path.join(params['experiment_name'],
180 | '{}_{}_most_similar_to_{}.pdf'.format(pheno,
181 | rep_id, name))
182 | rand_path = os.path.join(params['experiment_name'],
183 | '{}_{}_random_compared_to_{}.pdf'.format(pheno,
184 | rep_id, g))
185 | # make a visualization of select genes and their most similar genes
186 | if ind in model.wv.vocab:
187 | negative_ints = [i[0] for i in model.wv.most_similar(negative=[ind])]
188 | tsne_visualize(model, ind, list_names=negative_ints, vocab_length=params['embd_dim'], num_components=2, out_file_name=sim_path, translate=trans,converter=convertor)
189 | #sampled_ints = random.choices(genes, k=20)
190 | #tsne_visualize(model, ind, list_names=sampled_ints, vocab_length=params['embd_dim'], num_components=2, out_file_name=rand_path, translate=trans,converter=convertor)
191 | for pheno_1, pheno_2 in combinations(params['phenotypes'], 2):
192 | path_1 = os.path.join(params['experiment_name'],
193 | '{}_{}.model'.format(pheno_1, rep_id))
194 | path_2 = os.path.join(params['experiment_name'],
195 | '{}_{}.model'.format(pheno_2, rep_id))
196 | model_1 = Word2Vec.load(path_1)
197 | model_2 = Word2Vec.load(path_2)
198 | ints, ids, names = names_to_ints(names, trans, convertor)
199 | targets_in_model1 = [ (gene_i, name_i) for gene_i, name_i in zip(ints, names) if gene_i in model_1.wv.vocab]
200 | targets_in_model2 = [ (gene_i, name_i) for gene_i, name_i in zip(ints, names) if gene_i in model_2.wv.vocab]
201 | x1 = np.array([model_1.wv[gene_i] for gene_i, _ in targets_in_model1])
202 | x2 = np.array([model_2.wv[gene_i] for gene_i, _ in targets_in_model2])
203 | # Calculate the distance between elements of x1 and x2 as a distance matrix
204 | dist = cdist(x1, x2, metric='cosine')/2
205 | df = pd.DataFrame(dist)
206 |
207 | df.columns = [name_i for _, name_i in targets_in_model2]
208 | df.index = [name_i for _, name_i in targets_in_model1]
209 | matrix_path = os.path.join(params['experiment_name'],
210 | '{}_{}_{}_matrix.pdf'.format(pheno_1, pheno_2, rep_id))
211 | pyplot.figure()
212 | print(df.index, df.columns)
213 | print(df.shape)
214 | ax = sns.heatmap(df, square=True, vmin=0, vmax=1)
215 | #ax.tick_params(left=False, bottom=False)
216 | ax.set_yticklabels(list(df.index))
217 | ax.set_xticklabels(list(df.columns))
218 | figure = ax.get_figure()
219 | figure.savefig(matrix_path)
220 | pyplot.close()
221 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Juxtapose
2 |
3 |
4 | Table of Contents
6 |
7 |
28 |
13 |
16 |
19 |
23 |
35 |
36 |
75 |
76 |
165 |
166 |
172 |
173 |
178 |
179 |
180 |
189 |
190 |
204 |
205 |
206 |