├── netwalk
    ├── __init__.py
    ├── utils.py
    ├── models.py
    ├── walk.py
    ├── walkdataset.py
    └── translator.py
├── Makefile
├── JuxtaposeTutorial
    ├── line.png
    ├── keypair.png
    ├── attachvolume.png
    ├── attachvolume2.png
    ├── keypairname.png
    ├── securitygroup.png
    ├── spotrequests.png
    ├── connectinstance.png
    ├── selectinstance.png
    └── Embedding_Methodology.png
├── requirements.txt
├── regression.py
├── test
    ├── data
    │   ├── line-config.json
    │   ├── cross-config.json
    │   ├── circle-config.json
    │   ├── prefrontal_cortex.json
    │   └── brain-heart-config.json
    ├── test_utils.py
    ├── test_walkdataset.py
    ├── test_walk.py
    └── test_temp.py
├── Line
    └── IDConvertor.json
├── make_directed.py
├── network_stats.py
├── test_translator.py
├── find_common_genes.py
├── random_tree_generator.py
├── dataset_generator.py
├── similarity.py
├── dangle.py
├── dimensionality_reduction.py
├── README.md
└── runner.py


/netwalk/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | setup:
2 | 	pip install -r requirements.txt
3 | regression:
4 | 	python regression.py


--------------------------------------------------------------------------------
/JuxtaposeTutorial/line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/line.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/keypair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/keypair.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/attachvolume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/attachvolume.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/attachvolume2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/attachvolume2.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/keypairname.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/keypairname.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/securitygroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/securitygroup.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/spotrequests.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/spotrequests.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/connectinstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/connectinstance.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/selectinstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/selectinstance.png


--------------------------------------------------------------------------------
/JuxtaposeTutorial/Embedding_Methodology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/Embedding_Methodology.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gensim==3.8.3
 2 | matplotlib==3.1.3
 3 | networkx==2.4
 4 | numpy==1.18.1
 5 | pandas==1.0.1
 6 | scikit-learn==0.22.1
 7 | scipy==1.4.1
 8 | seaborn==0.10.0
 9 | sklearn==0.0
10 | torch==1.6.0
11 | torchvision==0.7.0


--------------------------------------------------------------------------------
/regression.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | # initialize the test suite
 5 | loader = unittest.TestLoader()
 6 | start_dir = './test/'
 7 | suite = loader.discover(start_dir)
 8 | 
 9 | # initialize a runner, pass it your suite and run it
10 | runner = unittest.TextTestRunner(verbosity=3)
11 | result = runner.run(suite)


--------------------------------------------------------------------------------
/test/data/line-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n_replicates": 10,
 3 |   "percentage": 0.4,
 4 |   "n_anchors": 6,
 5 |   "anchor_test_ratio": 0.5,
 6 |   "min_dangle_size": 3,
 7 |   "max_dangle_size": 10,
 8 |   "anchor_file_address": "test/data/line_anchors.csv",
 9 |   "phenotypes": ["1", "2"],
10 |   "experiment_name": "Line",
11 |   "test_ratio": 0.5,
12 |   "data_directory": "test/data"
13 | }
14 | 


--------------------------------------------------------------------------------
/test/data/cross-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n_replicates": 1,
 3 |   "percentage": 0.4,
 4 |   "n_anchors": 10,
 5 |   "anchor_test_ratio": 0.5,
 6 |   "min_dangle_size": 3,
 7 |   "max_dangle_size": 5,
 8 |   "anchor_file_address": "test/data/cross_anchors.csv",
 9 |   "phenotypes": ["cross_1", "cross_2"],
10 |   "experiment_name": "Cross",
11 |   "test_ratio": 0.5,
12 |   "data_directory": "test/data"
13 | }
14 | 


--------------------------------------------------------------------------------
/test/data/circle-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n_replicates": 1,
 3 |   "percentage": 0.4,
 4 |   "n_anchors": 10,
 5 |   "anchor_test_ratio": 0.5,
 6 |   "min_dangle_size": 3,
 7 |   "max_dangle_size": 5,
 8 |   "anchor_file_address": "test/data/circle_anchors.csv",
 9 |   "phenotypes": ["circle_1", "circle_2"],
10 |   "experiment_name": "Circle",
11 |   "test_ratio": 0.5,
12 |   "data_directory": "test/data"
13 | }
14 | 


--------------------------------------------------------------------------------
/test/data/prefrontal_cortex.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n_replicates": 1,
 3 |   "percentage": 0.3,
 4 |   "n_anchors": 20,
 5 |   "anchor_test_ratio": 0.2,
 6 |   "min_dangle_size": 10,
 7 |   "max_dangle_size": 15,
 8 |   "anchor_file_address": "data/common_cortex_genes.csv",
 9 |   "phenotypes": ["human", "chimpanzee", "macaque", "mouse"],
10 |   "experiment_name": "Cortex",
11 |   "test_ratio": 0.2,
12 |   "data_directory": "pcortex_data"
13 | }
14 | 


--------------------------------------------------------------------------------
/test/data/brain-heart-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n_replicates": 1,
 3 |   "percentage": 0.4,
 4 |   "n_anchors": 20,
 5 |   "anchor_test_ratio": 0.5,
 6 |   "min_dangle_size": 7,
 7 |   "max_dangle_size": 10,
 8 |   "anchor_file_address": "data/heart_brain_shared.csv",
 9 |   "phenotypes": ["brain_1", "brain_2", "brain_3", "heart_1", "heart_2", "heart_3"],
10 |   "experiment_name": "Heart_Brain",
11 |   "test_ratio": 0.5,
12 |   "data_directory": "data"
13 | }
14 | 


--------------------------------------------------------------------------------
/Line/IDConvertor.json:
--------------------------------------------------------------------------------
1 | {"0": 0, "1": 1, "10": 2, "11": 3, "12": 4, "13": 5, "14": 6, "15": 7, "16": 8, "17": 9, "18": 10, "19": 11, "2": 12, "20": 13, "3": 14, "4": 15, "5": 16, "6": 17, "7": 18, "8": 19, "9": 20, "pseudo_14": 21, "pseudo_14_000": 22, "pseudo_14_001": 23, "pseudo_14_002": 24, "pseudo_15": 25, "pseudo_15_000": 26, "pseudo_15_001": 27, "pseudo_15_002": 28, "pseudo_15_003": 29, "pseudo_5": 30, "pseudo_5_000": 31, "pseudo_5_001": 32, "pseudo_5_002": 33, "pseudo_5_003": 34, "pseudo_5_004": 35, "pseudo_5_005": 36}


--------------------------------------------------------------------------------
/make_directed.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | 
 3 | G = nx.read_edgelist("/home/farhad/Network/juxt/brain_heart_data/heart_1.csv", delimiter=',',nodetype=str, data=(('cor',float),))
 4 | #print(G.edges(data=True))
 5 | T = nx.algorithms.tree.mst.minimum_spanning_tree(G, weight='cor')
 6 | edj = T.edges()
 7 | print(edj)
 8 | address = 'test/data/heart_directed_1.txt'
 9 | with open(address, 'w') as fout:
10 |     for e in edj:
11 |         # write edges to file
12 |         node_1 = e[0]
13 |         node_2 = e[1]
14 |         fout.write('{}\t{}\n'.format(str(node_1),str(node_2)))
15 | 


--------------------------------------------------------------------------------
/network_stats.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def alignment_permutation_test(vocab1_length, vocab2_length, distance, actual_score, num_iteration=1000):
 5 |     n = min(vocab1_length, vocab2_length)
 6 |     indices = list(range(n))
 7 |     scores = []
 8 |     for i in range(num_iteration):
 9 |         v1 = np.random.choice(indices, size=n)
10 |         v2 = np.random.choice(indices, size=n)
11 |         s = 0
12 |         for i, j in zip(v1, v2):
13 |             s += distance[i, j]
14 |         scores.append(s/n)
15 |     scores = np.array(scores)
16 |     print(scores)
17 |     p = sum(scores >= actual_score) / num_iteration
18 |     return p
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from netwalk.utils import Vocabulary
 3 | from netwalk.utils import load
 4 | from netwalk.utils import Similarity
 5 | 
 6 | class TestUtils(unittest.TestCase):
 7 |     def test_load(self):
 8 |         similarity = load("data/similarity_file.csv", sep=",")
 9 |         expected = {("gene1", "gene2"): 0.5,
10 |                     ("gene2", "gene3"): 0.7,
11 |                     ("gene1", "gene3"): 0.0}
12 | 
13 |         self.assertDictEqual(similarity, expected)
14 | 
15 |     def test_Vocabulary(self):
16 |         genes = ['g0', 'g1', 'g2', 'g3', 'g4']
17 |         id_2_name_map = {0: 'g0', 1: 'g1', 2: 'g2', 3: 'g3', 4: 'g4'}
18 |         name_2_id_map = {'g0': 0, 'g1': 1, 'g2': 2, 'g3': 3, 'g4': 4}
19 |         vocab = Vocabulary(genes)
20 |         self.assertDictEqual(vocab.index, name_2_id_map)
21 |         self.assertDictEqual(vocab.name, id_2_name_map)
22 |         self.assertListEqual(vocab.genes, genes)
23 | 
24 |     def test_Similarity(self):
25 |         d = {("gene1", "gene2"): 0.5,
26 |              ("gene2", "gene3"): 0.7,
27 |              ("gene1", "gene3"): 0.6,
28 |              ("gene1", "gene4"): 0.3}
29 |         similarity = Similarity(d)
30 |         symmetric_keys = similarity.symmetric_key_set()
31 |         expected_sym_keys = [("gene1", "gene2"), ("gene2", "gene1"), ("gene2", "gene3"), ("gene3", "gene2"),
32 |                              ("gene1", "gene3"), ("gene3", "gene1"), ("gene1", "gene4"), ("gene4", "gene1")]
33 | 
34 |         assert set(expected_sym_keys) == set(symmetric_keys)
35 | 


--------------------------------------------------------------------------------
/test_translator.py:
--------------------------------------------------------------------------------
 1 | from netwalk.translator import IDCovertor
 2 | import os
 3 | import argparse
 4 | import json
 5 | import glob
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(description='Translate datasets')
 9 |     parser.add_argument('-c', '--config', metavar='JSON file path',
10 |                         action='store', required=True,
11 |                         help='Path to a config file')
12 |     args = parser.parse_args()
13 |     # read config file
14 |     with open(args.config) as fin:
15 |         params = json.load(fin)
16 | 
17 |     edge_list_file_addresses = glob.glob(os.path.join(params['experiment_name'],
18 |                                              'anchored_*.csv'))
19 |     print(os.listdir(params['experiment_name']))
20 |     #output_file_address = 'Line/translated_line.csv'
21 |     convertor = IDCovertor(edge_list_file_addresses, sep=',')
22 |     for edge_list_file_address in edge_list_file_addresses:
23 |         dir_path, file_name = os.path.split(edge_list_file_address)
24 |         output_file_address = os.path.join(dir_path, f'translated_{file_name}')
25 |         convertor.translate(edge_list_file_address, output_file_address, sep=',')
26 | 
27 |     convertor_file = os.path.join(params['experiment_name'],
28 |                                              'IDConvertor.json')
29 |     convertor.save(convertor_file)
30 |     con = IDCovertor.load(convertor_file)
31 |     assert con.id2int == convertor.id2int
32 |     assert con.int2id == convertor.int2id
33 |     assert con.ids == convertor.ids
34 | 
35 | 


--------------------------------------------------------------------------------
/netwalk/utils.py:
--------------------------------------------------------------------------------
 1 | ''' This module contains utilities classes and functions.
 2 | 
 3 | '''
 4 | import os.path
 5 | import json
 6 | import random
 7 | import torch
 8 | import copy
 9 | import numpy as np
10 | 
11 | class Vocabulary(object):
12 |     '''Create a bijective mapping between gene name/ID and indices.
13 | 
14 |     Args:
15 |         genes: An array-like containing gene names/IDs.
16 |     '''
17 | 
18 |     def __init__(self, genes):
19 |         self.genes = genes
20 |         self.index = dict(zip(sorted(genes), range(len(genes))))
21 |         self.name = {idx: gene for gene, idx in self.index.items()}
22 |         self.dim = len(self.genes)
23 | 
24 |     def to_indices(self, genes):
25 |         return [self.index[gene] for gene in genes]
26 | 
27 |     def to_names(self, indices):
28 |         return [self.name[i] for i in indices]
29 | 
30 |     def __len__(self):
31 |         return len(self.genes)
32 | 
33 | 
34 | def load_walks(file_dir='.', prefix='pair_walk', sep=','):
35 |     ''' Read dataset from a file.
36 | 
37 |         Args:
38 |             address: Address of a CSV file containing walks.
39 |             sep: A field delimiter.
40 |         Returns:
41 |             data: A list of walks.
42 |             genes: The set of all genes that appear in at least one walk.
43 |     '''
44 |     walk_address = os.path.join(file_dir, prefix + '_walks.csv')
45 |     walks = np.genfromtxt(walk_address, dtype=np.uint16, delimiter=sep)
46 |     return walks
47 | 
48 | 
49 | def dump_walks(walks, out_dir='.', prefix='pair_walk', sep=','):
50 |     # Create walks file
51 |     pass
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/test/test_walkdataset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from netwalk.walkdataset import WalkDataset
 3 | from netwalk.walkdataset import PairWalkDataset
 4 | from netwalk.utils import Vocabulary
 5 | 
 6 | 
 7 | class TestWalkDataset(unittest.TestCase):
 8 |     def setUp(self):
 9 |         data = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
10 |                 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
11 |                 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
12 |                 [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
13 |                 [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]
14 |         self.data = data
15 | 
16 |     def test_from_csv(self):
17 |         dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',')
18 |         self.assertListEqual(dataset.walks, self.data)
19 | 
20 |     def test__len__(self):
21 |         dataset = WalkDataset(original_walks=[], vocab=Vocabulary([]))
22 |         self.assertEqual(len(dataset), 0)
23 |         dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',')
24 |         self.assertEqual(len(dataset), 5)
25 | 
26 |     def test__getitem__(self):
27 |         dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',')
28 |         for i, expected in enumerate(self.data):
29 |             self.assertListEqual(expected, dataset[i])
30 | 
31 | 
32 | class TestPairedWalkDataset(unittest.TestCase):
33 |     def setUp(self):
34 |         data = [([0, 1, 2, 3, 4], [0, 1, 2, 3, 4]),
35 |                 ([5, 6, 7, 8, 9], [5, 8, 7, 8, 9])]
36 |         self.data = data
37 | 
38 |     def test_from_csv(self):
39 |         dataset = PairWalkDataset.from_csv('data/sample_pair_walk_dataset.csv', sep=',')
40 |         self.assertEqual(len(self.data), len(dataset))
41 |         for i, observed_walk in enumerate(dataset):
42 |             (expected_walk_a, expected_walk_b) = self.data[i]
43 |             observed_walk_a, observed_walk_b = observed_walk
44 |             self.assertListEqual(expected_walk_a, observed_walk_a)
45 |             self.assertListEqual(expected_walk_b, observed_walk_b)
46 | 


--------------------------------------------------------------------------------
/find_common_genes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import multiprocessing as mp
 3 | 
 4 | 
 5 | def load(address):
 6 |     genes = set()
 7 |     with open(address) as  fin:
 8 |         for line in fin:
 9 |             line = line.strip()
10 |             if line == '':
11 |                 continue
12 |             gene_a, gene_b, _ =  line.split(',')
13 |             genes.add(gene_a)
14 |             genes.add(gene_b)
15 |     return genes
16 | 
17 | def get_common_genes(edge_lists, genes_of_interest):
18 |     pool = mp.Pool(min(mp.cpu_count(), len(edge_llists)))
19 |     profiles_genes = [pool.apply(load, args=(address, )) for address in edge_lists]
20 |     pool.close()
21 |     common_genes = set(genes_of_interest)
22 |     for genes in profiles_genes:
23 |         common_genes =  genes & common_genes
24 |     return common_genes
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     output_address = 'data/common_cortex_genes.csv'
29 |     edge_llists = [
30 |         'pcortex_data/network_1_12_chimpanzee.csv',
31 |         'pcortex_data/network_1_12_human.csv',
32 |         'pcortex_data/network_1_12_macaque.csv',
33 |         'pcortex_data/network_1_12_mouse.csv']
34 |     #genes_of_interest = '/home/fam918/Documents/CodeRepos/WALKS/netwalk/data/homeostasis_genes.csv'
35 |     #output_address = 'heart_brain_shared.csv'
36 |     #edge_llists = ['/home/farhad/Network/netwalk/data/network_1_200_heart.csv',
37 |     #                '/home/farhad/Network/netwalk/data/network_2_200_heart.csv',
38 |     #                '/home/farhad/Network/netwalk/data/network_3_200_heart.csv',
39 |     #                '/home/farhad/Network/netwalk/data/network_1_200_brain.csv',
40 |     #                '/home/farhad/Network/netwalk/data/network_2_200_brain.csv',
41 |     #                '/home/farhad/Network/netwalk/data/network_3_200_brain.csv']
42 |     genes_of_interest = 'data/cellular_homeostasis.csv'
43 |     with open(genes_of_interest) as f:
44 |         lines = f.read().splitlines()
45 |     common_genes = get_common_genes(edge_llists, lines)
46 |     with open(output_address, 'w') as fout:
47 |         for gene in common_genes:
48 |             fout.write('{}\n'.format(gene))
49 | 


--------------------------------------------------------------------------------
/random_tree_generator.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import networkx as nx
 3 | 
 4 | START_SIZE=50
 5 | CURRENT_SIZE=50
 6 | FINAL_SIZE=200
 7 | NODE_STEP=10
 8 | 
 9 | while CURRENT_SIZE <= FINAL_SIZE:
10 |     #print(CURRENT_SIZE - 1)
11 |     address = 'test/data/random_tree_{}.csv'.format(CURRENT_SIZE)
12 |     with open(address, 'w') as fout:
13 |         if CURRENT_SIZE == START_SIZE:
14 |             G = nx.generators.trees.random_tree(START_SIZE)
15 |             while nx.number_connected_components(G) > 1:
16 |                 nx.generators.trees.random_tree(START_SIZE)
17 |             edj = list(G.edges())
18 |             n = list(G.nodes())
19 |             for e in edj:
20 |                 # write edges to file
21 |                 fout.write('{},{},1\n'.format(str(e[0]),str(e[1])))
22 |                 fout.write('{},{},1\n'.format(str(e[1]),str(e[0])))
23 |         elif CURRENT_SIZE != START_SIZE:
24 |             # read in the previous graph and write it to file
25 |             previous_address = 'test/data/random_tree_{}.csv'.format(CURRENT_SIZE-NODE_STEP)
26 |             file_previous = open(previous_address, 'r')
27 |             Lines = file_previous.readlines()
28 |             fout.writelines(Lines)
29 | 
30 |             G = nx.generators.trees.random_tree(NODE_STEP)
31 |             while nx.number_connected_components(G) > 1:
32 |                 nx.generators.trees.random_tree(NODE_STEP)
33 |             edj = list(G.edges())
34 | 
35 |             for e in edj:
36 |                 # write edges to file
37 |                 node_1 = e[0] + CURRENT_SIZE-NODE_STEP - 1
38 |                 node_2 = e[1] + CURRENT_SIZE- NODE_STEP - 1
39 |                 fout.write('{},{},1\n'.format(str(node_1),str(node_2)))
40 |                 fout.write('{},{},1\n'.format(str(node_2),str(node_1)))
41 |             # connect a node in graph to a random node in the original graph
42 |             rand_node = random.randint(0, CURRENT_SIZE-1-NODE_STEP)
43 |             #print(rand_node)
44 |             print(CURRENT_SIZE,rand_node,node_1)
45 |             fout.write('{},{},1\n'.format(str(rand_node), str(node_1)))
46 |             fout.write('{},{},1\n'.format(str(node_1), str(rand_node)))
47 |     CURRENT_SIZE= CURRENT_SIZE + NODE_STEP
48 | 
49 | 
50 | address = 'test/data/test_PPI.txt'
51 | with open(address, 'w') as fout:
52 |     G = nx.scale_free_graph(100)
53 |     edj = list(G.edges())
54 | 
55 |     for e in edj:
56 |         # write edges to file
57 |         node_1 = e[0]
58 |         node_2 = e[1]
59 |         fout.write('{}\t{}\t1\n'.format(str(node_1),str(node_2)))
60 | 


--------------------------------------------------------------------------------
/dataset_generator.py:
--------------------------------------------------------------------------------
 1 | ''' This module generate walk datasets.
 2 | '''
 3 | import argparse
 4 | import random
 5 | import numpy as np
 6 | import netwalk.utils as utils
 7 | import os.path
 8 | from netwalk.walk import WalkGenerator
 9 | from similarity import Similarity
10 | import gensim.models
11 | import time
12 | import multiprocessing as mp
13 | import json
14 | import pandas as pd
15 | import seaborn as sns
16 | import copy
17 | 
18 | def generate_walks(edge_list_address, walk_per_node, walk_length, workers = 4):
19 |     similarity = Similarity(correlation_file_path=edge_list_address, anchors=[],
20 |                             alphas=[], sep=',', prefix='pseudo')
21 |     genes = list(similarity.idx.keys())
22 |     start_time = time.time()
23 |     gen_walk = WalkGenerator(similarity.matrix, genes, walk_length, walk_per_node)
24 |     print("takes {} seconds to create walk object.".format(
25 |         time.time() - start_time))
26 | 
27 |     num_cpus = workers
28 |     pool = mp.Pool(num_cpus)
29 |     arguments = list(range(len(gen_walk)))
30 |     chunk_size = len(gen_walk) // num_cpus
31 |     walks = pool.map(gen_walk, arguments, chunksize=chunk_size)
32 |     return walks
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     parser = argparse.ArgumentParser(description='Generate datasets')
37 |     parser.add_argument('-c', '--config', metavar='JSON file path',
38 |                         action='store', required=True,
39 |                         help='Path to a config file')
40 |     args = parser.parse_args()
41 |     # read config file
42 |     with open(args.config) as fin:
43 |         params = json.load(fin)
44 |     # make walks and train the network
45 |     for pheno in params['phenotypes']:
46 |         for rep_id in range(params['n_replicates']):
47 |             edge_list_address = os.path.join(params['experiment_name'],
48 |                                              'translated_anchored_{}_{}.csv'.format(pheno,
49 |                                                                 str(rep_id)))
50 |             # Create walks
51 |             walks = generate_walks(edge_list_address, params['walk_per_node'],
52 |                                    params['walk_length'], workers=params['n_workers'])
53 | 
54 |             # Write walks to file
55 |             address = os.path.join(params['experiment_name'],
56 |                                    '{}_{}_walks.csv'.format(pheno, str(rep_id)))
57 |             with open(address, 'w') as fout:
58 |                 for w in walks:
59 |                     fout.write('{}\n'.format(','.join([str(s) for s in w])))
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/test/test_walk.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from netwalk.walk import Walk
 4 | from netwalk.walk import PairWalk
 5 | 
 6 | 
 7 | class TestWalk(unittest.TestCase):
 8 |     def test_from_dict(self):
 9 |         d = {(1, 2): 0.7,
10 |              (1, 3): 0.1,
11 |              (2, 4): 0.5,
12 |              (3, 4): 0.3}
13 |         similarity = [[0, 7/8, 1/8, 0],
14 |                       [7/12, 0, 0, 5/12],
15 |                       [1/4, 0, 0, 3/4],
16 |                       [0, 5/8, 3/8, 0]]
17 |         CDF = np.array([[0, 0.875, 1, 1],
18 |                         [7/12, 7/12, 7/12, 1],
19 |                         [0.25, 0.25, 0.25, 1],
20 |                         [0, 0.625, 1, 1]])
21 | 
22 |         walk = Walk(d)
23 |         self.assertListEqual(list(walk._nodes), [1, 2, 3, 4])
24 |         diff = np.array(similarity) - walk.prob
25 |         self.assertAlmostEqual(np.linalg.norm(diff), 0)
26 |         ids = np.array([walk._ids[node] for node in walk._nodes])
27 |         diff = np.linalg.norm(ids - np.arange(len(walk._nodes)))
28 |         self.assertAlmostEqual(diff, 0)
29 |         diff = np.linalg.norm(walk.cdf - CDF)
30 |         self.assertAlmostEqual(diff, 0)
31 | 
32 |     def test_generate(self):
33 |         similarity = {(1, 2): 0.5, (1, 3): 0.0}
34 | 
35 |         walk = Walk(similarity)
36 |         self.assertListEqual(walk.generate(3, 3), [2, 2, 2, 2])
37 | 
38 |         self.assertListEqual((walk.generate(1, 5)), [0, 1, 0, 1, 0, 1])
39 |         self.assertListEqual((walk.generate(2, 5)), [1, 0, 1, 0, 1, 0])
40 | 
41 |     def test_make_walks(self):
42 |         similarity = {(1, 2): 0.5, (1, 3): 0.0}
43 |         walk = Walk(similarity)
44 |         dataset = walk.make_walks(walk_per_node=2, walk_length=3)
45 |         expected_dataset = [[0, 1, 0, 1], [0, 1, 0, 1], [1, 0, 1, 0],
46 |                             [1, 0, 1, 0], [2, 2, 2, 2], [2, 2, 2, 2]]
47 |         for expected, observed in zip(expected_dataset, dataset['walks']):
48 |             self.assertListEqual(expected, list(observed))
49 |         expected_nodes = [1, 2, 3]
50 |         self.assertListEqual(expected_nodes, list(dataset['nodes']))
51 |         expected_ids = {1: 0, 2: 1, 3: 2}
52 |         self.assertEqual(expected_ids, dataset['ids'])
53 | 
54 | 
55 | class TestPairWalk(unittest.TestCase):
56 | 
57 |     def test_generate(self):
58 |         similarity = {(1, 2): 0.5, (1, 3): 0.0}
59 |         walk = PairWalk(similarity)
60 |         self.assertListEqual(walk.generate(3, 3), [2, 2, 2, 2, 2, 2, 2, 2])
61 |         self.assertListEqual((walk.generate(1, 5)), [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
62 |         self.assertListEqual((walk.generate(2, 5)), [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
63 | 


--------------------------------------------------------------------------------
/netwalk/models.py:
--------------------------------------------------------------------------------
 1 | ''' This class contain the models.
 2 | 
 3 | '''
 4 | import math
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from torch.nn import TransformerEncoder, TransformerEncoderLayer
 9 | 
10 | 
11 | class TransformerModel(nn.Module):
12 | 
13 |     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
14 |         super(TransformerModel, self).__init__()
15 |         self.model_type = 'Transformer'
16 |         self.src_mask = None
17 |         self.pos_encoder = PositionalEncoding(ninp, dropout)
18 |         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
19 |         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
20 |         self.encoder = nn.Embedding(ntoken, ninp)
21 |         self.ninp = ninp
22 |         self.decoder = nn.Linear(ninp, ntoken)
23 |         self.init_weights()
24 | 
25 |     def _generate_square_subsequent_mask(self, sz):
26 |         mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
27 |         mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
28 |         return mask
29 | 
30 |     def init_weights(self):
31 |         initrange = 0.1
32 |         self.encoder.weight.data.uniform_(-initrange, initrange)
33 |         self.decoder.bias.data.zero_()
34 |         self.decoder.weight.data.uniform_(-initrange, initrange)
35 | 
36 |     def forward(self, src):
37 |         if self.src_mask is None or self.src_mask.size(0) != len(src):
38 |             device = src.device
39 |             mask = self._generate_square_subsequent_mask(len(src)).to(device)
40 |             self.src_mask = mask
41 |         src = self.encoder(src) * math.sqrt(self.ninp)
42 |         src = self.pos_encoder(src)
43 |         output = self.transformer_encoder(src, self.src_mask)
44 |         output = self.decoder(output)
45 |         return F.log_softmax(output, dim=-1)
46 | 
47 |     def embedding(self, src):
48 |         model.eval() # Turn on the evaluation mode
49 |         with torch.no_grad():
50 |             src = self.encoder(src) * math.sqrt(self.ninp)
51 |             src = self.pos_encoder(src)
52 |             output = self.transformer_encoder(src, None)
53 |             return output
54 | 
55 |     def save(self, address):
56 |         torch.save(self.state_dict(), address)
57 | 
58 |     @classmethod
59 |     def load(cls, model, address):
60 |         model.load_state_dict(torch.load(address))
61 |         model.to(device)
62 |         return model
63 | 
64 | 
65 | class PositionalEncoding(nn.Module):
66 |     ''' Positional encoding used in transforemers.
67 |     '''
68 | 
69 |     def __init__(self, d_model, dropout=0.1, max_len=5000):
70 |         super(PositionalEncoding, self).__init__()
71 |         self.dropout = nn.Dropout(p=dropout)
72 | 
73 |         pe = torch.zeros(max_len, d_model)
74 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
75 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
76 |         pe[:, 0::2] = torch.sin(position * div_term)
77 |         pe[:, 1::2] = torch.cos(position * div_term)
78 |         pe = pe.unsqueeze(0).transpose(0, 1)
79 |         self.register_buffer('pe', pe)
80 | 
81 |     def forward(self, x):
82 |         x = x + self.pe[:x.size(0), :]
83 |         return self.dropout(x)
84 | 


--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Similarity(object):
 6 |     def __init__(self, correlation_file_path, anchors, alphas, sep=',',
 7 |                  prefix='pseudo', string_id=False):
 8 |         self.real_genes = set()
 9 |         with open(correlation_file_path) as fin:
10 |             for line in fin:
11 |                 line = line.strip()
12 |                 if line == '':
13 |                     continue
14 |                 a, b, _ = line.split(sep)
15 |                 a = a.strip()
16 |                 b = b.strip()
17 |                 if string_id is False:
18 |                     a = int(a)
19 |                     b = int(b)
20 |                 self.real_genes.add(a)
21 |                 self.real_genes.add(b)
22 |         self.real_genes = list(sorted(self.real_genes))
23 |         assert set(anchors).issubset(self.real_genes)
24 |         self.pseudo_genes = []
25 |         for anchor, alpha in zip(anchors, alphas):
26 |             self.pseudo_genes.append('{}_{}'.format(prefix, anchor))
27 |             for i in range(alpha):
28 |                 self.pseudo_genes.append('{}_{}_{:0>3d}'.format(prefix, anchor, i))
29 |         genes = self.real_genes + self.pseudo_genes
30 |         n = len(genes)
31 |         self.matrix = np.zeros((n, n), dtype=np.float32)
32 |         self.idx = {gene: i for i, gene in enumerate(genes)}
33 |         # Assign values to the correlation matrix
34 |         with open(correlation_file_path) as fin:
35 |             for line in fin:
36 |                 line = line.strip()
37 |                 if line == '':
38 |                     continue
39 |                 a, b, cor = line.split(sep)
40 |                 if string_id is False:
41 |                     a = int(a)
42 |                     b = int(b)
43 |                 i = self.idx[a]
44 |                 j = self.idx[b]
45 |                 self.matrix[i,j] = np.float32(cor)
46 |                 self.matrix[j,i] = np.float32(cor)
47 | 
48 |     def average_correlation(self):
49 |         n = len(self.real_genes)
50 |         values = self.matrix[0:n, 0:n][np.nonzero(self.matrix[0:n, 0:n])]
51 |         return np.mean(values)
52 | 
53 | 
54 |     def __getitem__(self, item):
55 |         a, b = item
56 |         i = self.idx[a]
57 |         j = self.idx[b]
58 |         return self.matrix[i, j]
59 | 
60 |     def transform(self, transform=None):
61 |         if transform is None:
62 |             transform = lambda x: 0.5 * x + 0.5
63 |         n = len(self.real_genes) + len(self.pseudo_genes)
64 |         for i in range(n):
65 |             for j in range(n):
66 |                 self.matrix[i, j] = transform(self.matrix[i, j])
67 | 
68 |     def apply_threshold(self, lower_cor, upper_cor, value):
69 |         n = len(self.real_genes) + len(self.pseudo_genes)
70 |         for i in range(n):
71 |             for j in range(n):
72 |                 if self.matrix[i, j] > lower_cor and self.matrix[i, j] < upper_cor:
73 |                     self.matrix[i, j] = value
74 | 
75 |     def to_csv(self, file_name):
76 |         n = len(self.real_genes) + len(self.pseudo_genes)
77 |         genes = self.real_genes + self.pseudo_genes
78 |         with open(file_name, 'w') as f:
79 |             for i in range(n):
80 |                 for j in range(n):
81 |                     if i == j:
82 |                         break
83 |                     else:
84 |                         f.write(','.join([genes[i], genes[j], str(self.matrix[i, j])]))
85 |                         f.write("\n")
86 | 
87 |     def augment(self, dangles):
88 |         genes = self.real_genes + self.pseudo_genes
89 |         for (a, b), w in dangles.items():
90 |             assert a in genes, "gene is missing from similarity matrix."
91 |             assert b in genes, "gene is missing from similarity matrix."
92 |             i = self.idx[a]
93 |             j = self.idx[b]
94 |             self.matrix[i, j] = w
95 |             self.matrix[j, i] = w
96 | 


--------------------------------------------------------------------------------
/netwalk/walk.py:
--------------------------------------------------------------------------------
  1 | ''' This module generates walks from a network.
  2 | 
  3 | '''
  4 | import numpy as np
  5 | import gensim.models
  6 | import seaborn as sns
  7 | from matplotlib import pyplot as plt
  8 | import pandas as pd
  9 | import time
 10 | import multiprocessing as mp
 11 | 
 12 | 
 13 | EPSILON = 1E-6
 14 | 
 15 | class Probability():
 16 |     def __init__(self, matrix, gene_names):
 17 |         n = matrix.shape[0]
 18 |         assert matrix.shape[0] == matrix.shape[1]
 19 |         assert len(gene_names) == n
 20 |         total_prob = matrix.sum(axis=1).reshape(n, 1)
 21 |         corrections = []
 22 |         for i, p in enumerate(total_prob):
 23 |             if total_prob[i] < EPSILON:
 24 |                 total_prob[i] = 1
 25 |                 corrections.append(i)
 26 |         self.prob = matrix / total_prob
 27 |         for i in corrections:
 28 |             self.prob[i, i] = 1
 29 |         for i in range(n):
 30 |             if abs(np.sum(self.prob[i]) - 1) > EPSILON:
 31 |                 self.prob[i] /=  (self.prob[i]).sum()
 32 |                 print((self.prob[i]).sum())
 33 |                 try:
 34 |                     assert abs(np.sum(self.prob[i]) - 1) < EPSILON
 35 |                 except:
 36 |                     print(abs(np.sum(self.prob[i]) - 1))
 37 |                     raise
 38 |         self.idx = {name:i for i, name in enumerate(gene_names)}
 39 | 
 40 |     def __getitem__(self, gene):
 41 |         i = self.idx[gene]
 42 |         return self.prob[i]
 43 | 
 44 | 
 45 | 
 46 | class WalkGenerator(object):
 47 |     ''' Create walks using a graph defined by a similarity matrix.
 48 | 
 49 |     Args:
 50 |         similarity: A dictionary representing the similarity between
 51 |             pairs of nodes, where the similarity between nodes u and v
 52 |             is represented by similarity((u, v)).
 53 |     '''
 54 |     def __init__(self, similarity_matrix, genes, walk_length, walk_per_node, fountains=None):
 55 |         self.walk_length = walk_length
 56 |         self.nodes = np.copy(genes)
 57 |         if fountains is None:
 58 |             self.fountains = np.copy(self.nodes)
 59 |         else:
 60 |             self.fountains = np.copy(fountains)
 61 |         self.starters = np.repeat(self.fountains, walk_per_node)
 62 |         np.random.shuffle(self.starters)
 63 |         self.LENGTH = len(self.starters)
 64 |         self.prob = Probability(similarity_matrix, self.nodes)
 65 | 
 66 | 
 67 | 
 68 |     def __len__(self):
 69 |         return self.LENGTH
 70 | 
 71 |     def __getitem__(self, i):
 72 |         ''' Generate a random walk starting from the i-th gene.
 73 | 
 74 |         Args:
 75 |             start: Starting point of the random walk.
 76 |             length: Length of the random walk.
 77 |         '''
 78 |         if i >= self.LENGTH:
 79 |             raise StopIteration
 80 |         current_node = self.starters[i]
 81 |         walk = [current_node]
 82 |         for _ in range(self.walk_length):
 83 |             next_node = np.random.choice(self.nodes, p=self.prob[current_node])
 84 |             walk.append(next_node)
 85 |             current_node = next_node
 86 |         return walk
 87 | 
 88 |     def __call__(self, i):
 89 |         return self[i]
 90 | 
 91 | 
 92 | #if __name__ == '__main__':
 93 |     #similarity_matrix = np.random.rand(15000, 15000)
 94 |     #genes = np.array(range(15000), dtype=np.uint16)
 95 | 
 96 |     #similarity_matrix = np.array([[0.0, 0.0, 0.6, 0.0],
 97 |     #                              [0.0, 0.0, 0.3, 0.0],
 98 |     #                              [0.6, 0.3, 0.0, 0.0],
 99 |     #                              [0.0, 0.0, 0.0, 0.0]])
100 |     #
101 |     #genes = ['1', '2', '3', '4']
102 |     #start_time = time.time()
103 |     #walks = WalkGenerator(similarity_matrix,genes, 50, 100)
104 |     #hours, rem = divmod(time.time() - start_time, 3600)
105 |     #minutes, seconds = divmod(rem, 60)
106 |     #print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
107 |     #num_cpus = mp.cpu_count() - 1
108 |     #pool = mp.Pool(num_cpus)
109 |     #arguments = list(range(len(walks)))
110 |     #chunk_size = len(walks) // num_cpus
111 |     #results = pool.map(walks, arguments, chunksize=chunk_size)
112 |     #with open('walks.csv', 'w') as fout:
113 |     #    for w in results:
114 |     #       fout.write('{}\n'.format(','.join([str(x) for x in w])))
115 | 
116 |     #for w in walks:
117 |     #    print(w)
118 | 
119 |     # colour_map = "Greens_r"
120 |     # model = gensim.models.Word2Vec(sentences=walks,
121 |     #                        size=5,
122 |     #                        window=2,
123 |     #                        min_count=2,
124 |     #                        workers=3,
125 |     #                        iter=1)
126 |     # wv1 = model.wv
127 |     # vocab_size = len(genes)
128 |     # dist1 = np.zeros((vocab_size, vocab_size))
129 |     # for i, gene_i in enumerate(genes):
130 |     #     for j, gene_j in enumerate(genes):
131 |     #         dist1[i,j] = np.linalg.norm(wv1[gene_i] - wv1[gene_j])
132 |     #
133 |     # df = pd.DataFrame(dist1, columns=genes, index=genes)
134 |     # ax = sns.heatmap(df, cmap=colour_map, square=True)
135 |     # plt.show()
136 | 


--------------------------------------------------------------------------------
/netwalk/walkdataset.py:
--------------------------------------------------------------------------------
  1 | ''' This module contains WalkDataset.
  2 | '''
  3 | from torch.utils.data import Dataset
  4 | from netwalk.utils import Vocabulary
  5 | 
  6 | 
  7 | class WalkDataset(Dataset):
  8 |     ''' Create a dataset of walks, where each walk is a sequence of genes.
  9 | 
 10 |     Args:
 11 |         original_walks: a nested list of gene names/IDs.
 12 |         vocab: A Vocabulary object including all genes in the original_walks.
 13 |     '''
 14 |     def __init__(self, original_walks, vocab):
 15 |         super(WalkDataset, self).__init__()
 16 |         self.vocab = vocab
 17 |         self.walks = self._vocab_index(original_walks, vocab)
 18 | 
 19 |     @staticmethod
 20 |     def _vocab_index(original_walks, vocab):
 21 |         ''' Translate walks from original node names to integer indices.
 22 |         Args:
 23 |             original_walks: The original walks, where each node is represented
 24 |                 by node name/ID.
 25 |             vocab: A Vocabulary object including all genes in the original_walks.
 26 |         Returns:
 27 |             A translated version of original_walks, where each node is
 28 |                 represented with an integer index. These indices starts with
 29 |                 0 to n with no gap, where n is the number of different nodes
 30 |                 present in at least one of the walks in original_walks.
 31 | 
 32 |         '''
 33 |         walks = []
 34 |         for walk in original_walks:
 35 |             walks.append([vocab.index[name] for name in walk])
 36 |         return walks
 37 | 
 38 |     @classmethod
 39 |     def read_csv(cls, address, sep):
 40 |         ''' Read dataset from a file.
 41 | 
 42 |         Args:
 43 |             address: Address of a CSV file containing walks.
 44 |             sep: A field delimiter.
 45 |         Returns:
 46 |             data: A list of walks.
 47 |             genes: The set of all genes that appear in at least one walk.
 48 |         '''
 49 |         data = []
 50 |         genes = set()
 51 |         with open(address) as fin:
 52 |             for line in fin:
 53 |                 line = line.strip()
 54 |                 if line == '':
 55 |                     continue
 56 |                 walk = [gene.strip() for gene in line.split(sep)]
 57 |                 data.append(walk)
 58 |                 genes.update(walk)
 59 |         return data, genes
 60 | 
 61 |     @classmethod
 62 |     def from_csv(cls, address, sep):
 63 |         ''' Create a WalkDataset from a CSV file.
 64 | 
 65 |         Args:
 66 |             address: Address of a CSV file containing walks.
 67 |             sep: A field delimiter.
 68 |         Returns:
 69 |             A WalkDataset object.
 70 |         '''
 71 |         data, genes = cls.read_csv(address, sep)
 72 |         vocab = Vocabulary(genes)
 73 |         return cls(data, vocab)
 74 | 
 75 |     def __getitem__(self, idx):
 76 |         return self.walks[idx]
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.walks)
 80 | 
 81 | 
 82 | class PairWalkDataset(WalkDataset):
 83 |     ''' Create a dataset of walks, where each walk is a sequence of genes.
 84 | 
 85 |     Args:
 86 |         original_walks: a nested list of gene names/IDs.
 87 |         vocab: A Vocabulary object including all genes in the original_walks.
 88 |     '''
 89 |     def __init__(self, original_walks, vocab):
 90 |         super(PairWalkDataset, self).__init__(original_walks, vocab)
 91 | 
 92 | 
 93 |     @classmethod
 94 |     def from_csv(cls, address, sep):
 95 |         ''' Create a WalkDataset from a CSV file.
 96 | 
 97 |         Args:
 98 |             address: Address of a CSV file containing walks.
 99 |             sep: A field delimiter.
100 |         Returns:
101 |             A WalkDataset object.
102 |         '''
103 |         data = []
104 |         pair_walks, genes = cls.read_csv(address, sep)
105 |         for walk_walk in pair_walks:
106 |             middle = len(walk_walk) // 2
107 |             walk_a, walk_b = walk_walk[:middle], walk_walk[middle:]
108 |             data.append((walk_a, walk_b))
109 |         vocab = Vocabulary(genes)
110 |         return cls(data, vocab)
111 | 
112 |     @staticmethod
113 |     def _vocab_index(original_walks, vocab):
114 |         ''' Translate walks from original node names to integer indices.
115 |         Args:
116 |             original_walks: The original walks, where each node is represented
117 |                 by node name/ID.
118 |             vocab: A Vocabulary object including all genes in the original_walks.
119 |         Returns:
120 |             A translated version of original_walks, where each node is
121 |                 represented with an integer index. These indices starts with
122 |                 0 to n with no gap, where n is the number of different nodes
123 |                 present in at least one of the walks in original_walks.
124 | 
125 |         '''
126 |         walks = []
127 |         for walk_a, walk_b in original_walks:
128 |             translated_walk_a = [vocab.index[name] for name in walk_a]
129 |             translated_walk_b = [vocab.index[name] for name in walk_b]
130 |             walks.append((translated_walk_a, translated_walk_b))
131 |         return walks
132 | 


--------------------------------------------------------------------------------
/netwalk/translator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | 
  8 | class ID2NameTranslator(object):
  9 |     def __init__(self, vocab_file_address, sep=','):
 10 |         assert os.path.isfile(vocab_file_address)
 11 |         df = pd.read_csv(vocab_file_address, sep=sep)
 12 |         df.columns = ['ID', 'Name']
 13 |         self.df = df
 14 |         self.ids = df.iloc[:, 0].values
 15 |         self.names = df.iloc[:, 1].values
 16 |         self.__id2name = {ID: name for ID, name in zip(self.ids, self.names)}
 17 | 
 18 |     def id2name(self, ensemble_id, default=''):
 19 |         return self.__id2name.get(ensemble_id, default)
 20 | 
 21 |     def names2ids(self, names):
 22 |         selected = self.df[self.df.iloc[:, 1].isin(names)]
 23 |         data = []
 24 |         for name in names:
 25 |             data.extend(selected[selected.iloc[:,1] == name].values)
 26 |         data = pd.DataFrame.from_records(data, columns=['ID', 'Name'])
 27 |         return list(data.ID), list(data.Name)
 28 | 
 29 | 
 30 | class IDCovertor(object):
 31 |     def __init__(self, edge_list_file_addresses, sep=','):
 32 |         idset = set()
 33 |         for edge_list_file_address in edge_list_file_addresses:
 34 |             with open(edge_list_file_address) as fin:
 35 |                 for line in fin:
 36 |                     id1, id2, _ = line.strip().split(sep)
 37 |                     idset.add(id1)
 38 |                     idset.add(id2)
 39 |         n = len(idset)
 40 |         self.ids = sorted(idset)
 41 |         self.id2int = {a_id: i for a_id, i in zip(self.ids, range(n))}
 42 |         self.int2id = {i: a_id for a_id, i in zip(self.ids, range(n))}
 43 | 
 44 |     def ids2ints(self, ids):
 45 |         return [self.id2int[x] for x in ids if x in self.ids]
 46 | 
 47 |     def ints2ids(self, ints):
 48 |         return [self.int2id[int(x)] for x in ints]
 49 | 
 50 |     def save(self, json_file_address):
 51 |         with open(json_file_address, 'w') as fout:
 52 |             json.dump(self.id2int, fout)
 53 | 
 54 |     @classmethod
 55 |     def load(cls, json_file_address):
 56 |         with open(json_file_address) as fin:
 57 |             id2int = json.load(fin)
 58 |         convertor = IDCovertor([])
 59 |         convertor.id2int = id2int
 60 |         convertor.int2id = {i: a_id for a_id, i in id2int.items()}
 61 |         convertor.ids = sorted(id2int.keys())
 62 |         return convertor
 63 | 
 64 |     def translate(self, input_file_address, output_file_address, sep=','):
 65 |         df = pd.read_csv(input_file_address, sep=sep, header=None)
 66 |         df.columns = ['ID1', 'ID2', 'Cor']
 67 |         with open(output_file_address, 'w') as fout:
 68 |             for id1, id2, cor in df.itertuples(index=False):
 69 |                 fout.write('{}{}{}{}{}\n'.format(self.id2int[id1], sep,
 70 |                                                  self.id2int[id2], sep,
 71 |                                                  cor))
 72 | 
 73 | 
 74 | def vocab2id_and_name(vocab, id_convertor_file_path, id2name_translator_file, default_name='', sep=','):
 75 |     id2name_translator = ID2NameTranslator(id2name_translator_file, sep=sep)
 76 |     id_convertor = IDCovertor.load(id_convertor_file_path)
 77 |     id_names = {}
 78 |     for k in vocab:
 79 |         ensemble_id = id_convertor.int2id[int(k)]
 80 |         name = id2name_translator.id2name(ensemble_id, default_name)
 81 |         id_names[k] = (ensemble_id, name)
 82 |     return id_names
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     ensemble_id_name_file = '../skeletal_data/mouse.vocab'
 87 |    # trans = ID2NameTranslator(ensemble_id_name_file, sep=',')
 88 |    # assert trans.id2name('ENSMUSG00000064372') == 'mt-Tp'
 89 |    # assert trans.id2name('ENSMUSG00000106796') == 'AC124394.4'
 90 |    # edge_list_file_addresses = ['../Skeletal_Cells/anchored_chicken_imm_0.csv',
 91 |    #                             '../Skeletal_Cells/anchored_chicken_ost_0.csv',
 92 |    #                             '../Skeletal_Cells/anchored_gar_imm_0.csv',
 93 |    #                             '../Skeletal_Cells/anchored_gar_ost_0.csv',
 94 |    #                             '../Skeletal_Cells/anchored_frog_imm_0.csv',
 95 |    #                             '../Skeletal_Cells/anchored_frog_ost_0.csv',
 96 |    #                             '../Skeletal_Cells/anchored_mouse_imm_0.csv',
 97 |    #                             '../Skeletal_Cells/anchored_mouse_ost_0.csv']
 98 |    # output_file_address = '../skeletal_data/translated_chicken_imm.csv'
 99 |    # convertor = IDCovertor(edge_list_file_addresses, sep=',')
100 |    # for edge_list_file_address in edge_list_file_addresses:
101 |    #     dir_path, file_name = os.path.split(edge_list_file_address)
102 |    #     output_file_address = os.path.join(dir_path, f'translated_{file_name}')
103 |    #     convertor.translate(edge_list_file_address, output_file_address, sep=',')
104 |     convertor_file = '../skeletal_data/IDConvertor.json'
105 |    # convertor.save(convertor_file)
106 |    # con = IDCovertor.load(convertor_file)
107 |    # assert con.id2int == convertor.id2int
108 |    # assert con.int2id == convertor.int2id
109 |    # assert con.ids == convertor.ids
110 |    # # Test vocab2id_and_name
111 |     v2id_name = vocab2id_and_name(['19210', '19211'], convertor_file, ensemble_id_name_file, default_name='', sep=',')
112 |     assert v2id_name['19210'][0] == 'ENSMUSG00000114019'
113 |     assert v2id_name['19211'][0] == 'ENSMUSG00000114025'
114 |     print(v2id_name)
115 | 


--------------------------------------------------------------------------------
/dangle.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | from similarity import Similarity
  4 | import numpy as np
  5 | import pandas as pd
  6 | import random
  7 | import os
  8 | import json
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | 
 12 | def build_backbone(anchors, alphas, weight, edge_percentage):
 13 |     dangling = {}
 14 |     for anchor, alpha in zip(anchors,alphas):
 15 |         pseudo_anchor = 'pseudo_{}'.format(anchor)
 16 |         dangles = dangling_structure(pseudo_anchor,
 17 |                                      alpha,
 18 |                                      weight,
 19 |                                      edge_percentage)
 20 |         dangles[(anchor, pseudo_anchor)] = weight
 21 |         dangling.update(dangles)
 22 |     return dangling
 23 | 
 24 | 
 25 | def dangling_structure(gene, alpha, weight, edge_percentage):
 26 |     num_dangles = alpha
 27 |     dangles = ['{}_{:0>3d}'.format(gene, i) for i in range(alpha)]
 28 |     sim = {}
 29 |     potential_edges = []
 30 |     for gene_i in dangles:
 31 |         for gene_j in dangles:
 32 |             if gene_i == gene_j:
 33 |                 break
 34 |             else:
 35 |                 potential_edges.append((gene_i, gene_j))
 36 |     random.shuffle(potential_edges)
 37 |     connected_genes = set()
 38 |     for gene_i, gene_j in potential_edges:
 39 |         if {gene_i, gene_j} < connected_genes:
 40 |             continue
 41 |         elif len(connected_genes) < num_dangles:
 42 |             connected_genes.add(gene_i)
 43 |             connected_genes.add(gene_j)
 44 |             sim[(gene_i, gene_j)] = weight
 45 | 
 46 |     sim[(gene, dangles[0])] = weight
 47 |     for gene_i, gene_j in potential_edges:
 48 |         if random.random() < edge_percentage:
 49 |             sim[(gene_i, gene_j)] = weight
 50 | 
 51 |     return sim
 52 | 
 53 | 
 54 | def main(experiment_name, phenotypes, data_directory, anchor_genes,
 55 |          num_replicates=1, percent=0.4, num_anchors=50, min_dangle_size=3,
 56 |          max_dangle_size=10, test_ratio=0.5):
 57 |     assert isinstance(phenotypes, list)
 58 |     alphas = random.choices(range(min_dangle_size, max_dangle_size),
 59 |                             k=int(num_anchors * test_ratio))
 60 |     assert len(alphas) < len(anchor_genes)
 61 |     anchor_train_groups = []
 62 |     anchor_test_groups = []
 63 |     backbones = []
 64 |     # Create all backbones
 65 |     for rep_id in range(num_replicates):
 66 |         random.shuffle(anchor_genes)
 67 |         candidates = anchor_genes[:int(num_anchors)]
 68 |         genes_of_interest_train, genes_of_interest_test = train_test_split(
 69 |             candidates,
 70 |             shuffle=True,
 71 |             test_size=test_ratio)
 72 | 
 73 |         anchor_train_groups.append(genes_of_interest_train)
 74 |         anchor_test_groups.append(genes_of_interest_test)
 75 |         backbones.append(
 76 |             build_backbone(anchors=anchor_train_groups[rep_id], alphas=alphas,
 77 |                            weight=1, edge_percentage=percent))
 78 |     # Write train anchors to file
 79 |     with open(os.path.join(experiment_name, 'train_anchors.csv'), 'w') as fout:
 80 |         for gene_group in anchor_train_groups:
 81 |             fout.write(','.join(gene_group))
 82 |             fout.write("\n")
 83 |     # Write test anchors to file
 84 |     with open(os.path.join(experiment_name, 'test_anchors.csv'), 'w') as fout:
 85 |         for gene_group in anchor_test_groups:
 86 |             fout.write(','.join(gene_group))
 87 |             fout.write("\n")
 88 |     # Adding the backbones and create the similarity object
 89 |     for pheno in phenotypes:
 90 |         file_name = os.path.join(data_directory, "{}.csv".format(pheno))
 91 |         for rep_id in range(num_replicates):
 92 |             sim_file_name = "anchored_{}_{}.csv".format(pheno, str(rep_id))
 93 |             out_address = os.path.join(experiment_name, sim_file_name)
 94 |             similarity = Similarity(file_name,
 95 |                                     anchors=anchor_train_groups[rep_id],
 96 |                                     alphas=alphas, string_id=True)
 97 |             similarity.transform()
 98 |             similarity.apply_threshold(lower_cor=0.2, upper_cor=0.8,
 99 |                                        value=0)
100 |             similarity.augment(backbones[rep_id])
101 | 
102 |             similarity.to_csv(out_address)
103 | 
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     parser = argparse.ArgumentParser(description='Generate dangling structures')
108 |     parser.add_argument('-c', '--config', metavar='JSON file path',
109 |                         action='store', required=True,
110 |                         help='Path to a config file')
111 |     args = parser.parse_args()
112 |     config_file_address = args.config
113 |     with open(config_file_address) as fin:
114 |         params = json.load(fin)
115 |     homeostasis_genes = pd.read_csv(params['anchor_file_address'],
116 |                                     dtype=str).iloc[:,0].values
117 |     main(experiment_name=params['experiment_name'],
118 |              phenotypes=params['phenotypes'],
119 |              data_directory=params['data_directory'],
120 |              anchor_genes=homeostasis_genes,
121 |              num_replicates=params['n_replicates'],
122 |              percent=params['percentage'],
123 |              num_anchors=params['n_anchors'],
124 |              min_dangle_size=params['min_dangle_size'],
125 |              max_dangle_size=params['max_dangle_size'],
126 |              test_ratio=params['test_ratio'])
127 | 
128 | 


--------------------------------------------------------------------------------
/test/test_temp.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from netwalk.temp import *
  3 | from netwalk.utils import load
  4 | 
  5 | 
  6 | class TestTemp(unittest.TestCase):
  7 |     def test_transform(self):
  8 |         edge_list = {("gene1", "gene2"): 0.5,
  9 |                      ("gene2", "gene3"): -0.7,
 10 |                      ("gene1", "gene3"): 0.0}
 11 |         transformed = transform(edge_list)
 12 |         transformed_dict = {k: val for k, val in transformed.items()}
 13 |         expected = {("gene1", "gene2"): 0.75,
 14 |                     ("gene2", "gene3"): 0.15,
 15 |                     ("gene1", "gene3"): 0.5}
 16 |         transformed_vals = list(transformed_dict.values())
 17 |         expected_vals = list(expected.values())
 18 |         for a, b in zip(transformed_vals, expected_vals):
 19 |             self.assertAlmostEqual(a, b, places=5)
 20 | 
 21 |         edge_list = load("data/fake_networks/network_2.csv", sep=",")
 22 |         transformed = transform(edge_list)
 23 |         expected = {("g1", "g2"): 0.9,
 24 |                     ("g2", "g3"): 0.55,
 25 |                     ("g2", "g1"): 0.9,
 26 |                     ("g3", "g2"): 0.55,
 27 |                     ("g2", "g4"): 0.75,
 28 |                     ("g4", "g3"): 0.55,
 29 |                     ("g3", "g1"): 0.75,
 30 |                     ("g1", "g3"): 0.75,
 31 |                     ("g1", "g4"): 0.10,
 32 |                     ("g4", "g1"): 0.10,
 33 |                     ("g4", "g2"): 0.75,
 34 |                     ("g3", "g4"): 0.55}
 35 | 
 36 |         transformed_dict = {k: val for k, val in transformed.items()}
 37 |         transformed_vals = list(transformed_dict.values())
 38 |         expected_vals = list(expected.values())
 39 |         for a, b in zip(transformed_vals, expected_vals):
 40 |             self.assertAlmostEqual(a, b, places=5)
 41 | 
 42 |     def test_filter(self):
 43 |         edge_list = {("gene1", "gene2"): 0.5,
 44 |                      ("gene2", "gene3"): -0.7,
 45 |                      ("gene1", "gene3"): 0.0}
 46 |         transformed = transform(edge_list)
 47 |         transformed_dict = {k: val for k, val in transformed.items()}
 48 |         filtered = filter(transformed_dict, exclude=(0.3, 0.6))
 49 |         filtered_dict = {k: val for k, val in filtered.items()}
 50 | 
 51 |         expected = {("gene1", "gene2"): 0.75,
 52 |                     ("gene2", "gene3"): 0.15}
 53 | 
 54 |         filtered_vals = list(filtered_dict.values())
 55 |         expected_vals = list(expected.values())
 56 |         for a, b in zip(filtered_vals, expected_vals):
 57 |             self.assertAlmostEqual(a, b, places=5)
 58 | 
 59 |         edge_list = {("gene1", "gene2"): 0.5,
 60 |                      ("gene2", "gene3"): -0.7,
 61 |                      ("gene1", "gene3"): 0.0}
 62 | 
 63 |         filtered = filter(edge_list, exclude=(0, 0.7))
 64 |         filtered_dict = {k: val for k, val in filtered.items()}
 65 | 
 66 |         expected = {("gene1", "gene2"): -0.7,
 67 |                     ("gene2", "gene3"): 0.0}
 68 | 
 69 |         filtered_vals = list(filtered_dict.values())
 70 |         expected_vals = list(expected.values())
 71 |         for a, b in zip(filtered_vals, expected_vals):
 72 |             self.assertAlmostEqual(a, b, places=5)
 73 | 
 74 |     def test_overlay_networks(self):
 75 |         original_edge_list_1 = Similarity({("gene1", "gene2"): 0.5,
 76 |                                            ("gene2", "gene3"): 0.7,
 77 |                                            ("gene1", "gene3"): 0.6,
 78 |                                            ("gene1", "gene4"): 0.3,
 79 |                                            ("gene2", "gene4"): 0.1,
 80 |                                            ("gene3", "gene4"): 0.0,
 81 |                                            ("gene1", "gene3"): 0.0})
 82 | 
 83 |         original_edge_list_2 = Similarity({("gene1", "gene2"): 0.5,
 84 |                                            ("gene2", "gene3"): 0.7,
 85 |                                            ("gene1", "gene3"): 0.0,
 86 |                                            ("gene1", "gene4"): 0.0,
 87 |                                            ("gene2", "gene4"): 0.3,
 88 |                                            ("gene4", "gene3"): 0.7,
 89 |                                            ("gene1", "gene3"): 0.2})
 90 | 
 91 |         edge_list_1 = Similarity({("gene1", "gene2"): 0.5,
 92 |                                   ("gene2", "gene3"): 0.7,
 93 |                                   ("gene1", "gene3"): 0.6,
 94 |                                   ("gene1", "gene4"): 0.3})
 95 | 
 96 |         edge_list_2 = Similarity({("gene1", "gene2"): 0.5,
 97 |                                   ("gene2", "gene3"): 0.7,
 98 |                                   ("gene2", "gene4"): 0.3})
 99 | 
100 |         net_1, net_2 = overlay_networks(net_a_similarity=edge_list_1, net_b_similarity=edge_list_2,
101 |                                         original_net_a=original_edge_list_1, original_net_b=original_edge_list_2)
102 | 
103 |         assert set(net_1.symmetric_key_set()) == set(net_2.symmetric_key_set())
104 |         net_1_dict = {k: val for k, val in net_1.items()}
105 |         net_2_dict = {k: val for k, val in net_2.items()}
106 |         assert set(net_1_dict.values()) == {0.5, 0.7, 0.6, 0.3, 0.1}
107 |         assert set(net_2_dict.values()) == {0.5, 0.7, 0.3, 0.2, 0.0}
108 | 
109 |     def test_create_spine(self):
110 |         net1 = Similarity.load("../data/fake_networks/network_1.csv", sep=",")
111 |         net2 = Similarity.load("../data/fake_networks/network_2.csv", sep=",")
112 | 
113 |         expected_spine = ["g1", "g4"]
114 |         expected_pseudo_spine = ["pseudo_g1", "pseudo_g4"]
115 |         expected_similarity = {("pseudo_g1", "pseudo_g10"): 0.5,
116 |                                ("pseudo_g1", "pseudo_g11"): 0.5,
117 |                                ("pseudo_g1", "pseudo_g4"): -0.8,
118 |                                ("pseudo_g4", "pseudo_g40"): 0.5,
119 |                                ("pseudo_g4", "pseudo_g41"): 0.5}
120 | 
121 |         spine, pseudo_spine, backbone = create_spine(spine=["g1", "g4"], net_a_tsfmd_similarity=net1,
122 |                                                      net_b_tsfmd_similarity=net2,
123 |                                                      prefix='pseudo_', alpha=2, weight=0.5)
124 |         assert set(spine) == set(expected_spine)
125 |         assert set(pseudo_spine) == set(expected_pseudo_spine)
126 | 
127 |         self.assertEqual(len(backbone), len(expected_similarity))
128 | 
129 |         for key, val in backbone.items():
130 |             assert key in expected_similarity.keys()
131 |             assert expected_similarity[key] == val
132 | 
133 |         expected_spine = ["g1", "g4", "g3"]
134 |         expected_pseudo_spine = ["pseudo_g1", "pseudo_g4", "pseudo_g3"]
135 |         expected_similarity = {("pseudo_g1", "pseudo_g3"): 0.5,
136 |                                ("pseudo_g1", "pseudo_g10"): 0.5,
137 |                                ("pseudo_g1", "pseudo_g11"): 0.5,
138 |                                ("pseudo_g4", "pseudo_g1"): -0.8,
139 |                                ("pseudo_g3", "pseudo_g30"): 0.5,
140 |                                ("pseudo_g3", "pseudo_g31"): 0.5,
141 |                                ("pseudo_g4", "pseudo_g40"): 0.5,
142 |                                ("pseudo_g4", "pseudo_g41"): 0.5}
143 | 
144 |         spine, pseudo_spine, backbone = create_spine(spine=["g1", "g4", "g3"], net_a_tsfmd_similarity=net1,
145 |                                                      net_b_tsfmd_similarity=net2,
146 |                                                      prefix='pseudo_', alpha=2, weight=0.5)
147 | 
148 |         assert set(spine) == set(expected_spine)
149 |         assert set(pseudo_spine) == set(expected_pseudo_spine)
150 | 
151 |         self.assertEqual(len(backbone), len(expected_similarity))
152 | 
153 |         for key, val in backbone.items():
154 |             assert key in expected_similarity.keys()
155 |             assert expected_similarity[key] == val
156 | 
157 |     def test_add_anchor(self):
158 |         net1 = Similarity.load("../data/fake_networks/network_1.csv", sep=",")
159 |         net2 = Similarity.load("../data/fake_networks/network_2.csv", sep=",")
160 | 
161 |         spine_similarity_1 = {("pseudo_g1", "pseudo_g3"): 0.5,
162 |                               ("pseudo_g1", "pseudo_g10"): 0.5,
163 |                               ("pseudo_g1", "pseudo_g11"): 0.5,
164 |                               ("pseudo_g4", "pseudo_g1"): -0.8,
165 |                               ("pseudo_g3", "pseudo_g30"): 0.5,
166 |                               ("pseudo_g3", "pseudo_g31"): 0.5,
167 |                               ("pseudo_g4", "pseudo_g40"): 0.5,
168 |                               ("pseudo_g4", "pseudo_g41"): 0.5}
169 | 
170 |         spine_similarity_2 = {("pseudo_g1", "pseudo_g10"): 0.5,
171 |                               ("pseudo_g1", "pseudo_g11"): 0.5,
172 |                               ("pseudo_g4", "pseudo_g1"): -0.8,
173 |                               ("pseudo_g4", "pseudo_g40"): 0.5,
174 |                               ("pseudo_g4", "pseudo_g41"): 0.5}
175 | 
176 |         anchored_net1 = add_anchor(net1, pseudo_similarity=backbone, spine_genes=["g1", "g4", "g3"],
177 |                                    pseudo_spine_genes=["pseudo_g1", "pseudo_g4", "pseudo_g3"],
178 |                                    weight=0.5)
179 | 
180 |         anchored_net2 = add_anchor(net2, pseudo_similarity=backbone, spine_genes=["g1", "g4"],
181 |                                    pseudo_spine_genes=["pseudo_g1", "pseudo_g4"],
182 |                                    weight=0.7)
183 | 
184 |         for key, val in anchored_net1.items():
185 |             assert key in spine_similarity_1.keys()
186 |         for key, val in anchored_net2.items():
187 |             assert key in spine_similarity_2.keys()
188 | 
189 |         result_keys = [anchored_net1[x] for x in anchored_net1.keys()]
190 |         self.assertListEqual(result_keys, net1.update(spine_similarity_1))
191 | 
192 |         result_keys = [anchored_net2[x] for x in anchored_net2.keys()]
193 |         self.assertListEqual(result_keys, net2.update(spine_similarity_2))
194 | 


--------------------------------------------------------------------------------
/dimensionality_reduction.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import json
  4 | import argparse
  5 | import random
  6 | from sklearn.decomposition import PCA
  7 | from sklearn.manifold import TSNE
  8 | from matplotlib import pyplot
  9 | from gensim.models import Word2Vec
 10 | import numpy as np
 11 | import pandas as pd
 12 | import seaborn as sns
 13 | from netwalk.translator import ID2NameTranslator, IDCovertor
 14 | from scipy.spatial.distance import cdist
 15 | from itertools import combinations
 16 | 
 17 | def pca_visualization(model, out_file_name):
 18 |     x = model[model.wv.vocab]
 19 |     pca = PCA(n_components=2)
 20 |     result = pca.fit_transform(x)
 21 |     # create a scatter plot of the projection
 22 |     pyplot.scatter(result[:, 0], result[:, 1])
 23 |     #pyplot.xlim(-25,  25)
 24 |     #pyplot.ylim(-25, 25)
 25 |     words = list(model.wv.vocab)
 26 |     #for i, word in enumerate(words):
 27 |         #pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
 28 |     pyplot.savefig(out_file_name)
 29 | 
 30 | 
 31 | def tsne_plot(model, out_file_name, perplexity=30, components=2, init='pca', 
 32 |               num_iter=500, rand_state=0):
 33 |     labels = []
 34 |     tokens = []
 35 | 
 36 |     for word in model.wv.vocab:
 37 |         tokens.append(model[word])
 38 |         labels.append(word)
 39 | 
 40 |     tsne_model = TSNE(perplexity=perplexity, n_components=components,
 41 |                       init=init, n_iter=num_iter, random_state=rand_state)
 42 |     new_values = tsne_model.fit_transform(tokens)
 43 | 
 44 |     x = []
 45 |     y = []
 46 |     for value in new_values:
 47 |         x.append(value[0])
 48 |         y.append(value[1])
 49 |     print(labels)
 50 |     print(labels["pseudo" in labels])
 51 |     c=["royalblue" if "pseudo" in x else "orangered" for x in labels]
 52 |     pyplot.figure(figsize=(16, 16))
 53 |     for i in range(len(x)):
 54 |         pyplot.scatter(x[i], y[i], color=c[i], s=30)
 55 | 
 56 |     pyplot.savefig(out_file_name)
 57 | 
 58 | 
 59 | def tsne_visualize(model, gene, list_names, vocab_length, num_components, out_file_name, converter, translate):
 60 |     """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query gene,
 61 |     its list of most similar genes, and a list of other genes.
 62 | vv    """
 63 |     arrays = np.empty((0, vocab_length), dtype='f')
 64 |     gene_labels = [gene]
 65 |     color_list  = ['red']
 66 | 
 67 |     # adds the vector of the query gene
 68 |     arrays = np.append(arrays, model.wv.__getitem__([gene]), axis=0)
 69 | 
 70 |     # gets list of most similar genes
 71 |     close_genes = model.wv.most_similar([gene])
 72 | 
 73 |     # adds the vector for each of the closest genes to the array
 74 |     for gne_score in close_genes:
 75 |         gne_vector = model.wv.__getitem__([gne_score[0]])
 76 |         gene_labels.append(gne_score[0])
 77 |         color_list.append('blue')
 78 |         arrays = np.append(arrays, gne_vector, axis=0)
 79 | 
 80 |     # adds the vector for each of the genes from list_names to the array
 81 |     for gne in list_names:
 82 |         gne_vector = model.wv.__getitem__([gne])
 83 |         gene_labels.append(gne)
 84 |         color_list.append('green')
 85 |         arrays = np.append(arrays, gne_vector, axis=0)
 86 | 
 87 |     # Reduces the dimensionality from 300 to 50 dimensions with PCA
 88 |     reduc = PCA(n_components=num_components).fit_transform(arrays)
 89 |     # Finds t-SNE coordinates for 2 dimensions
 90 |     np.set_printoptions(suppress=True)
 91 |     Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
 92 | 
 93 |     gene_labels = ints_to_names(gene_labels, translate, converter)
 94 | 
 95 |     # Sets everything up to plot
 96 |     df = pd.DataFrame({'x': [x for x in Y[:, 0]],
 97 |                        'y': [y for y in Y[:, 1]],
 98 |                        'genes': gene_labels,
 99 |                        'color': color_list})
100 | 
101 |     fig, _ = pyplot.subplots()
102 |     fig.set_size_inches(9, 9)
103 | 
104 |     # Basic plot
105 |     sns.set_style("ticks")
106 |     p1 = sns.regplot(data=df,
107 |                      x="x",
108 |                      y="y",
109 |                      fit_reg=False,
110 |                      marker="o",
111 |                      scatter_kws={'s': 40,
112 |                                   'facecolors': df['color']
113 |                                  }
114 |                     )
115 | 
116 |     # Adds annotations one by one with a loop
117 |     for line in range(0, df.shape[0]):
118 |          p1.text(df["x"][line],
119 |                  df['y'][line],
120 |                  '  ' + df["genes"][line].title(),
121 |                  horizontalalignment='left',
122 |                  verticalalignment='bottom', size='medium',
123 |                  color=df['color'][line],
124 |                  weight='normal'
125 |                 ).set_size(15)
126 | 
127 |     pyplot.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
128 |     pyplot.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
129 | 
130 |     pyplot.title('t-SNE visualization for {}'.format(gene.title()))
131 |     pyplot.savefig(out_file_name)
132 | 
133 | def names_to_ints(names, trans, convertor):
134 |     ids, names =  trans.names2ids(names)
135 |     ints = [str(i) for i in convertor.ids2ints(ids)]
136 |     return ints, ids, names
137 | 
138 | def ints_to_names(ints, trans, convertor):
139 |     #int to id
140 |     ids = [i for i in convertor.ints2ids(ints)]
141 |     #id to name
142 |     names =  [trans.id2name(x) for x in ids]
143 |     return names
144 | 
145 | if __name__ == '__main__':
146 |     parser = argparse.ArgumentParser(description='Generate datasets')
147 |     parser.add_argument('-c', '--config', metavar='JSON file path',
148 |                         action='store', required=True,
149 |                         help='Path to a config file')
150 |     args = parser.parse_args()
151 |     # read training config
152 |     with open(args.config) as fin:
153 |         params = json.load(fin)
154 | 
155 |     ensemble_id_name_file = params['vocab']
156 |     convertor_file = os.path.join(params['experiment_name'],'IDConvertor.json')
157 |     trans = ID2NameTranslator(ensemble_id_name_file, sep=',')
158 |     convertor = IDCovertor.load(convertor_file)
159 | 
160 |     for rep_id in range(params['n_replicates']):
161 |         for pheno in params['phenotypes']:
162 |             path = os.path.join(params['experiment_name'],
163 |                                   '{}_{}.model'.format(pheno, rep_id))
164 |             viz_path = os.path.join(params['experiment_name'],
165 |                                         '{}_{}_tsne.pdf'.format(pheno,
166 |                                                                  rep_id))
167 |             pca_path = os.path.join(params['experiment_name'],
168 |                                         '{}_{}_pca.pdf'.format(pheno,
169 |                                                                  rep_id))
170 |             # load model
171 |             model = Word2Vec.load(path)
172 |             genes = list(model.wv.vocab)
173 |             # make tsne
174 |             #tsne_plot(model, out_file_name=viz_path)
175 |             #pca_visualization(model, out_file_name=pca_path)
176 |             names = params['select_genes']
177 |             ints, ids, names = names_to_ints(names, trans, convertor)
178 |             for ind, g, name in zip(ints, ids, names):
179 |                 sim_path = os.path.join(params['experiment_name'],
180 |                                         '{}_{}_most_similar_to_{}.pdf'.format(pheno,
181 |                                                                  rep_id, name))
182 |                 rand_path = os.path.join(params['experiment_name'],
183 |                                         '{}_{}_random_compared_to_{}.pdf'.format(pheno,
184 |                                                                  rep_id, g))
185 |                 # make a visualization of select genes and their most similar genes
186 |                 if ind in model.wv.vocab:
187 |                     negative_ints  = [i[0] for i in model.wv.most_similar(negative=[ind])]
188 |                     tsne_visualize(model, ind, list_names=negative_ints, vocab_length=params['embd_dim'], num_components=2, out_file_name=sim_path, translate=trans,converter=convertor)
189 |                 #sampled_ints  = random.choices(genes, k=20) 
190 |                 #tsne_visualize(model, ind, list_names=sampled_ints,  vocab_length=params['embd_dim'], num_components=2, out_file_name=rand_path, translate=trans,converter=convertor)
191 |         for pheno_1, pheno_2 in combinations(params['phenotypes'], 2): 
192 |             path_1 = os.path.join(params['experiment_name'],
193 |                                   '{}_{}.model'.format(pheno_1, rep_id))
194 |             path_2 = os.path.join(params['experiment_name'],
195 |                                   '{}_{}.model'.format(pheno_2, rep_id))
196 |             model_1 = Word2Vec.load(path_1)
197 |             model_2 = Word2Vec.load(path_2)
198 |             ints, ids, names = names_to_ints(names, trans, convertor)
199 |             targets_in_model1 = [ (gene_i, name_i) for gene_i, name_i in zip(ints, names) if gene_i in model_1.wv.vocab]
200 |             targets_in_model2 = [ (gene_i, name_i) for gene_i, name_i in zip(ints, names) if gene_i in model_2.wv.vocab]
201 |             x1 = np.array([model_1.wv[gene_i] for gene_i, _ in targets_in_model1])
202 |             x2 = np.array([model_2.wv[gene_i] for gene_i, _ in targets_in_model2])
203 |             # Calculate the distance between elements of x1 and x2 as a distance matrix
204 |             dist = cdist(x1, x2, metric='cosine')/2
205 |             df = pd.DataFrame(dist)
206 | 
207 |             df.columns = [name_i for _, name_i in targets_in_model2]
208 |             df.index = [name_i for _, name_i in targets_in_model1]
209 |             matrix_path = os.path.join(params['experiment_name'],
210 |                                   '{}_{}_{}_matrix.pdf'.format(pheno_1, pheno_2, rep_id))
211 |             pyplot.figure()
212 |             print(df.index, df.columns)
213 |             print(df.shape)
214 |             ax = sns.heatmap(df, square=True, vmin=0, vmax=1)
215 |             #ax.tick_params(left=False, bottom=False)
216 |             ax.set_yticklabels(list(df.index))
217 |             ax.set_xticklabels(list(df.columns))
218 |             figure = ax.get_figure()
219 |             figure.savefig(matrix_path)
220 |             pyplot.close()
221 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Juxtapose
  2 | 
  3 | <!-- TABLE OF CONTENTS -->
  4 | <details open="open">
  5 |   <summary>Table of Contents</summary>
  6 |   <ol>
  7 |     <li>
  8 |       <a href="#about-the-project">About The Project</a>
  9 |     </li>
 10 |     <li>
 11 |       <a href="#getting-started">Getting Started</a>
 12 |       <ul>
 13 |         <li><a href="#prerequisites">Prerequisites</a></li>
 14 |         <li><a href="#installation">Installation</a></li>
 15 |       </ul>
 16 |     </li>
 17 |     <li><a href="#usage">Usage</a>
 18 |     <ul>
 19 |         <li><a href="#Small-network-embedding">Small network embedding</a></li>
 20 |         <li><a href="#Large-network-embedding">Large network embedding</a></li>
 21 |         <li><a href="#Translating-IDs-to-integers">Translating IDs to integers</a></li>      
 22 |      </ul>    
 23 |     </li>
 24 |     <li><a href="#contributing">Contributing</a></li>
 25 |     <li><a href="#versioning">Versioning</a></li>
 26 |     <li><a href="#contact">Contact</a></li>
 27 |   </ol>
 28 | </details>
 29 | 
 30 | 
 31 | 
 32 | <!-- ABOUT THE PROJECT -->
 33 | ## About The Project
 34 | <p align="center">
 35 | <img src="JuxtaposeTutorial/Embedding_Methodology.png" width="900">
 36 | </p>
 37 | 
 38 | Welcome to Juxtapose, a Python tool that can be used to compare gene co-expression networks (GCNs). Juxtapose, together with different similarity measures, can be utilized for comparative transcriptomics between a set of organisms. While we focus on its application to comparing co-expression networks across species in evolutionary studies, Juxtapose is also generalizable to co-expression network comparisons across tissues or conditions within the same species. A word embedding strategy commonly used in natural language processing was utilized in order to generate gene embeddings based on walks made throughout the GCNs. 
 39 | 
 40 | You may also suggest changes by forking this repo and creating a pull request or opening an issue. 
 41 | 
 42 | <!-- GETTING STARTED -->
 43 | ## Getting Started
 44 | 
 45 | The following steps will guide you through the process of running Juxtapose on your local machine or an [AWS spot instance](https://aws.amazon.com/ec2/spot/?cards.sort-by=item.additionalFields.startDateTime&cards.sort-order=asc).
 46 | 
 47 | ### Prerequisites
 48 | 
 49 | The main dependencies of Juxtapose are gensim, scikit-learn, numpy, pandas, and scipy. See requirements.txt for the complete list of requirements.
 50 | 
 51 | ### Installation
 52 | 
 53 | It is a good practice to use a virtual environment for deploying Python programs. Using conda, we can create an environment named juxtapose. The environment name is arbitrary.
 54 |  ```sh
 55 |   conda create -n juxtapose python=3.6
 56 |   ```
 57 | 
 58 | After downloading the Juxtapose repository, the following command can be run to install requirements.
 59 | ```sh
 60 |   make setup
 61 |   ```
 62 |   ### Adding more tests
 63 | 
 64 | New tests should be added as modules where their names start with test_ under test directory.
 65 | 
 66 | <!-- USAGE EXAMPLES -->
 67 | ## Usage
 68 | 
 69 | In order to run Juxtapose, two JSON files are required containing the desired parameters for (1) creating an anchored network using a set of genes and making walks through this network and (2) running an embedding method to obtain pairwise local distances between genes as well as a global similarity between networks, and results including visualizations from biclustering local pairwise distances. 
 70 | 
 71 | ### Small network embedding
 72 | Let us take an example of embedding a simple line network.
 73 | 
 74 | <p align="center">
 75 | <img src="JuxtaposeTutorial/line.png" width="300">
 76 | </p>
 77 | 
 78 | We require a csv file that that contains the edge list representation of the network. In our case, we have line_1.csv and line_2.csv.
 79 | 
 80 | Content of line_1.csv and line_2.csv:
 81 | ```sh
 82 | 0,1,1
 83 | 1,0,1
 84 | 1,2,1
 85 | 2,1,1
 86 | 2,3,1
 87 | 3,2,1
 88 | 3,4,1
 89 | 4,3,1
 90 | 4,5,1
 91 | 5,4,1
 92 | 5,6,1
 93 | 6,5,1
 94 | 6,7,1
 95 | 7,6,1
 96 | 7,8,1
 97 | 8,7,1
 98 | 8,9,1
 99 | 9,8,1
100 | 9,10,1
101 | 10,9,1
102 |   ```
103 | We have the config/JSON files stored in the test/data folder.
104 | The example contents of line-config.json used for adding anchors to the networks:
105 | ```sh
106 | {
107 |   "n_replicates": 10,
108 |   "percentage": 0.4,
109 |   "n_anchors": 6,
110 |   "anchor_test_ratio": 0.5,
111 |   "min_dangle_size": 3,
112 |   "max_dangle_size": 10,
113 |   "anchor_file_address": "test/data/line_anchors.csv",
114 |   "phenotypes": ["1", "2"],
115 |   "experiment_name": "Line",
116 |   "data_directory": "test/data"
117 | }
118 | ```
119 | 
120 | The example contents of Line-train-config.json used for model training and visualizations:
121 | ```sh
122 |   {
123 |   "experiment_name": "Line",
124 |   "phenotypes": ["1", "2"],
125 |   "walk_per_node": 1000,
126 |   "walk_length": 50,
127 |   "n_iter": 1,
128 |   "n_workers": 20,
129 |   "embd_dim": 10,
130 |   "window": 2,
131 |   "min_count": 2,
132 |   "negatives": 5,
133 |   "alpha": 0.01,
134 |   "n_replicates": 1,
135 |   "min_alpha": 0.001
136 |   }
137 |   ```
138 | To run the anchoring step, we also require the genes/nodes of the network that will be used as the anchor points in the networks that are going to be compared. As the networks will be compared, these synthetic structures that are attached to the real networks should be the same. We have provided line_anchors.csv for this example, but this list can be catered or limited to any sets of nodes a user would like to select as potential anchors. The name of the file used needs to be set in the config file using the "anchor_file_address" parameter.
139 | 
140 | To add anchor nodes, run the following command.
141 | ```sh
142 | python3 dangle.py --config test/data/line-config.json
143 | ``` 
144 | 
145 | To generate the intermediate walk files, if necessary, run the following command.
146 | ```sh
147 | python3 dataset_generator.py --config test/data/Line-train-config.json
148 | ``` 
149 | 
150 | Next, runner.py will train the models for each network, calculate the local and global similarity measures between genes and bicluster the local similarity results. If a full co-expression network is used and it is not possible to generate the complete matrix, there is also an option to select only a percentage of each bicluster in order to make a representative visualization. 
151 | ```sh
152 | python3 runner.py --config test/data/Line-train-config.json
153 | ```
154 | It should also be noted if the models for the networks have already been trained and only the similarity measures and biclustering need to be run, then the option "no-train" can be specified as below when running.
155 | ```sh
156 | python3 runner.py --config test/data/Line-train-config.json --no-train
157 | ```
158 | We have provided other datasets (circle, cross, heart, and brain) that can be used of various sizes and complexity/density for further testing. All can be found in the test data folder.
159 | 
160 | ### Large networks
161 | It will not always be possible to compare larger networks on many machines due to the large memory requirements as the number of edges in the networks increases. As such, we recommend an AWS spot instance for more affordable resources if no other resources are available to you. In order to set up an instance that will work for a larger network, e.g. 10,000+ genes, one option would be to go to the [EC2 Dashboard](https://aws.amazon.com/ec2/getting-started/) and make a spot request.
162 | 
163 | Make a spot request.
164 | <p align="center">
165 | <img src="JuxtaposeTutorial/spotrequests.png" width="900">
166 | </p>
167 | 
168 | Selecting an AMI. We recommend using Ubuntu Server 18.04 LTS (HVM), SSD Volume Type.
169 | 
170 | Selecting an instance type.
171 | <p align="center">
172 | <img src="JuxtaposeTutorial/selectinstance.png" width="900">
173 | </p>
174 | 
175 | This request will also require a [key pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair) and [enabling inbound SSH traffic from your IP address to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html). If these have not been set up already, there is the option to create a new key pair and security group below where they are requested in the spot instance template. The remaining parts of the spot request are optional and can be changed according to your needs.
176 | 
177 | <p align="center">
178 | <img src="JuxtaposeTutorial/keypairname.png" width="900">
179 | <img src="JuxtaposeTutorial/securitygroup.png" width="900">
180 | </p>
181 | 
182 | Once the instance is created, use ssh to go to the instance. A generic example is provided below.
183 | ```sh
184 | ssh -i "keypair.pem" ubuntu@ec2-52-23-241-60.compute-1.amazonaws.com
185 | ```
186 | The actual command to ssh to the instance can be obtained using the Amazon EC2 console. Go to Instances and click the Connect button, which will provide the required command.
187 | 
188 | <p align="center">
189 | <img src="JuxtaposeTutorial/connectinstance.png" width="900">
190 | </p>
191 | 
192 | Then the following will need to be run to set up python on the intance.
193 | ```sh
194 | sudo apt update
195 | sudo apt install python3-pip
196 | python3 -m pip install --user numpy scipy matplotlib
197 | pip3 install --upgrade gensim
198 | pip3 install seaborn
199 | pip3 install -U scikit-learn
200 | pip3 install torch torchvision
201 |  ```
202 | A [volume](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-creating-volume.html) is also required in order to store data and results. The volume will need to be attached to the instance as follows.
203 | <p align="center">
204 | <img src="JuxtaposeTutorial/attachvolume.png" width="900">
205 | <img src="JuxtaposeTutorial/attachvolume2.png" width="900">
206 | </p>
207 | 
208 | The volume created to store the data as well as Juxtapose can be attached as follows.
209 | ```sh
210 | lsblk # the volume we made is called xvdf
211 | mkdir experiment  # make a directory
212 | chmod -R 744 experiment/  # change its permissions
213 | sudo mount /dev/xvdf experiment/ # mount the volume to the new directory
214 | cd experiment/ # go to the directory and start working
215 | ```
216 | After the volume is attached to the spot instance, the code can be downloaded into the folder that has had the volume mounted to it and Juxtapose can be run as was done above with the line network example.
217 | 
218 | ### Translating IDs to integers
219 | There are also options to translate node IDs to integers if they are not in the original networks. Converting the names to integers can save a lot of memory and result in a quicker analysis. The following commands can be used in order to convert names to integers after the anchoring procedure has been completed and all nodes that will be a part of the networks are now included in the anchored network files, i.e. experiment_anchored_*.csv.
220 | 
221 | ```sh
222 |  python3 test_translator.py --config test/data/line-config.json 
223 |  ```
224 |  
225 |  This will produce files named the same as the original anchored networks (translated_anchored_*.csv). Also, a JSON file, IDConvertor.json, will be produced to save the translation between integers and the original IDs. This allows for easy translation bake to the original names or IDs for downstream analyses.
226 |  
227 |  Other means of visualization are available that have not been covered here are included in dimensionality_reduction.py including t-SNE and PCA visualizations of the embedded datasets. Tutorial on how to visualize specific genes as well as the genes closest or furthest from them in the embedding space will be added in the future.
228 |  
229 |  <!-- Versioning -->
230 | ## Versioning
231 | 
232 | We use [Semantic Versioning 2.0.0](http://semver.org/) for versioning.
233 | 
234 | 
235 | <!-- CONTACT -->
236 | ## Contact
237 | 
238 | **Katie Ovens** - katie.ovens@usask.ca
239 | Project Link: [https://github.com/klovens/juxtapose](https://github.com/klovens/juxtapose)
240 | 


--------------------------------------------------------------------------------
/runner.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import json
  4 | import argparse
  5 | import copy
  6 | import warnings
  7 | import random
  8 | from scipy import stats
  9 | from itertools import combinations
 10 | from sklearn import linear_model
 11 | from scipy.stats.mstats import gmean
 12 | from network_stats import alignment_permutation_test 
 13 | import numpy as np
 14 | import pandas as pd
 15 | import seaborn as sns
 16 | import matplotlib.pyplot as plt
 17 | from sklearn.cluster import SpectralBiclustering
 18 | from dimensionality_reduction import *
 19 | from gensim.models import Word2Vec
 20 | from scipy.optimize import linear_sum_assignment
 21 | from sklearn.metrics.pairwise import cosine_similarity
 22 | from netwalk.translator import IDCovertor
 23 | import logging
 24 | from collections import Counter
 25 | from multiprocessing import Pool
 26 | from scipy.spatial.distance import cdist
 27 | logging.basicConfig(level=logging.DEBUG, filemode='w', filename='Experiment.log')
 28 | 
 29 | 
 30 | warnings.filterwarnings("ignore", category=DeprecationWarning)
 31 | 
 32 | 
 33 | def cdist2(x1, x2, metric):
 34 |     if metric == 'angular':
 35 |         d = 1 - cdist(x1, x2, metric='cosine')
 36 |         return np.clip(a=d, a_min=-1, a_max=1) / np.pi
 37 |     else:
 38 |         return cdist(x1, x2, metric)
 39 | 
 40 | def linear_transform(path_1, path_2, mbd_dim):
 41 |     model_1 = Word2Vec.load(path_1)
 42 |     model_2 = Word2Vec.load(path_2)
 43 |     genes_1 = sorted(model_1.wv.vocab)
 44 |     vocab_1_size = len(genes_1)
 45 |     genes_2 = sorted(model_2.wv.vocab)
 46 |     vocab_2_size = len(genes_2)
 47 | 
 48 |     backbone = [g for g in genes_1 if "pseudo" in g]
 49 | 
 50 |     # transform with linear regression
 51 |     model = linear_model.LinearRegression()
 52 |     model.fit(model_1.wv[backbone], model_2.wv[backbone])
 53 | 
 54 |     # transform network 1 model
 55 |     for g in genes_1:
 56 |         model_1.wv[g] = np.array(model.predict([model_1.wv[g]]))
 57 |     # transform network 2 model
 58 |     for g in genes_2:
 59 |         model_2.wv[g] = np.array(model.predict([model_2.wv[g]]))
 60 | 
 61 |     dist1 = np.zeros((vocab_1_size, vocab_2_size))
 62 |     for i, gene_i in enumerate(genes_1):
 63 |         for j, gene_j in enumerate(genes_2):
 64 |             dist1[i, j] = cosine_similarity(
 65 |                 model_1[gene_i].reshape(1, mbd_dim),
 66 |                 model_2[gene_j].reshape(1, mbd_dim))
 67 |     return dist1, genes_1, genes_2
 68 | 
 69 | 
 70 | def angular_dist(sim_matrix, genes1, genes2):
 71 |     vocab_size1 = len(genes1)
 72 |     vocab_size2 = len(genes2)
 73 |     dist = np.zeros((vocab_size1, vocab_size2))
 74 |     for i, gene_i in enumerate(genes1):
 75 |         for j, gene_j in enumerate(genes2):
 76 |             dist[i, j] = np.arccos(np.clip(a=sim_matrix[i, j], a_min=-1, a_max=1)) / np.pi
 77 |     return dist
 78 | 
 79 | 
 80 | def match_dims(sim_matrix):
 81 |     numrows, numcols = sim_matrix.shape
 82 |     max_sim = np.amax(sim_matrix) + 1
 83 |     if numrows > numcols:
 84 |         slack = numrows - numcols
 85 |         new_cols = np.ones((numrows, slack)) * max_sim
 86 |         sim_matrix = np.concatenate((sim_matrix, new_cols), axis=1)
 87 |     elif numrows < numcols:
 88 |         slack = numcols - numrows
 89 |         new_rows = np.ones((slack, numcols)) * max_sim
 90 |         sim_matrix = np.concatenate((sim_matrix, new_rows), axis=0)
 91 | 
 92 |     return sim_matrix
 93 | 
 94 | 
 95 | def compare_anchors(dist_matrix, genes_1, genes_2, train_anchors, convertor, substr='pseudo_'):
 96 |     gene_ids_1 = convertor.ints2ids(genes_1)
 97 |     gene_ids_2 = convertor.ints2ids(genes_2)
 98 |     sub = [substr+x for x in train_anchors]
 99 | 
100 |     for prefix in sub:
101 |         anchor_dist = []
102 |         dangle_1 = [s for s in gene_ids_1 if prefix in s]
103 |         dangle_2 = [s for s in gene_ids_2 if prefix in s]
104 |         # convert dangle ids to ints
105 |         dangle_1 = convertor.ids2ints(dangle_1)
106 |         dangle_2 = convertor.ids2ints(dangle_2)
107 |         idx_1 = []
108 |         idx_2 = []
109 |         for i, item in enumerate(genes_1):
110 |             for d in dangle_1:
111 |                 if int(item) == d:
112 |                     idx_1.append(i)
113 |         for i, item in enumerate(genes_2):
114 |             for d in dangle_2:
115 |                 if int(item) == d:
116 |                     idx_2.append(i)
117 | 
118 |         for i,j in zip(idx_1, idx_2):
119 |             anchor_dist.append(dist_matrix[i, j])
120 | 
121 |         pvalue = 0
122 |         for i in range(0,1000):
123 |             rand_dist = []
124 |             for i in range(0,len(dangle_1)):
125 |                 rand_int = random.randint(0, min(len(genes_1)-1, len(genes_2)-1))
126 |                 rand_dist.append(dist_matrix[rand_int, rand_int])
127 |             pvalue = pvalue + (sum(anchor_dist) > sum(rand_dist))
128 |     # get the test statistic and p-value
129 |     # statistic, pvalue = stats.ttest_ind(anchor_dist, rand_dist)
130 |         print(pvalue/1000)
131 | 
132 |     return pvalue
133 | 
134 | 
135 | def biclustering(dist, genes_1, genes_2, x_label, y_label, out_file, experiment, id_convertor, n_clusters=3, precent_visualize=0.1):
136 |     model = SpectralBiclustering(n_clusters=n_clusters, n_components=12, n_best=6,
137 |                                  init='random', random_state=1)
138 | 
139 |     m, n = dist.shape
140 |     assert m == len(genes_1) and n == len(genes_2)
141 |     model.fit(dist)
142 |     rows = [(idx, clust_id) for idx, clust_id in enumerate(model.row_labels_)]
143 |     selected_rows = random.choices(rows, k=int(precent_visualize * len(rows)))
144 |     selected_rows_name = [genes_1[idx] for idx, _ in selected_rows]
145 |     selected_rows_clust_ids = [clust_id for _, clust_id in selected_rows]
146 |     selected_rows_indices = [idx for idx, _ in selected_rows]
147 |     # Slect columns
148 |     cols = [(idx, clust_id) for idx, clust_id in enumerate(model.column_labels_)]
149 |     selected_cols = random.choices(cols, k=int(precent_visualize * len(cols)))
150 |     selected_cols_names = [genes_2[idx] for idx, _ in selected_cols]
151 |     selected_cols_clust_ids = [clust_id for _, clust_id in selected_cols]
152 |     selected_cols_indices = [idx for idx, _ in selected_cols]
153 |     # Selected dist
154 |     selected_dist = dist[selected_rows_indices] [:, selected_cols_indices]
155 |     # Sort rows
156 |     sorted_rows_indices = np.argsort(selected_rows_clust_ids)
157 |     selected_dist = selected_dist[sorted_rows_indices, :]
158 |     selected_row_names = [selected_rows_name[i] for i in sorted_rows_indices]
159 |     #selected_row_names = selected_rows_name[sorted_rows_indices]
160 |     # sort columns
161 |     sorted_cols_indices = np.argsort(selected_cols_clust_ids)
162 |     selected_dist = selected_dist[:, sorted_cols_indices]
163 |     selected_cols_names = [selected_cols_names[i] for i in sorted_cols_indices]
164 | 
165 |     result = pd.DataFrame(selected_dist, columns=selected_cols_names, index=selected_rows_name)
166 | 
167 |     ax = sns.heatmap(result, cmap="Greens_r", square=True)
168 |     plt.title("Biclustering Results")
169 |     ax.set_yticklabels([])
170 |     ax.set_xticklabels([])
171 |     ax.tick_params(left=False, bottom=False)
172 |     ax.set_ylabel('{} genes'.format(x_label))
173 |     ax.set_xlabel('{} genes'.format(y_label))
174 |     figure = ax.get_figure()
175 |     figure.savefig(out_file)
176 |     plt.close()
177 | 
178 |     for bic in range(n_clusters*n_clusters):
179 |         #print(bic)
180 |         r = list(model.rows_[bic])
181 |         rows = [i for (i, b) in zip(genes_1, r) if b]
182 | 
183 |         c = list(model.columns_[bic])
184 |         columns = [i for (i, b) in zip(genes_2, c) if b]
185 | 
186 |         rows = id_convertor.ints2ids([int(k) for k in rows])
187 |         columns = id_convertor.ints2ids([int(k) for k in columns])
188 | 
189 |         cluster_path = os.path.join(experiment, f'{bic}_{x_label}_{y_label}_biclustering.csv')
190 |         with open(cluster_path, 'w') as fout:
191 |             fout.write(','.join(rows))
192 |             fout.write("\n")
193 |             fout.write(','.join(columns))
194 | 
195 | def get_distance(path_1, path_2, mbd_dim):
196 |     model_1 = Word2Vec.load(path_1)
197 |     model_2 = Word2Vec.load(path_2)
198 |     genes_1 = sorted(model_1.wv.vocab)
199 |     vocab_1_size = len(genes_1)
200 |     genes_2 = sorted(model_2.wv.vocab)
201 |     vocab_2_size = len(genes_2)
202 |     logging.info(f'Read {vocab_1_size} gene from the model in {path_1}')
203 |     logging.info(f'Read {vocab_2_size} gene from the model in {path_2}')
204 |     x1 = np.array([model_1.wv[gene_i] for gene_i in genes_1])
205 |     x2 = np.array([model_2.wv[gene_i] for gene_i in genes_2])
206 |     dist = cdist2(x1, x2, 'cosine')/2
207 |     return dist, genes_1, genes_2
208 | 
209 | 
210 | def make_heatmap(dist, image_path):
211 |     df = pd.DataFrame(dist)
212 |     plt.figure()
213 |     ax = sns.heatmap(df, cmap='Greens_r', square=True)
214 |     ax.tick_params(left=False, bottom=False)
215 |     ax.set_yticklabels([])
216 |     ax.set_xticklabels([])
217 |     figure = ax.get_figure()
218 |     figure.savefig(image_path)
219 |     plt.close()
220 | 
221 | 
222 | def read_anchors(anchor_path, non_anchor_path):
223 |     with open(anchor_path, 'r') as f:
224 |         anchors = list(csv.reader(f, delimiter=','))
225 |     with open(non_anchor_path, 'r') as f:
226 |         non_anchors = list(csv.reader(f, delimiter=','))
227 |     return anchors, non_anchors
228 | 
229 | 
230 | def train(params):
231 |     for pheno in params['phenotypes']:
232 |         for rep_id in range(params['n_replicates']):
233 |             rep_id = str(rep_id)
234 |             walks_path = os.path.join(params['experiment_name'],
235 |                                       '{}_{}_walks.csv'.format(pheno, rep_id))
236 |             with open(walks_path) as fin:
237 |                 walks = list(csv.reader(fin))
238 |             model = Word2Vec(sentences=walks,
239 |                              size=params['embd_dim'],
240 |                              window=params['window'],
241 |                              min_count=params['min_count'],
242 |                              workers=params['n_workers'],
243 |                              iter=params['n_iter'],
244 |                              negative=params['negatives'],
245 |                              alpha=params['alpha'],
246 |                              sg = 1,
247 |                              min_alpha=params['min_alpha'])
248 |             # Write model to file
249 |             model.save(os.path.join(params['experiment_name'],
250 |                                     '{}_{}.model'.format(pheno, rep_id)))
251 | 
252 | 
253 | def visualize(params):
254 |     for rep_id in range(params['n_replicates']):
255 |         rep_id = str(rep_id)
256 |         for pheno_1, pheno_2 in combinations(params['phenotypes'], 2):
257 |             path_1 = os.path.join(params['experiment_name'],
258 |                                   '{}_{}.model'.format(pheno_1, rep_id))
259 |             path_2 = os.path.join(params['experiment_name'],
260 |                                   '{}_{}.model'.format(pheno_2, rep_id))
261 |             viz_path = os.path.join(params['experiment_name'],
262 |                                     '{}_vs_{}_{}.pdf'.format(pheno_1, pheno_2,
263 |                                                              rep_id))
264 |             make_heatmap(path_1, path_2,
265 |                          params['embd_dim'],
266 |                          image_path=viz_path)
267 | 
268 | 
269 | if __name__ == '__main__':
270 |     anchor_stats = []
271 |     parser = argparse.ArgumentParser(description='Generate datasets')
272 |     parser.add_argument('-c', '--config', metavar='JSON file path',
273 |                         action='store', required=True,
274 |                         help='Path to a config file')
275 |     parser.add_argument('-n', '--no-train', dest='no_train',
276 |                         action='store_true', default=False,
277 |                         help='Skip training and only produce visualizations.')
278 |     args = parser.parse_args()
279 |     # read training config
280 |     with open(args.config) as fin:
281 |         params = json.load(fin)
282 |     logging.info('Read parameters')
283 |     # Train models for all replicates
284 |     if args.no_train is False:
285 |         train(params)
286 |         logging.info('Start training ...')
287 |         
288 |     # Generate visualizations
289 |     train_anchor_path = os.path.join(params['experiment_name'], 'train_anchors.csv')
290 |     test_anchor_path = os.path.join(params['experiment_name'], 'test_anchors.csv')
291 |     
292 |     train_anchors, test_anchors = read_anchors(anchor_path=train_anchor_path, non_anchor_path=test_anchor_path)
293 |     logging.info('Using {} potential anchors for training and {} for testing'.format(len(train_anchors),
294 |                                                                                      len(test_anchors)))
295 |     convertor_file = os.path.join(params['experiment_name'],'IDConvertor.json')
296 |     id_convertor = IDCovertor.load(convertor_file)
297 | 
298 |     for rep_id in range(params['n_replicates']):
299 |         logging.info(f'Start working on replicate {rep_id}')
300 |         rep_id = str(rep_id)
301 |         for pheno_1, pheno_2 in combinations(params['phenotypes'], 2):
302 |             logging.info(f'Start working on {pheno_1} and {pheno_2}')
303 |             path_1 = os.path.join(params['experiment_name'],
304 |                                   '{}_{}.model'.format(pheno_1, rep_id))
305 |             path_2 = os.path.join(params['experiment_name'],
306 |                                   '{}_{}.model'.format(pheno_2, rep_id))
307 |             viz_path = os.path.join(params['experiment_name'],
308 |                                     '{}_vs_{}_{}.pdf'.format(pheno_1, pheno_2,
309 |                                                              rep_id))
310 | 
311 |             dist, gene_names_1, gene_names_2 = get_distance(path_1, path_2, params['embd_dim'])
312 |             logging.info('Calculated distance matrix')
313 |             #ang_dist = angular_dist(dist, gene_names_1, gene_names_2)
314 |             ang_dist = dist
315 |             del(dist)
316 |             logging.info('Calculated angular distance matrix')
317 |             #make_heatmap(ang_dist, viz_path)
318 |             #pvalue = alignment_permutation_test(vocab1_length=len(gene_names_1), vocab2_length=len(gene_names_2), distance=ang_dist, actual_score=global_cost)
319 |             #print(pvalue)
320 |             bic_path = os.path.join(params['experiment_name'],
321 |                                     '{}_vs_{}_{}_biclustering.pdf'.format(pheno_1, pheno_2,
322 |                                                                           rep_id))
323 |             biclustering(ang_dist, gene_names_1, gene_names_2, pheno_1, pheno_2, bic_path, params['experiment_name'], id_convertor, n_clusters=5)
324 |             #logging.info('Finsihed biclustering')
325 | 
326 |             expanded_matrix = match_dims(ang_dist)
327 |             #logging.info('Expanded the angular distance matrix')
328 |             row_ind, col_ind = linear_sum_assignment(expanded_matrix)
329 |             #logging.info('Calculated the Huangarian distance matrix')
330 |             globa_scores = expanded_matrix[row_ind, col_ind]
331 |             global_cost = globa_scores.sum()
332 |             norm = max(len(row_ind), len(col_ind))
333 |             global_cost = global_cost/norm
334 |             print(pheno_1, pheno_2, global_cost)
335 |             #matches = [i for i, j in zip(row_ind, col_ind) if i == j]
336 |             #print(len(matches)/norm)
337 |         #pvalue = compare_anchors(ang_dist, gene_names_1, gene_names_2, train_anchors[int(rep_id)], convertor=id_convertor)
338 |         #anchor_stats.append(pvalue)
339 | 
340 |     #with open(os.path.join(params['experiment_name'],'stats.csv'),'w') as fout:
341 |     #    csv_out = csv.writer(fout)
342 |     #    csv_out.writerow(['statistic','pval'])
343 |     #    for row in anchor_stats:
344 |     #        csv_out.writerow(row)
345 | 
346 | 
347 | 


--------------------------------------------------------------------------------