├── scripts ├── README.md └── validate_graph.py ├── .gitignore ├── Makefile ├── test ├── Makefile └── main.cpp ├── data └── sample.adjlist.txt ├── README.md ├── LICENSE ├── main.cpp └── deepwalk.h /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | `python validate_graph.py ../data/sample.adjlist.txt` 3 | 4 | `output:` 5 | `(valid) start index: 1, max vertex idx: 34, total 34 vertex.` 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Some Macros 2 | # --------------- 3 | # Compiler Name 4 | CC = g++ 5 | # Compile Flags 6 | CXXFLAGS= -g -Wall -std=c++11 -fopenmp 7 | # Linker Flags 8 | LDFLAGS = 9 | # Include 10 | INCLUDES= 11 | # Libraries 12 | LIBS = 13 | # Object Files 14 | OBJS = main.o 15 | # Name of Executable 16 | TARGET = walk 17 | # --------------- 18 | 19 | all: $(TARGET) 20 | 21 | walk: main.cpp Makefile 22 | $(CC) $(CXXFLAGS) $(INCLUDES) $< -o $@ -L. $(LDFLAGS) 23 | 24 | clean: 25 | -rm -f *.o core.* walk 26 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | # Some Macros 2 | # --------------- 3 | # Compiler Name 4 | CC = g++ 5 | # Compile Flags 6 | CXXFLAGS= -g -Wall -std=c++11 -fopenmp 7 | # Linker Flags 8 | LDFLAGS = 9 | # Include 10 | INCLUDES= 11 | # Libraries 12 | LIBS = 13 | # Object Files 14 | OBJS = main.o 15 | # Name of Executable 16 | TARGET = test_walk 17 | # --------------- 18 | 19 | all: $(TARGET) 20 | 21 | test_walk: main.cpp Makefile 22 | $(CC) $(CXXFLAGS) $(INCLUDES) $< -o $@ -L.. $(LDFLAGS) 23 | 24 | clean: 25 | -rm -f *.o core.* test_walk 26 | -------------------------------------------------------------------------------- /data/sample.adjlist.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 9 11 12 13 14 18 20 22 32 2 | 2 1 3 4 8 14 18 20 22 31 3 | 3 1 2 4 8 9 10 14 28 29 33 4 | 4 1 2 3 8 13 14 5 | 5 1 7 11 6 | 6 1 7 11 17 7 | 7 1 5 6 17 8 | 8 1 2 3 4 9 | 9 1 3 31 33 34 10 | 10 3 34 11 | 11 1 5 6 12 | 12 1 13 | 13 1 4 14 | 14 1 2 3 4 34 15 | 15 33 34 16 | 16 33 34 17 | 17 6 7 18 | 18 1 2 19 | 19 33 34 20 | 20 1 2 34 21 | 21 33 34 22 | 22 1 2 23 | 23 33 34 24 | 24 26 28 30 33 34 25 | 25 26 28 32 26 | 26 24 25 32 27 | 27 30 34 28 | 28 3 24 25 34 29 | 29 3 32 34 30 | 30 24 27 33 34 31 | 31 2 9 33 34 32 | 32 1 25 26 29 33 34 33 | 33 3 9 15 16 19 21 23 24 30 31 32 34 34 | 34 9 10 14 15 16 19 20 21 23 24 27 28 29 30 31 32 33 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deepwalk 2 | weighted deepwalk implementation in c++. 3 | 4 | # Build 5 | `make` 6 | 7 | # Usage 8 | 1. `python scripts/validate_graph.py graph_file` 9 | 2. `./walk graph_file max_vertex_idx start_index(0/1) num_iter num_step output_file [idx_list_file]` 10 | `graph_file`: something.edge or something.adj or something.edge.directed. Only left -> right is an edge in edge.directed file. 11 | `max_vertex_idx`: max graph node index. 12 | `start_index`: graph node start index, 0 or 1. 13 | `num_iter`: iteration number of all nodes. 14 | `num_step`: max step number of each walk. 15 | `output_file`: output walks file. 16 | `idx_list_file`: graph node name and idx list, `name\tindex\n`. 17 | 18 | ## example 19 | `python scripts/validate_graph.py data/sample.edgelist.weight.txt` 20 | you will get output: `(valid) start index: 0, max vertex idx: 6300, total 6301 vertex.` 21 | `./walk data/sample.edgelist.weight.txt 6300 0 25 40 random_walks.txt` 22 | 23 | # Requirements 24 | `C++11` 25 | `OpenMP` 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 董国盛 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include "deepwalk.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | int main(int argc, const char *argv[]) 8 | { 9 | using namespace deepwalk; 10 | using namespace std; 11 | // args parse 12 | if (argc < 2) { 13 | cout << "./walk graph_file max_vertex_idx start_index(0/1) num_iter num_step output_file [idx_list_file]" << endl; 14 | return -1; 15 | } 16 | const char *filename = argv[1]; 17 | unsigned int n_vertex = atoi(argv[2]); 18 | int start_idx = atoi(argv[3]); 19 | if (start_idx == 0) { 20 | n_vertex ++; 21 | } 22 | 23 | Graph<> g(filename, n_vertex, start_idx); 24 | if (argc == 8) { 25 | const char *idx_list_file = argv[7]; 26 | g.LoadVertexName(idx_list_file); 27 | } 28 | // std::vector*> data = g.GetData(); 29 | int n_iter = atoi(argv[4]); 30 | int n_step = atoi(argv[5]); 31 | struct timeval st; gettimeofday( &st, NULL ); 32 | g.GenRandomWalks(n_iter, n_step); 33 | struct timeval et; gettimeofday( &et, NULL ); 34 | printf("Walks Cost: %ld ms\n", (et.tv_sec - st.tv_sec) * 1000 + (et.tv_usec - st.tv_usec)/1000); 35 | 36 | const char *walks_file = argv[6]; 37 | gettimeofday( &st, NULL ); 38 | g.SaveTxt(walks_file); 39 | gettimeofday( &et, NULL ); 40 | printf("Save Cost: %ld ms\n", (et.tv_sec - st.tv_sec) * 1000 + (et.tv_usec - st.tv_usec)/1000); 41 | 42 | return 0; 43 | } 44 | 45 | -------------------------------------------------------------------------------- /scripts/validate_graph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: gbk -*- 2 | 3 | import sys 4 | 5 | def validate(filename): 6 | n_max = -1 7 | v_set = set() 8 | with open(filename) as fin: 9 | n_line = 0 10 | for line in fin: 11 | n_line += 1 12 | v_list = line.rstrip().split() 13 | if 'adj' in filename: 14 | if len(v_list) < 2: 15 | print '(invalid) at line %d' % n_line 16 | return 17 | if 'edge' in filename: 18 | if len(v_list) != 2 and len(v_list) != 3: 19 | print '(invalid) at line %d' % n_line 20 | return 21 | if len(v_list) == 3: 22 | v_list = v_list[: 2] 23 | for vid in v_list: 24 | vid = int(vid) 25 | if vid < 0: 26 | print '(invalid) at line %d' % n_line 27 | return 28 | v_set.add(vid) 29 | if vid > n_max: 30 | n_max = vid 31 | 32 | if len(v_set) == n_max and 0 not in v_set: 33 | print '(valid) start index: 1, max vertex idx: %d, total %d vertex.' % (n_max, len(v_set)) 34 | elif len(v_set) == n_max + 1 and 0 in v_set: 35 | print '(valid) start index: 0, max vertex idx: %d, total %d vertex.' % (n_max, len(v_set)) 36 | else: 37 | print '(valid) start index: 0, max vertex idx: %d, total %d vertex.' % (n_max, len(v_set)) 38 | print 'vertex ids are not continuous, lost %d id.' % (n_max - len(v_set) if 0 in v_set else n_max - len(v_set) + 1) 39 | 40 | def main(): 41 | if len(sys.argv) != 2: 42 | print 'validate_graph.py graph_file' 43 | else: 44 | validate(sys.argv[1]) 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../deepwalk.h" 2 | #include 3 | #include 4 | #include 5 | 6 | int main() 7 | { 8 | using namespace deepwalk; 9 | using namespace std; 10 | 11 | // const char *filename = "../data/sample.adjlist.txt"; 12 | // unsigned int n_vertex = 34; 13 | // int start_idx = 1; 14 | 15 | const char *filename = "../data/sample.edgelist.txt"; 16 | unsigned int n_vertex = 6301; 17 | int start_idx = 0; 18 | 19 | Graph<> g(filename, n_vertex, start_idx); 20 | std::vector*> data = g.GetData(); 21 | // std::random_shuffle(data.begin(), data.end()); 22 | 23 | for (auto p_v : data) { 24 | cout << p_v->id << endl; 25 | cout << "adj list size: " << p_v->adjacent_list.size() << endl; 26 | // cout << "adj list: "; 27 | // for (auto p_adj : p_v->adjacent_list) { 28 | // cout << p_adj->id << "|"; 29 | // cout << p_adj->adjacent_list.size() << ","; 30 | // } 31 | // cout << endl; 32 | if (p_v->adjacent_list.size() == 0) { 33 | cout << "EMPTY!" << endl; 34 | break; 35 | } 36 | } 37 | cout << "----------------" << endl; 38 | int n_iter = 10; 39 | int n_step = 40; 40 | g.GenRandomWalks(n_iter, n_step); 41 | // show random walks 42 | 43 | const char *walks_file = "random_walks.txt"; 44 | // const char *walks_file_b = "random_walks.bin"; 45 | g.SaveTxt(walks_file); 46 | // g.SaveBinary(walks_file_b); 47 | 48 | /* 49 | std::vector > paths = g.GetPaths(); 50 | int cnt = 0; 51 | for (auto path : paths) { 52 | cnt++; 53 | if (path.size() != 41) { 54 | cout << "error. " << cnt << ", path size: " << path.size() << endl; 55 | break; 56 | } 57 | for (auto id : path) { 58 | cout << id << ","; 59 | } 60 | cout << endl; 61 | } 62 | */ 63 | return 0; 64 | } 65 | 66 | -------------------------------------------------------------------------------- /deepwalk.h: -------------------------------------------------------------------------------- 1 | #ifndef DEEPWALK_H 2 | #define DEEPWALK_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace deepwalk { 18 | const static int THREAD_NUM = omp_get_max_threads(); 19 | template 20 | struct Vertex { 21 | T id; 22 | std::vector > adjacent_list; 23 | std::vector cum_table; 24 | Vertex(T _id) : id(_id) {} 25 | Vertex() {} 26 | }; 27 | 28 | template 29 | class Graph { 30 | public: 31 | Graph() {} 32 | Graph(const char *filename, T n_vertex, int start_idx) { 33 | //TODO: load different type of graph file 34 | data = std::vector*>(n_vertex, NULL); 35 | bool rs = false; 36 | n_edge = 0; 37 | 38 | if (strstr(filename, "adj") != NULL) rs = LoadAdjList(filename, start_idx); 39 | else if (strstr(filename, "edge") != NULL) { 40 | if (strstr(filename, "directed") != NULL) rs = LoadEdgeList(filename, start_idx, true); 41 | else rs = LoadEdgeList(filename, start_idx); 42 | } 43 | else { 44 | std::cerr << "file format not supported yet." << std::endl; 45 | } 46 | if (!rs) std::cerr << "load graph file error." << std::endl; 47 | rng.seed(static_cast(time(NULL))); 48 | MakeCumTable(); 49 | } 50 | virtual ~Graph() { 51 | for (auto ptr_v : data) { 52 | delete ptr_v; 53 | ptr_v = NULL; 54 | } 55 | } 56 | inline bool LoadAdjList(const char *filename, int start_idx) { 57 | std::ifstream fin(filename); 58 | if (fin.fail()) { 59 | std::cerr << filename << " read error." << std::endl; 60 | return false; 61 | } 62 | std::string line; 63 | while (std::getline(fin, line)) { 64 | std::istringstream ss(line); 65 | T tmp_id, v_id; 66 | int cnt = 0; 67 | while (!ss.eof()) { 68 | if (!(ss >> tmp_id)) break; 69 | tmp_id -= start_idx; 70 | assert(tmp_id < data.size()); 71 | if (cnt == 0) { 72 | v_id = tmp_id; 73 | if(data[v_id] == NULL) data[v_id] = new Vertex(v_id); 74 | } 75 | else { 76 | assert(data[v_id] != NULL); 77 | if (data[tmp_id] == NULL) data[tmp_id] = new Vertex(tmp_id); 78 | // adj list with weight not supported yet. 79 | data[v_id]->adjacent_list.push_back(std::make_pair(data[tmp_id], 1.0f)); 80 | n_edge++; 81 | } 82 | cnt++; 83 | } 84 | } 85 | n_edge /= 2; 86 | fin.close(); 87 | std::cout << "Read adjlist complete. Total " << data.size() << " vertex, " << n_edge << " edges." << std::endl; 88 | return true; 89 | } 90 | inline bool LoadEdgeList(const char *filename, int start_idx, bool directed = false) { 91 | std::ifstream fin(filename); 92 | if (fin.fail()) { 93 | std::cerr << filename << " read error." << std::endl; 94 | return false; 95 | } 96 | std::string line; 97 | while (std::getline(fin, line)) { 98 | std::istringstream ss(line); 99 | T id_left, id_right; 100 | ss >> id_left >> id_right; 101 | float weight = 1.0f; 102 | ss >> weight; 103 | id_left -= start_idx; 104 | id_right -= start_idx; 105 | assert(id_left < data.size()); 106 | assert(id_right < data.size()); 107 | if (data[id_left] == NULL) data[id_left] = new Vertex(id_left); 108 | if (data[id_right] == NULL) data[id_right] = new Vertex(id_right); 109 | data[id_left]->adjacent_list.push_back(std::make_pair(data[id_right], weight)); 110 | if (!directed) { 111 | data[id_right]->adjacent_list.push_back(std::make_pair(data[id_left], weight)); 112 | n_edge++; 113 | } 114 | n_edge++; 115 | } 116 | fin.close(); 117 | std::cout << "Read edgelist complete. Total " << data.size() << " vertex, " << n_edge << " edges." << std::endl; 118 | return true; 119 | } 120 | inline bool LoadVertexName(const char *filename) { 121 | std::ifstream fin(filename); 122 | if (fin.fail()) { 123 | std::cerr << filename << " read error." << std::endl; 124 | return false; 125 | } 126 | std::string line; 127 | while (std::getline(fin, line)) { 128 | std::istringstream ss(line); 129 | std::string vname; 130 | T vid; 131 | ss >> vname >> vid; 132 | vidtoname[vid] = vname; 133 | } 134 | std::cout << "Load Vertex List Complete." << std::endl; 135 | fin.close(); 136 | return true; 137 | } 138 | inline void MakeCumTable() { 139 | typedef std::pair*, float> edge_t; 140 | for(auto v_ptr : data) { 141 | if (v_ptr == NULL) continue; // for id not occured in graph file 142 | // sort adjlist, higher weight with lower index 143 | std::sort((v_ptr->adjacent_list).begin(), (v_ptr->adjacent_list).end(), [](const edge_t left, const edge_t right){ return left.second > right.second; }); 144 | (v_ptr->cum_table).reserve(v_ptr->adjacent_list.size()); 145 | float weight_sum = 0; 146 | for(auto edge : v_ptr->adjacent_list) { 147 | weight_sum += edge.second; 148 | (v_ptr->cum_table).push_back(weight_sum); 149 | } 150 | } 151 | } 152 | inline void GenRandomWalks(int n_iter, int n_step) { 153 | paths = std::vector >(n_iter * data.size(), std::vector()); 154 | for (int i = 0; i < n_iter; ++i) { 155 | std::random_shuffle(data.begin(), data.end()); 156 | int n_vertex = static_cast(data.size()); 157 | # pragma omp parallel for num_threads(THREAD_NUM) 158 | for (int j = 0; j < n_vertex; ++ j) { 159 | if (data[j] == NULL) continue; // for id not occured in graph file 160 | if (data[j]->adjacent_list.empty()) continue; 161 | paths[i*data.size() + j] = Walk(data[j], n_step); 162 | // Walk(data[j], n_step, paths[i*data.size() + j]); 163 | } 164 | } 165 | } 166 | inline std::vector Walk(const Vertex* ptr_v, int n_step) { 167 | std::vector path; 168 | Walk(ptr_v, n_step, path); 169 | return path; 170 | } 171 | inline size_t BinarySearch(const std::vector &cum_table, float rand_num) { 172 | size_t idx; 173 | size_t left = 0, right = cum_table.size() - 1; 174 | while(left <= right) { 175 | idx = left + (right - left) / 2; 176 | if(idx == 0 || idx == cum_table.size() - 1) break; 177 | if (rand_num > cum_table[idx-1] && rand_num <= cum_table[idx]) break; 178 | else if (rand_num > cum_table[idx] && rand_num <= cum_table[idx+1]) { idx++; break; } 179 | else if (rand_num > cum_table[idx+1]) left = idx + 1; 180 | else right = idx - 1; 181 | } 182 | return idx; 183 | } 184 | inline void Walk(const Vertex* ptr_v, int n_step, std::vector &path) { 185 | // path[0] = ptr_v->id; 186 | path.push_back(ptr_v->id); 187 | float rand_num; 188 | size_t idx; 189 | int i = 1; 190 | const Vertex* ptr_now = ptr_v; 191 | while (i <= n_step) { 192 | assert(ptr_now->adjacent_list.size() == ptr_now->cum_table.size()); 193 | if (ptr_now->adjacent_list.empty()) return; 194 | std::uniform_real_distribution ud(0.0f, ptr_now->cum_table[(ptr_now->cum_table).size() - 1]); 195 | rand_num = ud(rng); 196 | idx = BinarySearch(ptr_now->cum_table, rand_num); 197 | ptr_now = ptr_now->adjacent_list[idx].first; 198 | // path[i] = ptr_now->id; 199 | path.push_back(ptr_now->id); 200 | i++; 201 | } 202 | } 203 | inline bool SaveTxt(const char *filename) { 204 | //TODO: Too Slow ... 205 | std::ofstream fout(filename); 206 | if (fout.fail()) { 207 | std::cerr << filename << " open error." << std::endl; 208 | return false; 209 | } 210 | for (auto path : paths) { 211 | if (path.empty()) continue; 212 | for (auto id : path) { 213 | if (vidtoname.size() > 0) { assert(vidtoname.find(id) != vidtoname.end()); fout << vidtoname[id] << " "; } 214 | else fout << id << " "; 215 | } 216 | fout << std::endl; 217 | } 218 | fout.close(); 219 | return true; 220 | } 221 | inline bool SaveBinary(const char *filename) { 222 | std::ofstream fout(filename, std::ios::out | std::ios::binary); 223 | if (fout.fail()) { 224 | std::cerr << filename << " open error." << std::endl; 225 | return false; 226 | } 227 | for (auto path : paths) { 228 | fout.write(reinterpret_cast(&path[0]), path.size() * sizeof(T)); 229 | } 230 | fout.close(); 231 | } 232 | inline std::vector*> GetData() const { 233 | return data; 234 | } 235 | inline std::vector > GetPaths() const { 236 | return paths; 237 | } 238 | inline Vertex* operator[](const T idx) const { 239 | return data[idx]; 240 | } 241 | inline size_t GetDegree(const T idx) const { 242 | if (data[idx] == NULL) return 0; 243 | else return data[idx]->adjacent_list.size(); 244 | } 245 | private: 246 | std::vector*> data; 247 | unsigned long n_edge; 248 | std::vector > paths; 249 | std::mt19937 rng; 250 | std::unordered_map vidtoname; 251 | 252 | Graph(const Graph &other); 253 | Graph& operator=(const Graph &other); 254 | }; 255 | } 256 | 257 | 258 | #endif /*DEEPWALK_H*/ 259 | --------------------------------------------------------------------------------