├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    └── Modules
    │   ├── FindGflags.cmake
    │   └── FindLOG4CXX.cmake
├── configure
├── example
    ├── blog.sh
    ├── flickr.sh
    ├── log4cxx.config
    ├── mat2edge.py
    ├── ppi.sh
    ├── predict.py
    ├── redsvd2emb.py
    └── youtube.sh
└── src
    ├── BinaryGraphWalker.cc
    ├── BinaryGraphWalker.h
    ├── CMakeLists.txt
    ├── GraphWalker.cc
    ├── GraphWalker.h
    ├── WeightGraphWalker.cc
    ├── WeightGraphWalker.h
    ├── config.h.cmake
    ├── netsmf.cc
    └── redsvd
        ├── CMakeLists.txt
        ├── cmdline.h
        ├── fileReader.hpp
        ├── redsvd.hpp
        ├── redsvdFile.cpp
        ├── redsvdFile.hpp
        ├── redsvdIncr.hpp
        ├── redsvdMain.cpp
        ├── redsvdMainIncr.cpp
        ├── util.cpp
        └── util.hpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | /.vs
34 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # CMAKE parameters file for 'configure'
  3 | #
  4 | 
  5 | SET(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required
  6 | 
  7 | CMAKE_MINIMUM_REQUIRED(VERSION 3.5)
  8 | 
  9 | #
 10 | # Misc parameters
 11 | #
 12 | 
 13 | # output parameters for 'make'
 14 | SET(CMAKE_COLOR_MAKEFILE "ON")
 15 | 
 16 | # project declaration
 17 | PROJECT(NETSMF)
 18 | 
 19 | # Release/Debug
 20 | IF(NOT CMAKE_BUILD_TYPE)
 21 |   SET(CMAKE_DEFAULT_BUILD_TYPE "Release")
 22 |   SET(CMAKE_BUILD_TYPE "Release")
 23 | ENDIF(NOT CMAKE_BUILD_TYPE)
 24 | MESSAGE(STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE})
 25 | 
 26 | # path for binary
 27 | SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin/${CMAKE_BUILD_TYPE})
 28 | 
 29 | #
 30 | # compiler flags
 31 | #
 32 | SET(CMAKE_CXX_FLAGS_RELEASE "-O3")
 33 | SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3 -fPIC")
 34 | IF(NOT APPLE)
 35 |   ADD_DEFINITIONS(-march=native)
 36 | ENDIF(NOT APPLE)
 37 | 
 38 | # c++ 11
 39 | set(CMAKE_CXX_STANDARD 11)
 40 | 
 41 | #
 42 | # Check for required libraries
 43 | # FIND_PACKAGE( xxx REQUIRED ) : REQUIRED removed so that MESSAGE is written
 44 | #
 45 | 
 46 | # gflags
 47 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules")
 48 | find_package(Gflags REQUIRED)
 49 | find_package(LOG4CXX REQUIRED)
 50 | #include_directories("${GFLAGS_INCLUDE_DIR}")
 51 | #include_directories("${LOG4CXX_INCLUDE_DIR}")
 52 | 
 53 | if (GFLAGS_FOUND)
 54 |  MESSAGE(STATUS "Compiling with gflags support")
 55 |  SET (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -lgflags")
 56 |  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgflags")
 57 |  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lgflags")
 58 | ENDIF(GFLAGS_FOUND)
 59 | 
 60 | if (LOG4CXX_FOUND)
 61 |  MESSAGE(STATUS "Compiling with log4cxx support")
 62 |  SET (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog4cxx")
 63 |  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog4cxx")
 64 |  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -llog4cxx")
 65 | ENDIF(LOG4CXX_FOUND)
 66 | 
 67 | 
 68 | find_package (Eigen3 REQUIRED NO_MODULE)
 69 | 
 70 | # Threads
 71 | FIND_PACKAGE( Threads )
 72 | IF ( NOT THREADS_FOUND )
 73 |   MESSAGE(FATAL_ERROR "Package Threads required, but not found!")
 74 | ENDIF( NOT THREADS_FOUND )
 75 | 
 76 | # Zlib
 77 | FIND_PACKAGE( ZLIB )
 78 | IF ( NOT ZLIB_FOUND )
 79 |   MESSAGE(FATAL_ERROR "Package ZLIB required, but not found!")
 80 | ENDIF( NOT ZLIB_FOUND )
 81 | 
 82 | # Blas
 83 | FIND_PACKAGE(BLAS)
 84 | IF(BLAS_FOUND)
 85 |   SET(USE_BLAS 1)
 86 | ENDIF(BLAS_FOUND)
 87 | 
 88 | # Lapack
 89 | FIND_PACKAGE(LAPACK)
 90 | IF(LAPACK_FOUND)
 91 |   SET(USE_LAPACK 1)
 92 | ENDIF(LAPACK_FOUND)
 93 | 
 94 | # OpenMP
 95 | FIND_PACKAGE(OpenMP)
 96 | if (OPENMP_FOUND)
 97 |  MESSAGE(STATUS "Compiling with OpenMP support")
 98 |  SET (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 99 |  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
100 |  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
101 | ENDIF(OPENMP_FOUND)
102 | 
103 | #
104 | # Default Options
105 | #
106 | 
107 | # should we use VERBOSE functions?
108 | OPTION (VERBOSE  "Use VERBOSE option" OFF)
109 | 
110 | # should we use VERBOSE functions?
111 | OPTION (EIGEN_USE_MKL_ALL  "Use MKL Library option" OFF)
112 | 
113 | #
114 | # Management
115 | #
116 | 
117 | # configure a header file to pass some of the CMake settings
118 | # to the source code
119 | CONFIGURE_FILE (
120 |   ${CMAKE_SOURCE_DIR}/src/config.h.cmake
121 |   ${CMAKE_SOURCE_DIR}/src/config.h
122 | )
123 | 
124 | # specify the cross compiler
125 | SET(CMAKE_CXX_COMPILER g++)
126 | 
127 | 
128 | ADD_SUBDIRECTORY(src)
129 | # INCLUDE_DIRECTORIES(data)
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jiezhong Qiu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NetSMF
 2 | 
 3 | NetSMF: Large-Scale Network Embedding as Sparse Matrix Factorization [[arxiv](https://arxiv.org/abs/1906.11156)]
 4 | 
 5 | Please cite our paper if you use this code in your own work:
 6 | 
 7 | ```
 8 | @inproceedings{qiu2019netsmf,
 9 |  author = {Qiu, Jiezhong and Dong, Yuxiao and Ma, Hao and Li, Jian and Wang, Chi and Wang, Kuansan and Tang, Jie},
10 |  title = {NetSMF: Large-Scale Network Embedding As Sparse Matrix Factorization},
11 |  booktitle = {The World Wide Web Conference},
12 |  series = {WWW '19},
13 |  year = {2019},
14 |  publisher = {ACM}
15 | } 
16 | ```
17 | 
18 | # HOWTO
19 | 
20 | ## How to install
21 | ```
22 | sudo apt-get install cmake
23 | sudo apt-get install libgflags-dev
24 | sudo apt-get install liblog4cxx-dev
25 | sudo apt-get install libomp-dev
26 | sudo apt-get install libeigen3-dev
27 | https://github.com/xptree/NetSMF.git
28 | cd NetSMF
29 | mkdir build
30 | ./configure
31 | cd build
32 | make
33 | ```
34 | 
35 | The dependence versions that the code is tested:
36 | 
37 | | Dependence 	| Version     	|
38 | |------------	|-------------	|
39 | | g++        	| 5.4.0       	|
40 | | cmake      	| 3.5.1-1     	|
41 | | gflags     	| 2.1.2-3     	|
42 | | log4cxx    	| 0.10.0-10   	|
43 | | openmp     	| 3.7.0-3     	|
44 | | eigen3     	| 3.3~beta1-2 	|
45 | 
46 | **Note: Using eigen3 3.2.5 may cause problems. Please do update you eigen3 to 3.3 or above.**
47 | 
48 | ## How to run
49 | 
50 | ### Input
51 | 
52 | Support undirected networks with edgelist format.
53 | 
54 | For unweighted networks, each edge should appear twice `a b` and `b a`.
55 | 
56 | For weighted networks, each edge should appear twice `a b w` and `b a w`.
57 | 
58 | You may want to use `example/mat2edge.py` to translate mat to edgelist.
59 | 
60 | `.mat` files can be downloaded here:
61 | 
62 | * BlogCatalog [Source](http://socialcomputing.asu.edu/datasets/BlogCatalog3) [Preprocessed](http://leitang.net/code/social-dimension/data/blogcatalog.mat)
63 | * Protein-Protein Interaction [Source](http://thebiogrid.org/download.php) [Preprocessed](http://snap.stanford.edu/node2vec/Homo_sapiens.mat)
64 | * [Flickr](http://leitang.net/code/social-dimension/data/flickr.mat)
65 | * [YouTube](http://leitang.net/code/social-dimension/data/youtube.mat)
66 | 
67 | 
68 | 
69 | ### Run NetSMF
70 | 
71 | For unweighted networks, see `example/blog.sh` for an example.
72 | 
73 | `blog.sh` takes three arguments, the first one indicates the input edgelist file, the second one indicates the output file, the third one indicating the origin `.mat` file containing network and labels.
74 | 
75 | For exmaple, runing `./blog.sh blogcatalog.edgelist blogcatalog.netsmf blogcatalog.mat` will
76 | 
77 | * check if `blogcatalog.edgelist` is a valid file. If not, it calls `mat2edge.py` to translate mat file `blogcatalog.mat` to edgelist `blogcatalog.edgelist`.
78 | * call NetSMF algorithm, and store the 128-dim embedding at `blogcatalog.netsmf_128.npy`.
79 | * call `predict.py` to evaluate NetSMF at the label classification task.
80 | 
81 | You can use `-weight` to switch to weighted networks and use `-noweight` to switch to unweighted network (default unweighted).
82 | 
83 | ### About truncated logarithm
84 | 
85 | We propose to use truncated logarithm in our WWW'19 paper.
86 | 
87 | In the code, we provide a new option `log1p`, i.e., `log(1+x)`. You can use  `-log1p` to turn it on and `-nolog1p` to turn it off (default off). Empirically speaking, `log1p` sometimes achieves better performance, for example in wiki dataset.
88 | 
89 | 
90 | ## Acknowledgement
91 | 
92 | The implementation of randomized singular value decomposition is by [redsvd](https://code.google.com/p/redsvd/) and [HPCA](https://github.com/idiap/hpca).
93 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindGflags.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find GFLAGS
 2 | #
 3 | # The following variables are optionally searched for defaults
 4 | #  GFLAGS_ROOT_DIR:            Base directory where all GFLAGS components are found
 5 | #
 6 | # The following are set after configuration is done:
 7 | #  GFLAGS_FOUND
 8 | #  GFLAGS_INCLUDE_DIRS
 9 | #  GFLAGS_LIBRARIES
10 | #  GFLAGS_LIBRARYRARY_DIRS
11 | 
12 | include(FindPackageHandleStandardArgs)
13 | 
14 | set(GFLAGS_ROOT_DIR "" CACHE PATH "Folder contains Gflags")
15 | 
16 | # We are testing only a couple of files in the include directories
17 | if(WIN32)
18 |     find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
19 |         PATHS ${GFLAGS_ROOT_DIR}/src/windows)
20 | else()
21 |     find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
22 |         PATHS ${GFLAGS_ROOT_DIR})
23 | endif()
24 | 
25 | if(MSVC)
26 |     find_library(GFLAGS_LIBRARY_RELEASE
27 |         NAMES libgflags
28 |         PATHS ${GFLAGS_ROOT_DIR}
29 |         PATH_SUFFIXES Release)
30 | 
31 |     find_library(GFLAGS_LIBRARY_DEBUG
32 |         NAMES libgflags-debug
33 |         PATHS ${GFLAGS_ROOT_DIR}
34 |         PATH_SUFFIXES Debug)
35 | 
36 |     set(GFLAGS_LIBRARY optimized ${GFLAGS_LIBRARY_RELEASE} debug ${GFLAGS_LIBRARY_DEBUG})
37 | else()
38 |     find_library(GFLAGS_LIBRARY gflags)
39 | endif()
40 | 
41 | find_package_handle_standard_args(GFlags DEFAULT_MSG GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY)
42 | 
43 | 
44 | if(GFLAGS_FOUND)
45 |     set(GFLAGS_INCLUDE_DIRS ${GFLAGS_INCLUDE_DIR})
46 |     set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY})
47 |     message(STATUS "Found gflags  (include: ${GFLAGS_INCLUDE_DIR}, library: ${GFLAGS_LIBRARY})")
48 |     mark_as_advanced(GFLAGS_LIBRARY_DEBUG GFLAGS_LIBRARY_RELEASE
49 |                      GFLAGS_LIBRARY GFLAGS_INCLUDE_DIR GFLAGS_ROOT_DIR)
50 | endif()
51 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindLOG4CXX.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find LOG4CXX
 2 | #
 3 | # The following variables are optionally searched for defaults
 4 | #  LOG4CXX_ROOT_DIR:            Base directory where all LOG4CXX components are found
 5 | #
 6 | # The following are set after configuration is done:
 7 | #  LOG4CXX_FOUND
 8 | #  LOG4CXX_INCLUDE_DIRS
 9 | #  LOG4CXX_LIBRARIES
10 | #  LOG4CXX_LIBRARYRARY_DIRS
11 | 
12 | include(FindPackageHandleStandardArgs)
13 | 
14 | set(LOG4CXX_ROOT_DIR "" CACHE PATH "Folder contains LOG4CXX")
15 | 
16 | # We are testing only a couple of files in the include directories
17 | if(WIN32)
18 |     find_path(LOG4CXX_INCLUDE_DIR log4cxx/logger.h
19 |         PATHS ${LOG4CXX_ROOT_DIR}/src/windows)
20 | else()
21 |     find_path(LOG4CXX_INCLUDE_DIR log4cxx/logger.h
22 |         PATHS ${LOG4CXX_ROOT_DIR})
23 | endif()
24 | 
25 | if(MSVC)
26 |     find_library(LOG4CXX_LIBRARY_RELEASE
27 |         NAMES liblog4cxx
28 |         PATHS ${LOG4CXX_ROOT_DIR}
29 |         PATH_SUFFIXES Release)
30 | 
31 |     find_library(LOG4CXX_LIBRARY_DEBUG
32 |         NAMES liblog4cxx-debug
33 |         PATHS ${LOG4CXX_ROOT_DIR}
34 |         PATH_SUFFIXES Debug)
35 | 
36 |     set(LOG4CXX_LIBRARY optimized ${LOG4CXX_LIBRARY_RELEASE} debug ${LOG4CXX_LIBRARY_DEBUG})
37 | else()
38 |     find_library(LOG4CXX_LIBRARY log4cxx)
39 | endif()
40 | 
41 | find_package_handle_standard_args(LOG4CXX DEFAULT_MSG LOG4CXX_INCLUDE_DIR LOG4CXX_LIBRARY)
42 | 
43 | 
44 | if(LOG4CXX_FOUND)
45 |     set(LOG4CXX_INCLUDE_DIRS ${LOG4CXX_INCLUDE_DIR})
46 |     set(LOG4CXX_LIBRARIES ${LOG4CXX_LIBRARY})
47 |     message(STATUS "Found LOG4CXX  (include: ${LOG4CXX_INCLUDE_DIR}, library: ${LOG4CXX_LIBRARY})")
48 |     mark_as_advanced(LOG4CXX_LIBRARY_DEBUG LOG4CXX_LIBRARY_RELEASE
49 |                      LOG4CXX_LIBRARY LOG4CXX_INCLUDE_DIR LOG4CXX_ROOT_DIR)
50 | endif()
51 | 


--------------------------------------------------------------------------------
/configure:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | mkdir -p build
3 | (cd build >/dev/null 2>&1 && cmake .. "$@")
4 | 


--------------------------------------------------------------------------------
/example/blog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | NETSMF=../bin/Release/netsmf
 5 | if [ -z "$1" ]; then
 6 |     INPUT="blogcatalog.edge"
 7 | else
 8 |     INPUT=$1
 9 | fi
10 | 
11 | if [ -z "$2" ]; then
12 |     OUTPUT="blogcatalog.netsmf"
13 | else
14 |     OUTPUT=$2
15 | fi
16 | 
17 | if [ -z "$3" ]; then
18 | 	LABEL=blogcatalog.mat
19 | else
20 | 	LABEL=$3
21 | fi
22 | 
23 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT
24 | 
25 | (/usr/bin/time -p $NETSMF -T 10 \
26 |     -filename $INPUT \
27 |     -machine $HOSTNAME \
28 |     -output_svd $OUTPUT \
29 |     -rank 512 \
30 |     -num_threads_sampling 40 \
31 |     -num_threads_svd 40 \
32 |     -rounds 10000 \
33 |     -check_point 50 \
34 |     -noweight \
35 |     -nolog1p \
36 |     -log4cxx log4cxx.config) |& tee blog.log
37 | 
38 | python redsvd2emb.py --name $OUTPUT --dim 128
39 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9
40 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9
41 | 


--------------------------------------------------------------------------------
/example/flickr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | NETSMF=../bin/Release/netsmf
 6 | if [ -z "$1" ]; then
 7 |     INPUT="flickr.edge"
 8 | else
 9 |     INPUT=$1
10 | fi
11 | 
12 | if [ -z "$2" ]; then
13 |     #mkdir -p flickr
14 |     OUTPUT="flickr.netsmf"
15 | else
16 |     OUTPUT=$2
17 | fi
18 | 
19 | if [ -z "$3" ]; then
20 | 	LABEL=flickr.mat
21 | else
22 | 	LABEL=$3
23 | fi
24 | 
25 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT
26 | 
27 | (/usr/bin/time -p $NETSMF -T 10 \
28 |     -filename $INPUT \
29 |     -machine $HOSTNAME \
30 |     -output_svd $OUTPUT \
31 |     -rank 512 \
32 |     -num_threads_sampling 20 \
33 |     -num_threads_svd 40 \
34 |     -rounds 1000 \
35 |     -check_point 20 \
36 |     -noweight \
37 |     -nolog1p \
38 |     -log4cxx log4cxx.config) |& tee -a flickr.log
39 | 
40 | python redsvd2emb.py --name $OUTPUT --dim 128
41 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10
42 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10
43 | 


--------------------------------------------------------------------------------
/example/log4cxx.config:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=DEBUG, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 |  
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c - %m%n
7 | 


--------------------------------------------------------------------------------
/example/mat2edge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | # File Name: mat2edge.py
 4 | # Author: Jiezhong Qiu
 5 | # Create Time: 2019/03/18 12:01
 6 | # TODO:
 7 | 
 8 | import scipy.io
 9 | import math
10 | import sys
11 | 
12 | def load_adjacency_matrix(file, variable_name="network"):
13 |     data = scipy.io.loadmat(file)
14 |     return data[variable_name]
15 | 
16 | def mat2edge(file, output):
17 |     print("mat2edgelist from %s to %s" % (file, output))
18 |     A = load_adjacency_matrix(file)
19 |     A.eliminate_zeros()
20 |     min_v, max_v = min(A.data) , max(A.data)
21 |     print("minimum non-zero value=%.2f maximum non-zero value=%.2f" \
22 |             % (min_v, max_v))
23 |     unweighted = math.isclose(min_v, 1.0) and math.isclose(max_v, 1.0)
24 |     print("unweighted graph" if unweighted else "weighted graph")
25 |     A = A.todok()
26 |     with open(output, "w") as f:
27 |         for (x, y), v in A.items():
28 |             assert(math.isclose(A[y, x], v))
29 |             print("%d\t%d" % (x, y) if unweighted else "%d\t%d\t%f" % (x, y, v),end="\n", file=f)
30 | 
31 | if __name__ == "__main__":
32 |     #mat2edge("youtube.mat", "youtube.edge")
33 |     mat2edge(sys.argv[1], sys.argv[2])
34 | 


--------------------------------------------------------------------------------
/example/ppi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | NETSMF=../bin/Release/netsmf
 6 | if [ -z "$1" ]; then
 7 |     INPUT="ppi.edge"
 8 | else
 9 |     INPUT=$1
10 | fi
11 | 
12 | if [ -z "$2" ]; then
13 |     #mkdir -p ppi
14 |     OUTPUT="ppi.netsmf"
15 | else
16 |     OUTPUT=$2
17 | fi
18 | 
19 | if [ -z "$3" ]; then
20 | 	LABEL=Homo_sapiens.mat
21 | else
22 | 	LABEL=$3
23 | fi
24 | 
25 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT
26 | 
27 | (/usr/bin/time -p $NETSMF -T 10 \
28 |     -filename $INPUT \
29 |     -machine $HOSTNAME \
30 |     -output_svd $OUTPUT \
31 |     -rank 256 \
32 |     -num_threads_sampling 40 \
33 |     -num_threads_svd 40 \
34 |     -rounds 1000 \
35 |     -check_point 50 \
36 |     -noweight \
37 |     -nolog1p \
38 |     -log4cxx log4cxx.config) |& tee -a ppi.log
39 | 
40 | python redsvd2emb.py --name $OUTPUT --dim 128
41 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9
42 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9
43 | 


--------------------------------------------------------------------------------
/example/predict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | # File Name: predict.py
  4 | # Author: Jiezhong Qiu
  5 | # Create Time: 2017/07/17 21:57
  6 | # TODO:
  7 | 
  8 | import warnings
  9 | warnings.filterwarnings("ignore")
 10 | 
 11 | import os
 12 | import pickle as pkl
 13 | import numpy as np
 14 | import scipy.sparse as sp
 15 | import scipy.io
 16 | import argparse
 17 | import logging
 18 | from sklearn.linear_model import LogisticRegression
 19 | from sklearn.model_selection import ShuffleSplit
 20 | from sklearn.multiclass import OneVsRestClassifier
 21 | from sklearn.metrics import f1_score
 22 | #from sklearn.exceptions import UndefinedMetricWarning
 23 | #warnings.filterwarnings("ignore", category=UserWarning)
 24 | #warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | def construct_indicator(y_score, y):
 29 |     # rank the labels by the scores directly
 30 |     num_label = y.sum(axis=1, dtype=np.int32)
 31 |     # num_label = np.sum(y, axis=1, dtype=np.int)
 32 |     y_sort = np.fliplr(np.argsort(y_score, axis=1))
 33 |     #y_pred = np.zeros_like(y_score, dtype=np.int32)
 34 |     row, col = [], []
 35 |     for i in range(y_score.shape[0]):
 36 |         row += [i] * num_label[i, 0]
 37 |         col += y_sort[i, :num_label[i, 0]].tolist()
 38 |         #for j in range(num_label[i, 0]):
 39 |         #    y_pred[i, y_sort[i, j]] = 1
 40 |     y_pred = sp.csr_matrix(
 41 |             ([1] * len(row), (row, col)),
 42 |             shape=y.shape, dtype=np.bool_)
 43 |     return y_pred
 44 | 
 45 | def load_w2v_feature(file):
 46 |     with open(file, "rb") as f:
 47 |         nu = 0
 48 |         for line in f:
 49 |             content = line.strip().split()
 50 |             nu += 1
 51 |             if nu == 1:
 52 |                 n, d = int(content[0]), int(content[1])
 53 |                 feature = [[] for i in range(n)]
 54 |                 continue
 55 |             index = int(content[0])
 56 |             for x in content[1:]:
 57 |                 feature[index].append(float(x))
 58 |             if nu % 10000000 == 0:
 59 |                 logger.info("read %d line from w2v feature file", nu)
 60 | 
 61 | #    for item in feature:
 62 | #        assert len(item) == d
 63 |     return np.array(feature, dtype=np.float32)
 64 | 
 65 | 
 66 | def load_label(file, variable_name="group"):
 67 |     if file.endswith(".tsv") or file.endswith(".txt"):
 68 |         data = np.loadtxt(file).astype(np.int32)
 69 |         label = sp.csr_matrix(([1] * data.shape[0], (data[:, 0], data[:, 1])), dtype=np.bool_)
 70 |         sp.save_npz("label.npz", label)
 71 |         return label
 72 |     elif file.endswith(".npz"):
 73 |         return sp.load_npz(file)
 74 |     else:
 75 |         data = scipy.io.loadmat(file)
 76 |         logger.info("loading mat file %s", file)
 77 | 
 78 |         label = data[variable_name].tocsr().astype(np.bool_)
 79 |         print(label.shape, label.dtype)
 80 |         return label
 81 | 
 82 |     label = data[variable_name].todense().astype(np.int32)
 83 |     label = np.array(label)
 84 |     return label
 85 | 
 86 | def predict_cv(X, y, train_ratio=0.2, n_splits=10, random_state=0, C=1., num_workers=1):
 87 |     micro, macro = [], []
 88 |     shuffle = ShuffleSplit(n_splits=n_splits, test_size=1-train_ratio,
 89 |             random_state=random_state)
 90 |     for train_index, test_index in shuffle.split(X):
 91 |         #print(train_index.shape, test_index.shape)
 92 |         #assert len(set(train_index) & set(test_index)) == 0
 93 |         #assert len(train_index) + len(test_index) == X.shape[0]
 94 |         X_train, X_test = X[train_index], X[test_index]
 95 |         y_train, y_test = y[train_index], y[test_index]
 96 |         clf = OneVsRestClassifier(
 97 |                 LogisticRegression(
 98 |                     C=C,
 99 |                     solver="liblinear",
100 |                     multi_class="ovr"),
101 |                 n_jobs=num_workers)
102 |         clf.fit(X_train, y_train)
103 |         y_score = clf.predict_proba(X_test)
104 |         y_pred = construct_indicator(y_score, y_test)
105 |         mi = f1_score(y_test, y_pred, average="micro")
106 |         ma = f1_score(y_test, y_pred, average="macro")
107 |         logger.info("micro f1 %f macro f1 %f", mi, ma)
108 |         micro.append(mi)
109 |         macro.append(ma)
110 |     logger.info("%d fold validation, training ratio %f", len(micro), train_ratio)
111 |     logger.info("Average micro %.2f, Average macro %.2f",
112 |             np.mean(micro) * 100,
113 |             np.mean(macro) * 100)
114 |     return np.mean(micro)*100, np.mean(macro)*100
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     parser = argparse.ArgumentParser()
119 |     parser.add_argument("--label", type=str, required=True,
120 |             help="input file path for labels (.mat)")
121 |     parser.add_argument("--embedding", type=str, required=True,
122 |             help="input file path for embedding (.npy)")
123 |     parser.add_argument("--matfile-variable-name", type=str, default='group',
124 |             help='variable name of adjacency matrix inside a .mat file.')
125 |     parser.add_argument("--seed", type=int, required=True,
126 |             help="seed used for random number generator when randomly split data into training/test set.")
127 |     parser.add_argument("--start-train-ratio", type=float, default=10,
128 |             help="the start value of the train ratio (inclusive).")
129 |     parser.add_argument("--stop-train-ratio", type=float, default=90,
130 |             help="the end value of the train ratio (inclusive).")
131 |     parser.add_argument("--num-train-ratio", type=int, default=9,
132 |             help="the number of train ratio choosed from [train-ratio-start, train-ratio-end].")
133 |     parser.add_argument("--C", type=float, default=1.0,
134 |             help="inverse of regularization strength used in logistic regression.")
135 |     parser.add_argument("--num-split", type=int, default=10,
136 |             help="The number of re-shuffling & splitting for each train ratio.")
137 |     parser.add_argument("--num-workers", type=int, default=60,
138 |             help="Number of process")
139 |     args = parser.parse_args()
140 |     logging.basicConfig(
141 |             filename="%s.log" % args.embedding, filemode="a", # uncomment this to log to file
142 |             level=logging.INFO,
143 |             format='%(asctime)s %(message)s') # include timestamp
144 |     logger.info("C=%f", args.C)
145 |     logger.info("Loading label from %s...", args.label)
146 |     label = load_label(file=args.label, variable_name=args.matfile_variable_name)
147 |     logger.info("Label loaded!")
148 | 
149 |     logger.info("Loading network embedding from %s...", args.embedding)
150 |     ext = os.path.splitext(args.embedding)[1]
151 |     if ext == ".npy":
152 |         embedding = np.load(args.embedding)
153 |     elif ext == ".pkl":
154 |         with open(args.embedding, "rb") as f:
155 |             embedding = pkl.load(f)
156 |     else:
157 |         # Load word2vec format
158 |         embedding = load_w2v_feature(args.embedding)
159 |         np.save("%s.npy" % args.embedding, embedding, allow_pickle=False)
160 |     logger.info("Network embedding loaded!")
161 | 
162 |     logger.info("Embedding has shape %d, %d", embedding.shape[0], embedding.shape[1])
163 |     logger.info("Label has shape %d, %d", label.shape[0], label.shape[1])
164 | 
165 |     if label.shape[0] != embedding.shape[0]:
166 |         logger.info("Different shape ....")
167 |         num_instance = min(label.shape[0], embedding.shape[0])
168 |         label, embedding = label[:num_instance], embedding[:num_instance]
169 | 
170 |     num_label = label.sum(axis=1, dtype=np.int32)
171 |     idx = np.argwhere(num_label == 0)
172 |     logger.info("%d instances with no label" % len(idx))
173 |     #  if len(idx):
174 |     #      embedding = embedding[label.getnnz(1)>0]
175 |     #      label = label[label.getnnz(1)>0]
176 |     #  logger.info("After deleting ...")
177 |     logger.info("Embedding has shape %d, %d", embedding.shape[0], embedding.shape[1])
178 |     logger.info("Label has shape %d, %d", label.shape[0], label.shape[1])
179 | 
180 |     train_ratios = np.linspace(args.start_train_ratio, args.stop_train_ratio,
181 |             args.num_train_ratio)
182 | 
183 | 
184 |     f1 = list()
185 |     for tr in train_ratios:
186 |         res = predict_cv(embedding, label, train_ratio=tr/100.,
187 |                 n_splits=args.num_split, C=args.C, random_state=args.seed,
188 |                 num_workers=args.num_workers)
189 |         f1.append(res)
190 |     micro, macro = zip(*f1)
191 |     print(" ".join([str(x) for x in micro]))
192 |     logger.info(" ".join([str(x) for x in micro]))
193 |     print(" ".join([str(x) for x in macro]))
194 |     logger.info(" ".join([str(x) for x in macro]))
195 | 


--------------------------------------------------------------------------------
/example/redsvd2emb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | # File Name: redsvd2emb.py
 4 | # Author: Jiezhong Qiu
 5 | # Create Time: 2018/10/22 03:37
 6 | # TODO:
 7 | 
 8 | 
 9 | import scipy.sparse as sp
10 | import numpy as np
11 | import logging
12 | import argparse
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | def redsvd2emb(u, s):
17 |     return sp.diags(np.sqrt(s)).dot(u.T).T
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("--name", type=str, required=True,
23 |             help="file name")
24 |     parser.add_argument("--dim", type=int, required=True,
25 |             help="dimension")
26 |     args = parser.parse_args()
27 |     logging.basicConfig(level=logging.INFO,
28 |             format='%(asctime)s %(message)s') # include timestamp
29 |     u = np.loadtxt("%s.U" % args.name)[:, :args.dim]
30 |     s = np.loadtxt("%s.S" % args.name)[:args.dim]
31 |     embedding = redsvd2emb(u, s)
32 |     logger.info("save embedding to %s_%d.npy", args.name, args.dim)
33 |     np.save("%s_%d.npy" % (args.name, args.dim), embedding, allow_pickle=False)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/example/youtube.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | NETSMF=../bin/Release/netsmf
 6 | if [ -z "$1" ]; then
 7 |     INPUT="youtube.edge"
 8 | else
 9 |     INPUT=$1
10 | fi
11 | 
12 | if [ -z "$2" ]; then
13 | #    mkdir -p youtube
14 |     OUTPUT="youtube.netsmf"
15 | else
16 |     OUTPUT=$2
17 | fi
18 | 
19 | if [ -z "$3" ]; then
20 | 	LABEL=youtube.mat
21 | else
22 | 	LABEL=$3
23 | fi
24 | 
25 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT
26 | 
27 | (/usr/bin/time -p $NETSMF -T 10 \
28 |     -filename $INPUT \
29 |     -machine $HOSTNAME \
30 |     -output_svd $OUTPUT \
31 |     -rank 256 \
32 |     -num_threads_sampling 10 \
33 |     -num_threads_svd 32 \
34 |     -rounds 2000 \
35 |     -check_point 10 \
36 |     -noweight \
37 |     -nolog1p \
38 |     -log4cxx log4cxx.config) |& tee -a youtube.log
39 | 
40 | python redsvd2emb.py --name $OUTPUT --dim 128
41 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10
42 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10
43 | 


--------------------------------------------------------------------------------
/src/BinaryGraphWalker.cc:
--------------------------------------------------------------------------------
  1 | #include "BinaryGraphWalker.h"
  2 | 
  3 | #include <cassert>
  4 | #include <numeric>
  5 | #include <fstream>
  6 | #include <cassert>
  7 | #include <functional>
  8 | #include <algorithm>
  9 | #include <cmath>
 10 | #include <ctime>
 11 | #include <omp.h>
 12 | 
 13 | // include gflags
 14 | #include <gflags/gflags.h>
 15 | DECLARE_int32(num_threads_svd);
 16 | DECLARE_int32(rank);
 17 | DECLARE_int32(negative);
 18 | DECLARE_string(output_svd);
 19 | 
 20 | using namespace log4cxx;
 21 | 
 22 | BinaryGraphWalker::BinaryGraphWalker(const std::vector<VertexId>& indices,
 23 |         const std::vector<VertexId>& indptr, int T, const std::vector<float>& degree)
 24 |     : GraphWalker(indices, indptr, degree, T) {
 25 |     LOG4CXX_INFO(logger, "unweighted network");
 26 | }
 27 | 
 28 | BinaryGraphWalker* BinaryGraphWalker::getWalker(const std::string& fname, int T) {
 29 |     std::vector<VertexId> edges;
 30 | 	std::vector<VertexId> out_degree;
 31 | 	std::vector<VertexId> edge_pair;
 32 | 
 33 |     VertexId max_vertex_id = 0;
 34 | 	std::ifstream fin(fname);
 35 | 	assert(fin.is_open());
 36 |     VertexId src, dst;
 37 | 	while (fin >> src >> dst) {
 38 | 		// vertexid overflow
 39 | 		if (src >= max_vertex_id || dst >= max_vertex_id) {
 40 | 			max_vertex_id = std::max(src, dst);
 41 | 			out_degree.resize(max_vertex_id + 1, 0);
 42 | 		}
 43 |         if (src == dst) {
 44 |             continue;
 45 |         }
 46 | 		++out_degree[src];
 47 |         edge_pair.push_back(src);
 48 |         edge_pair.push_back(dst);
 49 | 	}
 50 |     int num_vertex = max_vertex_id + 1;
 51 | 
 52 |     std::vector<VertexId> indptr(num_vertex + 1, 0);
 53 |     std::partial_sum(out_degree.begin(), out_degree.end(), indptr.begin() + 1);
 54 |     std::vector<float> degree;
 55 |     for (auto const& val : out_degree) {
 56 |         degree.push_back(float(val));
 57 |     }
 58 | 
 59 |     EdgeId edge_cnt = edge_pair.size() >> 1;
 60 |     std::vector<VertexId> indices(edge_cnt, 0);
 61 | 
 62 |     for (EdgeId e = 0; e < edge_cnt; ++e) {
 63 |         VertexId src = edge_pair[e << 1];
 64 |         VertexId dst = edge_pair[(e << 1) + 1];
 65 | 
 66 |         EdgeId idx = indptr[src] + (--out_degree[src]);
 67 |         indices[idx] = dst;
 68 |     }
 69 |     return new BinaryGraphWalker(indices, indptr, T, degree);
 70 | }
 71 | 
 72 | 
 73 | VertexId BinaryGraphWalker::randomWalk(VertexId u, int step,
 74 |         unsigned* seed) const {
 75 |     for (;step--;) {
 76 |         // u's neighbors are indices[indptr[i]:indptr[i+1]]
 77 |         int offset = rand_r(seed) % (indptr[u+1] - indptr[u]);
 78 |         u = indices[indptr[u] + offset];
 79 |     }
 80 |     return u;
 81 | }
 82 | 
 83 | void BinaryGraphWalker::samplePath(const VertexId u, const VertexId v, int r, unsigned* seed,
 84 |         std::vector<VertexPair>& sampled_pairs) const {
 85 |     int k = rand_r(seed) % r + 1;
 86 |     VertexId u_ = randomWalk(u, k - 1, seed);
 87 |     VertexId v_ = randomWalk(v, r - k, seed);
 88 |     // add record (u_, v_, 1)
 89 | 
 90 |     if (u_ > v_) {
 91 |         std::swap(u_, v_);
 92 |     }
 93 | 
 94 |     sampled_pairs.push_back(std::make_pair(u_, v_));
 95 | }
 96 | 
 97 | void BinaryGraphWalker::sampling(int round, int num_threads,
 98 |         const std::string& machine,
 99 |         int check_point) {
100 |     omp_set_num_threads(num_threads);
101 | 
102 |     std::vector<std::vector<ValuedVertexPair>*> counters;
103 |     for (int i = 0; i < num_threads; ++i) {
104 |         counters.push_back(new std::vector<ValuedVertexPair>);
105 |     }
106 | 
107 |     #pragma omp parallel default(shared)
108 |     {
109 |         int this_thread = omp_get_thread_num();
110 |         std::string thread_name = std::string("machine_") + machine
111 |             + std::string("_thread_") + std::to_string(this_thread); // + std::string("_time_") + std::to_string(time(0));
112 | 
113 |         LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " thread name is " << thread_name );
114 |         unsigned seed = std::hash<std::string>{}(thread_name);
115 | 
116 |         std::vector<VertexPair> sampled_pairs;
117 |         std::vector<ValuedVertexPair> *&counter = counters[this_thread];
118 |         std::vector<ValuedVertexPair> *counter_tmp = new std::vector<ValuedVertexPair>;
119 | 
120 |         LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " set seed " << seed);
121 |         int my_round= ceil((double)round / num_threads);
122 | 
123 |         for (int i=0; i<my_round; ++i) {
124 |             for (VertexId u=0; u+1 < indptr.size(); ++u) {
125 |                 for (size_t j=indptr[u]; j<indptr[u+1]; ++j) {
126 |                     VertexId v = indices[j];
127 |                     for (int r=1; r<=T; ++r) {
128 |                         // printf("%d %d %d\n", u, v, r);
129 |                         samplePath(u, v, r, &seed, sampled_pairs);
130 |                     }
131 |                 }
132 |             }
133 |             if ((i + 1) % check_point == 0 || i + 1 == my_round) {
134 |                 float max_val = merge(*counter, *counter_tmp, sampled_pairs);
135 |                 std::swap(counter, counter_tmp);
136 |                 sampled_pairs.clear();
137 |                 counter_tmp->clear();
138 |                 LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " complete " << i + 1 << " rounds, size of counter=" << counter->size() << " counter.max_val=" << max_val);
139 |             }
140 |         }
141 |         LOG4CXX_INFO(logger, "[thread " << this_thread << "] finish job");
142 |         delete counter_tmp;
143 |     }
144 | 
145 |     // now we have a list of counters, we want to merge them in a binary tree way --- from leaf to root
146 |     while (counters.size() > 1) {
147 |         LOG4CXX_INFO(logger, counters.size() << " counters to merge.");
148 |         size_t n_half = (counters.size() + 1) >> 1;
149 |         omp_set_num_threads(counters.size() >> 1);
150 | 
151 |         #pragma omp parallel default(shared)
152 |         {
153 |             int this_thread = omp_get_thread_num();
154 |             LOG4CXX_INFO(logger, "merge counter " << this_thread << " and " << n_half + this_thread);
155 |             std::vector<ValuedVertexPair> *counter_tmp = merge_counters(*counters[this_thread], *counters[n_half + this_thread]);
156 | 
157 |             delete counters[this_thread];
158 |             delete counters[n_half + this_thread];
159 |             counters[this_thread] = counter_tmp;
160 |         }
161 | 
162 |         counters.resize(n_half);
163 |     }
164 |     counter_merged = counters[0];
165 | }
166 | 
167 | float BinaryGraphWalker::merge(const std::vector<ValuedVertexPair>& counter,
168 |         std::vector<ValuedVertexPair>& tmp,
169 |         std::vector<VertexPair>& sampled_pairs) {
170 |     float max_val = 0;
171 |     std::sort(sampled_pairs.begin(), sampled_pairs.end());
172 | 
173 |     std::vector<ValuedVertexPair>::const_iterator iter = counter.cbegin();
174 |     for (size_t i = 0, j = 0; i < sampled_pairs.size(); i = j) {
175 |         for (j = i + 1; j < sampled_pairs.size() && sampled_pairs[j] == sampled_pairs[i]; ++j);
176 |         for (;iter != counter.end() && iter->first < sampled_pairs[i]; ++iter) {
177 |             max_val = std::max(max_val, iter->second);
178 |             tmp.push_back(*iter);
179 |         }
180 |         if (iter != counter.end() && iter->first == sampled_pairs[i]) {
181 |             max_val = std::max(max_val, j - i + iter->second);
182 |             tmp.push_back(
183 |                     std::make_pair(iter->first, j - i + iter->second));
184 |             ++iter;
185 |         } else {
186 |             max_val = std::max(max_val, float(j - i));
187 |             tmp.push_back(std::make_pair(sampled_pairs[i], float(j - i)));
188 |         }
189 |     }
190 |     for (;iter != counter.end(); ++iter) {
191 |         max_val = std::max(max_val, iter->second);
192 |         tmp.push_back(*iter);
193 |     }
194 |     return max_val;
195 | }
196 | 
197 | 
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/src/BinaryGraphWalker.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "GraphWalker.h"
 4 | 
 5 | class BinaryGraphWalker : public GraphWalker {
 6 | public:
 7 |     static BinaryGraphWalker* getWalker(const std::string& name, int T);
 8 |     BinaryGraphWalker(const std::vector<VertexId>& indices,
 9 |             const std::vector<VertexId>& indptr, int T,
10 |             const std::vector<float>& degree);
11 | 
12 |     void samplePath(VertexId u, VertexId v, int r, unsigned* seed,
13 |             std::vector<VertexPair>& sampled_pair) const;
14 |     VertexId randomWalk(VertexId u, int step, unsigned* seed) const;
15 |     void sampling(int round, int num_threads,
16 |             const std::string& machine,
17 |             int check_point);
18 |     // void transformation();
19 |     // void redsvd();
20 |     // void merge_to_sparsifier(const std::vector<VertexPairCount>& counter);
21 | 
22 |     // static void dump(const std::string& filename,
23 |     //         const std::vector<VertexPairCount>& counter);
24 |     static float merge(const std::vector<ValuedVertexPair>& counter,
25 |             std::vector<ValuedVertexPair>& tmp,
26 |             std::vector<VertexPair>& sampled_pairs);
27 | };
28 | 
29 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Add subdirectories
 2 | ADD_SUBDIRECTORY(redsvd)
 3 | 
 4 | # Find dependencies libraries for Math
 5 | IF(UNIX)
 6 |   SET(MATH_LIB m)
 7 | ELSE(UNIX)
 8 |   SET(MATH_LIB)
 9 | ENDIF(UNIX)
10 | 
11 | # executable declaration
12 | ADD_EXECUTABLE(netsmf netsmf.cc BinaryGraphWalker.cc GraphWalker.cc WeightGraphWalker.cc)
13 | 
14 | # Linking
15 | TARGET_LINK_LIBRARIES( netsmf
16 |                        redsvd
17 |                        ${MATH_LIB}
18 |                        ${ZLIB_LIBRARIES} 
19 |                        ${GFLAGS_LIBRARIES} 
20 |                        ${LOG4CXX_LIBRARIES} )
21 | 
22 | # TARGET_LINK_LIBRARIES( GraphWalker )
23 | # TARGET_LINK_LIBRARIES( BinaryGraphWalker 
24 | #                        GraphWalker
25 | #                        ${GFLAGS_LIBRARIES} 
26 | #                        ${LOG4CXX_LIBRARIES} )
27 | 
28 | # Installing
29 | INSTALL(TARGETS netsmf DESTINATION bin)
30 | 


--------------------------------------------------------------------------------
/src/GraphWalker.cc:
--------------------------------------------------------------------------------
  1 | #include "GraphWalker.h"
  2 | #include <numeric> // std::partial_sum
  3 | #include <omp.h>
  4 | 
  5 | // include gflags
  6 | #include <gflags/gflags.h>
  7 | DECLARE_int32(num_threads_svd);
  8 | DECLARE_int32(rank);
  9 | DECLARE_int32(negative);
 10 | DECLARE_string(output_svd);
 11 | DECLARE_bool(log1p);
 12 | 
 13 | // include redsvd headers
 14 | #include "redsvd/util.hpp"
 15 | #include "redsvd/redsvd.hpp"
 16 | #include "redsvd/redsvdFile.hpp"
 17 | 
 18 | using namespace log4cxx;
 19 | LoggerPtr GraphWalker::logger(Logger::getLogger("GraphWalker"));
 20 | 
 21 | GraphWalker::GraphWalker(const std::vector<VertexId>& indices_,
 22 |         const std::vector<VertexId>& indptr_,
 23 |         const std::vector<float>& degree_,
 24 |         int T_)
 25 |     : indices(indices_), indptr(indptr_), degree(degree_), T(T_) {
 26 |     assert(indptr.size() == degree.size() + 1);
 27 |     sparsifier_lower = new std::vector<ValuedVertexPair>();
 28 |     sparsifier_upper = new std::vector<ValuedVertexPair>();
 29 |     counter_merged = NULL;
 30 | }
 31 | 
 32 | void GraphWalker::transformation() {
 33 | 
 34 |     LOG4CXX_INFO(logger, "transformation ...");
 35 |     double M = 0;
 36 |     for (auto iter = counter_merged->cbegin(); iter != counter_merged->cend(); ++iter) {
 37 |         M += iter->second * 2;
 38 |     }
 39 |     LOG4CXX_INFO(logger, "total number of samples=" << M);
 40 |     double num_edges = (double)indices.size();
 41 |     double vol = 0.0;
 42 |     for (auto const& val : degree) {
 43 |         vol += val;
 44 |     }
 45 |     LOG4CXX_INFO(logger, "vol(G)=" << vol);
 46 |     double factor = vol * num_edges / M / FLAGS_negative;
 47 |     VertexId src, dst;
 48 |     double val;
 49 |     std::vector<VertexId> nnz_lower_row(degree.size(), 0);
 50 | 
 51 |     size_t nnz_lower = 0;
 52 |     sparsifier_upper->clear();
 53 |     sparsifier_lower->clear();
 54 |     if (FLAGS_log1p) {
 55 |         LOG4CXX_INFO(logger, "using log1p...");
 56 |     } else {
 57 |         LOG4CXX_INFO(logger, "using truncated logarithm...");
 58 |     }
 59 | 
 60 |     std::function<double (double)> log1p_func = log1p;
 61 |     std::function<double (double)> log_func = log;
 62 | 	auto mylog = FLAGS_log1p ? log1p_func : log_func;
 63 | 
 64 |     for (auto iter = counter_merged->cbegin(); iter != counter_merged->cend(); ++iter) {
 65 |         src = iter->first.first;
 66 |         dst = iter->first.second;
 67 |         val = src != dst ? iter->second : iter->second * 2;
 68 |         val = mylog(val * factor / degree[src] / degree[dst]);
 69 |         if (val > 0) {
 70 |             sparsifier_upper->push_back(std::make_pair(iter->first, (float)val));
 71 |             if (src != dst) {
 72 |                 ++nnz_lower_row[dst];
 73 |                 ++nnz_lower;
 74 |             }
 75 |         }
 76 |     }
 77 |     LOG4CXX_INFO(logger, "after log, #nnz in upper triangle and diagonal reduces to " << sparsifier_upper->size() << " (from " << counter_merged->size() << ")");
 78 |     counter_merged->clear();
 79 |     delete counter_merged;
 80 | 
 81 | 
 82 |     LOG4CXX_INFO(logger, "constructing lower triangle ...");
 83 |     // now, sparsifier stores upper triangle + diagonal
 84 |     // we will re-use sparsifier_lower to store lower triangle
 85 |     std::vector<VertexId> lower_indptr(degree.size() + 1, 0);
 86 |     std::partial_sum(nnz_lower_row.begin(), nnz_lower_row.end(), lower_indptr.begin() + 1);
 87 | 
 88 |     sparsifier_lower->resize(nnz_lower);
 89 |     LOG4CXX_INFO(logger, "lower triangle has " << nnz_lower << " nnz.");
 90 |     for (auto riter = sparsifier_upper->crbegin(); riter != sparsifier_upper->crend(); ++riter) {
 91 |         src = riter->first.first;
 92 |         dst = riter->first.second;
 93 |         if (src == dst) {
 94 |             continue;
 95 |         }
 96 |         auto iter = sparsifier_lower->begin() + lower_indptr[dst] + (--nnz_lower_row[dst]);
 97 |         iter->first.first = dst;
 98 |         iter->first.second = src;
 99 |         iter->second = riter->second;
100 |     }
101 |     LOG4CXX_INFO(logger, "lower triangle constructed.");
102 | }
103 | 
104 | void GraphWalker::redsvd() {
105 |     Eigen::setNbThreads(FLAGS_num_threads_svd);
106 |     LOG4CXX_INFO(logger, "prepare svd ...");
107 |     REDSVD::SMatrixXf A;
108 |     // matrix size
109 |     A.resize(degree.size(), degree.size());
110 |     // number of nnz
111 |     A.reserve(sparsifier_upper->size() + sparsifier_lower->size());
112 |     auto iter_lower = sparsifier_lower->cbegin();
113 |     auto iter_upper = sparsifier_upper->cbegin();
114 |     for (size_t i = 0; i < degree.size(); ++i) {
115 |         A.startVec(i);
116 |         for (;iter_lower != sparsifier_lower->cend() && iter_lower->first.first == i; ++iter_lower) {
117 |             A.insertBack(i, iter_lower->first.second) = iter_lower->second;
118 |         }
119 |         for (;iter_upper != sparsifier_upper->cend() && iter_upper->first.first == i; ++iter_upper) {
120 |             A.insertBack(i, iter_upper->first.second) = iter_upper->second;
121 |         }
122 |     }
123 |     A.finalize();
124 |     sparsifier_upper->clear();
125 |     sparsifier_lower->clear();
126 |     delete sparsifier_upper;
127 |     delete sparsifier_lower;
128 | 
129 |     LOG4CXX_INFO(logger, "running randomized SVD...");
130 |     const double start = REDSVD::Util::getSec();
131 |     REDSVD::RedSVD svdOfA(A, FLAGS_rank < degree.size() ? FLAGS_rank : degree.size());
132 |     LOG4CXX_INFO(logger, "done in " << REDSVD::Util::getSec() - start);
133 | 
134 |     // set output name
135 | 	REDSVD::writeMatrix(FLAGS_output_svd, svdOfA);
136 | }
137 | 
138 | std::vector<ValuedVertexPair>* GraphWalker::merge_counters(const std::vector<ValuedVertexPair>& counter,
139 |         const std::vector<ValuedVertexPair>& counter_other) {
140 |     std::vector<ValuedVertexPair>::const_iterator iter1 = counter.cbegin();
141 |     std::vector<ValuedVertexPair>::const_iterator iter2 = counter_other.cbegin();
142 | 
143 |     std::vector<ValuedVertexPair> *counter_tmp = new std::vector<ValuedVertexPair>;
144 | 
145 |     while (iter1 != counter.cend() && iter2 != counter_other.cend()) {
146 |         if (iter1->first < iter2->first) {
147 |             counter_tmp->push_back(*(iter1++));
148 |         } else if (iter1->first > iter2->first) {
149 |             counter_tmp->push_back(*(iter2++));
150 |         } else {
151 |             counter_tmp->push_back(
152 |                     std::make_pair(iter1->first, iter1->second + iter2->second));
153 |             ++iter1;
154 |             ++iter2;
155 |         }
156 |     }
157 | 
158 |     for (;iter1 != counter.cend(); ++iter1) {
159 |         counter_tmp->push_back(*iter1);
160 |     }
161 | 
162 |     for (;iter2 != counter_other.cend(); ++iter2) {
163 |         counter_tmp->push_back(*iter2);
164 |     }
165 |     return counter_tmp;
166 | }
167 | 
168 | 


--------------------------------------------------------------------------------
/src/GraphWalker.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <iostream>
 5 | #include <string>
 6 | #include "log4cxx/logger.h"
 7 | 
 8 | 
 9 | //using VertexId = unsigned long; //uint32_t;
10 | using VertexId = unsigned int; //uint32_t;
11 | using EdgeId = unsigned int; //uint32_t;
12 | using VertexPair = std::pair<VertexId, VertexId>;
13 | using VertexPairCount = std::pair<VertexPair, unsigned int>;
14 | using ValuedVertexPair = std::pair<std::pair<VertexId, VertexId>, float>;
15 | 
16 | 
17 | /* indices, indptr, data
18 |  * indices is array of column indices
19 |  * data is array of corresponding nonzero values
20 |  * indptr points to row starts in indices and data
21 |  * length is n_row + 1, last item = number of values = length of both indices and data
22 |  * nonzero values of the i-th row are data[indptr[i]:indptr[i+1]] with column indices indices[indptr[i]:indptr[i+1]]
23 |  * item (i, j) can be accessed as data[indptr[i]+k], where k is position of j in indices[indptr[i]:indptr[i+1]]
24 |  */
25 | 
26 | class GraphWalker {
27 | public:
28 |     static log4cxx::LoggerPtr logger;
29 |     GraphWalker(const std::vector<VertexId>& indices_,
30 |         const std::vector<VertexId>& indptr_,
31 |         const std::vector<float>& degree_,
32 |         int T);
33 | 
34 |     const std::vector<VertexId> indices;
35 |     const std::vector<VertexId> indptr;
36 |     const std::vector<float> degree;
37 |     int T;
38 | 
39 |     std::vector<ValuedVertexPair> *sparsifier_upper, *sparsifier_lower;
40 |     std::vector<ValuedVertexPair> *counter_merged;
41 | 
42 |     virtual void sampling(int round, int num_threads,
43 |             const std::string& machine,
44 |             int check_point) = 0;
45 |     void transformation();
46 |     void redsvd();
47 | 
48 |     static std::vector<ValuedVertexPair>* merge_counters(
49 |         const std::vector<ValuedVertexPair>& counter,
50 |         const std::vector<ValuedVertexPair>& counter_other);
51 | };
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/src/WeightGraphWalker.cc:
--------------------------------------------------------------------------------
  1 | #include "WeightGraphWalker.h"
  2 | 
  3 | #include <cassert>
  4 | #include <numeric>
  5 | #include <fstream>
  6 | #include <cassert>
  7 | #include <algorithm>
  8 | #include <cmath>
  9 | #include <omp.h>
 10 | 
 11 | // include gflags
 12 | #include <gflags/gflags.h>
 13 | DECLARE_int32(num_threads_svd); DECLARE_int32(rank);
 14 | DECLARE_int32(negative);
 15 | DECLARE_string(output_svd);
 16 | 
 17 | using namespace log4cxx;
 18 | 
 19 | WeightGraphWalker::WeightGraphWalker(const std::vector<VertexId>& indices,
 20 |         const std::vector<VertexId>& indptr, int T,
 21 |         const std::vector<float>& data_,
 22 |         const std::vector<float>& prefix_sum_,
 23 |         const std::vector<float>& degree)
 24 |     : GraphWalker(indices, indptr, degree, T), data(data_), prefix_sum(prefix_sum_) {
 25 |     LOG4CXX_INFO(logger, "weighted network");
 26 | }
 27 | 
 28 | WeightGraphWalker* WeightGraphWalker::getWalker(const std::string& fname, int T) {
 29 |     std::vector<VertexId> edges;
 30 | 	std::vector<VertexId> out_degree;
 31 | 	std::vector<VertexId> edge_pair;
 32 |     std::vector<float> weight;
 33 |     std::vector<float> generalized_out_degree;
 34 | 
 35 |     VertexId max_vertex_id = 0;
 36 | 	std::ifstream fin(fname);
 37 | 	assert(fin.is_open());
 38 |     VertexId src, dst;
 39 |     double w;
 40 | 	while (fin >> src >> dst >> w) {
 41 | 		// vertexid overflow
 42 | 		if (src >= max_vertex_id || dst >= max_vertex_id) {
 43 | 			max_vertex_id = std::max(src, dst);
 44 | 			out_degree.resize(max_vertex_id + 1, 0);
 45 |             generalized_out_degree.resize(max_vertex_id + 1, 0);
 46 | 		}
 47 |         if (src == dst) {
 48 |             continue;
 49 |         }
 50 | 		++out_degree[src];
 51 |         generalized_out_degree[src] += w;
 52 |         edge_pair.push_back(src);
 53 |         edge_pair.push_back(dst);
 54 |         weight.push_back(w);
 55 | 	}
 56 | 
 57 | 
 58 |     int num_vertex = max_vertex_id + 1;
 59 | 
 60 |     std::vector<VertexId> indptr(num_vertex + 1, 0);
 61 |     std::partial_sum(out_degree.begin(), out_degree.end(), indptr.begin() + 1);
 62 | 
 63 |     EdgeId edge_cnt = edge_pair.size() >> 1;
 64 |     std::vector<VertexId> indices(edge_cnt, 0);
 65 | 
 66 |     std::vector<float> data(edge_cnt, 0.0);
 67 | 
 68 |     for (EdgeId e = 0; e < edge_cnt; ++e) {
 69 |         VertexId src = edge_pair[e << 1];
 70 |         VertexId dst = edge_pair[(e << 1) + 1];
 71 | 
 72 |         EdgeId idx = indptr[src] + (--out_degree[src]);
 73 |         indices[idx] = dst;
 74 |         data[idx] = weight[e];
 75 |     }
 76 | 
 77 | 
 78 |     std::vector<float> prefix_sum(edge_cnt, 0.0);
 79 |     for (VertexId v = 0; v < max_vertex_id; ++v) {
 80 |         std::partial_sum(data.begin() + indptr[v], data.begin() + indptr[v + 1], prefix_sum.begin() + indptr[v]);
 81 |     }
 82 | 
 83 |     return new WeightGraphWalker(indices, indptr, T, data, prefix_sum, generalized_out_degree);
 84 | }
 85 | 
 86 | VertexId WeightGraphWalker::randomWalk(VertexId u, int step, double& Z,
 87 |         unsigned* seed) const {
 88 |     for (;step--;) {
 89 |         // u's neighbors are indices[indptr[i]:indptr[i+1]]
 90 |         double ratio = (double)rand_r(seed) / RAND_MAX;
 91 |         int head = indptr[u], tail = indptr[u+1] - 1, pos = tail;
 92 |         double generalized_out_degree = prefix_sum[tail];
 93 |         for (;head < tail;) {
 94 |             int mid = (head + tail) >> 1;
 95 |             if (prefix_sum[mid] >= ratio * generalized_out_degree) {
 96 |                 tail= mid - 1;
 97 |                 pos = mid;
 98 |             } else {
 99 |                 head = mid + 1;
100 |             }
101 |         }
102 | 
103 |         u = indices[pos];
104 |         Z += 1. / data[pos];
105 |     }
106 |     return u;
107 | }
108 | 
109 | void WeightGraphWalker::samplePath(VertexId u, VertexId v, double w, int r, unsigned* seed,
110 |         std::vector<ValuedVertexPair>& sampled_pair) const {
111 |     int k = rand_r(seed) % r + 1;
112 |     double Z_half = 1. / w;
113 |     VertexId u_ = randomWalk(u, k - 1, Z_half, seed);
114 |     VertexId v_ = randomWalk(v, r - k, Z_half, seed);
115 |     if (u_ > v_) {
116 |         std::swap(u_, v_);
117 |     }
118 | 
119 |     // add record (u_, v_, r / Z_half)
120 |     sampled_pair.push_back(std::make_pair(std::make_pair(u_, v_), float(r / Z_half)));
121 | }
122 | 
123 | void WeightGraphWalker::sampling(int round, int num_threads,
124 |         const std::string& machine,
125 |         int check_point) {
126 |     omp_set_num_threads(num_threads);
127 | 
128 |     std::vector<std::vector<ValuedVertexPair>*> counters;
129 |     for (int i = 0; i < num_threads; ++i) {
130 |         counters.push_back(new std::vector<ValuedVertexPair>);
131 |     }
132 | 
133 |     #pragma omp parallel default(shared)
134 |     {
135 |         int this_thread = omp_get_thread_num();
136 |         std::string thread_name = std::string("machine_") + machine
137 |             + std::string("_thread_") + std::to_string(this_thread);
138 | 
139 |         LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " thread name is " << thread_name);
140 |         unsigned seed = std::hash<std::string>{}(thread_name);
141 | 
142 |         std::vector<ValuedVertexPair> sampled_pairs;
143 |         std::vector<ValuedVertexPair> *&counter = counters[this_thread];
144 |         std::vector<ValuedVertexPair> *counter_tmp = new std::vector<ValuedVertexPair>;
145 | 
146 |         LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " set seed " << seed);
147 |         int my_round= ceil((double)round / num_threads);
148 | 
149 |         for (int i=0; i<my_round; ++i) {
150 |             for (VertexId u=0; u+1 < indptr.size(); ++u) {
151 |                 for (size_t j=indptr[u]; j<indptr[u+1]; ++j) {
152 |                     VertexId v = indices[j];
153 |                     for (int r=1; r<T; ++r) {
154 |                         samplePath(u, v, data[j], r, &seed, sampled_pairs);
155 |                     }
156 |                 }
157 |             }
158 |             if ((i + 1) % check_point == 0 || i + 1 == my_round) {
159 |                 float max_val = merge(*counter, *counter_tmp, sampled_pairs);
160 |                 std::swap(counter, counter_tmp);
161 |                 counter_tmp->clear();
162 |                 LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " complete " << i + 1 << " rounds, size of counter=" << counter->size() << " counter.max_val=" << max_val);
163 |             }
164 |         }
165 | 
166 |         LOG4CXX_INFO(logger, "[thread " << this_thread << "] finish job");
167 |         delete counter_tmp;
168 |     }
169 | 
170 |     // now we have a list of counters, we want to merge them in a binary tree way --- from leaf to root
171 |     while (counters.size() > 1) {
172 |         LOG4CXX_INFO(logger, counters.size() << " counters to merge.");
173 |         size_t n_half = (counters.size() + 1) >> 1;
174 |         omp_set_num_threads(counters.size() >> 1);
175 | 
176 |         #pragma omp parallel default(shared)
177 |         {
178 |             int this_thread = omp_get_thread_num();
179 |             LOG4CXX_INFO(logger, "merge counter " << this_thread << " and " << n_half + this_thread);
180 |             std::vector<ValuedVertexPair> *counter_tmp = merge_counters(*counters[this_thread], *counters[n_half + this_thread]);
181 | 
182 |             delete counters[this_thread];
183 |             delete counters[n_half + this_thread];
184 |             counters[this_thread] = counter_tmp;
185 |         }
186 | 
187 |         counters.resize(n_half);
188 |     }
189 |     counter_merged = counters[0];
190 | 
191 | }
192 | 
193 | float WeightGraphWalker::merge(const std::vector<ValuedVertexPair>& counter,
194 |         std::vector<ValuedVertexPair>& tmp,
195 |         std::vector<ValuedVertexPair>& sampled_pairs) {
196 |     float max_val = 0;
197 |     float w;
198 |     std::sort(sampled_pairs.begin(), sampled_pairs.end());
199 | 
200 |     std::vector<ValuedVertexPair>::const_iterator iter = counter.cbegin();
201 |     for (size_t i = 0, j = 0; i < sampled_pairs.size(); i = j) {
202 |         w = sampled_pairs[i].second;
203 |         for (j = i + 1; j < sampled_pairs.size()
204 |                 && sampled_pairs[j].first == sampled_pairs[i].first; ++j) {
205 |             w += sampled_pairs[j].second;
206 |         }
207 |         for (;iter != counter.end() && iter->first < sampled_pairs[i].first; ++iter) {
208 |             max_val = std::max(max_val, iter->second);
209 |             tmp.push_back(*iter);
210 |         }
211 |         if (iter != counter.end() && iter->first == sampled_pairs[i].first) {
212 |             max_val = std::max(max_val, w + iter->second);
213 |             tmp.push_back(
214 |                     std::make_pair(iter->first, w + iter->second));
215 |             ++iter;
216 |         } else {
217 |             max_val = std::max(max_val, w);
218 |             tmp.push_back(std::make_pair(sampled_pairs[i].first, w));
219 |         }
220 |     }
221 |     for (;iter != counter.end(); ++iter) {
222 |         max_val = std::max(max_val, iter->second);
223 |         tmp.push_back(*iter);
224 |     }
225 |     return max_val;
226 | }
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 


--------------------------------------------------------------------------------
/src/WeightGraphWalker.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "GraphWalker.h"
 4 | 
 5 | 
 6 | class WeightGraphWalker : public GraphWalker {
 7 | public:
 8 |     static WeightGraphWalker* getWalker(const std::string& name, int T);
 9 |     WeightGraphWalker(const std::vector<VertexId>& indices,
10 |             const std::vector<VertexId>& indptr, int T,
11 |             const std::vector<float>& data,
12 |             const std::vector<float>& prefix_sum,
13 |             const std::vector<float>& degree);
14 | 
15 |     void samplePath(VertexId u, VertexId v, double w, int r, unsigned* seed,
16 |             std::vector<ValuedVertexPair>& sampled_pairs) const;
17 |     VertexId randomWalk(VertexId u, int step, double& Z, unsigned* seed) const;
18 |     void sampling(int round, int num_threads,
19 |             const std::string& machine,
20 |             int check_point);
21 | 
22 |     static float merge(const std::vector<ValuedVertexPair>& counter,
23 |             std::vector<ValuedVertexPair>& tmp,
24 |             std::vector<ValuedVertexPair>& sampled_pairs);
25 | 
26 |     const std::vector<float> data;
27 |     const std::vector<float> prefix_sum;
28 | };
29 | 


--------------------------------------------------------------------------------
/src/config.h.cmake:
--------------------------------------------------------------------------------
 1 | // Macros define at compilation time.
 2 | //
 3 | // Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/
 4 | // Written by Rémi Lebret <remi@lebret.ch>
 5 | //
 6 | // This file is part of HPCA.
 7 | //
 8 | // HPCA is free software: you can redistribute it and/or modify
 9 | // it under the terms of the GNU General Public License version 3 as
10 | // published by the Free Software Foundation.
11 | //
12 | // HPCA is distributed in the hope that it will be useful,
13 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | // GNU General Public License for more details.
16 | //
17 | // You should have received a copy of the GNU General Public License
18 | // along with HPCA. If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | #ifndef CONFIG_H
21 | #define CONFIG_H
22 | 
23 | /*---- the configured options and settings for HPCA ---------------*/
24 | /*---- the cmake configuration for NetSMF is adapted from HPCA ----*/
25 | 
26 | // should we use Intel MKL through Eigen?
27 | #cmakedefine EIGEN_USE_MKL_ALL
28 | 
29 | #endif // CONFIG_H
30 | 


--------------------------------------------------------------------------------
/src/netsmf.cc:
--------------------------------------------------------------------------------
 1 | #include "BinaryGraphWalker.h"
 2 | #include "WeightGraphWalker.h"
 3 | #include <gflags/gflags.h>
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | // include log4cxx header files.
 7 | #include "log4cxx/logger.h"
 8 | #include "log4cxx/basicconfigurator.h"
 9 | #include "log4cxx/propertyconfigurator.h"
10 | #include "log4cxx/helpers/exception.h"
11 | 
12 | using namespace log4cxx;
13 | using namespace log4cxx::helpers;
14 | 
15 | LoggerPtr logger(Logger::getLogger("main"));
16 | 
17 | 
18 | DEFINE_int32(T, 10, "Window size.");
19 | DEFINE_string(filename, "edgelist", "Filename for edgelist file.");
20 | DEFINE_string(machine, "localhost", "machine name for generating random seed by hash.");
21 | // DEFINE_string(output_samples, "sample", "Filename for sampled pairs.");
22 | DEFINE_string(output_svd, "sample", "Filename for svd results.");
23 | DEFINE_int32(rank, 256, "embedding dimension.");
24 | DEFINE_int32(negative, 1, "number of negative sampling.");
25 | DEFINE_int32(num_threads_sampling, 32, "Number of threads.");
26 | DEFINE_int32(num_threads_svd, 32, "Number of threads for svd.");
27 | DEFINE_int32(rounds, 1000, "Number of rounds.");
28 | DEFINE_int32(check_point, 2, "Check point every ? rounds.");
29 | // DEFINE_int32(max_mem_GB, 200, "Maximum cached data.");
30 | DEFINE_bool(weight, false, "Weighted graph");
31 | DEFINE_bool(log1p, false, "Using log1p instead of truncated logarithm");
32 | DEFINE_string(log4cxx, "log4cxx.config", "Log4cxx config file");
33 | 
34 | int main(int argc, char** argv) {
35 |     gflags::ParseCommandLineFlags(&argc, &argv, true);
36 |     //BasicConfigurator::configure();
37 |     PropertyConfigurator::configure(FLAGS_log4cxx.c_str());
38 |     LOG4CXX_INFO(logger, "Entering application.");
39 | 
40 |     GraphWalker *walker = FLAGS_weight ?
41 |         (GraphWalker*)WeightGraphWalker::getWalker(FLAGS_filename.c_str(), FLAGS_T) :
42 |         (GraphWalker*)BinaryGraphWalker::getWalker(FLAGS_filename.c_str(), FLAGS_T);
43 | 
44 |     walker->sampling(FLAGS_rounds, FLAGS_num_threads_sampling,  FLAGS_machine, FLAGS_check_point);
45 |     walker->transformation();
46 |     walker->redsvd();
47 |     LOG4CXX_INFO(logger, "Exiting application.");
48 |     return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/src/redsvd/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | FILE(
 2 |   GLOB
 3 |   redsvd_files
 4 |   *.cpp
 5 |   *.hpp
 6 |   *.h
 7 | )
 8 | # Add "redsvd" library
 9 | ADD_LIBRARY(redsvd ${redsvd_files})
10 | IF(USE_BLAS)
11 |     TARGET_LINK_LIBRARIES(redsvd ${BLAS_LIBRARIES} Eigen3::Eigen)
12 | ENDIF(USE_BLAS)
13 | IF(USE_LAPACK)
14 |     TARGET_LINK_LIBRARIES(redsvd ${LAPACK_LIBRARIES} Eigen3::Eigen)
15 | ENDIF(USE_LAPACK)
16 | 


--------------------------------------------------------------------------------
/src/redsvd/cmdline.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2009, Hideyuki Tanaka
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 |     * Redistributions of source code must retain the above copyright
  8 |       notice, this list of conditions and the following disclaimer.
  9 |     * Redistributions in binary form must reproduce the above copyright
 10 |       notice, this list of conditions and the following disclaimer in the
 11 |       documentation and/or other materials provided with the distribution.
 12 |     * Neither the name of the <organization> nor the
 13 |       names of its contributors may be used to endorse or promote products
 14 |       derived from this software without specific prior written permission.
 15 | 
 16 | THIS SOFTWARE IS PROVIDED BY <copyright holder> ''AS IS'' AND ANY
 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | */
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <iostream>
 31 | #include <sstream>
 32 | #include <vector>
 33 | #include <map>
 34 | #include <string>
 35 | #include <stdexcept>
 36 | #include <typeinfo>
 37 | #include <cstring>
 38 | #include <algorithm>
 39 | #include <cxxabi.h>
 40 | #include <cstdlib>
 41 | 
 42 | namespace cmdline{
 43 | 
 44 | namespace detail{
 45 | 
 46 | template <typename Target, typename Source, bool Same>
 47 | class lexical_cast_t{
 48 | public:
 49 |   static Target cast(const Source &arg){
 50 |     Target ret;
 51 |     std::stringstream ss;
 52 |     if (!(ss<<arg && ss>>ret && ss.eof()))
 53 |       throw std::bad_cast();
 54 |     
 55 |     return ret;
 56 |   }
 57 | };
 58 | 
 59 | template <typename Target, typename Source>
 60 | class lexical_cast_t<Target, Source, true>{
 61 | public:
 62 |   static Target cast(const Source &arg){
 63 |     return arg;
 64 |   }  
 65 | };
 66 | 
 67 | template <typename Source>
 68 | class lexical_cast_t<std::string, Source, false>{
 69 | public:
 70 |   static std::string cast(const Source &arg){
 71 |     std::ostringstream ss;
 72 |     ss<<arg;
 73 |     return ss.str();
 74 |   }
 75 | };
 76 | 
 77 | template <typename Target>
 78 | class lexical_cast_t<Target, std::string, false>{
 79 | public:
 80 |   static Target cast(const std::string &arg){
 81 |     Target ret;
 82 |     std::istringstream ss(arg);
 83 |     if (!(ss>>ret && ss.eof()))
 84 |       throw std::bad_cast();
 85 |     return ret;
 86 |   }
 87 | };
 88 | 
 89 | template <typename T1, typename T2>
 90 | struct is_same {
 91 |   static const bool value = false;
 92 | };
 93 | 
 94 | template <typename T>
 95 | struct is_same<T, T>{
 96 |   static const bool value = true;
 97 | };
 98 | 
 99 | template<typename Target, typename Source>
100 | Target lexical_cast(const Source &arg)
101 | {
102 |   return lexical_cast_t<Target, Source, detail::is_same<Target, Source>::value>::cast(arg);
103 | }
104 | 
105 | static inline std::string demangle(const std::string &name)
106 | {
107 |   int status=0;
108 |   char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status);
109 |   std::string ret(p);
110 |   free(p);
111 |   return ret;
112 | }
113 | 
114 | template <class T>
115 | std::string readable_typename()
116 | {
117 |   return demangle(typeid(T).name());
118 | }
119 | 
120 | template <>
121 | std::string readable_typename<std::string>()
122 | {
123 |   return "string";
124 | }
125 | 
126 | } // detail
127 | 
128 | //-----
129 | 
130 | class cmdline_error : public std::exception {
131 | public:
132 |   cmdline_error(const std::string &msg): msg(msg){}
133 |   ~cmdline_error() throw() {}
134 |   const char *what() const throw() { return msg.c_str(); }
135 | private:
136 |   std::string msg;
137 | };
138 | 
139 | template <class T>
140 | struct default_reader{
141 |   T operator()(const std::string &str){
142 |     return detail::lexical_cast<T>(str);
143 |   }
144 | };
145 | 
146 | template <class T>
147 | struct range_reader{
148 |   range_reader(const T &low, const T &high): low(low), high(high) {}
149 |   T operator()(const std::string &s) const {
150 |     T ret=default_reader<T>()(s);
151 |     if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error");
152 |     return ret;
153 |   }
154 | private:
155 |   T low, high;
156 | };
157 | 
158 | template <class T>
159 | range_reader<T> range(const T &low, const T &high)
160 | {
161 |   return range_reader<T>(low, high);
162 | }
163 | 
164 | template <class T>
165 | struct oneof_reader{
166 |   T operator()(const std::string &s){
167 |     T ret=default_reader<T>()(s);
168 |     if (std::find(alt.begin(), alt.end(), s)==alt.end())
169 |       throw cmdline_error("");
170 |     return ret;
171 |   }
172 |   void add(const T &v){ alt.push_back(v); }
173 | private:
174 |   std::vector<T> alt;
175 | };
176 | 
177 | template <class T>
178 | oneof_reader<T> oneof(T a1)
179 | {
180 |   oneof_reader<T> ret;
181 |   ret.add(a1);
182 |   return ret;
183 | }
184 | 
185 | template <class T>
186 | oneof_reader<T> oneof(T a1, T a2)
187 | {
188 |   oneof_reader<T> ret;
189 |   ret.add(a1);
190 |   ret.add(a2);
191 |   return ret;
192 | }
193 | 
194 | template <class T>
195 | oneof_reader<T> oneof(T a1, T a2, T a3)
196 | {
197 |   oneof_reader<T> ret;
198 |   ret.add(a1);
199 |   ret.add(a2);
200 |   ret.add(a3);
201 |   return ret;
202 | }
203 | 
204 | template <class T>
205 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4)
206 | {
207 |   oneof_reader<T> ret;
208 |   ret.add(a1);
209 |   ret.add(a2);
210 |   ret.add(a3);
211 |   ret.add(a4);
212 |   return ret;
213 | }
214 | 
215 | template <class T>
216 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5)
217 | {
218 |   oneof_reader<T> ret;
219 |   ret.add(a1);
220 |   ret.add(a2);
221 |   ret.add(a3);
222 |   ret.add(a4);
223 |   ret.add(a5);
224 |   return ret;
225 | }
226 | 
227 | template <class T>
228 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6)
229 | {
230 |   oneof_reader<T> ret;
231 |   ret.add(a1);
232 |   ret.add(a2);
233 |   ret.add(a3);
234 |   ret.add(a4);
235 |   ret.add(a5);
236 |   ret.add(a6);
237 |   return ret;
238 | }
239 | 
240 | template <class T>
241 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7)
242 | {
243 |   oneof_reader<T> ret;
244 |   ret.add(a1);
245 |   ret.add(a2);
246 |   ret.add(a3);
247 |   ret.add(a4);
248 |   ret.add(a5);
249 |   ret.add(a6);
250 |   ret.add(a7);
251 |   return ret;
252 | }
253 | 
254 | template <class T>
255 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8)
256 | {
257 |   oneof_reader<T> ret;
258 |   ret.add(a1);
259 |   ret.add(a2);
260 |   ret.add(a3);
261 |   ret.add(a4);
262 |   ret.add(a5);
263 |   ret.add(a6);
264 |   ret.add(a7);
265 |   ret.add(a8);
266 |   return ret;
267 | }
268 | 
269 | template <class T>
270 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9)
271 | {
272 |   oneof_reader<T> ret;
273 |   ret.add(a1);
274 |   ret.add(a2);
275 |   ret.add(a3);
276 |   ret.add(a4);
277 |   ret.add(a5);
278 |   ret.add(a6);
279 |   ret.add(a7);
280 |   ret.add(a8);
281 |   ret.add(a9);
282 |   return ret;
283 | }
284 | 
285 | template <class T>
286 | oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10)
287 | {
288 |   oneof_reader<T> ret;
289 |   ret.add(a1);
290 |   ret.add(a2);
291 |   ret.add(a3);
292 |   ret.add(a4);
293 |   ret.add(a5);
294 |   ret.add(a6);
295 |   ret.add(a7);
296 |   ret.add(a8);
297 |   ret.add(a9);
298 |   ret.add(a10);
299 |   return ret;
300 | }
301 | 
302 | //-----
303 | 
304 | class parser{
305 | public:
306 |   parser(){
307 |   }
308 |   ~parser(){
309 |     for (std::map<std::string, option_base*>::iterator p=options.begin();
310 | 	 p!=options.end(); p++)
311 |       delete p->second;
312 |   }
313 | 
314 |   void add(const std::string &name,
315 | 	   char short_name=0,
316 | 	   const std::string &desc=""){
317 |     if (options.count(name)) throw cmdline_error("multiple definition: "+name);
318 |     options[name]=new option_without_value(name, short_name, desc);
319 |     ordered.push_back(options[name]);
320 |   }
321 | 
322 |   template <class T>
323 |   void add(const std::string &name,
324 | 	   char short_name=0,
325 | 	   const std::string &desc="",
326 | 	   bool need=true,
327 | 	   const T def=T()){
328 |     add(name, short_name, desc, need, def, default_reader<T>());
329 |   }
330 | 
331 |   template <class T, class F>
332 |   void add(const std::string &name,
333 | 	   char short_name=0,
334 | 	   const std::string &desc="",
335 | 	   bool need=true,
336 | 	   const T def=T(),
337 | 	   F reader=F()){
338 |     if (options.count(name)) throw cmdline_error("multiple definition: "+name);
339 |     options[name]=new option_with_value_with_reader<T, F>(name, short_name, need, def, desc, reader);
340 |     ordered.push_back(options[name]);
341 |   }
342 | 
343 |   void footer(const std::string &f){
344 |     ftr=f;
345 |   }
346 | 
347 |   void set_program_name(const std::string &name){
348 |     prog_name=name;
349 |   }
350 | 
351 |   bool exist(const std::string &name) const {
352 |     if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name);
353 |     return options.find(name)->second->has_set();
354 |   }
355 | 
356 |   template <class T>
357 |   const T &get(const std::string &name) const {
358 |     if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name);
359 |     const option_with_value<T> *p=dynamic_cast<const option_with_value<T>*>(options.find(name)->second);
360 |     if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'");
361 |     return p->get();
362 |   }
363 | 
364 |   const std::vector<std::string> &rest() const {
365 |     return others;
366 |   }
367 | 
368 |   bool parse(const std::string &arg){
369 |     std::vector<std::string> args;
370 | 
371 |     std::string buf;
372 |     bool in_quote=false;
373 |     for (std::string::size_type i=0; i<arg.length(); i++){
374 |       if (arg[i]=='\"'){
375 | 	in_quote=!in_quote;
376 | 	continue;
377 |       }
378 | 
379 |       if (arg[i]==' ' && !in_quote){
380 | 	args.push_back(buf);
381 | 	buf="";
382 | 	continue;
383 |       }
384 | 
385 |       if (arg[i]=='\\'){
386 | 	i++;
387 | 	if (i>=arg.length()){
388 | 	  errors.push_back("unexpected occurrence of '\\' at end of string");
389 | 	  return false;
390 | 	}
391 |       }
392 | 
393 |       buf+=arg[i];
394 |     }
395 | 
396 |     if (in_quote){
397 |       errors.push_back("quote is not closed");
398 |       return false;
399 |     }
400 | 
401 |     if (buf.length()>0)
402 |       args.push_back(buf);
403 | 
404 |     for (size_t i=0; i<args.size(); i++)
405 |       std::cout<<"\""<<args[i]<<"\""<<std::endl;
406 | 
407 |     return parse(args);
408 |   }
409 | 
410 |   bool parse(const std::vector<std::string> &args){
411 |     int argc=static_cast<int>(args.size());
412 |     std::vector<const char*> argv(argc);
413 | 
414 |     for (int i=0; i<argc; i++)
415 |       argv[i]=args[i].c_str();
416 | 
417 |     return parse(argc, &argv[0]);
418 |   }
419 | 
420 |   bool parse(int argc, const char * const argv[]){
421 |     errors.clear();
422 |     others.clear();
423 | 
424 |     if (argc<1){
425 |       errors.push_back("argument number must be longer than 0");
426 |       return false;
427 |     }
428 |     if (prog_name=="")
429 |       prog_name=argv[0];
430 | 
431 |     std::map<char, std::string> lookup;
432 |     for (std::map<std::string, option_base*>::iterator p=options.begin();
433 | 	 p!=options.end(); p++){
434 |       if (p->first.length()==0) continue;
435 |       char initial=p->second->short_name();
436 |       if (initial){
437 | 	if (lookup.count(initial)>0){
438 | 	  lookup[initial]="";
439 | 	  errors.push_back(std::string("short option '")+initial+"' is ambiguous");
440 | 	  return false;
441 | 	}
442 | 	else lookup[initial]=p->first;
443 |       }
444 |     }
445 | 
446 |     for (int i=1; i<argc; i++){
447 |       if (strncmp(argv[i], "--", 2)==0){
448 | 	const char *p=strchr(argv[i]+2, '=');
449 | 	if (p){
450 | 	  std::string name(argv[i]+2, p);
451 | 	  std::string val(p+1);
452 | 	  set_option(name, val);
453 | 	}
454 | 	else{
455 | 	  std::string name(argv[i]+2);
456 |           if (options.count(name)==0){
457 |             errors.push_back("undefined option: --"+name);
458 |             continue;
459 |           }
460 |           if (options[name]->has_value()){
461 |             if (i+1>=argc){
462 |               errors.push_back("option needs value: --"+name);
463 |               continue;
464 |             }
465 |             else{
466 |               i++;
467 |               set_option(name, argv[i]);
468 |             }
469 |           }
470 |           else{
471 |             set_option(name);
472 |           }
473 | 	}
474 |       }
475 |       else if (strncmp(argv[i], "-", 1)==0){
476 | 	if (!argv[i][1]) continue;
477 | 	char last=argv[i][1];
478 | 	for (int j=2; argv[i][j]; j++){
479 | 	  last=argv[i][j];
480 | 	  if (lookup.count(argv[i][j-1])==0){
481 | 	    errors.push_back(std::string("undefined short option: -")+argv[i][j-1]);
482 | 	    continue;
483 | 	  }
484 | 	  if (lookup[argv[i][j-1]]==""){
485 | 	    errors.push_back(std::string("ambiguous short option: -")+argv[i][j-1]);
486 | 	    continue;
487 | 	  }
488 | 	  set_option(lookup[argv[i][j-1]]);
489 | 	}
490 | 
491 | 	if (lookup.count(last)==0){
492 | 	  errors.push_back(std::string("undefined short option: -")+last);
493 | 	  continue;
494 | 	}
495 | 	if (lookup[last]==""){
496 | 	  errors.push_back(std::string("ambiguous short option: -")+last);
497 | 	  continue;
498 | 	}
499 | 
500 | 	if (i+1<argc && options[lookup[last]]->has_value()){
501 | 	  set_option(lookup[last], argv[i+1]);
502 | 	  i++;
503 | 	}
504 | 	else{
505 | 	  set_option(lookup[last]);
506 | 	}
507 |       }
508 |       else{
509 | 	others.push_back(argv[i]);
510 |       }
511 |     }
512 | 
513 |     for (std::map<std::string, option_base*>::iterator p=options.begin();
514 | 	 p!=options.end(); p++)
515 |       if (!p->second->valid())
516 | 	errors.push_back("need option: --"+std::string(p->first));
517 | 
518 |     return errors.size()==0;
519 |   }
520 | 
521 |   std::string error() const{
522 |     return errors.size()>0?errors[0]:"";
523 |   }
524 | 
525 |   std::string error_full() const{
526 |     std::ostringstream oss;
527 |     for (size_t i=0; i<errors.size(); i++)
528 |       oss<<errors[i]<<std::endl;
529 |     return oss.str();
530 |   }
531 | 
532 |   std::string usage() const {
533 |     std::ostringstream oss;
534 |     oss<<"usage: "<<prog_name<<" ";
535 |     for (size_t i=0; i<ordered.size(); i++){
536 |       if (ordered[i]->must())
537 | 	oss<<ordered[i]->short_description()<<" ";
538 |     }
539 |     
540 |     oss<<"[options] ... "<<ftr<<std::endl;
541 |     oss<<"options:"<<std::endl;
542 | 
543 |     size_t max_width=0;
544 |     for (size_t i=0; i<ordered.size(); i++){
545 |       max_width=std::max(max_width, ordered[i]->name().length());
546 |     }
547 |     for (size_t i=0; i<ordered.size(); i++){
548 |       if (ordered[i]->short_name()){
549 | 	oss<<"  -"<<ordered[i]->short_name()<<", ";
550 |       }
551 |       else{
552 | 	oss<<"      ";
553 |       }
554 | 
555 |       oss<<"--"<<ordered[i]->name();
556 |       for (size_t j=ordered[i]->name().length(); j<max_width+4; j++)
557 | 	oss<<' ';
558 |       oss<<ordered[i]->description()<<std::endl;
559 |     }
560 |     return oss.str();
561 |   }
562 | 
563 | private:
564 | 
565 |   void set_option(const std::string &name){
566 |     if (options.count(name)==0){
567 |       errors.push_back("undefined option: --"+name);
568 |       return;
569 |     }
570 |     if (!options[name]->set()){
571 |       errors.push_back("option needs value: --"+name);
572 |       return;
573 |     }
574 |   }
575 | 
576 |   void set_option(const std::string &name, const std::string &value){
577 |     if (options.count(name)==0){
578 |       errors.push_back("undefined option: --"+name);
579 |       return;
580 |     }
581 |     if (!options[name]->set(value)){
582 |       errors.push_back("option value is invalid: --"+name+"="+value);
583 |       return;
584 |     }
585 |   }
586 | 
587 |   class option_base{
588 |   public:
589 |     virtual ~option_base(){}
590 | 
591 |     virtual bool has_value() const=0;
592 |     virtual bool set()=0;
593 |     virtual bool set(const std::string &value)=0;
594 |     virtual bool has_set() const=0;
595 |     virtual bool valid() const=0;
596 |     virtual bool must() const=0;
597 | 
598 |     virtual const std::string &name() const=0;
599 |     virtual char short_name() const=0;
600 |     virtual const std::string &description() const=0;
601 |     virtual std::string short_description() const=0;
602 |   };
603 | 
604 |   class option_without_value : public option_base {
605 |   public:
606 |     option_without_value(const std::string &name,
607 | 			 char short_name,
608 | 			 const std::string &desc)
609 |       :nam(name), snam(short_name), desc(desc), has(false){
610 |     }
611 |     ~option_without_value(){}
612 | 
613 |     bool has_value() const { return false; }
614 | 
615 |     bool set(){
616 |       has=true;
617 |       return true;
618 |     }
619 | 
620 |     bool set(const std::string &){
621 |       return false;
622 |     }
623 | 
624 |     bool has_set() const {
625 |       return has;
626 |     }
627 | 
628 |     bool valid() const{
629 |       return true;
630 |     }
631 | 
632 |     bool must() const{
633 |       return false;
634 |     }
635 | 
636 |     const std::string &name() const{
637 |       return nam;
638 |     }
639 | 
640 |     char short_name() const{
641 |       return snam;
642 |     }
643 | 
644 |     const std::string &description() const {
645 |       return desc;
646 |     }
647 | 
648 |     std::string short_description() const{
649 |       return "--"+nam;
650 |     }
651 | 
652 |   private:
653 |     std::string nam;
654 |     char snam;
655 |     std::string desc;
656 |     bool has;
657 |   };
658 | 
659 |   template <class T>
660 |   class option_with_value : public option_base {
661 |   public:
662 |     option_with_value(const std::string &name,
663 | 		      char short_name,
664 | 		      bool need,
665 | 		      const T &def,
666 | 		      const std::string &desc)
667 |       : nam(name), snam(short_name), need(need), has(false)
668 |       , def(def), actual(def) {
669 |       this->desc=full_description(desc);
670 |     }
671 |     ~option_with_value(){}
672 | 
673 |     const T &get() const {
674 |       return actual;
675 |     }
676 | 
677 |     bool has_value() const { return true; }
678 | 
679 |     bool set(){
680 |       return false;
681 |     }
682 | 
683 |     bool set(const std::string &value){
684 |       try{
685 | 	actual=read(value);
686 | 	has=true;
687 |       }
688 |       catch(const std::exception &e){
689 | 	return false;
690 |       }
691 |       return true;
692 |     }
693 | 
694 |     bool has_set() const{
695 |       return has;
696 |     }
697 | 
698 |     bool valid() const{
699 |       if (need && !has) return false;
700 |       return true;
701 |     }
702 | 
703 |     bool must() const{
704 |       return need;
705 |     }
706 | 
707 |     const std::string &name() const{
708 |       return nam;
709 |     }
710 | 
711 |     char short_name() const{
712 |       return snam;
713 |     }
714 | 
715 |     const std::string &description() const {
716 |       return desc;
717 |     }
718 | 
719 |     std::string short_description() const{
720 |       return "--"+nam+"="+detail::readable_typename<T>();
721 |     }
722 | 
723 |   protected:
724 |     std::string full_description(const std::string &desc){
725 |       return
726 | 	desc+" ("+detail::readable_typename<T>()+
727 | 	(need?"":" [="+detail::lexical_cast<std::string>(def)+"]")
728 | 	+")";
729 |     }
730 | 
731 |     virtual T read(const std::string &s)=0;
732 | 
733 |     std::string nam;
734 |     char snam;
735 |     bool need;
736 |     std::string desc;
737 | 
738 |     bool has;
739 |     T def;
740 |     T actual;
741 |   };
742 | 
743 |   template <class T, class F>
744 |   class option_with_value_with_reader : public option_with_value<T> {
745 |   public:
746 |     option_with_value_with_reader(const std::string &name,
747 | 				  char short_name,
748 | 				  bool need,
749 | 				  const T def,
750 | 				  const std::string &desc,
751 | 				  F reader)
752 |       : option_with_value<T>(name, short_name, need, def, desc), reader(reader){
753 |     }
754 | 
755 |   private:
756 |     T read(const std::string &s){
757 |       return reader(s);
758 |     }
759 | 
760 |     F reader;
761 |   };
762 | 
763 |   std::map<std::string, option_base*> options;
764 |   std::vector<option_base*> ordered;
765 |   std::string ftr;
766 | 
767 |   std::string prog_name;
768 |   std::vector<std::string> others;
769 | 
770 |   std::vector<std::string> errors;
771 | };
772 | 
773 | } // cmdline
774 | 


--------------------------------------------------------------------------------
/src/redsvd/fileReader.hpp:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  *  Copyright (c) 2010 Daisuke Okanohara
 3 |  * 
 4 |  *   Redistribution and use in source and binary forms, with or without
 5 |  *   modification, are permitted provided that the following conditions
 6 |  *   are met:
 7 |  * 
 8 |  *   1. Redistributions of source code must retain the above Copyright
 9 |  *      notice, this list of conditions and the following disclaimer.
10 |  *
11 |  *   2. Redistributions in binary form must reproduce the above Copyright
12 |  *      notice, this list of conditions and the following disclaimer in the
13 |  *      documentation and/or other materials provided with the distribution.
14 |  *
15 |  *   3. Neither the name of the authors nor the names of its contributors
16 |  *      may be used to endorse or promote products derived from this
17 |  *      software without specific prior written permission.
18 |  */
19 | 
20 | #ifndef FILEREADER_HPP_
21 | #define FILEREADER_HPP_
22 | 
23 | #include <fstream>
24 | #include <sstream>
25 | #include "util.hpp"
26 | 
27 | namespace REDSVD{
28 | 
29 | class FileReader {
30 | public:
31 |   FileReader() : rows_(0), cols_(0) {}
32 |   ~FileReader() {}
33 | 
34 |   void OpenFile(const char* inputFileName){
35 |     inputFileName_ = inputFileName;
36 |     ifs_.close();
37 |     ifs_.clear();
38 |     ifs_.open(inputFileName_.c_str(), std::ifstream::in);
39 |     if (!ifs_){
40 |       throw std::string("open error ") + inputFileName_;
41 |     }
42 |   }
43 | 
44 |   void Rewind(){
45 |     ifs_.clear();
46 |     ifs_.seekg(0);
47 |   }
48 | 
49 |   void GetStat(){
50 |     rows_ = 0;
51 |     cols_ = 0;
52 |     for (fv_t fv; ReadRow(fv) != -1; ++rows_){
53 |       if (fv.size() == 0) continue;
54 |       cols_ = std::max(fv.back().first+1, cols_);
55 |     }
56 |     ifs_.clear();
57 |     ifs_.seekg(0);
58 |   }
59 | 
60 |   int ReadRow(fv_t& fv){
61 |     std::string line;
62 |     if (!getline(ifs_, line)){
63 |       return -1;
64 |     }
65 |     std::istringstream is(line);
66 |     
67 |     int id;
68 |     char sep;
69 |     float val;
70 |     while (is >> id >> sep >> val){
71 |       fv.push_back(std::make_pair(id, val));
72 |     }
73 |     sort(fv.begin(), fv.end());
74 |     fv.erase(unique(fv.begin(), fv.end()), fv.end());
75 |     
76 |     return 0;
77 |   }
78 | 
79 |   int rows() const {
80 |     return rows_;
81 |   }
82 | 
83 |   int cols() const {
84 |     return cols_;
85 |   }
86 | 
87 | private:
88 |   std::ifstream ifs_;
89 |   std::string inputFileName_;
90 |   int rows_;
91 |   int cols_;
92 | };
93 | 
94 | }
95 | 
96 | #endif // FILEREADER_HPP_
97 | 


--------------------------------------------------------------------------------
/src/redsvd/redsvd.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2010 Daisuke Okanohara
  3 |  *
  4 |  *   Redistribution and use in source and binary forms, with or without
  5 |  *   modification, are permitted provided that the following conditions
  6 |  *   are met:
  7 |  *
  8 |  *   1. Redistributions of source code must retain the above Copyright
  9 |  *      notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  *   2. Redistributions in binary form must reproduce the above Copyright
 12 |  *      notice, this list of conditions and the following disclaimer in the
 13 |  *      documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  *   3. Neither the name of the authors nor the names of its contributors
 16 |  *      may be used to endorse or promote products derived from this
 17 |  *      software without specific prior written permission.
 18 |  */
 19 | 
 20 | #ifndef REDSVD_HPP__
 21 | #define REDSVD_HPP__
 22 | 
 23 | #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
 24 | 
 25 | #include <vector>
 26 | #include <eigen3/Eigen/Sparse>
 27 | #include <eigen3/Eigen/Dense>
 28 | #include <eigen3/Eigen/Eigenvalues>
 29 | #include "util.hpp"
 30 | #include "log4cxx/logger.h"
 31 | 
 32 | using namespace log4cxx;
 33 | 
 34 | namespace REDSVD {
 35 | 
 36 | class RedSVD {
 37 | public:
 38 |   RedSVD(){}
 39 | 
 40 |   template <class Mat>
 41 |   RedSVD(Mat& A){
 42 |     int r = (A.rows() < A.cols()) ? A.rows() : A.cols();
 43 |     run(A, r);
 44 |   }
 45 | 
 46 |   template <class Mat>
 47 |   RedSVD(Mat& A, const int rank){
 48 |     run(A, rank);
 49 |   }
 50 | 
 51 |   template <class Mat>
 52 |   void run(Mat& A, const int rank){
 53 |     LoggerPtr logger(Logger::getLogger("redsvd"));
 54 |     if (A.cols() == 0 || A.rows() == 0) return;
 55 |     int r = (rank < A.cols()) ? rank : A.cols();
 56 |     r = (r < A.rows()) ? r : A.rows();
 57 | 
 58 |     // Gaussian Random Matrix for A^T
 59 |     Eigen::MatrixXf O(A.rows(), r);
 60 |     Util::sampleGaussianMat(O);
 61 | 
 62 |     LOG4CXX_INFO(logger, "sampling gaussian random matrix O for A^T done.");
 63 | 
 64 |     // Compute Sample Matrix of A^T
 65 |     // Eigen::MatrixXf Y = A.transpose() * O;
 66 |     Eigen::MatrixXf Y = A * O;
 67 |     LOG4CXX_INFO(logger, "compute sample matrix of Y = A^T * O  = A * O done (because A^T = A)");
 68 | 
 69 |     // Orthonormalize Y
 70 |     Util::processGramSchmidt(Y);
 71 |     LOG4CXX_INFO(logger, "orthonormalize Y done.");
 72 | 
 73 |     // Range(B) = Range(A^T)
 74 |     Eigen::MatrixXf B = A * Y;
 75 |     LOG4CXX_INFO(logger, "B = A * Y done, Range(B) = Range(A^T).");
 76 | 
 77 |     // Gaussian Random Matrix
 78 |     Eigen::MatrixXf P(B.cols(), r);
 79 |     Util::sampleGaussianMat(P);
 80 |     LOG4CXX_INFO(logger, "sample another gaussian random matrix P done.");
 81 | 
 82 |     // Compute Sample Matrix of B
 83 |     Eigen::MatrixXf Z = B * P;
 84 |     LOG4CXX_INFO(logger, "compute sample matrix of Z = B * P done.")
 85 | 
 86 |     // Orthonormalize Z
 87 |     Util::processGramSchmidt(Z);
 88 |     LOG4CXX_INFO(logger, "orthonormalize Z done.");
 89 | 
 90 |     // Range(C) = Range(B)
 91 |     Eigen::MatrixXf C = Z.transpose() * B;
 92 |     LOG4CXX_INFO(logger, "C = Z^T * B done, Range(C) = Range(B).");
 93 | 
 94 |     Eigen::JacobiSVD<Eigen::MatrixXf> svdOfC(C, Eigen::ComputeThinU | Eigen::ComputeThinV);
 95 |     LOG4CXX_INFO(logger, "JacabiSVD for C done.");
 96 | 
 97 | 
 98 |     // C = USV^T
 99 |     // A = Z * U * S * V^T * Y^T()
100 |     matU_ = Z * svdOfC.matrixU();
101 |     matS_ = svdOfC.singularValues();
102 |     matV_ = Y * svdOfC.matrixV();
103 |     LOG4CXX_INFO(logger, "compute U S V done.");
104 |   }
105 | 
106 |   const Eigen::MatrixXf& matrixU() const {
107 |     return matU_;
108 |   }
109 | 
110 |   const Eigen::VectorXf& singularValues() const {
111 |     return matS_;
112 |   }
113 | 
114 |   const Eigen::MatrixXf& matrixV() const {
115 |     return matV_;
116 |   }
117 | 
118 | private:
119 |   Eigen::MatrixXf matU_;
120 |   Eigen::VectorXf matS_;
121 |   Eigen::MatrixXf matV_;
122 | };
123 | 
124 | class RedSymEigen {
125 | public:
126 |   RedSymEigen(){}
127 | 
128 |   template <class Mat>
129 |   RedSymEigen(Mat& A, const int rank){
130 |     run(A, rank);
131 |   }
132 | 
133 |   template <class Mat>
134 |   void run(Mat& A, const int rank){
135 |     if (A.cols() == 0 || A.rows() == 0) return;
136 |     int r = (rank < A.cols()) ? rank : A.cols();
137 |     r = (r < A.rows()) ? r : A.rows();
138 | 
139 |     // Gaussian Random Matrix
140 |     Eigen::MatrixXf O(A.rows(), r);
141 |     Util::sampleGaussianMat(O);
142 | 
143 |     // Compute Sample Matrix of A
144 |     Eigen::MatrixXf Y = A.transpose() * O;
145 | 
146 |     // Orthonormalize Y
147 |     Util::processGramSchmidt(Y);
148 | 
149 |     Eigen::MatrixXf B = Y.transpose() * A * Y;
150 |     Eigen::SelfAdjointEigenSolver<Eigen::MatrixXf> eigenOfB(B);
151 | 
152 |     eigenValues_ = eigenOfB.eigenvalues();
153 |     eigenVectors_ = Y * eigenOfB.eigenvectors();
154 |   }
155 | 
156 |   const Eigen::MatrixXf& eigenVectors() const {
157 |     return eigenVectors_;
158 |   }
159 | 
160 |   const Eigen::VectorXf& eigenValues() const {
161 |     return eigenValues_;
162 |   }
163 | 
164 | private:
165 |   Eigen::VectorXf eigenValues_;
166 |   Eigen::MatrixXf eigenVectors_;
167 | };
168 | 
169 | class RedPCA {
170 | public:
171 |   RedPCA(){}
172 | 
173 |   template <class Mat>
174 |   RedPCA(const Mat& A, const int rank) {
175 |     run(A, rank);
176 |   }
177 | 
178 |   template <class Mat>
179 |   void run(const Mat& A, const int rank) {
180 |     RedSVD redsvd;
181 |     redsvd.run(A, rank);
182 |     const Eigen::VectorXf& S = redsvd.singularValues();
183 |     principalComponents_ = redsvd.matrixV();
184 |     scores_              = redsvd.matrixU() * S.asDiagonal();
185 |   }
186 | 
187 |   const Eigen::MatrixXf& principalComponents() const {
188 |     return principalComponents_;
189 |   }
190 | 
191 |   const Eigen::MatrixXf& scores() const {
192 |     return scores_;
193 |   }
194 | 
195 |  private:
196 |   Eigen::MatrixXf principalComponents_;
197 |   Eigen::MatrixXf scores_;
198 | };
199 | 
200 | }
201 | 
202 | #endif // REDSVD_HPP__
203 | 


--------------------------------------------------------------------------------
/src/redsvd/redsvdFile.cpp:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  *  Copyright (c) 2010 Daisuke Okanohara
  3 |  * 
  4 |  *   Redistribution and use in source and binary forms, with or without
  5 |  *   modification, are permitted provided that the following conditions
  6 |  *   are met:
  7 |  * 
  8 |  *   1. Redistributions of source code must retain the above Copyright
  9 |  *      notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  *   2. Redistributions in binary form must reproduce the above Copyright
 12 |  *      notice, this list of conditions and the following disclaimer in the
 13 |  *      documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  *   3. Neither the name of the authors nor the names of its contributors
 16 |  *      may be used to endorse or promote products derived from this
 17 |  *      software without specific prior written permission.
 18 |  */
 19 | 
 20 | #include <string>
 21 | #include <fstream>
 22 | #include <sstream>
 23 | #include <iostream>
 24 | 
 25 | #include "redsvdFile.hpp"
 26 | #include "redsvd.hpp"
 27 | #include "redsvdIncr.hpp"
 28 | 
 29 | using namespace std;
 30 | using namespace Eigen;
 31 | 
 32 | namespace REDSVD{
 33 | 
 34 | namespace {
 35 | 
 36 | void writeMatrix_(const string& fn, const MatrixXf& M){
 37 |   cout << "write " << fn << endl;
 38 |   FILE* outfp = fopen(fn.c_str(), "wb");
 39 |   if (outfp == NULL){
 40 |     throw string("cannot open ") + fn;
 41 |   }
 42 | 
 43 |   for (int i = 0; i < M.rows(); ++i){
 44 |     for (int j = 0; j < M.cols(); ++j){
 45 |       fprintf(outfp, "%+f ",  M(i, j));
 46 |     }
 47 |     fprintf(outfp, "\n");
 48 |   }
 49 | 
 50 |   fclose(outfp);
 51 | }
 52 | 
 53 | void writeVector_(const string& fn, const VectorXf& V){
 54 |   cout << "write " << fn << endl;
 55 |   FILE* outfp = fopen(fn.c_str(), "wb");
 56 |   if (outfp == NULL){
 57 |     throw string("cannot open ") + fn;
 58 |   }
 59 | 
 60 |   for (int i = 0; i < V.rows(); ++i){
 61 |     fprintf(outfp, "%+f\n", V(i));
 62 |   }
 63 | 
 64 |   fclose(outfp);
 65 | }
 66 | 
 67 | void readLine(const string& line,  
 68 | 	      fv_t& fv){
 69 |   istringstream is(line);
 70 | 
 71 |   int id;
 72 |   char sep;
 73 |   float val;
 74 |   while (is >> id >> sep >> val){
 75 |     fv.push_back(make_pair(id, val));
 76 |   }
 77 |   sort(fv.begin(), fv.end());
 78 |   fv.erase(unique(fv.begin(), fv.end()), fv.end());
 79 | }
 80 | 
 81 | }
 82 | 
 83 | 
 84 | 
 85 | void readMatrix(const std::string& fn, SMatrixXf& A){
 86 |   vector<fv_t> fvs;
 87 |   ifstream ifs(fn.c_str());
 88 |   if (!ifs){
 89 |     throw string("failed to open") + fn;
 90 |   }
 91 | 
 92 |   for (string line; getline(ifs, line); ){
 93 |     fv_t fv;
 94 |     readLine(line, fv);
 95 |     //if (fv.size() == 0) continue;
 96 |     fvs.push_back(fv);
 97 |   }
 98 |   Util::convertFV2Mat(fvs, A);
 99 | }
100 | 
101 | void readMatrix(const std::string& fn, MatrixXf& A){
102 |   ifstream ifs(fn.c_str());
103 |   if (!ifs){
104 |     throw string("failed to open " ) + fn;
105 |   }
106 | 
107 |   vector< vector<float> > vs;
108 |   for (string line; getline(ifs, line); ){
109 |     istringstream is(line);
110 |     vector<float> v; 
111 |     float val;
112 |     while (is >> val){
113 |       v.push_back(val);
114 |     }
115 |     vs.push_back(v);
116 |   }
117 | 
118 |   size_t rowN = vs.size();
119 |   if (rowN == 0) return;
120 |   size_t colN = vs[0].size();
121 |   A.resize(rowN, colN);
122 |   
123 |   for (size_t i = 0; i < rowN; ++i){
124 |     if (colN != vs[i].size()){
125 |       cerr << "warning: " << i+1 << "-th row has " << vs[i].size() << " entries. " 
126 | 	   << colN << " entries are expected" << endl;
127 |     }
128 |     size_t colNmin = min(colN, vs[i].size());
129 |     for (size_t j = 0; j < colNmin; ++j){
130 |       A(i, j) = vs[i][j];
131 |     }
132 |   }
133 | }
134 | 
135 | void writeMatrix(const string& fn, const REDSVD::RedSVD& A){
136 |   writeMatrix_(fn + ".U", A.matrixU());
137 |   writeVector_(fn + ".S", A.singularValues());
138 |   writeMatrix_(fn + ".V", A.matrixV());
139 | }
140 | 
141 | void writeMatrix(const string& fn, const REDSVD::RedSVDIncr& A){
142 |   writeMatrix_(fn + ".U", A.matrixU());
143 |   writeVector_(fn + ".S", A.singularValues());
144 |   writeMatrix_(fn + ".V", A.matrixV());
145 | }
146 | 
147 | 
148 | void writeMatrix(const string& fn, const REDSVD::RedPCA& A){
149 |   writeMatrix_(fn + ".pc",    A.principalComponents());
150 |   writeMatrix_(fn + ".score", A.scores());
151 | }
152 | 
153 | void writeMatrix(const string& fn, const REDSVD::RedSymEigen& A){
154 |   writeMatrix_(fn + ".evec", A.eigenVectors());
155 |   writeVector_(fn + ".eval", A.eigenValues());
156 | }
157 | 
158 | }
159 | 


--------------------------------------------------------------------------------
/src/redsvd/redsvdFile.hpp:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  *  Copyright (c) 2010 Daisuke Okanohara
 3 |  * 
 4 |  *   Redistribution and use in source and binary forms, with or without
 5 |  *   modification, are permitted provided that the following conditions
 6 |  *   are met:
 7 |  * 
 8 |  *   1. Redistributions of source code must retain the above Copyright
 9 |  *      notice, this list of conditions and the following disclaimer.
10 |  *
11 |  *   2. Redistributions in binary form must reproduce the above Copyright
12 |  *      notice, this list of conditions and the following disclaimer in the
13 |  *      documentation and/or other materials provided with the distribution.
14 |  *
15 |  *   3. Neither the name of the authors nor the names of its contributors
16 |  *      may be used to endorse or promote products derived from this
17 |  *      software without specific prior written permission.
18 |  */
19 | 
20 | #ifndef REDSVDFILE_HPP__
21 | #define REDSVDFILE_HPP__
22 | 
23 | #include <string>
24 | #include <iostream>
25 | #include <fstream>
26 | #include "util.hpp"
27 | 
28 | namespace REDSVD{
29 | 
30 | class RedSVD;
31 | class RedPCA;
32 | class RedSymEigen;
33 | class RedSVDIncr;
34 | 
35 | void readMatrix(const std::string& fn, SMatrixXf& A);
36 | void readMatrix(const std::string& fn, Eigen::MatrixXf& A);
37 | 
38 | void writeMatrix(const std::string& fn, const RedSVD& A);
39 | void writeMatrix(const std::string& fn, const RedPCA& A);
40 | void writeMatrix(const std::string& fn, const RedSymEigen& A);
41 | void writeMatrix(const std::string& fn, const RedSVDIncr& A);
42 | 
43 | template<class Mat, class RetMat>
44 | void fileProcess(const std::string& inputFileName,
45 | 		 const std::string& outputFileName,
46 | 		 int rank){
47 |   double startSec = Util::getSec();
48 |   std::cout << "read matrix from " << inputFileName << " ... " << std::flush;
49 |   Mat A;
50 |   readMatrix(inputFileName.c_str(), A);
51 |   std::cout << Util::getSec() - startSec << " sec." <<std:: endl;
52 |   std::cout << "rows:\t" << A.rows() << std::endl
53 | 	    << "cols:\t" << A.cols() << std::endl
54 | 	    << "rank:\t" << rank  << std::endl;
55 | 
56 |   std::cout << "compute ... " << std::flush;
57 |   startSec = Util::getSec();
58 |   RetMat retMat(A, rank);
59 |   std::cout << Util::getSec() - startSec << " sec." << std::endl;
60 |   
61 |   startSec = Util::getSec();
62 |   writeMatrix(outputFileName, retMat);
63 |   std::cout << Util::getSec() - startSec << " sec." << std::endl
64 | 	    << "finished." << std::endl;
65 | }
66 | 
67 | }
68 | 
69 | #endif // REDSVDFILE_HPP__
70 | 


--------------------------------------------------------------------------------
/src/redsvd/redsvdIncr.hpp:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  *  Copyright (c) 2011 Daisuke Okanohara
  3 |  * 
  4 |  *   Redistribution and use in source and binary forms, with or without
  5 |  *   modification, are permitted provided that the following conditions
  6 |  *   are met:
  7 |  * 
  8 |  *   1. Redistributions of source code must retain the above Copyright
  9 |  *      notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  *   2. Redistributions in binary form must reproduce the above Copyright
 12 |  *      notice, this list of conditions and the following disclaimer in the
 13 |  *      documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  *   3. Neither the name of the authors nor the names of its contributors
 16 |  *      may be used to endorse or promote products derived from this
 17 |  *      software without specific prior written permission.
 18 |  */
 19 | 
 20 | #ifndef REDSVD_INCR_HPP__
 21 | #define REDSVD_INCR_HPP__
 22 | 
 23 | #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
 24 | 
 25 | #include <vector>
 26 | #include <eigen3/Eigen/Sparse>
 27 | #include <eigen3/Eigen/Dense>
 28 | #include <eigen3/Eigen/Eigenvalues>
 29 | #include "util.hpp"
 30 | 
 31 | namespace REDSVD {
 32 | 
 33 | class RedSVDIncr {
 34 | public:
 35 |   RedSVDIncr(){}
 36 | 
 37 |   template <class Reader>
 38 |   RedSVDIncr(Reader& reader, const int rank){
 39 |     run(reader, rank);
 40 |   }
 41 | 
 42 |   template <class Reader>
 43 |   void run(Reader& reader, const int rank){
 44 |     int r = (rank < reader.cols()) ? rank : reader.cols();
 45 |     Eigen::MatrixXf O(reader.rows(), r);
 46 |     Util::sampleGaussianMat(O);
 47 | 
 48 |     Eigen::MatrixXf Y = Eigen::MatrixXf::Zero(reader.cols(), r);
 49 |     for (int row = 0; row < reader.rows(); ++row){
 50 |       fv_t fv;
 51 |       reader.ReadRow(fv);
 52 |       for (size_t i = 0; i < fv.size(); ++i){
 53 |         int column = fv[i].first;
 54 |         float val = fv[i].second;
 55 |         for (int j = 0; j < r; ++j){
 56 |           Y(column, j) += O(row, j) * val;
 57 |         }
 58 |       }
 59 |     }
 60 |     Util::processGramSchmidt(Y);
 61 | 
 62 |     reader.Rewind();
 63 |     
 64 |     Eigen::MatrixXf B = Eigen::MatrixXf::Zero(reader.rows(), r);
 65 |     for (int row = 0; row < reader.rows(); ++row){
 66 |       fv_t fv;
 67 |       reader.ReadRow(fv);
 68 |       for (size_t i = 0; i < fv.size(); ++i){
 69 |         int column = fv[i].first;
 70 |         float val = fv[i].second;
 71 |         for (int j = 0; j < r; ++j){
 72 |           B(row, j) += val * Y(column, j);
 73 |         }
 74 |       }
 75 |     }
 76 | 
 77 |     // Gaussian Random Matrix
 78 |     Eigen::MatrixXf P(B.cols(), r);
 79 |     Util::sampleGaussianMat(P);
 80 |     
 81 |     // Compute Sample Matrix of B
 82 |     Eigen::MatrixXf Z = B * P;
 83 |     
 84 |     // Orthonormalize Z
 85 |     Util::processGramSchmidt(Z);
 86 |     
 87 |     // Range(C) = Range(B)
 88 |     Eigen::MatrixXf C = Z.transpose() * B; 
 89 |     
 90 |     Eigen::JacobiSVD<Eigen::MatrixXf> svdOfC(C, Eigen::ComputeThinU | Eigen::ComputeThinV);
 91 |     
 92 |     // C = USV^T
 93 |     // A = Z * U * S * V^T * Y^T()
 94 |     matU_ = Z * svdOfC.matrixU();
 95 |     matS_ = svdOfC.singularValues();
 96 |     matV_ = Y * svdOfC.matrixV();
 97 |   }
 98 |   
 99 |   const Eigen::MatrixXf& matrixU() const {
100 |     return matU_;
101 |   }
102 | 
103 |   const Eigen::VectorXf& singularValues() const {
104 |     return matS_;
105 |   }
106 | 
107 |   const Eigen::MatrixXf& matrixV() const {
108 |     return matV_;
109 |   }
110 | 
111 | private:
112 |   Eigen::MatrixXf matU_;
113 |   Eigen::VectorXf matS_;
114 |   Eigen::MatrixXf matV_;
115 | };
116 | 
117 | }
118 | 
119 | #endif // REDSVD_INCR_HPP__
120 | 


--------------------------------------------------------------------------------
/src/redsvd/redsvdMain.cpp:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  *  Copyright (c) 2010 Daisuke Okanohara
  3 |  * 
  4 |  *   Redistribution and use in source and binary forms, with or without
  5 |  *   modification, are permitted provided that the following conditions
  6 |  *   are met:
  7 |  * 
  8 |  *   1. Redistributions of source code must retain the above Copyright
  9 |  *      notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  *   2. Redistributions in binary form must reproduce the above Copyright
 12 |  *      notice, this list of conditions and the following disclaimer in the
 13 |  *      documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  *   3. Neither the name of the authors nor the names of its contributors
 16 |  *      may be used to endorse or promote products derived from this
 17 |  *      software without specific prior written permission.
 18 |  */
 19 | 
 20 | #include <string>
 21 | #include <fstream>
 22 | #include <sstream>
 23 | 
 24 | #include "cmdline.h"
 25 | #include "redsvd.hpp"
 26 | #include "redsvdFile.hpp"
 27 | 
 28 | using namespace std;
 29 | 
 30 | namespace {
 31 | 
 32 | void setFooter(cmdline::parser& p){
 33 |   p.footer(
 34 |    "\n\n"
 35 |    "redsvd supports the following format types (one line for each row)\n\n"
 36 |    "[format=dense] (<value>+\\n)+\n"
 37 |    "[format=sparse] ((colum_id:value)+\\n)+\n"
 38 |    "Example:\n"
 39 |    ">redsvd -i imat -o omat -r 10 -f dense\n"
 40 |    "compuate SVD for a dense matrix in imat and output omat.U omat.V, and omat.S\n"
 41 |    "with the 10 largest eigen values/vectors\n" 
 42 |    ">redsvd -i imat -o omat -r 3 -f sparse -m PCA\n"
 43 |    "compuate PCA for a sparse matrix in imat and output omat.PC omat.SCORE\n"
 44 |    "with the 3 largest principal components\n" 
 45 |   );
 46 | }
 47 | }
 48 | 
 49 | int main(int argc, char* argv[]){
 50 |   cmdline::parser p;
 51 |   p.add<string>("input",  'i', "input file", true);
 52 |   p.add<string>("output", 'o', "output file's prefix", true);
 53 |   p.add<int>   ("rank",   'r', "rank      ", false, 10);
 54 |   p.add<string>("format", 'f', "format type (dense|sparse) See example. ", false, "dense");
 55 |   p.add<string>("method", 'm', "method (SVD|PCA|SymEigen)", false, "SVD");
 56 |   p.set_program_name("redsvd");
 57 |   setFooter(p);
 58 | 
 59 |   if (argc == 1){
 60 |     cerr << p.usage() << endl;
 61 |     return 0;
 62 |   }
 63 | 
 64 |   if (p.parse(argc, argv) == 0){
 65 |     cerr << "Error:" << p.error() << endl
 66 | 	 << p.usage() << endl;
 67 |     return -1;
 68 |   }
 69 | 
 70 |   string input  = p.get<string>("input");
 71 |   string output = p.get<string>("output");
 72 |   string format = p.get<string>("format");
 73 |   int    rank   = p.get<int>   ("rank");
 74 |   string method = p.get<string>("method");
 75 |   bool isInputSparse = false;
 76 | 
 77 |   if (rank <= 0){
 78 |     cerr << "rank=" << rank << endl
 79 | 	 << "rank should be positive integer" << endl;
 80 |     return -1;
 81 |   }
 82 | 
 83 | 
 84 |   if (format == "dense"){
 85 |     isInputSparse = false;
 86 |   } else if (format == "sparse"){
 87 |     isInputSparse = true;
 88 |   } else {
 89 |     cerr << "unknwon format:" << format << endl;
 90 |     return -1;
 91 |   }
 92 | 
 93 |   cout << "compute " << method << endl;
 94 |   try {
 95 |     if (method == "SVD"){
 96 |       if (isInputSparse){
 97 | 	REDSVD::fileProcess<REDSVD::SMatrixXf, REDSVD::RedSVD>(input, output, rank);
 98 |       } else {
 99 | 	REDSVD::fileProcess<Eigen::MatrixXf, REDSVD::RedSVD>(input, output, rank);
100 |       }
101 |     } else if (method == "PCA"){
102 |       if (isInputSparse){
103 | 	REDSVD::fileProcess<REDSVD::SMatrixXf, REDSVD::RedPCA>(input, output, rank);
104 |       } else {
105 | 	REDSVD::fileProcess<Eigen::MatrixXf, REDSVD::RedPCA>(input, output, rank);
106 |       }
107 |     } else if (method == "SymEigen"){ 
108 |       if (isInputSparse){
109 | 	REDSVD::fileProcess<REDSVD::SMatrixXf, REDSVD::RedSymEigen>(input, output, rank);
110 |       } else {
111 | 	REDSVD::fileProcess<Eigen::MatrixXf, REDSVD::RedSymEigen>(input, output, rank);
112 |       }
113 |     } else {
114 |       cerr << "unknown method:" << method << endl;
115 |       return -1;
116 |     }
117 |   } catch (const string& error){
118 |     cerr << "Error: " << error << endl;
119 |     return -1;
120 |   }
121 |   return 0;
122 | }
123 | 


--------------------------------------------------------------------------------
/src/redsvd/redsvdMainIncr.cpp:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  *  Copyright (c) 2010 Daisuke Okanohara
 3 |  * 
 4 |  *   Redistribution and use in source and binary forms, with or without
 5 |  *   modification, are permitted provided that the following conditions
 6 |  *   are met:
 7 |  * 
 8 |  *   1. Redistributions of source code must retain the above Copyright
 9 |  *      notice, this list of conditions and the following disclaimer.
10 |  *
11 |  *   2. Redistributions in binary form must reproduce the above Copyright
12 |  *      notice, this list of conditions and the following disclaimer in the
13 |  *      documentation and/or other materials provided with the distribution.
14 |  *
15 |  *   3. Neither the name of the authors nor the names of its contributors
16 |  *      may be used to endorse or promote products derived from this
17 |  *      software without specific prior written permission.
18 |  */
19 | 
20 | #include <string>
21 | #include <fstream>
22 | #include <sstream>
23 | 
24 | #include "cmdline.h"
25 | #include "fileReader.hpp"
26 | #include "redsvdFile.hpp"
27 | #include "redsvd.hpp"
28 | #include "redsvdIncr.hpp"
29 | #include "util.hpp"
30 | 
31 | using namespace std;
32 | using namespace REDSVD;
33 | 
34 | void IncrRun(const string& inputFileName,
35 |              const string& outputFileName,
36 |              int rank){
37 |   FileReader fileReader;
38 |   std::cout << "read matrix from " << inputFileName << " ... " << std::flush;
39 |   double startSec = Util::getSec();
40 |   fileReader.OpenFile(inputFileName.c_str());
41 |   std::cout << Util::getSec() - startSec << " sec." <<std:: endl;
42 | 
43 |   fileReader.GetStat();
44 |   
45 |   std::cout << "rows:\t" << fileReader.rows() << std::endl
46 | 	    << "cols:\t" << fileReader.cols() << std::endl
47 | 	    << "rank:\t" << rank  << std::endl;
48 | 
49 |   std::cout << "compute ... " << std::flush;
50 |   startSec = Util::getSec();
51 |   RedSVDIncr redsvd_incr(fileReader, rank);
52 |   std::cout << Util::getSec() - startSec << " sec." << std::endl;
53 |   startSec = Util::getSec();
54 |   writeMatrix(outputFileName, redsvd_incr);
55 |   std::cout << Util::getSec() - startSec << " sec." << std::endl
56 | 	    << "finished." << std::endl;
57 | 
58 |   
59 | }
60 | 
61 | int main(int argc, char* argv[]){
62 |   cmdline::parser p;
63 |   p.add<string>("input",  'i', "input file", true);
64 |   p.add<string>("output", 'o', "output file's prefix", true);
65 |   p.add<int>   ("rank",   'r', "rank      ", false, 10);
66 |   p.set_program_name("redsvd_incr");
67 | 
68 |   if (argc == 1){
69 |     cerr << p.usage() << endl;
70 |     return 0;
71 |   }
72 | 
73 |   if (p.parse(argc, argv) == 0){
74 |     cerr << "Error:" << p.error() << endl
75 | 	 << p.usage() << endl;
76 |     return -1;
77 |   }
78 | 
79 |   string input  = p.get<string>("input");
80 |   string output = p.get<string>("output");
81 |   int    rank   = p.get<int>   ("rank");
82 | 
83 |   if (rank <= 0){
84 |     cerr << "rank=" << rank << endl
85 | 	 << "rank should be positive integer" << endl;
86 |     return -1;
87 |   }
88 | 
89 |   try {
90 |     IncrRun(input, output, rank);
91 |   } catch (const string& error){
92 |     cerr << "Error: " << error << endl;
93 |     return -1;
94 |   }
95 |   return 0;
96 | }
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/src/redsvd/util.cpp:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  *  Copyright (c) 2010 Daisuke Okanohara
  3 |  * 
  4 |  *   Redistribution and use in source and binary forms, with or without
  5 |  *   modification, are permitted provided that the following conditions
  6 |  *   are met:
  7 |  * 
  8 |  *   1. Redistributions of source code must retain the above Copyright
  9 |  *      notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  *   2. Redistributions in binary form must reproduce the above Copyright
 12 |  *      notice, this list of conditions and the following disclaimer in the
 13 |  *      documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  *   3. Neither the name of the authors nor the names of its contributors
 16 |  *      may be used to endorse or promote products derived from this
 17 |  *      software without specific prior written permission.
 18 |  */
 19 | 
 20 | #include <iostream>
 21 | #include <sys/time.h>
 22 | 
 23 | #include "util.hpp"
 24 | 
 25 | using namespace std;
 26 | using namespace Eigen;
 27 | 
 28 | namespace REDSVD {
 29 | 
 30 | const float SVD_EPS = 0.0001f;
 31 | 
 32 | double Util::getSec(){
 33 |   timeval tv;
 34 |   gettimeofday(&tv, NULL);
 35 |   return tv.tv_sec + (double)tv.tv_usec*1e-6;
 36 | }
 37 | 
 38 | void Util::sampleTwoGaussian(float& f1, float& f2){
 39 |   float v1 = (float)(rand() + 1.f) / ((float)RAND_MAX+2.f);
 40 |   float v2 = (float)(rand() + 1.f) / ((float)RAND_MAX+2.f);
 41 |   float len = sqrt(-2.f * log(v1));
 42 |   f1 = len * cos(2.f * M_PI * v2);
 43 |   f2 = len * sin(2.f * M_PI * v2);
 44 | }
 45 | 
 46 | void Util::sampleGaussianMat(MatrixXf& mat){
 47 |   for (int i = 0; i < mat.rows(); ++i){
 48 |     int j = 0;
 49 |     for ( ; j+1 < mat.cols(); j += 2){
 50 |       float f1, f2;
 51 |       sampleTwoGaussian(f1, f2);
 52 |       mat(i,j  ) = f1;
 53 |       mat(i,j+1) = f2;
 54 |     }
 55 |     for (; j < mat.cols(); j ++){
 56 |       float f1, f2;
 57 |       sampleTwoGaussian(f1, f2);
 58 |       mat(i, j)  = f1;
 59 |     }
 60 |   }
 61 | } 
 62 | 
 63 | 
 64 | void Util::processGramSchmidt(MatrixXf& mat){
 65 |   for (int i = 0; i < mat.cols(); ++i){
 66 |     for (int j = 0; j < i; ++j){
 67 |       float r = mat.col(i).dot(mat.col(j));
 68 |       mat.col(i) -= r * mat.col(j);
 69 |     }
 70 |     float norm = mat.col(i).norm();
 71 |     if (norm < SVD_EPS){
 72 |       for (int k = i; k < mat.cols(); ++k){
 73 | 	mat.col(k).setZero();
 74 |       } 
 75 |       return;
 76 |     }
 77 |     mat.col(i) *= (1.f / norm);
 78 |   }
 79 | }
 80 | 
 81 | void Util::convertFV2Mat(const vector<fv_t>& fvs, REDSVD::SMatrixXf& A){
 82 |   int maxID = 0;
 83 |   size_t nonZeroNum = 0;
 84 |   for (size_t i = 0; i < fvs.size(); ++i){
 85 |     const fv_t& fv(fvs[i]);
 86 |     for (size_t j = 0; j < fv.size(); ++j){
 87 |       maxID = max(fv[j].first+1, maxID);
 88 |     }
 89 |     nonZeroNum += fv.size();
 90 |   }
 91 |   A.resize(fvs.size(), maxID);
 92 |   A.reserve(nonZeroNum);
 93 |   for (size_t i = 0; i < fvs.size(); ++i){
 94 |     A.startVec(i);
 95 |     const fv_t& fv(fvs[i]);
 96 |     for (size_t j = 0; j < fv.size(); ++j){
 97 |       A.insertBack(i, fv[j].first) = fv[j].second;
 98 |     }
 99 |   }
100 |   A.finalize();
101 | }
102 | 
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/redsvd/util.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2011 Daisuke Okanohara
 3 |  *
 4 |  *   Redistribution and use in source and binary forms, with or without
 5 |  *   modification, are permitted provided that the following conditions
 6 |  *   are met:
 7 |  *
 8 |  *   1. Redistributions of source code must retain the above Copyright
 9 |  *      notice, this list of conditions and the following disclaimer.
10 |  *
11 |  *   2. Redistributions in binary form must reproduce the above Copyright
12 |  *      notice, this list of conditions and the following disclaimer in the
13 |  *      documentation and/or other materials provided with the distribution.
14 |  *
15 |  *   3. Neither the name of the authors nor the names of its contributors
16 |  *      may be used to endorse or promote products derived from this
17 |  *      software without specific prior written permission.
18 |  */
19 | 
20 | #ifndef REDSVD_UTIL_HPP__
21 | #define REDSVD_UTIL_HPP__
22 | 
23 | #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
24 | 
25 | #include <cstddef>
26 | #include <vector>
27 | #include <eigen3/Eigen/Sparse>
28 | #include <eigen3/Eigen/Dense>
29 | #include <eigen3/Eigen/Eigenvalues>
30 | 
31 | namespace REDSVD {
32 | 
33 | typedef Eigen::SparseMatrix<float, Eigen::RowMajor, std::ptrdiff_t> SMatrixXf;
34 | typedef std::vector<std::pair<int, float> > fv_t;
35 | 
36 | class Util{
37 | public:
38 |   static void convertFV2Mat(const std::vector<fv_t>& fvs, SMatrixXf& A);
39 |   static void sampleGaussianMat(Eigen::MatrixXf& x);
40 |   static void processGramSchmidt(Eigen::MatrixXf& mat);
41 |   static double getSec();
42 | 
43 | private:
44 |   static void sampleTwoGaussian(float& f1, float& f2);
45 | };
46 | 
47 | }
48 | 
49 | #endif // REDSVD_UTIL_HPP_
50 | 


--------------------------------------------------------------------------------