├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── Modules │ ├── FindGflags.cmake │ └── FindLOG4CXX.cmake ├── configure ├── example ├── blog.sh ├── flickr.sh ├── log4cxx.config ├── mat2edge.py ├── ppi.sh ├── predict.py ├── redsvd2emb.py └── youtube.sh └── src ├── BinaryGraphWalker.cc ├── BinaryGraphWalker.h ├── CMakeLists.txt ├── GraphWalker.cc ├── GraphWalker.h ├── WeightGraphWalker.cc ├── WeightGraphWalker.h ├── config.h.cmake ├── netsmf.cc └── redsvd ├── CMakeLists.txt ├── cmdline.h ├── fileReader.hpp ├── redsvd.hpp ├── redsvdFile.cpp ├── redsvdFile.hpp ├── redsvdIncr.hpp ├── redsvdMain.cpp ├── redsvdMainIncr.cpp ├── util.cpp └── util.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | /.vs 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # CMAKE parameters file for 'configure' 3 | # 4 | 5 | SET(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required 6 | 7 | CMAKE_MINIMUM_REQUIRED(VERSION 3.5) 8 | 9 | # 10 | # Misc parameters 11 | # 12 | 13 | # output parameters for 'make' 14 | SET(CMAKE_COLOR_MAKEFILE "ON") 15 | 16 | # project declaration 17 | PROJECT(NETSMF) 18 | 19 | # Release/Debug 20 | IF(NOT CMAKE_BUILD_TYPE) 21 | SET(CMAKE_DEFAULT_BUILD_TYPE "Release") 22 | SET(CMAKE_BUILD_TYPE "Release") 23 | ENDIF(NOT CMAKE_BUILD_TYPE) 24 | MESSAGE(STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE}) 25 | 26 | # path for binary 27 | SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin/${CMAKE_BUILD_TYPE}) 28 | 29 | # 30 | # compiler flags 31 | # 32 | SET(CMAKE_CXX_FLAGS_RELEASE "-O3") 33 | SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3 -fPIC") 34 | IF(NOT APPLE) 35 | ADD_DEFINITIONS(-march=native) 36 | ENDIF(NOT APPLE) 37 | 38 | # c++ 11 39 | set(CMAKE_CXX_STANDARD 11) 40 | 41 | # 42 | # Check for required libraries 43 | # FIND_PACKAGE( xxx REQUIRED ) : REQUIRED removed so that MESSAGE is written 44 | # 45 | 46 | # gflags 47 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules") 48 | find_package(Gflags REQUIRED) 49 | find_package(LOG4CXX REQUIRED) 50 | #include_directories("${GFLAGS_INCLUDE_DIR}") 51 | #include_directories("${LOG4CXX_INCLUDE_DIR}") 52 | 53 | if (GFLAGS_FOUND) 54 | MESSAGE(STATUS "Compiling with gflags support") 55 | SET (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -lgflags") 56 | SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgflags") 57 | SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lgflags") 58 | ENDIF(GFLAGS_FOUND) 59 | 60 | if (LOG4CXX_FOUND) 61 | MESSAGE(STATUS "Compiling with log4cxx support") 62 | SET (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog4cxx") 63 | SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog4cxx") 64 | SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -llog4cxx") 65 | ENDIF(LOG4CXX_FOUND) 66 | 67 | 68 | find_package (Eigen3 REQUIRED NO_MODULE) 69 | 70 | # Threads 71 | FIND_PACKAGE( Threads ) 72 | IF ( NOT THREADS_FOUND ) 73 | MESSAGE(FATAL_ERROR "Package Threads required, but not found!") 74 | ENDIF( NOT THREADS_FOUND ) 75 | 76 | # Zlib 77 | FIND_PACKAGE( ZLIB ) 78 | IF ( NOT ZLIB_FOUND ) 79 | MESSAGE(FATAL_ERROR "Package ZLIB required, but not found!") 80 | ENDIF( NOT ZLIB_FOUND ) 81 | 82 | # Blas 83 | FIND_PACKAGE(BLAS) 84 | IF(BLAS_FOUND) 85 | SET(USE_BLAS 1) 86 | ENDIF(BLAS_FOUND) 87 | 88 | # Lapack 89 | FIND_PACKAGE(LAPACK) 90 | IF(LAPACK_FOUND) 91 | SET(USE_LAPACK 1) 92 | ENDIF(LAPACK_FOUND) 93 | 94 | # OpenMP 95 | FIND_PACKAGE(OpenMP) 96 | if (OPENMP_FOUND) 97 | MESSAGE(STATUS "Compiling with OpenMP support") 98 | SET (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 99 | SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 100 | SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") 101 | ENDIF(OPENMP_FOUND) 102 | 103 | # 104 | # Default Options 105 | # 106 | 107 | # should we use VERBOSE functions? 108 | OPTION (VERBOSE "Use VERBOSE option" OFF) 109 | 110 | # should we use VERBOSE functions? 111 | OPTION (EIGEN_USE_MKL_ALL "Use MKL Library option" OFF) 112 | 113 | # 114 | # Management 115 | # 116 | 117 | # configure a header file to pass some of the CMake settings 118 | # to the source code 119 | CONFIGURE_FILE ( 120 | ${CMAKE_SOURCE_DIR}/src/config.h.cmake 121 | ${CMAKE_SOURCE_DIR}/src/config.h 122 | ) 123 | 124 | # specify the cross compiler 125 | SET(CMAKE_CXX_COMPILER g++) 126 | 127 | 128 | ADD_SUBDIRECTORY(src) 129 | # INCLUDE_DIRECTORIES(data) 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jiezhong Qiu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NetSMF 2 | 3 | NetSMF: Large-Scale Network Embedding as Sparse Matrix Factorization [[arxiv](https://arxiv.org/abs/1906.11156)] 4 | 5 | Please cite our paper if you use this code in your own work: 6 | 7 | ``` 8 | @inproceedings{qiu2019netsmf, 9 | author = {Qiu, Jiezhong and Dong, Yuxiao and Ma, Hao and Li, Jian and Wang, Chi and Wang, Kuansan and Tang, Jie}, 10 | title = {NetSMF: Large-Scale Network Embedding As Sparse Matrix Factorization}, 11 | booktitle = {The World Wide Web Conference}, 12 | series = {WWW '19}, 13 | year = {2019}, 14 | publisher = {ACM} 15 | } 16 | ``` 17 | 18 | # HOWTO 19 | 20 | ## How to install 21 | ``` 22 | sudo apt-get install cmake 23 | sudo apt-get install libgflags-dev 24 | sudo apt-get install liblog4cxx-dev 25 | sudo apt-get install libomp-dev 26 | sudo apt-get install libeigen3-dev 27 | https://github.com/xptree/NetSMF.git 28 | cd NetSMF 29 | mkdir build 30 | ./configure 31 | cd build 32 | make 33 | ``` 34 | 35 | The dependence versions that the code is tested: 36 | 37 | | Dependence | Version | 38 | |------------ |------------- | 39 | | g++ | 5.4.0 | 40 | | cmake | 3.5.1-1 | 41 | | gflags | 2.1.2-3 | 42 | | log4cxx | 0.10.0-10 | 43 | | openmp | 3.7.0-3 | 44 | | eigen3 | 3.3~beta1-2 | 45 | 46 | **Note: Using eigen3 3.2.5 may cause problems. Please do update you eigen3 to 3.3 or above.** 47 | 48 | ## How to run 49 | 50 | ### Input 51 | 52 | Support undirected networks with edgelist format. 53 | 54 | For unweighted networks, each edge should appear twice `a b` and `b a`. 55 | 56 | For weighted networks, each edge should appear twice `a b w` and `b a w`. 57 | 58 | You may want to use `example/mat2edge.py` to translate mat to edgelist. 59 | 60 | `.mat` files can be downloaded here: 61 | 62 | * BlogCatalog [Source](http://socialcomputing.asu.edu/datasets/BlogCatalog3) [Preprocessed](http://leitang.net/code/social-dimension/data/blogcatalog.mat) 63 | * Protein-Protein Interaction [Source](http://thebiogrid.org/download.php) [Preprocessed](http://snap.stanford.edu/node2vec/Homo_sapiens.mat) 64 | * [Flickr](http://leitang.net/code/social-dimension/data/flickr.mat) 65 | * [YouTube](http://leitang.net/code/social-dimension/data/youtube.mat) 66 | 67 | 68 | 69 | ### Run NetSMF 70 | 71 | For unweighted networks, see `example/blog.sh` for an example. 72 | 73 | `blog.sh` takes three arguments, the first one indicates the input edgelist file, the second one indicates the output file, the third one indicating the origin `.mat` file containing network and labels. 74 | 75 | For exmaple, runing `./blog.sh blogcatalog.edgelist blogcatalog.netsmf blogcatalog.mat` will 76 | 77 | * check if `blogcatalog.edgelist` is a valid file. If not, it calls `mat2edge.py` to translate mat file `blogcatalog.mat` to edgelist `blogcatalog.edgelist`. 78 | * call NetSMF algorithm, and store the 128-dim embedding at `blogcatalog.netsmf_128.npy`. 79 | * call `predict.py` to evaluate NetSMF at the label classification task. 80 | 81 | You can use `-weight` to switch to weighted networks and use `-noweight` to switch to unweighted network (default unweighted). 82 | 83 | ### About truncated logarithm 84 | 85 | We propose to use truncated logarithm in our WWW'19 paper. 86 | 87 | In the code, we provide a new option `log1p`, i.e., `log(1+x)`. You can use `-log1p` to turn it on and `-nolog1p` to turn it off (default off). Empirically speaking, `log1p` sometimes achieves better performance, for example in wiki dataset. 88 | 89 | 90 | ## Acknowledgement 91 | 92 | The implementation of randomized singular value decomposition is by [redsvd](https://code.google.com/p/redsvd/) and [HPCA](https://github.com/idiap/hpca). 93 | -------------------------------------------------------------------------------- /cmake/Modules/FindGflags.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find GFLAGS 2 | # 3 | # The following variables are optionally searched for defaults 4 | # GFLAGS_ROOT_DIR: Base directory where all GFLAGS components are found 5 | # 6 | # The following are set after configuration is done: 7 | # GFLAGS_FOUND 8 | # GFLAGS_INCLUDE_DIRS 9 | # GFLAGS_LIBRARIES 10 | # GFLAGS_LIBRARYRARY_DIRS 11 | 12 | include(FindPackageHandleStandardArgs) 13 | 14 | set(GFLAGS_ROOT_DIR "" CACHE PATH "Folder contains Gflags") 15 | 16 | # We are testing only a couple of files in the include directories 17 | if(WIN32) 18 | find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h 19 | PATHS ${GFLAGS_ROOT_DIR}/src/windows) 20 | else() 21 | find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h 22 | PATHS ${GFLAGS_ROOT_DIR}) 23 | endif() 24 | 25 | if(MSVC) 26 | find_library(GFLAGS_LIBRARY_RELEASE 27 | NAMES libgflags 28 | PATHS ${GFLAGS_ROOT_DIR} 29 | PATH_SUFFIXES Release) 30 | 31 | find_library(GFLAGS_LIBRARY_DEBUG 32 | NAMES libgflags-debug 33 | PATHS ${GFLAGS_ROOT_DIR} 34 | PATH_SUFFIXES Debug) 35 | 36 | set(GFLAGS_LIBRARY optimized ${GFLAGS_LIBRARY_RELEASE} debug ${GFLAGS_LIBRARY_DEBUG}) 37 | else() 38 | find_library(GFLAGS_LIBRARY gflags) 39 | endif() 40 | 41 | find_package_handle_standard_args(GFlags DEFAULT_MSG GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY) 42 | 43 | 44 | if(GFLAGS_FOUND) 45 | set(GFLAGS_INCLUDE_DIRS ${GFLAGS_INCLUDE_DIR}) 46 | set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY}) 47 | message(STATUS "Found gflags (include: ${GFLAGS_INCLUDE_DIR}, library: ${GFLAGS_LIBRARY})") 48 | mark_as_advanced(GFLAGS_LIBRARY_DEBUG GFLAGS_LIBRARY_RELEASE 49 | GFLAGS_LIBRARY GFLAGS_INCLUDE_DIR GFLAGS_ROOT_DIR) 50 | endif() 51 | -------------------------------------------------------------------------------- /cmake/Modules/FindLOG4CXX.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find LOG4CXX 2 | # 3 | # The following variables are optionally searched for defaults 4 | # LOG4CXX_ROOT_DIR: Base directory where all LOG4CXX components are found 5 | # 6 | # The following are set after configuration is done: 7 | # LOG4CXX_FOUND 8 | # LOG4CXX_INCLUDE_DIRS 9 | # LOG4CXX_LIBRARIES 10 | # LOG4CXX_LIBRARYRARY_DIRS 11 | 12 | include(FindPackageHandleStandardArgs) 13 | 14 | set(LOG4CXX_ROOT_DIR "" CACHE PATH "Folder contains LOG4CXX") 15 | 16 | # We are testing only a couple of files in the include directories 17 | if(WIN32) 18 | find_path(LOG4CXX_INCLUDE_DIR log4cxx/logger.h 19 | PATHS ${LOG4CXX_ROOT_DIR}/src/windows) 20 | else() 21 | find_path(LOG4CXX_INCLUDE_DIR log4cxx/logger.h 22 | PATHS ${LOG4CXX_ROOT_DIR}) 23 | endif() 24 | 25 | if(MSVC) 26 | find_library(LOG4CXX_LIBRARY_RELEASE 27 | NAMES liblog4cxx 28 | PATHS ${LOG4CXX_ROOT_DIR} 29 | PATH_SUFFIXES Release) 30 | 31 | find_library(LOG4CXX_LIBRARY_DEBUG 32 | NAMES liblog4cxx-debug 33 | PATHS ${LOG4CXX_ROOT_DIR} 34 | PATH_SUFFIXES Debug) 35 | 36 | set(LOG4CXX_LIBRARY optimized ${LOG4CXX_LIBRARY_RELEASE} debug ${LOG4CXX_LIBRARY_DEBUG}) 37 | else() 38 | find_library(LOG4CXX_LIBRARY log4cxx) 39 | endif() 40 | 41 | find_package_handle_standard_args(LOG4CXX DEFAULT_MSG LOG4CXX_INCLUDE_DIR LOG4CXX_LIBRARY) 42 | 43 | 44 | if(LOG4CXX_FOUND) 45 | set(LOG4CXX_INCLUDE_DIRS ${LOG4CXX_INCLUDE_DIR}) 46 | set(LOG4CXX_LIBRARIES ${LOG4CXX_LIBRARY}) 47 | message(STATUS "Found LOG4CXX (include: ${LOG4CXX_INCLUDE_DIR}, library: ${LOG4CXX_LIBRARY})") 48 | mark_as_advanced(LOG4CXX_LIBRARY_DEBUG LOG4CXX_LIBRARY_RELEASE 49 | LOG4CXX_LIBRARY LOG4CXX_INCLUDE_DIR LOG4CXX_ROOT_DIR) 50 | endif() 51 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | mkdir -p build 3 | (cd build >/dev/null 2>&1 && cmake .. "$@") 4 | -------------------------------------------------------------------------------- /example/blog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | NETSMF=../bin/Release/netsmf 5 | if [ -z "$1" ]; then 6 | INPUT="blogcatalog.edge" 7 | else 8 | INPUT=$1 9 | fi 10 | 11 | if [ -z "$2" ]; then 12 | OUTPUT="blogcatalog.netsmf" 13 | else 14 | OUTPUT=$2 15 | fi 16 | 17 | if [ -z "$3" ]; then 18 | LABEL=blogcatalog.mat 19 | else 20 | LABEL=$3 21 | fi 22 | 23 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT 24 | 25 | (/usr/bin/time -p $NETSMF -T 10 \ 26 | -filename $INPUT \ 27 | -machine $HOSTNAME \ 28 | -output_svd $OUTPUT \ 29 | -rank 512 \ 30 | -num_threads_sampling 40 \ 31 | -num_threads_svd 40 \ 32 | -rounds 10000 \ 33 | -check_point 50 \ 34 | -noweight \ 35 | -nolog1p \ 36 | -log4cxx log4cxx.config) |& tee blog.log 37 | 38 | python redsvd2emb.py --name $OUTPUT --dim 128 39 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9 40 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9 41 | -------------------------------------------------------------------------------- /example/flickr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | NETSMF=../bin/Release/netsmf 6 | if [ -z "$1" ]; then 7 | INPUT="flickr.edge" 8 | else 9 | INPUT=$1 10 | fi 11 | 12 | if [ -z "$2" ]; then 13 | #mkdir -p flickr 14 | OUTPUT="flickr.netsmf" 15 | else 16 | OUTPUT=$2 17 | fi 18 | 19 | if [ -z "$3" ]; then 20 | LABEL=flickr.mat 21 | else 22 | LABEL=$3 23 | fi 24 | 25 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT 26 | 27 | (/usr/bin/time -p $NETSMF -T 10 \ 28 | -filename $INPUT \ 29 | -machine $HOSTNAME \ 30 | -output_svd $OUTPUT \ 31 | -rank 512 \ 32 | -num_threads_sampling 20 \ 33 | -num_threads_svd 40 \ 34 | -rounds 1000 \ 35 | -check_point 20 \ 36 | -noweight \ 37 | -nolog1p \ 38 | -log4cxx log4cxx.config) |& tee -a flickr.log 39 | 40 | python redsvd2emb.py --name $OUTPUT --dim 128 41 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10 42 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10 43 | -------------------------------------------------------------------------------- /example/log4cxx.config: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, A1 2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 4 | 5 | # Print the date in ISO 8601 format 6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c - %m%n 7 | -------------------------------------------------------------------------------- /example/mat2edge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: mat2edge.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2019/03/18 12:01 6 | # TODO: 7 | 8 | import scipy.io 9 | import math 10 | import sys 11 | 12 | def load_adjacency_matrix(file, variable_name="network"): 13 | data = scipy.io.loadmat(file) 14 | return data[variable_name] 15 | 16 | def mat2edge(file, output): 17 | print("mat2edgelist from %s to %s" % (file, output)) 18 | A = load_adjacency_matrix(file) 19 | A.eliminate_zeros() 20 | min_v, max_v = min(A.data) , max(A.data) 21 | print("minimum non-zero value=%.2f maximum non-zero value=%.2f" \ 22 | % (min_v, max_v)) 23 | unweighted = math.isclose(min_v, 1.0) and math.isclose(max_v, 1.0) 24 | print("unweighted graph" if unweighted else "weighted graph") 25 | A = A.todok() 26 | with open(output, "w") as f: 27 | for (x, y), v in A.items(): 28 | assert(math.isclose(A[y, x], v)) 29 | print("%d\t%d" % (x, y) if unweighted else "%d\t%d\t%f" % (x, y, v),end="\n", file=f) 30 | 31 | if __name__ == "__main__": 32 | #mat2edge("youtube.mat", "youtube.edge") 33 | mat2edge(sys.argv[1], sys.argv[2]) 34 | -------------------------------------------------------------------------------- /example/ppi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | NETSMF=../bin/Release/netsmf 6 | if [ -z "$1" ]; then 7 | INPUT="ppi.edge" 8 | else 9 | INPUT=$1 10 | fi 11 | 12 | if [ -z "$2" ]; then 13 | #mkdir -p ppi 14 | OUTPUT="ppi.netsmf" 15 | else 16 | OUTPUT=$2 17 | fi 18 | 19 | if [ -z "$3" ]; then 20 | LABEL=Homo_sapiens.mat 21 | else 22 | LABEL=$3 23 | fi 24 | 25 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT 26 | 27 | (/usr/bin/time -p $NETSMF -T 10 \ 28 | -filename $INPUT \ 29 | -machine $HOSTNAME \ 30 | -output_svd $OUTPUT \ 31 | -rank 256 \ 32 | -num_threads_sampling 40 \ 33 | -num_threads_svd 40 \ 34 | -rounds 1000 \ 35 | -check_point 50 \ 36 | -noweight \ 37 | -nolog1p \ 38 | -log4cxx log4cxx.config) |& tee -a ppi.log 39 | 40 | python redsvd2emb.py --name $OUTPUT --dim 128 41 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9 42 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 10 --stop-train-ratio 90 --num-train-ratio 9 43 | -------------------------------------------------------------------------------- /example/predict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: predict.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2017/07/17 21:57 6 | # TODO: 7 | 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | 11 | import os 12 | import pickle as pkl 13 | import numpy as np 14 | import scipy.sparse as sp 15 | import scipy.io 16 | import argparse 17 | import logging 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn.model_selection import ShuffleSplit 20 | from sklearn.multiclass import OneVsRestClassifier 21 | from sklearn.metrics import f1_score 22 | #from sklearn.exceptions import UndefinedMetricWarning 23 | #warnings.filterwarnings("ignore", category=UserWarning) 24 | #warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | def construct_indicator(y_score, y): 29 | # rank the labels by the scores directly 30 | num_label = y.sum(axis=1, dtype=np.int32) 31 | # num_label = np.sum(y, axis=1, dtype=np.int) 32 | y_sort = np.fliplr(np.argsort(y_score, axis=1)) 33 | #y_pred = np.zeros_like(y_score, dtype=np.int32) 34 | row, col = [], [] 35 | for i in range(y_score.shape[0]): 36 | row += [i] * num_label[i, 0] 37 | col += y_sort[i, :num_label[i, 0]].tolist() 38 | #for j in range(num_label[i, 0]): 39 | # y_pred[i, y_sort[i, j]] = 1 40 | y_pred = sp.csr_matrix( 41 | ([1] * len(row), (row, col)), 42 | shape=y.shape, dtype=np.bool_) 43 | return y_pred 44 | 45 | def load_w2v_feature(file): 46 | with open(file, "rb") as f: 47 | nu = 0 48 | for line in f: 49 | content = line.strip().split() 50 | nu += 1 51 | if nu == 1: 52 | n, d = int(content[0]), int(content[1]) 53 | feature = [[] for i in range(n)] 54 | continue 55 | index = int(content[0]) 56 | for x in content[1:]: 57 | feature[index].append(float(x)) 58 | if nu % 10000000 == 0: 59 | logger.info("read %d line from w2v feature file", nu) 60 | 61 | # for item in feature: 62 | # assert len(item) == d 63 | return np.array(feature, dtype=np.float32) 64 | 65 | 66 | def load_label(file, variable_name="group"): 67 | if file.endswith(".tsv") or file.endswith(".txt"): 68 | data = np.loadtxt(file).astype(np.int32) 69 | label = sp.csr_matrix(([1] * data.shape[0], (data[:, 0], data[:, 1])), dtype=np.bool_) 70 | sp.save_npz("label.npz", label) 71 | return label 72 | elif file.endswith(".npz"): 73 | return sp.load_npz(file) 74 | else: 75 | data = scipy.io.loadmat(file) 76 | logger.info("loading mat file %s", file) 77 | 78 | label = data[variable_name].tocsr().astype(np.bool_) 79 | print(label.shape, label.dtype) 80 | return label 81 | 82 | label = data[variable_name].todense().astype(np.int32) 83 | label = np.array(label) 84 | return label 85 | 86 | def predict_cv(X, y, train_ratio=0.2, n_splits=10, random_state=0, C=1., num_workers=1): 87 | micro, macro = [], [] 88 | shuffle = ShuffleSplit(n_splits=n_splits, test_size=1-train_ratio, 89 | random_state=random_state) 90 | for train_index, test_index in shuffle.split(X): 91 | #print(train_index.shape, test_index.shape) 92 | #assert len(set(train_index) & set(test_index)) == 0 93 | #assert len(train_index) + len(test_index) == X.shape[0] 94 | X_train, X_test = X[train_index], X[test_index] 95 | y_train, y_test = y[train_index], y[test_index] 96 | clf = OneVsRestClassifier( 97 | LogisticRegression( 98 | C=C, 99 | solver="liblinear", 100 | multi_class="ovr"), 101 | n_jobs=num_workers) 102 | clf.fit(X_train, y_train) 103 | y_score = clf.predict_proba(X_test) 104 | y_pred = construct_indicator(y_score, y_test) 105 | mi = f1_score(y_test, y_pred, average="micro") 106 | ma = f1_score(y_test, y_pred, average="macro") 107 | logger.info("micro f1 %f macro f1 %f", mi, ma) 108 | micro.append(mi) 109 | macro.append(ma) 110 | logger.info("%d fold validation, training ratio %f", len(micro), train_ratio) 111 | logger.info("Average micro %.2f, Average macro %.2f", 112 | np.mean(micro) * 100, 113 | np.mean(macro) * 100) 114 | return np.mean(micro)*100, np.mean(macro)*100 115 | 116 | 117 | if __name__ == "__main__": 118 | parser = argparse.ArgumentParser() 119 | parser.add_argument("--label", type=str, required=True, 120 | help="input file path for labels (.mat)") 121 | parser.add_argument("--embedding", type=str, required=True, 122 | help="input file path for embedding (.npy)") 123 | parser.add_argument("--matfile-variable-name", type=str, default='group', 124 | help='variable name of adjacency matrix inside a .mat file.') 125 | parser.add_argument("--seed", type=int, required=True, 126 | help="seed used for random number generator when randomly split data into training/test set.") 127 | parser.add_argument("--start-train-ratio", type=float, default=10, 128 | help="the start value of the train ratio (inclusive).") 129 | parser.add_argument("--stop-train-ratio", type=float, default=90, 130 | help="the end value of the train ratio (inclusive).") 131 | parser.add_argument("--num-train-ratio", type=int, default=9, 132 | help="the number of train ratio choosed from [train-ratio-start, train-ratio-end].") 133 | parser.add_argument("--C", type=float, default=1.0, 134 | help="inverse of regularization strength used in logistic regression.") 135 | parser.add_argument("--num-split", type=int, default=10, 136 | help="The number of re-shuffling & splitting for each train ratio.") 137 | parser.add_argument("--num-workers", type=int, default=60, 138 | help="Number of process") 139 | args = parser.parse_args() 140 | logging.basicConfig( 141 | filename="%s.log" % args.embedding, filemode="a", # uncomment this to log to file 142 | level=logging.INFO, 143 | format='%(asctime)s %(message)s') # include timestamp 144 | logger.info("C=%f", args.C) 145 | logger.info("Loading label from %s...", args.label) 146 | label = load_label(file=args.label, variable_name=args.matfile_variable_name) 147 | logger.info("Label loaded!") 148 | 149 | logger.info("Loading network embedding from %s...", args.embedding) 150 | ext = os.path.splitext(args.embedding)[1] 151 | if ext == ".npy": 152 | embedding = np.load(args.embedding) 153 | elif ext == ".pkl": 154 | with open(args.embedding, "rb") as f: 155 | embedding = pkl.load(f) 156 | else: 157 | # Load word2vec format 158 | embedding = load_w2v_feature(args.embedding) 159 | np.save("%s.npy" % args.embedding, embedding, allow_pickle=False) 160 | logger.info("Network embedding loaded!") 161 | 162 | logger.info("Embedding has shape %d, %d", embedding.shape[0], embedding.shape[1]) 163 | logger.info("Label has shape %d, %d", label.shape[0], label.shape[1]) 164 | 165 | if label.shape[0] != embedding.shape[0]: 166 | logger.info("Different shape ....") 167 | num_instance = min(label.shape[0], embedding.shape[0]) 168 | label, embedding = label[:num_instance], embedding[:num_instance] 169 | 170 | num_label = label.sum(axis=1, dtype=np.int32) 171 | idx = np.argwhere(num_label == 0) 172 | logger.info("%d instances with no label" % len(idx)) 173 | # if len(idx): 174 | # embedding = embedding[label.getnnz(1)>0] 175 | # label = label[label.getnnz(1)>0] 176 | # logger.info("After deleting ...") 177 | logger.info("Embedding has shape %d, %d", embedding.shape[0], embedding.shape[1]) 178 | logger.info("Label has shape %d, %d", label.shape[0], label.shape[1]) 179 | 180 | train_ratios = np.linspace(args.start_train_ratio, args.stop_train_ratio, 181 | args.num_train_ratio) 182 | 183 | 184 | f1 = list() 185 | for tr in train_ratios: 186 | res = predict_cv(embedding, label, train_ratio=tr/100., 187 | n_splits=args.num_split, C=args.C, random_state=args.seed, 188 | num_workers=args.num_workers) 189 | f1.append(res) 190 | micro, macro = zip(*f1) 191 | print(" ".join([str(x) for x in micro])) 192 | logger.info(" ".join([str(x) for x in micro])) 193 | print(" ".join([str(x) for x in macro])) 194 | logger.info(" ".join([str(x) for x in macro])) 195 | -------------------------------------------------------------------------------- /example/redsvd2emb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: redsvd2emb.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2018/10/22 03:37 6 | # TODO: 7 | 8 | 9 | import scipy.sparse as sp 10 | import numpy as np 11 | import logging 12 | import argparse 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | def redsvd2emb(u, s): 17 | return sp.diags(np.sqrt(s)).dot(u.T).T 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--name", type=str, required=True, 23 | help="file name") 24 | parser.add_argument("--dim", type=int, required=True, 25 | help="dimension") 26 | args = parser.parse_args() 27 | logging.basicConfig(level=logging.INFO, 28 | format='%(asctime)s %(message)s') # include timestamp 29 | u = np.loadtxt("%s.U" % args.name)[:, :args.dim] 30 | s = np.loadtxt("%s.S" % args.name)[:args.dim] 31 | embedding = redsvd2emb(u, s) 32 | logger.info("save embedding to %s_%d.npy", args.name, args.dim) 33 | np.save("%s_%d.npy" % (args.name, args.dim), embedding, allow_pickle=False) 34 | 35 | 36 | -------------------------------------------------------------------------------- /example/youtube.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | NETSMF=../bin/Release/netsmf 6 | if [ -z "$1" ]; then 7 | INPUT="youtube.edge" 8 | else 9 | INPUT=$1 10 | fi 11 | 12 | if [ -z "$2" ]; then 13 | # mkdir -p youtube 14 | OUTPUT="youtube.netsmf" 15 | else 16 | OUTPUT=$2 17 | fi 18 | 19 | if [ -z "$3" ]; then 20 | LABEL=youtube.mat 21 | else 22 | LABEL=$3 23 | fi 24 | 25 | [ ! -f $INPUT ] && python mat2edge.py $LABEL $INPUT 26 | 27 | (/usr/bin/time -p $NETSMF -T 10 \ 28 | -filename $INPUT \ 29 | -machine $HOSTNAME \ 30 | -output_svd $OUTPUT \ 31 | -rank 256 \ 32 | -num_threads_sampling 10 \ 33 | -num_threads_svd 32 \ 34 | -rounds 2000 \ 35 | -check_point 10 \ 36 | -noweight \ 37 | -nolog1p \ 38 | -log4cxx log4cxx.config) |& tee -a youtube.log 39 | 40 | python redsvd2emb.py --name $OUTPUT --dim 128 41 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 1 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10 42 | python predict.py --label $LABEL --embedding ${OUTPUT}_128.npy --seed 0 --C 10 --start-train-ratio 1 --stop-train-ratio 10 --num-train-ratio 10 43 | -------------------------------------------------------------------------------- /src/BinaryGraphWalker.cc: -------------------------------------------------------------------------------- 1 | #include "BinaryGraphWalker.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | // include gflags 14 | #include 15 | DECLARE_int32(num_threads_svd); 16 | DECLARE_int32(rank); 17 | DECLARE_int32(negative); 18 | DECLARE_string(output_svd); 19 | 20 | using namespace log4cxx; 21 | 22 | BinaryGraphWalker::BinaryGraphWalker(const std::vector& indices, 23 | const std::vector& indptr, int T, const std::vector& degree) 24 | : GraphWalker(indices, indptr, degree, T) { 25 | LOG4CXX_INFO(logger, "unweighted network"); 26 | } 27 | 28 | BinaryGraphWalker* BinaryGraphWalker::getWalker(const std::string& fname, int T) { 29 | std::vector edges; 30 | std::vector out_degree; 31 | std::vector edge_pair; 32 | 33 | VertexId max_vertex_id = 0; 34 | std::ifstream fin(fname); 35 | assert(fin.is_open()); 36 | VertexId src, dst; 37 | while (fin >> src >> dst) { 38 | // vertexid overflow 39 | if (src >= max_vertex_id || dst >= max_vertex_id) { 40 | max_vertex_id = std::max(src, dst); 41 | out_degree.resize(max_vertex_id + 1, 0); 42 | } 43 | if (src == dst) { 44 | continue; 45 | } 46 | ++out_degree[src]; 47 | edge_pair.push_back(src); 48 | edge_pair.push_back(dst); 49 | } 50 | int num_vertex = max_vertex_id + 1; 51 | 52 | std::vector indptr(num_vertex + 1, 0); 53 | std::partial_sum(out_degree.begin(), out_degree.end(), indptr.begin() + 1); 54 | std::vector degree; 55 | for (auto const& val : out_degree) { 56 | degree.push_back(float(val)); 57 | } 58 | 59 | EdgeId edge_cnt = edge_pair.size() >> 1; 60 | std::vector indices(edge_cnt, 0); 61 | 62 | for (EdgeId e = 0; e < edge_cnt; ++e) { 63 | VertexId src = edge_pair[e << 1]; 64 | VertexId dst = edge_pair[(e << 1) + 1]; 65 | 66 | EdgeId idx = indptr[src] + (--out_degree[src]); 67 | indices[idx] = dst; 68 | } 69 | return new BinaryGraphWalker(indices, indptr, T, degree); 70 | } 71 | 72 | 73 | VertexId BinaryGraphWalker::randomWalk(VertexId u, int step, 74 | unsigned* seed) const { 75 | for (;step--;) { 76 | // u's neighbors are indices[indptr[i]:indptr[i+1]] 77 | int offset = rand_r(seed) % (indptr[u+1] - indptr[u]); 78 | u = indices[indptr[u] + offset]; 79 | } 80 | return u; 81 | } 82 | 83 | void BinaryGraphWalker::samplePath(const VertexId u, const VertexId v, int r, unsigned* seed, 84 | std::vector& sampled_pairs) const { 85 | int k = rand_r(seed) % r + 1; 86 | VertexId u_ = randomWalk(u, k - 1, seed); 87 | VertexId v_ = randomWalk(v, r - k, seed); 88 | // add record (u_, v_, 1) 89 | 90 | if (u_ > v_) { 91 | std::swap(u_, v_); 92 | } 93 | 94 | sampled_pairs.push_back(std::make_pair(u_, v_)); 95 | } 96 | 97 | void BinaryGraphWalker::sampling(int round, int num_threads, 98 | const std::string& machine, 99 | int check_point) { 100 | omp_set_num_threads(num_threads); 101 | 102 | std::vector*> counters; 103 | for (int i = 0; i < num_threads; ++i) { 104 | counters.push_back(new std::vector); 105 | } 106 | 107 | #pragma omp parallel default(shared) 108 | { 109 | int this_thread = omp_get_thread_num(); 110 | std::string thread_name = std::string("machine_") + machine 111 | + std::string("_thread_") + std::to_string(this_thread); // + std::string("_time_") + std::to_string(time(0)); 112 | 113 | LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " thread name is " << thread_name ); 114 | unsigned seed = std::hash{}(thread_name); 115 | 116 | std::vector sampled_pairs; 117 | std::vector *&counter = counters[this_thread]; 118 | std::vector *counter_tmp = new std::vector; 119 | 120 | LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " set seed " << seed); 121 | int my_round= ceil((double)round / num_threads); 122 | 123 | for (int i=0; iclear(); 138 | LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " complete " << i + 1 << " rounds, size of counter=" << counter->size() << " counter.max_val=" << max_val); 139 | } 140 | } 141 | LOG4CXX_INFO(logger, "[thread " << this_thread << "] finish job"); 142 | delete counter_tmp; 143 | } 144 | 145 | // now we have a list of counters, we want to merge them in a binary tree way --- from leaf to root 146 | while (counters.size() > 1) { 147 | LOG4CXX_INFO(logger, counters.size() << " counters to merge."); 148 | size_t n_half = (counters.size() + 1) >> 1; 149 | omp_set_num_threads(counters.size() >> 1); 150 | 151 | #pragma omp parallel default(shared) 152 | { 153 | int this_thread = omp_get_thread_num(); 154 | LOG4CXX_INFO(logger, "merge counter " << this_thread << " and " << n_half + this_thread); 155 | std::vector *counter_tmp = merge_counters(*counters[this_thread], *counters[n_half + this_thread]); 156 | 157 | delete counters[this_thread]; 158 | delete counters[n_half + this_thread]; 159 | counters[this_thread] = counter_tmp; 160 | } 161 | 162 | counters.resize(n_half); 163 | } 164 | counter_merged = counters[0]; 165 | } 166 | 167 | float BinaryGraphWalker::merge(const std::vector& counter, 168 | std::vector& tmp, 169 | std::vector& sampled_pairs) { 170 | float max_val = 0; 171 | std::sort(sampled_pairs.begin(), sampled_pairs.end()); 172 | 173 | std::vector::const_iterator iter = counter.cbegin(); 174 | for (size_t i = 0, j = 0; i < sampled_pairs.size(); i = j) { 175 | for (j = i + 1; j < sampled_pairs.size() && sampled_pairs[j] == sampled_pairs[i]; ++j); 176 | for (;iter != counter.end() && iter->first < sampled_pairs[i]; ++iter) { 177 | max_val = std::max(max_val, iter->second); 178 | tmp.push_back(*iter); 179 | } 180 | if (iter != counter.end() && iter->first == sampled_pairs[i]) { 181 | max_val = std::max(max_val, j - i + iter->second); 182 | tmp.push_back( 183 | std::make_pair(iter->first, j - i + iter->second)); 184 | ++iter; 185 | } else { 186 | max_val = std::max(max_val, float(j - i)); 187 | tmp.push_back(std::make_pair(sampled_pairs[i], float(j - i))); 188 | } 189 | } 190 | for (;iter != counter.end(); ++iter) { 191 | max_val = std::max(max_val, iter->second); 192 | tmp.push_back(*iter); 193 | } 194 | return max_val; 195 | } 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /src/BinaryGraphWalker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "GraphWalker.h" 4 | 5 | class BinaryGraphWalker : public GraphWalker { 6 | public: 7 | static BinaryGraphWalker* getWalker(const std::string& name, int T); 8 | BinaryGraphWalker(const std::vector& indices, 9 | const std::vector& indptr, int T, 10 | const std::vector& degree); 11 | 12 | void samplePath(VertexId u, VertexId v, int r, unsigned* seed, 13 | std::vector& sampled_pair) const; 14 | VertexId randomWalk(VertexId u, int step, unsigned* seed) const; 15 | void sampling(int round, int num_threads, 16 | const std::string& machine, 17 | int check_point); 18 | // void transformation(); 19 | // void redsvd(); 20 | // void merge_to_sparsifier(const std::vector& counter); 21 | 22 | // static void dump(const std::string& filename, 23 | // const std::vector& counter); 24 | static float merge(const std::vector& counter, 25 | std::vector& tmp, 26 | std::vector& sampled_pairs); 27 | }; 28 | 29 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Add subdirectories 2 | ADD_SUBDIRECTORY(redsvd) 3 | 4 | # Find dependencies libraries for Math 5 | IF(UNIX) 6 | SET(MATH_LIB m) 7 | ELSE(UNIX) 8 | SET(MATH_LIB) 9 | ENDIF(UNIX) 10 | 11 | # executable declaration 12 | ADD_EXECUTABLE(netsmf netsmf.cc BinaryGraphWalker.cc GraphWalker.cc WeightGraphWalker.cc) 13 | 14 | # Linking 15 | TARGET_LINK_LIBRARIES( netsmf 16 | redsvd 17 | ${MATH_LIB} 18 | ${ZLIB_LIBRARIES} 19 | ${GFLAGS_LIBRARIES} 20 | ${LOG4CXX_LIBRARIES} ) 21 | 22 | # TARGET_LINK_LIBRARIES( GraphWalker ) 23 | # TARGET_LINK_LIBRARIES( BinaryGraphWalker 24 | # GraphWalker 25 | # ${GFLAGS_LIBRARIES} 26 | # ${LOG4CXX_LIBRARIES} ) 27 | 28 | # Installing 29 | INSTALL(TARGETS netsmf DESTINATION bin) 30 | -------------------------------------------------------------------------------- /src/GraphWalker.cc: -------------------------------------------------------------------------------- 1 | #include "GraphWalker.h" 2 | #include // std::partial_sum 3 | #include 4 | 5 | // include gflags 6 | #include 7 | DECLARE_int32(num_threads_svd); 8 | DECLARE_int32(rank); 9 | DECLARE_int32(negative); 10 | DECLARE_string(output_svd); 11 | DECLARE_bool(log1p); 12 | 13 | // include redsvd headers 14 | #include "redsvd/util.hpp" 15 | #include "redsvd/redsvd.hpp" 16 | #include "redsvd/redsvdFile.hpp" 17 | 18 | using namespace log4cxx; 19 | LoggerPtr GraphWalker::logger(Logger::getLogger("GraphWalker")); 20 | 21 | GraphWalker::GraphWalker(const std::vector& indices_, 22 | const std::vector& indptr_, 23 | const std::vector& degree_, 24 | int T_) 25 | : indices(indices_), indptr(indptr_), degree(degree_), T(T_) { 26 | assert(indptr.size() == degree.size() + 1); 27 | sparsifier_lower = new std::vector(); 28 | sparsifier_upper = new std::vector(); 29 | counter_merged = NULL; 30 | } 31 | 32 | void GraphWalker::transformation() { 33 | 34 | LOG4CXX_INFO(logger, "transformation ..."); 35 | double M = 0; 36 | for (auto iter = counter_merged->cbegin(); iter != counter_merged->cend(); ++iter) { 37 | M += iter->second * 2; 38 | } 39 | LOG4CXX_INFO(logger, "total number of samples=" << M); 40 | double num_edges = (double)indices.size(); 41 | double vol = 0.0; 42 | for (auto const& val : degree) { 43 | vol += val; 44 | } 45 | LOG4CXX_INFO(logger, "vol(G)=" << vol); 46 | double factor = vol * num_edges / M / FLAGS_negative; 47 | VertexId src, dst; 48 | double val; 49 | std::vector nnz_lower_row(degree.size(), 0); 50 | 51 | size_t nnz_lower = 0; 52 | sparsifier_upper->clear(); 53 | sparsifier_lower->clear(); 54 | if (FLAGS_log1p) { 55 | LOG4CXX_INFO(logger, "using log1p..."); 56 | } else { 57 | LOG4CXX_INFO(logger, "using truncated logarithm..."); 58 | } 59 | 60 | std::function log1p_func = log1p; 61 | std::function log_func = log; 62 | auto mylog = FLAGS_log1p ? log1p_func : log_func; 63 | 64 | for (auto iter = counter_merged->cbegin(); iter != counter_merged->cend(); ++iter) { 65 | src = iter->first.first; 66 | dst = iter->first.second; 67 | val = src != dst ? iter->second : iter->second * 2; 68 | val = mylog(val * factor / degree[src] / degree[dst]); 69 | if (val > 0) { 70 | sparsifier_upper->push_back(std::make_pair(iter->first, (float)val)); 71 | if (src != dst) { 72 | ++nnz_lower_row[dst]; 73 | ++nnz_lower; 74 | } 75 | } 76 | } 77 | LOG4CXX_INFO(logger, "after log, #nnz in upper triangle and diagonal reduces to " << sparsifier_upper->size() << " (from " << counter_merged->size() << ")"); 78 | counter_merged->clear(); 79 | delete counter_merged; 80 | 81 | 82 | LOG4CXX_INFO(logger, "constructing lower triangle ..."); 83 | // now, sparsifier stores upper triangle + diagonal 84 | // we will re-use sparsifier_lower to store lower triangle 85 | std::vector lower_indptr(degree.size() + 1, 0); 86 | std::partial_sum(nnz_lower_row.begin(), nnz_lower_row.end(), lower_indptr.begin() + 1); 87 | 88 | sparsifier_lower->resize(nnz_lower); 89 | LOG4CXX_INFO(logger, "lower triangle has " << nnz_lower << " nnz."); 90 | for (auto riter = sparsifier_upper->crbegin(); riter != sparsifier_upper->crend(); ++riter) { 91 | src = riter->first.first; 92 | dst = riter->first.second; 93 | if (src == dst) { 94 | continue; 95 | } 96 | auto iter = sparsifier_lower->begin() + lower_indptr[dst] + (--nnz_lower_row[dst]); 97 | iter->first.first = dst; 98 | iter->first.second = src; 99 | iter->second = riter->second; 100 | } 101 | LOG4CXX_INFO(logger, "lower triangle constructed."); 102 | } 103 | 104 | void GraphWalker::redsvd() { 105 | Eigen::setNbThreads(FLAGS_num_threads_svd); 106 | LOG4CXX_INFO(logger, "prepare svd ..."); 107 | REDSVD::SMatrixXf A; 108 | // matrix size 109 | A.resize(degree.size(), degree.size()); 110 | // number of nnz 111 | A.reserve(sparsifier_upper->size() + sparsifier_lower->size()); 112 | auto iter_lower = sparsifier_lower->cbegin(); 113 | auto iter_upper = sparsifier_upper->cbegin(); 114 | for (size_t i = 0; i < degree.size(); ++i) { 115 | A.startVec(i); 116 | for (;iter_lower != sparsifier_lower->cend() && iter_lower->first.first == i; ++iter_lower) { 117 | A.insertBack(i, iter_lower->first.second) = iter_lower->second; 118 | } 119 | for (;iter_upper != sparsifier_upper->cend() && iter_upper->first.first == i; ++iter_upper) { 120 | A.insertBack(i, iter_upper->first.second) = iter_upper->second; 121 | } 122 | } 123 | A.finalize(); 124 | sparsifier_upper->clear(); 125 | sparsifier_lower->clear(); 126 | delete sparsifier_upper; 127 | delete sparsifier_lower; 128 | 129 | LOG4CXX_INFO(logger, "running randomized SVD..."); 130 | const double start = REDSVD::Util::getSec(); 131 | REDSVD::RedSVD svdOfA(A, FLAGS_rank < degree.size() ? FLAGS_rank : degree.size()); 132 | LOG4CXX_INFO(logger, "done in " << REDSVD::Util::getSec() - start); 133 | 134 | // set output name 135 | REDSVD::writeMatrix(FLAGS_output_svd, svdOfA); 136 | } 137 | 138 | std::vector* GraphWalker::merge_counters(const std::vector& counter, 139 | const std::vector& counter_other) { 140 | std::vector::const_iterator iter1 = counter.cbegin(); 141 | std::vector::const_iterator iter2 = counter_other.cbegin(); 142 | 143 | std::vector *counter_tmp = new std::vector; 144 | 145 | while (iter1 != counter.cend() && iter2 != counter_other.cend()) { 146 | if (iter1->first < iter2->first) { 147 | counter_tmp->push_back(*(iter1++)); 148 | } else if (iter1->first > iter2->first) { 149 | counter_tmp->push_back(*(iter2++)); 150 | } else { 151 | counter_tmp->push_back( 152 | std::make_pair(iter1->first, iter1->second + iter2->second)); 153 | ++iter1; 154 | ++iter2; 155 | } 156 | } 157 | 158 | for (;iter1 != counter.cend(); ++iter1) { 159 | counter_tmp->push_back(*iter1); 160 | } 161 | 162 | for (;iter2 != counter_other.cend(); ++iter2) { 163 | counter_tmp->push_back(*iter2); 164 | } 165 | return counter_tmp; 166 | } 167 | 168 | -------------------------------------------------------------------------------- /src/GraphWalker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "log4cxx/logger.h" 7 | 8 | 9 | //using VertexId = unsigned long; //uint32_t; 10 | using VertexId = unsigned int; //uint32_t; 11 | using EdgeId = unsigned int; //uint32_t; 12 | using VertexPair = std::pair; 13 | using VertexPairCount = std::pair; 14 | using ValuedVertexPair = std::pair, float>; 15 | 16 | 17 | /* indices, indptr, data 18 | * indices is array of column indices 19 | * data is array of corresponding nonzero values 20 | * indptr points to row starts in indices and data 21 | * length is n_row + 1, last item = number of values = length of both indices and data 22 | * nonzero values of the i-th row are data[indptr[i]:indptr[i+1]] with column indices indices[indptr[i]:indptr[i+1]] 23 | * item (i, j) can be accessed as data[indptr[i]+k], where k is position of j in indices[indptr[i]:indptr[i+1]] 24 | */ 25 | 26 | class GraphWalker { 27 | public: 28 | static log4cxx::LoggerPtr logger; 29 | GraphWalker(const std::vector& indices_, 30 | const std::vector& indptr_, 31 | const std::vector& degree_, 32 | int T); 33 | 34 | const std::vector indices; 35 | const std::vector indptr; 36 | const std::vector degree; 37 | int T; 38 | 39 | std::vector *sparsifier_upper, *sparsifier_lower; 40 | std::vector *counter_merged; 41 | 42 | virtual void sampling(int round, int num_threads, 43 | const std::string& machine, 44 | int check_point) = 0; 45 | void transformation(); 46 | void redsvd(); 47 | 48 | static std::vector* merge_counters( 49 | const std::vector& counter, 50 | const std::vector& counter_other); 51 | }; 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/WeightGraphWalker.cc: -------------------------------------------------------------------------------- 1 | #include "WeightGraphWalker.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // include gflags 12 | #include 13 | DECLARE_int32(num_threads_svd); DECLARE_int32(rank); 14 | DECLARE_int32(negative); 15 | DECLARE_string(output_svd); 16 | 17 | using namespace log4cxx; 18 | 19 | WeightGraphWalker::WeightGraphWalker(const std::vector& indices, 20 | const std::vector& indptr, int T, 21 | const std::vector& data_, 22 | const std::vector& prefix_sum_, 23 | const std::vector& degree) 24 | : GraphWalker(indices, indptr, degree, T), data(data_), prefix_sum(prefix_sum_) { 25 | LOG4CXX_INFO(logger, "weighted network"); 26 | } 27 | 28 | WeightGraphWalker* WeightGraphWalker::getWalker(const std::string& fname, int T) { 29 | std::vector edges; 30 | std::vector out_degree; 31 | std::vector edge_pair; 32 | std::vector weight; 33 | std::vector generalized_out_degree; 34 | 35 | VertexId max_vertex_id = 0; 36 | std::ifstream fin(fname); 37 | assert(fin.is_open()); 38 | VertexId src, dst; 39 | double w; 40 | while (fin >> src >> dst >> w) { 41 | // vertexid overflow 42 | if (src >= max_vertex_id || dst >= max_vertex_id) { 43 | max_vertex_id = std::max(src, dst); 44 | out_degree.resize(max_vertex_id + 1, 0); 45 | generalized_out_degree.resize(max_vertex_id + 1, 0); 46 | } 47 | if (src == dst) { 48 | continue; 49 | } 50 | ++out_degree[src]; 51 | generalized_out_degree[src] += w; 52 | edge_pair.push_back(src); 53 | edge_pair.push_back(dst); 54 | weight.push_back(w); 55 | } 56 | 57 | 58 | int num_vertex = max_vertex_id + 1; 59 | 60 | std::vector indptr(num_vertex + 1, 0); 61 | std::partial_sum(out_degree.begin(), out_degree.end(), indptr.begin() + 1); 62 | 63 | EdgeId edge_cnt = edge_pair.size() >> 1; 64 | std::vector indices(edge_cnt, 0); 65 | 66 | std::vector data(edge_cnt, 0.0); 67 | 68 | for (EdgeId e = 0; e < edge_cnt; ++e) { 69 | VertexId src = edge_pair[e << 1]; 70 | VertexId dst = edge_pair[(e << 1) + 1]; 71 | 72 | EdgeId idx = indptr[src] + (--out_degree[src]); 73 | indices[idx] = dst; 74 | data[idx] = weight[e]; 75 | } 76 | 77 | 78 | std::vector prefix_sum(edge_cnt, 0.0); 79 | for (VertexId v = 0; v < max_vertex_id; ++v) { 80 | std::partial_sum(data.begin() + indptr[v], data.begin() + indptr[v + 1], prefix_sum.begin() + indptr[v]); 81 | } 82 | 83 | return new WeightGraphWalker(indices, indptr, T, data, prefix_sum, generalized_out_degree); 84 | } 85 | 86 | VertexId WeightGraphWalker::randomWalk(VertexId u, int step, double& Z, 87 | unsigned* seed) const { 88 | for (;step--;) { 89 | // u's neighbors are indices[indptr[i]:indptr[i+1]] 90 | double ratio = (double)rand_r(seed) / RAND_MAX; 91 | int head = indptr[u], tail = indptr[u+1] - 1, pos = tail; 92 | double generalized_out_degree = prefix_sum[tail]; 93 | for (;head < tail;) { 94 | int mid = (head + tail) >> 1; 95 | if (prefix_sum[mid] >= ratio * generalized_out_degree) { 96 | tail= mid - 1; 97 | pos = mid; 98 | } else { 99 | head = mid + 1; 100 | } 101 | } 102 | 103 | u = indices[pos]; 104 | Z += 1. / data[pos]; 105 | } 106 | return u; 107 | } 108 | 109 | void WeightGraphWalker::samplePath(VertexId u, VertexId v, double w, int r, unsigned* seed, 110 | std::vector& sampled_pair) const { 111 | int k = rand_r(seed) % r + 1; 112 | double Z_half = 1. / w; 113 | VertexId u_ = randomWalk(u, k - 1, Z_half, seed); 114 | VertexId v_ = randomWalk(v, r - k, Z_half, seed); 115 | if (u_ > v_) { 116 | std::swap(u_, v_); 117 | } 118 | 119 | // add record (u_, v_, r / Z_half) 120 | sampled_pair.push_back(std::make_pair(std::make_pair(u_, v_), float(r / Z_half))); 121 | } 122 | 123 | void WeightGraphWalker::sampling(int round, int num_threads, 124 | const std::string& machine, 125 | int check_point) { 126 | omp_set_num_threads(num_threads); 127 | 128 | std::vector*> counters; 129 | for (int i = 0; i < num_threads; ++i) { 130 | counters.push_back(new std::vector); 131 | } 132 | 133 | #pragma omp parallel default(shared) 134 | { 135 | int this_thread = omp_get_thread_num(); 136 | std::string thread_name = std::string("machine_") + machine 137 | + std::string("_thread_") + std::to_string(this_thread); 138 | 139 | LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " thread name is " << thread_name); 140 | unsigned seed = std::hash{}(thread_name); 141 | 142 | std::vector sampled_pairs; 143 | std::vector *&counter = counters[this_thread]; 144 | std::vector *counter_tmp = new std::vector; 145 | 146 | LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " set seed " << seed); 147 | int my_round= ceil((double)round / num_threads); 148 | 149 | for (int i=0; iclear(); 162 | LOG4CXX_INFO(logger, "[thread " << this_thread << "]" << " complete " << i + 1 << " rounds, size of counter=" << counter->size() << " counter.max_val=" << max_val); 163 | } 164 | } 165 | 166 | LOG4CXX_INFO(logger, "[thread " << this_thread << "] finish job"); 167 | delete counter_tmp; 168 | } 169 | 170 | // now we have a list of counters, we want to merge them in a binary tree way --- from leaf to root 171 | while (counters.size() > 1) { 172 | LOG4CXX_INFO(logger, counters.size() << " counters to merge."); 173 | size_t n_half = (counters.size() + 1) >> 1; 174 | omp_set_num_threads(counters.size() >> 1); 175 | 176 | #pragma omp parallel default(shared) 177 | { 178 | int this_thread = omp_get_thread_num(); 179 | LOG4CXX_INFO(logger, "merge counter " << this_thread << " and " << n_half + this_thread); 180 | std::vector *counter_tmp = merge_counters(*counters[this_thread], *counters[n_half + this_thread]); 181 | 182 | delete counters[this_thread]; 183 | delete counters[n_half + this_thread]; 184 | counters[this_thread] = counter_tmp; 185 | } 186 | 187 | counters.resize(n_half); 188 | } 189 | counter_merged = counters[0]; 190 | 191 | } 192 | 193 | float WeightGraphWalker::merge(const std::vector& counter, 194 | std::vector& tmp, 195 | std::vector& sampled_pairs) { 196 | float max_val = 0; 197 | float w; 198 | std::sort(sampled_pairs.begin(), sampled_pairs.end()); 199 | 200 | std::vector::const_iterator iter = counter.cbegin(); 201 | for (size_t i = 0, j = 0; i < sampled_pairs.size(); i = j) { 202 | w = sampled_pairs[i].second; 203 | for (j = i + 1; j < sampled_pairs.size() 204 | && sampled_pairs[j].first == sampled_pairs[i].first; ++j) { 205 | w += sampled_pairs[j].second; 206 | } 207 | for (;iter != counter.end() && iter->first < sampled_pairs[i].first; ++iter) { 208 | max_val = std::max(max_val, iter->second); 209 | tmp.push_back(*iter); 210 | } 211 | if (iter != counter.end() && iter->first == sampled_pairs[i].first) { 212 | max_val = std::max(max_val, w + iter->second); 213 | tmp.push_back( 214 | std::make_pair(iter->first, w + iter->second)); 215 | ++iter; 216 | } else { 217 | max_val = std::max(max_val, w); 218 | tmp.push_back(std::make_pair(sampled_pairs[i].first, w)); 219 | } 220 | } 221 | for (;iter != counter.end(); ++iter) { 222 | max_val = std::max(max_val, iter->second); 223 | tmp.push_back(*iter); 224 | } 225 | return max_val; 226 | } 227 | 228 | 229 | 230 | 231 | 232 | -------------------------------------------------------------------------------- /src/WeightGraphWalker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "GraphWalker.h" 4 | 5 | 6 | class WeightGraphWalker : public GraphWalker { 7 | public: 8 | static WeightGraphWalker* getWalker(const std::string& name, int T); 9 | WeightGraphWalker(const std::vector& indices, 10 | const std::vector& indptr, int T, 11 | const std::vector& data, 12 | const std::vector& prefix_sum, 13 | const std::vector& degree); 14 | 15 | void samplePath(VertexId u, VertexId v, double w, int r, unsigned* seed, 16 | std::vector& sampled_pairs) const; 17 | VertexId randomWalk(VertexId u, int step, double& Z, unsigned* seed) const; 18 | void sampling(int round, int num_threads, 19 | const std::string& machine, 20 | int check_point); 21 | 22 | static float merge(const std::vector& counter, 23 | std::vector& tmp, 24 | std::vector& sampled_pairs); 25 | 26 | const std::vector data; 27 | const std::vector prefix_sum; 28 | }; 29 | -------------------------------------------------------------------------------- /src/config.h.cmake: -------------------------------------------------------------------------------- 1 | // Macros define at compilation time. 2 | // 3 | // Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/ 4 | // Written by Rémi Lebret 5 | // 6 | // This file is part of HPCA. 7 | // 8 | // HPCA is free software: you can redistribute it and/or modify 9 | // it under the terms of the GNU General Public License version 3 as 10 | // published by the Free Software Foundation. 11 | // 12 | // HPCA is distributed in the hope that it will be useful, 13 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | // GNU General Public License for more details. 16 | // 17 | // You should have received a copy of the GNU General Public License 18 | // along with HPCA. If not, see . 19 | 20 | #ifndef CONFIG_H 21 | #define CONFIG_H 22 | 23 | /*---- the configured options and settings for HPCA ---------------*/ 24 | /*---- the cmake configuration for NetSMF is adapted from HPCA ----*/ 25 | 26 | // should we use Intel MKL through Eigen? 27 | #cmakedefine EIGEN_USE_MKL_ALL 28 | 29 | #endif // CONFIG_H 30 | -------------------------------------------------------------------------------- /src/netsmf.cc: -------------------------------------------------------------------------------- 1 | #include "BinaryGraphWalker.h" 2 | #include "WeightGraphWalker.h" 3 | #include 4 | #include 5 | #include 6 | // include log4cxx header files. 7 | #include "log4cxx/logger.h" 8 | #include "log4cxx/basicconfigurator.h" 9 | #include "log4cxx/propertyconfigurator.h" 10 | #include "log4cxx/helpers/exception.h" 11 | 12 | using namespace log4cxx; 13 | using namespace log4cxx::helpers; 14 | 15 | LoggerPtr logger(Logger::getLogger("main")); 16 | 17 | 18 | DEFINE_int32(T, 10, "Window size."); 19 | DEFINE_string(filename, "edgelist", "Filename for edgelist file."); 20 | DEFINE_string(machine, "localhost", "machine name for generating random seed by hash."); 21 | // DEFINE_string(output_samples, "sample", "Filename for sampled pairs."); 22 | DEFINE_string(output_svd, "sample", "Filename for svd results."); 23 | DEFINE_int32(rank, 256, "embedding dimension."); 24 | DEFINE_int32(negative, 1, "number of negative sampling."); 25 | DEFINE_int32(num_threads_sampling, 32, "Number of threads."); 26 | DEFINE_int32(num_threads_svd, 32, "Number of threads for svd."); 27 | DEFINE_int32(rounds, 1000, "Number of rounds."); 28 | DEFINE_int32(check_point, 2, "Check point every ? rounds."); 29 | // DEFINE_int32(max_mem_GB, 200, "Maximum cached data."); 30 | DEFINE_bool(weight, false, "Weighted graph"); 31 | DEFINE_bool(log1p, false, "Using log1p instead of truncated logarithm"); 32 | DEFINE_string(log4cxx, "log4cxx.config", "Log4cxx config file"); 33 | 34 | int main(int argc, char** argv) { 35 | gflags::ParseCommandLineFlags(&argc, &argv, true); 36 | //BasicConfigurator::configure(); 37 | PropertyConfigurator::configure(FLAGS_log4cxx.c_str()); 38 | LOG4CXX_INFO(logger, "Entering application."); 39 | 40 | GraphWalker *walker = FLAGS_weight ? 41 | (GraphWalker*)WeightGraphWalker::getWalker(FLAGS_filename.c_str(), FLAGS_T) : 42 | (GraphWalker*)BinaryGraphWalker::getWalker(FLAGS_filename.c_str(), FLAGS_T); 43 | 44 | walker->sampling(FLAGS_rounds, FLAGS_num_threads_sampling, FLAGS_machine, FLAGS_check_point); 45 | walker->transformation(); 46 | walker->redsvd(); 47 | LOG4CXX_INFO(logger, "Exiting application."); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /src/redsvd/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | FILE( 2 | GLOB 3 | redsvd_files 4 | *.cpp 5 | *.hpp 6 | *.h 7 | ) 8 | # Add "redsvd" library 9 | ADD_LIBRARY(redsvd ${redsvd_files}) 10 | IF(USE_BLAS) 11 | TARGET_LINK_LIBRARIES(redsvd ${BLAS_LIBRARIES} Eigen3::Eigen) 12 | ENDIF(USE_BLAS) 13 | IF(USE_LAPACK) 14 | TARGET_LINK_LIBRARIES(redsvd ${LAPACK_LIBRARIES} Eigen3::Eigen) 15 | ENDIF(USE_LAPACK) 16 | -------------------------------------------------------------------------------- /src/redsvd/cmdline.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2009, Hideyuki Tanaka 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the nor the 13 | names of its contributors may be used to endorse or promote products 14 | derived from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | namespace cmdline{ 43 | 44 | namespace detail{ 45 | 46 | template 47 | class lexical_cast_t{ 48 | public: 49 | static Target cast(const Source &arg){ 50 | Target ret; 51 | std::stringstream ss; 52 | if (!(ss<>ret && ss.eof())) 53 | throw std::bad_cast(); 54 | 55 | return ret; 56 | } 57 | }; 58 | 59 | template 60 | class lexical_cast_t{ 61 | public: 62 | static Target cast(const Source &arg){ 63 | return arg; 64 | } 65 | }; 66 | 67 | template 68 | class lexical_cast_t{ 69 | public: 70 | static std::string cast(const Source &arg){ 71 | std::ostringstream ss; 72 | ss< 78 | class lexical_cast_t{ 79 | public: 80 | static Target cast(const std::string &arg){ 81 | Target ret; 82 | std::istringstream ss(arg); 83 | if (!(ss>>ret && ss.eof())) 84 | throw std::bad_cast(); 85 | return ret; 86 | } 87 | }; 88 | 89 | template 90 | struct is_same { 91 | static const bool value = false; 92 | }; 93 | 94 | template 95 | struct is_same{ 96 | static const bool value = true; 97 | }; 98 | 99 | template 100 | Target lexical_cast(const Source &arg) 101 | { 102 | return lexical_cast_t::value>::cast(arg); 103 | } 104 | 105 | static inline std::string demangle(const std::string &name) 106 | { 107 | int status=0; 108 | char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status); 109 | std::string ret(p); 110 | free(p); 111 | return ret; 112 | } 113 | 114 | template 115 | std::string readable_typename() 116 | { 117 | return demangle(typeid(T).name()); 118 | } 119 | 120 | template <> 121 | std::string readable_typename() 122 | { 123 | return "string"; 124 | } 125 | 126 | } // detail 127 | 128 | //----- 129 | 130 | class cmdline_error : public std::exception { 131 | public: 132 | cmdline_error(const std::string &msg): msg(msg){} 133 | ~cmdline_error() throw() {} 134 | const char *what() const throw() { return msg.c_str(); } 135 | private: 136 | std::string msg; 137 | }; 138 | 139 | template 140 | struct default_reader{ 141 | T operator()(const std::string &str){ 142 | return detail::lexical_cast(str); 143 | } 144 | }; 145 | 146 | template 147 | struct range_reader{ 148 | range_reader(const T &low, const T &high): low(low), high(high) {} 149 | T operator()(const std::string &s) const { 150 | T ret=default_reader()(s); 151 | if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error"); 152 | return ret; 153 | } 154 | private: 155 | T low, high; 156 | }; 157 | 158 | template 159 | range_reader range(const T &low, const T &high) 160 | { 161 | return range_reader(low, high); 162 | } 163 | 164 | template 165 | struct oneof_reader{ 166 | T operator()(const std::string &s){ 167 | T ret=default_reader()(s); 168 | if (std::find(alt.begin(), alt.end(), s)==alt.end()) 169 | throw cmdline_error(""); 170 | return ret; 171 | } 172 | void add(const T &v){ alt.push_back(v); } 173 | private: 174 | std::vector alt; 175 | }; 176 | 177 | template 178 | oneof_reader oneof(T a1) 179 | { 180 | oneof_reader ret; 181 | ret.add(a1); 182 | return ret; 183 | } 184 | 185 | template 186 | oneof_reader oneof(T a1, T a2) 187 | { 188 | oneof_reader ret; 189 | ret.add(a1); 190 | ret.add(a2); 191 | return ret; 192 | } 193 | 194 | template 195 | oneof_reader oneof(T a1, T a2, T a3) 196 | { 197 | oneof_reader ret; 198 | ret.add(a1); 199 | ret.add(a2); 200 | ret.add(a3); 201 | return ret; 202 | } 203 | 204 | template 205 | oneof_reader oneof(T a1, T a2, T a3, T a4) 206 | { 207 | oneof_reader ret; 208 | ret.add(a1); 209 | ret.add(a2); 210 | ret.add(a3); 211 | ret.add(a4); 212 | return ret; 213 | } 214 | 215 | template 216 | oneof_reader oneof(T a1, T a2, T a3, T a4, T a5) 217 | { 218 | oneof_reader ret; 219 | ret.add(a1); 220 | ret.add(a2); 221 | ret.add(a3); 222 | ret.add(a4); 223 | ret.add(a5); 224 | return ret; 225 | } 226 | 227 | template 228 | oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6) 229 | { 230 | oneof_reader ret; 231 | ret.add(a1); 232 | ret.add(a2); 233 | ret.add(a3); 234 | ret.add(a4); 235 | ret.add(a5); 236 | ret.add(a6); 237 | return ret; 238 | } 239 | 240 | template 241 | oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) 242 | { 243 | oneof_reader ret; 244 | ret.add(a1); 245 | ret.add(a2); 246 | ret.add(a3); 247 | ret.add(a4); 248 | ret.add(a5); 249 | ret.add(a6); 250 | ret.add(a7); 251 | return ret; 252 | } 253 | 254 | template 255 | oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) 256 | { 257 | oneof_reader ret; 258 | ret.add(a1); 259 | ret.add(a2); 260 | ret.add(a3); 261 | ret.add(a4); 262 | ret.add(a5); 263 | ret.add(a6); 264 | ret.add(a7); 265 | ret.add(a8); 266 | return ret; 267 | } 268 | 269 | template 270 | oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) 271 | { 272 | oneof_reader ret; 273 | ret.add(a1); 274 | ret.add(a2); 275 | ret.add(a3); 276 | ret.add(a4); 277 | ret.add(a5); 278 | ret.add(a6); 279 | ret.add(a7); 280 | ret.add(a8); 281 | ret.add(a9); 282 | return ret; 283 | } 284 | 285 | template 286 | oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10) 287 | { 288 | oneof_reader ret; 289 | ret.add(a1); 290 | ret.add(a2); 291 | ret.add(a3); 292 | ret.add(a4); 293 | ret.add(a5); 294 | ret.add(a6); 295 | ret.add(a7); 296 | ret.add(a8); 297 | ret.add(a9); 298 | ret.add(a10); 299 | return ret; 300 | } 301 | 302 | //----- 303 | 304 | class parser{ 305 | public: 306 | parser(){ 307 | } 308 | ~parser(){ 309 | for (std::map::iterator p=options.begin(); 310 | p!=options.end(); p++) 311 | delete p->second; 312 | } 313 | 314 | void add(const std::string &name, 315 | char short_name=0, 316 | const std::string &desc=""){ 317 | if (options.count(name)) throw cmdline_error("multiple definition: "+name); 318 | options[name]=new option_without_value(name, short_name, desc); 319 | ordered.push_back(options[name]); 320 | } 321 | 322 | template 323 | void add(const std::string &name, 324 | char short_name=0, 325 | const std::string &desc="", 326 | bool need=true, 327 | const T def=T()){ 328 | add(name, short_name, desc, need, def, default_reader()); 329 | } 330 | 331 | template 332 | void add(const std::string &name, 333 | char short_name=0, 334 | const std::string &desc="", 335 | bool need=true, 336 | const T def=T(), 337 | F reader=F()){ 338 | if (options.count(name)) throw cmdline_error("multiple definition: "+name); 339 | options[name]=new option_with_value_with_reader(name, short_name, need, def, desc, reader); 340 | ordered.push_back(options[name]); 341 | } 342 | 343 | void footer(const std::string &f){ 344 | ftr=f; 345 | } 346 | 347 | void set_program_name(const std::string &name){ 348 | prog_name=name; 349 | } 350 | 351 | bool exist(const std::string &name) const { 352 | if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); 353 | return options.find(name)->second->has_set(); 354 | } 355 | 356 | template 357 | const T &get(const std::string &name) const { 358 | if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); 359 | const option_with_value *p=dynamic_cast*>(options.find(name)->second); 360 | if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'"); 361 | return p->get(); 362 | } 363 | 364 | const std::vector &rest() const { 365 | return others; 366 | } 367 | 368 | bool parse(const std::string &arg){ 369 | std::vector args; 370 | 371 | std::string buf; 372 | bool in_quote=false; 373 | for (std::string::size_type i=0; i=arg.length()){ 388 | errors.push_back("unexpected occurrence of '\\' at end of string"); 389 | return false; 390 | } 391 | } 392 | 393 | buf+=arg[i]; 394 | } 395 | 396 | if (in_quote){ 397 | errors.push_back("quote is not closed"); 398 | return false; 399 | } 400 | 401 | if (buf.length()>0) 402 | args.push_back(buf); 403 | 404 | for (size_t i=0; i &args){ 411 | int argc=static_cast(args.size()); 412 | std::vector argv(argc); 413 | 414 | for (int i=0; i lookup; 432 | for (std::map::iterator p=options.begin(); 433 | p!=options.end(); p++){ 434 | if (p->first.length()==0) continue; 435 | char initial=p->second->short_name(); 436 | if (initial){ 437 | if (lookup.count(initial)>0){ 438 | lookup[initial]=""; 439 | errors.push_back(std::string("short option '")+initial+"' is ambiguous"); 440 | return false; 441 | } 442 | else lookup[initial]=p->first; 443 | } 444 | } 445 | 446 | for (int i=1; idescription()<set()){ 571 | errors.push_back("option needs value: --"+name); 572 | return; 573 | } 574 | } 575 | 576 | void set_option(const std::string &name, const std::string &value){ 577 | if (options.count(name)==0){ 578 | errors.push_back("undefined option: --"+name); 579 | return; 580 | } 581 | if (!options[name]->set(value)){ 582 | errors.push_back("option value is invalid: --"+name+"="+value); 583 | return; 584 | } 585 | } 586 | 587 | class option_base{ 588 | public: 589 | virtual ~option_base(){} 590 | 591 | virtual bool has_value() const=0; 592 | virtual bool set()=0; 593 | virtual bool set(const std::string &value)=0; 594 | virtual bool has_set() const=0; 595 | virtual bool valid() const=0; 596 | virtual bool must() const=0; 597 | 598 | virtual const std::string &name() const=0; 599 | virtual char short_name() const=0; 600 | virtual const std::string &description() const=0; 601 | virtual std::string short_description() const=0; 602 | }; 603 | 604 | class option_without_value : public option_base { 605 | public: 606 | option_without_value(const std::string &name, 607 | char short_name, 608 | const std::string &desc) 609 | :nam(name), snam(short_name), desc(desc), has(false){ 610 | } 611 | ~option_without_value(){} 612 | 613 | bool has_value() const { return false; } 614 | 615 | bool set(){ 616 | has=true; 617 | return true; 618 | } 619 | 620 | bool set(const std::string &){ 621 | return false; 622 | } 623 | 624 | bool has_set() const { 625 | return has; 626 | } 627 | 628 | bool valid() const{ 629 | return true; 630 | } 631 | 632 | bool must() const{ 633 | return false; 634 | } 635 | 636 | const std::string &name() const{ 637 | return nam; 638 | } 639 | 640 | char short_name() const{ 641 | return snam; 642 | } 643 | 644 | const std::string &description() const { 645 | return desc; 646 | } 647 | 648 | std::string short_description() const{ 649 | return "--"+nam; 650 | } 651 | 652 | private: 653 | std::string nam; 654 | char snam; 655 | std::string desc; 656 | bool has; 657 | }; 658 | 659 | template 660 | class option_with_value : public option_base { 661 | public: 662 | option_with_value(const std::string &name, 663 | char short_name, 664 | bool need, 665 | const T &def, 666 | const std::string &desc) 667 | : nam(name), snam(short_name), need(need), has(false) 668 | , def(def), actual(def) { 669 | this->desc=full_description(desc); 670 | } 671 | ~option_with_value(){} 672 | 673 | const T &get() const { 674 | return actual; 675 | } 676 | 677 | bool has_value() const { return true; } 678 | 679 | bool set(){ 680 | return false; 681 | } 682 | 683 | bool set(const std::string &value){ 684 | try{ 685 | actual=read(value); 686 | has=true; 687 | } 688 | catch(const std::exception &e){ 689 | return false; 690 | } 691 | return true; 692 | } 693 | 694 | bool has_set() const{ 695 | return has; 696 | } 697 | 698 | bool valid() const{ 699 | if (need && !has) return false; 700 | return true; 701 | } 702 | 703 | bool must() const{ 704 | return need; 705 | } 706 | 707 | const std::string &name() const{ 708 | return nam; 709 | } 710 | 711 | char short_name() const{ 712 | return snam; 713 | } 714 | 715 | const std::string &description() const { 716 | return desc; 717 | } 718 | 719 | std::string short_description() const{ 720 | return "--"+nam+"="+detail::readable_typename(); 721 | } 722 | 723 | protected: 724 | std::string full_description(const std::string &desc){ 725 | return 726 | desc+" ("+detail::readable_typename()+ 727 | (need?"":" [="+detail::lexical_cast(def)+"]") 728 | +")"; 729 | } 730 | 731 | virtual T read(const std::string &s)=0; 732 | 733 | std::string nam; 734 | char snam; 735 | bool need; 736 | std::string desc; 737 | 738 | bool has; 739 | T def; 740 | T actual; 741 | }; 742 | 743 | template 744 | class option_with_value_with_reader : public option_with_value { 745 | public: 746 | option_with_value_with_reader(const std::string &name, 747 | char short_name, 748 | bool need, 749 | const T def, 750 | const std::string &desc, 751 | F reader) 752 | : option_with_value(name, short_name, need, def, desc), reader(reader){ 753 | } 754 | 755 | private: 756 | T read(const std::string &s){ 757 | return reader(s); 758 | } 759 | 760 | F reader; 761 | }; 762 | 763 | std::map options; 764 | std::vector ordered; 765 | std::string ftr; 766 | 767 | std::string prog_name; 768 | std::vector others; 769 | 770 | std::vector errors; 771 | }; 772 | 773 | } // cmdline 774 | -------------------------------------------------------------------------------- /src/redsvd/fileReader.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #ifndef FILEREADER_HPP_ 21 | #define FILEREADER_HPP_ 22 | 23 | #include 24 | #include 25 | #include "util.hpp" 26 | 27 | namespace REDSVD{ 28 | 29 | class FileReader { 30 | public: 31 | FileReader() : rows_(0), cols_(0) {} 32 | ~FileReader() {} 33 | 34 | void OpenFile(const char* inputFileName){ 35 | inputFileName_ = inputFileName; 36 | ifs_.close(); 37 | ifs_.clear(); 38 | ifs_.open(inputFileName_.c_str(), std::ifstream::in); 39 | if (!ifs_){ 40 | throw std::string("open error ") + inputFileName_; 41 | } 42 | } 43 | 44 | void Rewind(){ 45 | ifs_.clear(); 46 | ifs_.seekg(0); 47 | } 48 | 49 | void GetStat(){ 50 | rows_ = 0; 51 | cols_ = 0; 52 | for (fv_t fv; ReadRow(fv) != -1; ++rows_){ 53 | if (fv.size() == 0) continue; 54 | cols_ = std::max(fv.back().first+1, cols_); 55 | } 56 | ifs_.clear(); 57 | ifs_.seekg(0); 58 | } 59 | 60 | int ReadRow(fv_t& fv){ 61 | std::string line; 62 | if (!getline(ifs_, line)){ 63 | return -1; 64 | } 65 | std::istringstream is(line); 66 | 67 | int id; 68 | char sep; 69 | float val; 70 | while (is >> id >> sep >> val){ 71 | fv.push_back(std::make_pair(id, val)); 72 | } 73 | sort(fv.begin(), fv.end()); 74 | fv.erase(unique(fv.begin(), fv.end()), fv.end()); 75 | 76 | return 0; 77 | } 78 | 79 | int rows() const { 80 | return rows_; 81 | } 82 | 83 | int cols() const { 84 | return cols_; 85 | } 86 | 87 | private: 88 | std::ifstream ifs_; 89 | std::string inputFileName_; 90 | int rows_; 91 | int cols_; 92 | }; 93 | 94 | } 95 | 96 | #endif // FILEREADER_HPP_ 97 | -------------------------------------------------------------------------------- /src/redsvd/redsvd.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #ifndef REDSVD_HPP__ 21 | #define REDSVD_HPP__ 22 | 23 | #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "util.hpp" 30 | #include "log4cxx/logger.h" 31 | 32 | using namespace log4cxx; 33 | 34 | namespace REDSVD { 35 | 36 | class RedSVD { 37 | public: 38 | RedSVD(){} 39 | 40 | template 41 | RedSVD(Mat& A){ 42 | int r = (A.rows() < A.cols()) ? A.rows() : A.cols(); 43 | run(A, r); 44 | } 45 | 46 | template 47 | RedSVD(Mat& A, const int rank){ 48 | run(A, rank); 49 | } 50 | 51 | template 52 | void run(Mat& A, const int rank){ 53 | LoggerPtr logger(Logger::getLogger("redsvd")); 54 | if (A.cols() == 0 || A.rows() == 0) return; 55 | int r = (rank < A.cols()) ? rank : A.cols(); 56 | r = (r < A.rows()) ? r : A.rows(); 57 | 58 | // Gaussian Random Matrix for A^T 59 | Eigen::MatrixXf O(A.rows(), r); 60 | Util::sampleGaussianMat(O); 61 | 62 | LOG4CXX_INFO(logger, "sampling gaussian random matrix O for A^T done."); 63 | 64 | // Compute Sample Matrix of A^T 65 | // Eigen::MatrixXf Y = A.transpose() * O; 66 | Eigen::MatrixXf Y = A * O; 67 | LOG4CXX_INFO(logger, "compute sample matrix of Y = A^T * O = A * O done (because A^T = A)"); 68 | 69 | // Orthonormalize Y 70 | Util::processGramSchmidt(Y); 71 | LOG4CXX_INFO(logger, "orthonormalize Y done."); 72 | 73 | // Range(B) = Range(A^T) 74 | Eigen::MatrixXf B = A * Y; 75 | LOG4CXX_INFO(logger, "B = A * Y done, Range(B) = Range(A^T)."); 76 | 77 | // Gaussian Random Matrix 78 | Eigen::MatrixXf P(B.cols(), r); 79 | Util::sampleGaussianMat(P); 80 | LOG4CXX_INFO(logger, "sample another gaussian random matrix P done."); 81 | 82 | // Compute Sample Matrix of B 83 | Eigen::MatrixXf Z = B * P; 84 | LOG4CXX_INFO(logger, "compute sample matrix of Z = B * P done.") 85 | 86 | // Orthonormalize Z 87 | Util::processGramSchmidt(Z); 88 | LOG4CXX_INFO(logger, "orthonormalize Z done."); 89 | 90 | // Range(C) = Range(B) 91 | Eigen::MatrixXf C = Z.transpose() * B; 92 | LOG4CXX_INFO(logger, "C = Z^T * B done, Range(C) = Range(B)."); 93 | 94 | Eigen::JacobiSVD svdOfC(C, Eigen::ComputeThinU | Eigen::ComputeThinV); 95 | LOG4CXX_INFO(logger, "JacabiSVD for C done."); 96 | 97 | 98 | // C = USV^T 99 | // A = Z * U * S * V^T * Y^T() 100 | matU_ = Z * svdOfC.matrixU(); 101 | matS_ = svdOfC.singularValues(); 102 | matV_ = Y * svdOfC.matrixV(); 103 | LOG4CXX_INFO(logger, "compute U S V done."); 104 | } 105 | 106 | const Eigen::MatrixXf& matrixU() const { 107 | return matU_; 108 | } 109 | 110 | const Eigen::VectorXf& singularValues() const { 111 | return matS_; 112 | } 113 | 114 | const Eigen::MatrixXf& matrixV() const { 115 | return matV_; 116 | } 117 | 118 | private: 119 | Eigen::MatrixXf matU_; 120 | Eigen::VectorXf matS_; 121 | Eigen::MatrixXf matV_; 122 | }; 123 | 124 | class RedSymEigen { 125 | public: 126 | RedSymEigen(){} 127 | 128 | template 129 | RedSymEigen(Mat& A, const int rank){ 130 | run(A, rank); 131 | } 132 | 133 | template 134 | void run(Mat& A, const int rank){ 135 | if (A.cols() == 0 || A.rows() == 0) return; 136 | int r = (rank < A.cols()) ? rank : A.cols(); 137 | r = (r < A.rows()) ? r : A.rows(); 138 | 139 | // Gaussian Random Matrix 140 | Eigen::MatrixXf O(A.rows(), r); 141 | Util::sampleGaussianMat(O); 142 | 143 | // Compute Sample Matrix of A 144 | Eigen::MatrixXf Y = A.transpose() * O; 145 | 146 | // Orthonormalize Y 147 | Util::processGramSchmidt(Y); 148 | 149 | Eigen::MatrixXf B = Y.transpose() * A * Y; 150 | Eigen::SelfAdjointEigenSolver eigenOfB(B); 151 | 152 | eigenValues_ = eigenOfB.eigenvalues(); 153 | eigenVectors_ = Y * eigenOfB.eigenvectors(); 154 | } 155 | 156 | const Eigen::MatrixXf& eigenVectors() const { 157 | return eigenVectors_; 158 | } 159 | 160 | const Eigen::VectorXf& eigenValues() const { 161 | return eigenValues_; 162 | } 163 | 164 | private: 165 | Eigen::VectorXf eigenValues_; 166 | Eigen::MatrixXf eigenVectors_; 167 | }; 168 | 169 | class RedPCA { 170 | public: 171 | RedPCA(){} 172 | 173 | template 174 | RedPCA(const Mat& A, const int rank) { 175 | run(A, rank); 176 | } 177 | 178 | template 179 | void run(const Mat& A, const int rank) { 180 | RedSVD redsvd; 181 | redsvd.run(A, rank); 182 | const Eigen::VectorXf& S = redsvd.singularValues(); 183 | principalComponents_ = redsvd.matrixV(); 184 | scores_ = redsvd.matrixU() * S.asDiagonal(); 185 | } 186 | 187 | const Eigen::MatrixXf& principalComponents() const { 188 | return principalComponents_; 189 | } 190 | 191 | const Eigen::MatrixXf& scores() const { 192 | return scores_; 193 | } 194 | 195 | private: 196 | Eigen::MatrixXf principalComponents_; 197 | Eigen::MatrixXf scores_; 198 | }; 199 | 200 | } 201 | 202 | #endif // REDSVD_HPP__ 203 | -------------------------------------------------------------------------------- /src/redsvd/redsvdFile.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "redsvdFile.hpp" 26 | #include "redsvd.hpp" 27 | #include "redsvdIncr.hpp" 28 | 29 | using namespace std; 30 | using namespace Eigen; 31 | 32 | namespace REDSVD{ 33 | 34 | namespace { 35 | 36 | void writeMatrix_(const string& fn, const MatrixXf& M){ 37 | cout << "write " << fn << endl; 38 | FILE* outfp = fopen(fn.c_str(), "wb"); 39 | if (outfp == NULL){ 40 | throw string("cannot open ") + fn; 41 | } 42 | 43 | for (int i = 0; i < M.rows(); ++i){ 44 | for (int j = 0; j < M.cols(); ++j){ 45 | fprintf(outfp, "%+f ", M(i, j)); 46 | } 47 | fprintf(outfp, "\n"); 48 | } 49 | 50 | fclose(outfp); 51 | } 52 | 53 | void writeVector_(const string& fn, const VectorXf& V){ 54 | cout << "write " << fn << endl; 55 | FILE* outfp = fopen(fn.c_str(), "wb"); 56 | if (outfp == NULL){ 57 | throw string("cannot open ") + fn; 58 | } 59 | 60 | for (int i = 0; i < V.rows(); ++i){ 61 | fprintf(outfp, "%+f\n", V(i)); 62 | } 63 | 64 | fclose(outfp); 65 | } 66 | 67 | void readLine(const string& line, 68 | fv_t& fv){ 69 | istringstream is(line); 70 | 71 | int id; 72 | char sep; 73 | float val; 74 | while (is >> id >> sep >> val){ 75 | fv.push_back(make_pair(id, val)); 76 | } 77 | sort(fv.begin(), fv.end()); 78 | fv.erase(unique(fv.begin(), fv.end()), fv.end()); 79 | } 80 | 81 | } 82 | 83 | 84 | 85 | void readMatrix(const std::string& fn, SMatrixXf& A){ 86 | vector fvs; 87 | ifstream ifs(fn.c_str()); 88 | if (!ifs){ 89 | throw string("failed to open") + fn; 90 | } 91 | 92 | for (string line; getline(ifs, line); ){ 93 | fv_t fv; 94 | readLine(line, fv); 95 | //if (fv.size() == 0) continue; 96 | fvs.push_back(fv); 97 | } 98 | Util::convertFV2Mat(fvs, A); 99 | } 100 | 101 | void readMatrix(const std::string& fn, MatrixXf& A){ 102 | ifstream ifs(fn.c_str()); 103 | if (!ifs){ 104 | throw string("failed to open " ) + fn; 105 | } 106 | 107 | vector< vector > vs; 108 | for (string line; getline(ifs, line); ){ 109 | istringstream is(line); 110 | vector v; 111 | float val; 112 | while (is >> val){ 113 | v.push_back(val); 114 | } 115 | vs.push_back(v); 116 | } 117 | 118 | size_t rowN = vs.size(); 119 | if (rowN == 0) return; 120 | size_t colN = vs[0].size(); 121 | A.resize(rowN, colN); 122 | 123 | for (size_t i = 0; i < rowN; ++i){ 124 | if (colN != vs[i].size()){ 125 | cerr << "warning: " << i+1 << "-th row has " << vs[i].size() << " entries. " 126 | << colN << " entries are expected" << endl; 127 | } 128 | size_t colNmin = min(colN, vs[i].size()); 129 | for (size_t j = 0; j < colNmin; ++j){ 130 | A(i, j) = vs[i][j]; 131 | } 132 | } 133 | } 134 | 135 | void writeMatrix(const string& fn, const REDSVD::RedSVD& A){ 136 | writeMatrix_(fn + ".U", A.matrixU()); 137 | writeVector_(fn + ".S", A.singularValues()); 138 | writeMatrix_(fn + ".V", A.matrixV()); 139 | } 140 | 141 | void writeMatrix(const string& fn, const REDSVD::RedSVDIncr& A){ 142 | writeMatrix_(fn + ".U", A.matrixU()); 143 | writeVector_(fn + ".S", A.singularValues()); 144 | writeMatrix_(fn + ".V", A.matrixV()); 145 | } 146 | 147 | 148 | void writeMatrix(const string& fn, const REDSVD::RedPCA& A){ 149 | writeMatrix_(fn + ".pc", A.principalComponents()); 150 | writeMatrix_(fn + ".score", A.scores()); 151 | } 152 | 153 | void writeMatrix(const string& fn, const REDSVD::RedSymEigen& A){ 154 | writeMatrix_(fn + ".evec", A.eigenVectors()); 155 | writeVector_(fn + ".eval", A.eigenValues()); 156 | } 157 | 158 | } 159 | -------------------------------------------------------------------------------- /src/redsvd/redsvdFile.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #ifndef REDSVDFILE_HPP__ 21 | #define REDSVDFILE_HPP__ 22 | 23 | #include 24 | #include 25 | #include 26 | #include "util.hpp" 27 | 28 | namespace REDSVD{ 29 | 30 | class RedSVD; 31 | class RedPCA; 32 | class RedSymEigen; 33 | class RedSVDIncr; 34 | 35 | void readMatrix(const std::string& fn, SMatrixXf& A); 36 | void readMatrix(const std::string& fn, Eigen::MatrixXf& A); 37 | 38 | void writeMatrix(const std::string& fn, const RedSVD& A); 39 | void writeMatrix(const std::string& fn, const RedPCA& A); 40 | void writeMatrix(const std::string& fn, const RedSymEigen& A); 41 | void writeMatrix(const std::string& fn, const RedSVDIncr& A); 42 | 43 | template 44 | void fileProcess(const std::string& inputFileName, 45 | const std::string& outputFileName, 46 | int rank){ 47 | double startSec = Util::getSec(); 48 | std::cout << "read matrix from " << inputFileName << " ... " << std::flush; 49 | Mat A; 50 | readMatrix(inputFileName.c_str(), A); 51 | std::cout << Util::getSec() - startSec << " sec." < 26 | #include 27 | #include 28 | #include 29 | #include "util.hpp" 30 | 31 | namespace REDSVD { 32 | 33 | class RedSVDIncr { 34 | public: 35 | RedSVDIncr(){} 36 | 37 | template 38 | RedSVDIncr(Reader& reader, const int rank){ 39 | run(reader, rank); 40 | } 41 | 42 | template 43 | void run(Reader& reader, const int rank){ 44 | int r = (rank < reader.cols()) ? rank : reader.cols(); 45 | Eigen::MatrixXf O(reader.rows(), r); 46 | Util::sampleGaussianMat(O); 47 | 48 | Eigen::MatrixXf Y = Eigen::MatrixXf::Zero(reader.cols(), r); 49 | for (int row = 0; row < reader.rows(); ++row){ 50 | fv_t fv; 51 | reader.ReadRow(fv); 52 | for (size_t i = 0; i < fv.size(); ++i){ 53 | int column = fv[i].first; 54 | float val = fv[i].second; 55 | for (int j = 0; j < r; ++j){ 56 | Y(column, j) += O(row, j) * val; 57 | } 58 | } 59 | } 60 | Util::processGramSchmidt(Y); 61 | 62 | reader.Rewind(); 63 | 64 | Eigen::MatrixXf B = Eigen::MatrixXf::Zero(reader.rows(), r); 65 | for (int row = 0; row < reader.rows(); ++row){ 66 | fv_t fv; 67 | reader.ReadRow(fv); 68 | for (size_t i = 0; i < fv.size(); ++i){ 69 | int column = fv[i].first; 70 | float val = fv[i].second; 71 | for (int j = 0; j < r; ++j){ 72 | B(row, j) += val * Y(column, j); 73 | } 74 | } 75 | } 76 | 77 | // Gaussian Random Matrix 78 | Eigen::MatrixXf P(B.cols(), r); 79 | Util::sampleGaussianMat(P); 80 | 81 | // Compute Sample Matrix of B 82 | Eigen::MatrixXf Z = B * P; 83 | 84 | // Orthonormalize Z 85 | Util::processGramSchmidt(Z); 86 | 87 | // Range(C) = Range(B) 88 | Eigen::MatrixXf C = Z.transpose() * B; 89 | 90 | Eigen::JacobiSVD svdOfC(C, Eigen::ComputeThinU | Eigen::ComputeThinV); 91 | 92 | // C = USV^T 93 | // A = Z * U * S * V^T * Y^T() 94 | matU_ = Z * svdOfC.matrixU(); 95 | matS_ = svdOfC.singularValues(); 96 | matV_ = Y * svdOfC.matrixV(); 97 | } 98 | 99 | const Eigen::MatrixXf& matrixU() const { 100 | return matU_; 101 | } 102 | 103 | const Eigen::VectorXf& singularValues() const { 104 | return matS_; 105 | } 106 | 107 | const Eigen::MatrixXf& matrixV() const { 108 | return matV_; 109 | } 110 | 111 | private: 112 | Eigen::MatrixXf matU_; 113 | Eigen::VectorXf matS_; 114 | Eigen::MatrixXf matV_; 115 | }; 116 | 117 | } 118 | 119 | #endif // REDSVD_INCR_HPP__ 120 | -------------------------------------------------------------------------------- /src/redsvd/redsvdMain.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "cmdline.h" 25 | #include "redsvd.hpp" 26 | #include "redsvdFile.hpp" 27 | 28 | using namespace std; 29 | 30 | namespace { 31 | 32 | void setFooter(cmdline::parser& p){ 33 | p.footer( 34 | "\n\n" 35 | "redsvd supports the following format types (one line for each row)\n\n" 36 | "[format=dense] (+\\n)+\n" 37 | "[format=sparse] ((colum_id:value)+\\n)+\n" 38 | "Example:\n" 39 | ">redsvd -i imat -o omat -r 10 -f dense\n" 40 | "compuate SVD for a dense matrix in imat and output omat.U omat.V, and omat.S\n" 41 | "with the 10 largest eigen values/vectors\n" 42 | ">redsvd -i imat -o omat -r 3 -f sparse -m PCA\n" 43 | "compuate PCA for a sparse matrix in imat and output omat.PC omat.SCORE\n" 44 | "with the 3 largest principal components\n" 45 | ); 46 | } 47 | } 48 | 49 | int main(int argc, char* argv[]){ 50 | cmdline::parser p; 51 | p.add("input", 'i', "input file", true); 52 | p.add("output", 'o', "output file's prefix", true); 53 | p.add ("rank", 'r', "rank ", false, 10); 54 | p.add("format", 'f', "format type (dense|sparse) See example. ", false, "dense"); 55 | p.add("method", 'm', "method (SVD|PCA|SymEigen)", false, "SVD"); 56 | p.set_program_name("redsvd"); 57 | setFooter(p); 58 | 59 | if (argc == 1){ 60 | cerr << p.usage() << endl; 61 | return 0; 62 | } 63 | 64 | if (p.parse(argc, argv) == 0){ 65 | cerr << "Error:" << p.error() << endl 66 | << p.usage() << endl; 67 | return -1; 68 | } 69 | 70 | string input = p.get("input"); 71 | string output = p.get("output"); 72 | string format = p.get("format"); 73 | int rank = p.get ("rank"); 74 | string method = p.get("method"); 75 | bool isInputSparse = false; 76 | 77 | if (rank <= 0){ 78 | cerr << "rank=" << rank << endl 79 | << "rank should be positive integer" << endl; 80 | return -1; 81 | } 82 | 83 | 84 | if (format == "dense"){ 85 | isInputSparse = false; 86 | } else if (format == "sparse"){ 87 | isInputSparse = true; 88 | } else { 89 | cerr << "unknwon format:" << format << endl; 90 | return -1; 91 | } 92 | 93 | cout << "compute " << method << endl; 94 | try { 95 | if (method == "SVD"){ 96 | if (isInputSparse){ 97 | REDSVD::fileProcess(input, output, rank); 98 | } else { 99 | REDSVD::fileProcess(input, output, rank); 100 | } 101 | } else if (method == "PCA"){ 102 | if (isInputSparse){ 103 | REDSVD::fileProcess(input, output, rank); 104 | } else { 105 | REDSVD::fileProcess(input, output, rank); 106 | } 107 | } else if (method == "SymEigen"){ 108 | if (isInputSparse){ 109 | REDSVD::fileProcess(input, output, rank); 110 | } else { 111 | REDSVD::fileProcess(input, output, rank); 112 | } 113 | } else { 114 | cerr << "unknown method:" << method << endl; 115 | return -1; 116 | } 117 | } catch (const string& error){ 118 | cerr << "Error: " << error << endl; 119 | return -1; 120 | } 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /src/redsvd/redsvdMainIncr.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "cmdline.h" 25 | #include "fileReader.hpp" 26 | #include "redsvdFile.hpp" 27 | #include "redsvd.hpp" 28 | #include "redsvdIncr.hpp" 29 | #include "util.hpp" 30 | 31 | using namespace std; 32 | using namespace REDSVD; 33 | 34 | void IncrRun(const string& inputFileName, 35 | const string& outputFileName, 36 | int rank){ 37 | FileReader fileReader; 38 | std::cout << "read matrix from " << inputFileName << " ... " << std::flush; 39 | double startSec = Util::getSec(); 40 | fileReader.OpenFile(inputFileName.c_str()); 41 | std::cout << Util::getSec() - startSec << " sec." <("input", 'i', "input file", true); 64 | p.add("output", 'o', "output file's prefix", true); 65 | p.add ("rank", 'r', "rank ", false, 10); 66 | p.set_program_name("redsvd_incr"); 67 | 68 | if (argc == 1){ 69 | cerr << p.usage() << endl; 70 | return 0; 71 | } 72 | 73 | if (p.parse(argc, argv) == 0){ 74 | cerr << "Error:" << p.error() << endl 75 | << p.usage() << endl; 76 | return -1; 77 | } 78 | 79 | string input = p.get("input"); 80 | string output = p.get("output"); 81 | int rank = p.get ("rank"); 82 | 83 | if (rank <= 0){ 84 | cerr << "rank=" << rank << endl 85 | << "rank should be positive integer" << endl; 86 | return -1; 87 | } 88 | 89 | try { 90 | IncrRun(input, output, rank); 91 | } catch (const string& error){ 92 | cerr << "Error: " << error << endl; 93 | return -1; 94 | } 95 | return 0; 96 | } 97 | 98 | 99 | -------------------------------------------------------------------------------- /src/redsvd/util.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #include 21 | #include 22 | 23 | #include "util.hpp" 24 | 25 | using namespace std; 26 | using namespace Eigen; 27 | 28 | namespace REDSVD { 29 | 30 | const float SVD_EPS = 0.0001f; 31 | 32 | double Util::getSec(){ 33 | timeval tv; 34 | gettimeofday(&tv, NULL); 35 | return tv.tv_sec + (double)tv.tv_usec*1e-6; 36 | } 37 | 38 | void Util::sampleTwoGaussian(float& f1, float& f2){ 39 | float v1 = (float)(rand() + 1.f) / ((float)RAND_MAX+2.f); 40 | float v2 = (float)(rand() + 1.f) / ((float)RAND_MAX+2.f); 41 | float len = sqrt(-2.f * log(v1)); 42 | f1 = len * cos(2.f * M_PI * v2); 43 | f2 = len * sin(2.f * M_PI * v2); 44 | } 45 | 46 | void Util::sampleGaussianMat(MatrixXf& mat){ 47 | for (int i = 0; i < mat.rows(); ++i){ 48 | int j = 0; 49 | for ( ; j+1 < mat.cols(); j += 2){ 50 | float f1, f2; 51 | sampleTwoGaussian(f1, f2); 52 | mat(i,j ) = f1; 53 | mat(i,j+1) = f2; 54 | } 55 | for (; j < mat.cols(); j ++){ 56 | float f1, f2; 57 | sampleTwoGaussian(f1, f2); 58 | mat(i, j) = f1; 59 | } 60 | } 61 | } 62 | 63 | 64 | void Util::processGramSchmidt(MatrixXf& mat){ 65 | for (int i = 0; i < mat.cols(); ++i){ 66 | for (int j = 0; j < i; ++j){ 67 | float r = mat.col(i).dot(mat.col(j)); 68 | mat.col(i) -= r * mat.col(j); 69 | } 70 | float norm = mat.col(i).norm(); 71 | if (norm < SVD_EPS){ 72 | for (int k = i; k < mat.cols(); ++k){ 73 | mat.col(k).setZero(); 74 | } 75 | return; 76 | } 77 | mat.col(i) *= (1.f / norm); 78 | } 79 | } 80 | 81 | void Util::convertFV2Mat(const vector& fvs, REDSVD::SMatrixXf& A){ 82 | int maxID = 0; 83 | size_t nonZeroNum = 0; 84 | for (size_t i = 0; i < fvs.size(); ++i){ 85 | const fv_t& fv(fvs[i]); 86 | for (size_t j = 0; j < fv.size(); ++j){ 87 | maxID = max(fv[j].first+1, maxID); 88 | } 89 | nonZeroNum += fv.size(); 90 | } 91 | A.resize(fvs.size(), maxID); 92 | A.reserve(nonZeroNum); 93 | for (size_t i = 0; i < fvs.size(); ++i){ 94 | A.startVec(i); 95 | const fv_t& fv(fvs[i]); 96 | for (size_t j = 0; j < fv.size(); ++j){ 97 | A.insertBack(i, fv[j].first) = fv[j].second; 98 | } 99 | } 100 | A.finalize(); 101 | } 102 | 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/redsvd/util.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 Daisuke Okanohara 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * 8 | * 1. Redistributions of source code must retain the above Copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above Copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the authors nor the names of its contributors 16 | * may be used to endorse or promote products derived from this 17 | * software without specific prior written permission. 18 | */ 19 | 20 | #ifndef REDSVD_UTIL_HPP__ 21 | #define REDSVD_UTIL_HPP__ 22 | 23 | #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace REDSVD { 32 | 33 | typedef Eigen::SparseMatrix SMatrixXf; 34 | typedef std::vector > fv_t; 35 | 36 | class Util{ 37 | public: 38 | static void convertFV2Mat(const std::vector& fvs, SMatrixXf& A); 39 | static void sampleGaussianMat(Eigen::MatrixXf& x); 40 | static void processGramSchmidt(Eigen::MatrixXf& mat); 41 | static double getSec(); 42 | 43 | private: 44 | static void sampleTwoGaussian(float& f1, float& f2); 45 | }; 46 | 47 | } 48 | 49 | #endif // REDSVD_UTIL_HPP_ 50 | --------------------------------------------------------------------------------