├── Mizan-0.1bu1 ├── Release │ └── src │ │ └── dataManager │ │ └── dataStructures │ │ └── data │ │ └── subdir.mk └── src │ ├── Mizan.h │ ├── algorithms │ ├── MST.h │ ├── SSSP.h │ ├── WCC.h │ ├── dimEst.h │ └── pageRank.h │ ├── communication │ └── dataStructures │ │ └── general.h │ ├── dataManager │ └── dataStructures │ │ └── data │ │ ├── mMSTEdgeValue.cpp │ │ ├── mMSTEdgeValue.h │ │ ├── mMSTVertexValue.cpp │ │ └── mMSTVertexValue.h │ ├── general.h │ ├── main.cpp │ └── tools │ └── argParser.h ├── README.md ├── benchmark ├── bench-all.sh ├── common │ ├── bench-finish.sh │ ├── bench-init.sh │ ├── cleanup-bench.sh │ ├── get-configs.sh │ ├── get-dirs.sh │ └── ssh-check.sh ├── datasets │ ├── Makefile │ ├── convert-adj.sh │ ├── convert-mst.sh │ ├── load-files.sh │ ├── load-splits.sh │ ├── mst-convert.cpp │ ├── snap-convert.cpp │ ├── snap-revert.cpp │ └── split-input.sh ├── giraph │ ├── benchall.sh │ ├── dimest.sh │ ├── kill-java-job.sh │ ├── mst.sh │ ├── pagerank.sh │ ├── prtolfinder.sh │ ├── recompile-giraph.sh │ ├── sssp.sh │ └── wcc.sh ├── gps │ ├── benchall.sh │ ├── debug-site.sh │ ├── dimest.sh │ ├── disable-dimest-fix.sh │ ├── enable-dimest-fix.sh │ ├── init.sh │ ├── mst.sh │ ├── pagerank.sh │ ├── recompile-gps.sh │ ├── sssp.sh │ ├── start-nodes.sh │ ├── stop-nodes.sh │ └── wcc.sh ├── graphlab │ ├── benchall.sh │ ├── dimest.sh │ ├── init.sh │ ├── pagerank.sh │ ├── recompile-graphlab.sh │ ├── sssp.sh │ └── wcc.sh ├── hadoop │ ├── init.sh │ └── restart-hadoop.sh ├── init-all.sh ├── local-init.sh ├── mizan │ ├── benchall.sh │ ├── dimest.sh │ ├── init.sh │ ├── mst.sh │ ├── pagerank.sh │ ├── premizan.sh │ ├── recompile-mizan.sh │ ├── sssp.sh │ └── wcc.sh ├── parsers │ ├── batch-parser.py │ └── log-checker.sh └── readme.txt ├── ec2 └── uw-ec2.py ├── giraph-1.0.0 ├── findbugs-exclude.xml ├── giraph-core │ └── src │ │ └── main │ │ └── java │ │ └── org │ │ └── apache │ │ └── giraph │ │ └── io │ │ └── formats │ │ ├── JsonLongLongLongLongVertexInputFormat.java │ │ └── JsonLongLongNullLongVertexInputFormat.java └── giraph-examples │ └── src │ └── main │ └── java │ └── org │ └── apache │ └── giraph │ └── examples │ ├── ConnectedComponentsInputFormat.java │ ├── ConnectedComponentsVertex.java │ ├── DiameterEstimationInputFormat.java │ ├── DiameterEstimationVertex.java │ ├── JsonLongLongArrayInputFormat.java │ ├── JsonLongMSTVertexInputFormat.java │ ├── MinimumSpanningTreeInputFormat.java │ ├── MinimumSpanningTreeVertex.java │ ├── PageRankTolFinderVertex.java │ ├── SimplePageRankInputFormat.java │ ├── SimplePageRankVertex.java │ └── SimpleShortestPathsInputFormat.java ├── gps-rev-110 ├── local-master-scripts │ └── make_gps_node_runner_jar.sh └── src │ └── java │ └── gps │ ├── examples │ ├── dimest │ │ └── DiameterEstimationVertex.java │ ├── pagerank │ │ └── PageRankVertex.java │ ├── sssp │ │ ├── SSSPVertex.java │ │ └── SingleSourceAllVerticesShortestPathVertex.java │ └── wcc │ │ └── WeaklyConnectedComponentsVertex.java │ ├── messages │ └── storage │ │ ├── ArrayBackedIncomingMessageStorage.javaDIMEST │ │ └── ArrayBackedIncomingMessageStorage.javaORIGINAL │ ├── node │ ├── GPSNodeRunner.java │ └── worker │ │ └── dynamic │ │ ├── VertexWrapper.java │ │ └── greedy │ │ ├── BaseGreedyDynamicGPSWorkerImpl.java │ │ ├── onesync │ │ ├── OneSyncDynamicMessageSender.java │ │ └── OneSyncLaggingGreedyDynamicGPSWorker.java │ │ └── twosync │ │ └── TwoSyncGreedyDynamicGPSWorker.java │ └── writable │ └── LongArrayWritable.java ├── graphlab-2a063b3829 ├── src │ └── graphlab │ │ └── graph │ │ ├── builtin_parsers.hpp │ │ └── distributed_graph.hpp └── toolkits │ └── graph_analytics │ ├── connected_component.cpp │ ├── pagerank.cpp │ └── sssp.cpp └── results └── plots ├── all-plots.tex ├── constants.py ├── data_mem.py ├── data_mem_master.py ├── data_net.py ├── data_net_master.py ├── data_time.py ├── gen-all.sh ├── gen-data.py ├── plot-all.sh ├── plot-paper.sh ├── plot-with-cuts.py └── plot.py /Mizan-0.1bu1/Release/src/dataManager/dataStructures/data/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/dataManager/dataStructures/data/IdataType.cpp \ 8 | ../src/dataManager/dataStructures/data/mArrayIntTagNK.cpp \ 9 | ../src/dataManager/dataStructures/data/mCharArray.cpp \ 10 | ../src/dataManager/dataStructures/data/mCharArrayNoCpy.cpp \ 11 | ../src/dataManager/dataStructures/data/mDouble.cpp \ 12 | ../src/dataManager/dataStructures/data/mDoubleArray.cpp \ 13 | ../src/dataManager/dataStructures/data/mInt.cpp \ 14 | ../src/dataManager/dataStructures/data/mIntCharArrayPair.cpp \ 15 | ../src/dataManager/dataStructures/data/mIntTagDouble.cpp \ 16 | ../src/dataManager/dataStructures/data/mLong.cpp \ 17 | ../src/dataManager/dataStructures/data/mLongArray.cpp \ 18 | ../src/dataManager/dataStructures/data/mMSTVertexValue.cpp \ 19 | ../src/dataManager/dataStructures/data/mMSTEdgeValue.cpp 20 | 21 | OBJS += \ 22 | ./src/dataManager/dataStructures/data/IdataType.o \ 23 | ./src/dataManager/dataStructures/data/mArrayIntTagNK.o \ 24 | ./src/dataManager/dataStructures/data/mCharArray.o \ 25 | ./src/dataManager/dataStructures/data/mCharArrayNoCpy.o \ 26 | ./src/dataManager/dataStructures/data/mDouble.o \ 27 | ./src/dataManager/dataStructures/data/mDoubleArray.o \ 28 | ./src/dataManager/dataStructures/data/mInt.o \ 29 | ./src/dataManager/dataStructures/data/mIntCharArrayPair.o \ 30 | ./src/dataManager/dataStructures/data/mIntTagDouble.o \ 31 | ./src/dataManager/dataStructures/data/mLong.o \ 32 | ./src/dataManager/dataStructures/data/mLongArray.o \ 33 | ./src/dataManager/dataStructures/data/mMSTVertexValue.o \ 34 | ./src/dataManager/dataStructures/data/mMSTEdgeValue.o 35 | 36 | CPP_DEPS += \ 37 | ./src/dataManager/dataStructures/data/IdataType.d \ 38 | ./src/dataManager/dataStructures/data/mArrayIntTagNK.d \ 39 | ./src/dataManager/dataStructures/data/mCharArray.d \ 40 | ./src/dataManager/dataStructures/data/mCharArrayNoCpy.d \ 41 | ./src/dataManager/dataStructures/data/mDouble.d \ 42 | ./src/dataManager/dataStructures/data/mDoubleArray.d \ 43 | ./src/dataManager/dataStructures/data/mInt.d \ 44 | ./src/dataManager/dataStructures/data/mIntCharArrayPair.d \ 45 | ./src/dataManager/dataStructures/data/mIntTagDouble.d \ 46 | ./src/dataManager/dataStructures/data/mLong.d \ 47 | ./src/dataManager/dataStructures/data/mLongArray.d \ 48 | ./src/dataManager/dataStructures/data/mMSTVertexValue.d \ 49 | ./src/dataManager/dataStructures/data/mMSTEdgeValue.d 50 | 51 | 52 | # Each subdirectory must supply rules for building sources it contributes 53 | src/dataManager/dataStructures/data/%.o: ../src/dataManager/dataStructures/data/%.cpp 54 | @echo 'Building file: $<' 55 | @echo 'Invoking: GCC C++ Compiler' 56 | mpic++ -I$(MPI_HOME)/include -I$(BOOST_ROOT)/include -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/linux -I$(HADOOP_HOME)/src/c++/libhdfs -O3 -w -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 57 | @echo 'Finished building: $<' 58 | @echo ' ' 59 | 60 | 61 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/algorithms/SSSP.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SSSP.h 3 | * 4 | * Created on: Nov 17 2013 5 | * Authors: Jack Jin, Jenny Wang, Young Han 6 | */ 7 | 8 | #ifndef SSSP_H_ 9 | #define SSSP_H_ 10 | 11 | #include "../IsuperStep.h" 12 | #include "../Icombiner.h" 13 | #include "../dataManager/dataStructures/data/mLong.h" 14 | 15 | #define INF mLong(LLONG_MAX) 16 | 17 | // combiner that takes the minimum of all messages 18 | class SSSPCombiner: public Icombiner { 19 | private: 20 | // NOTE: making this into a macro is dangerous!! 21 | mLong min(mLong a, mLong b) { 22 | return (a < b) ? a : b; 23 | } 24 | 25 | public: 26 | void combineMessages(mLong dst, messageIterator * messages, 27 | messageManager * mManager) { 28 | 29 | mLong minDist = INF; 30 | while (messages->hasNext()) { 31 | minDist = min(minDist, messages->getNext()); 32 | } 33 | 34 | // send message if minDist is not INF 35 | if (minDist == INF) { 36 | } else { 37 | mManager->sendMessage(dst, minDist); 38 | } 39 | } 40 | }; 41 | 42 | /* 43 | * Template types are where 44 | * K: ID class 45 | * V1: vertex value class 46 | * M: message value class 47 | * A: aggregation class 48 | * 49 | * For SSSP, vertex and message values are both mLong 50 | */ 51 | class SSSP: public IsuperStep { 52 | private: 53 | mLong srcID; 54 | int maxSuperStep; 55 | 56 | bool isSrc(mLong id) { 57 | return (id == srcID); 58 | } 59 | 60 | mLong min(mLong a, mLong b) { 61 | return (a < b) ? a : b; 62 | } 63 | 64 | public: 65 | /** 66 | * \param srcID The vertex ID of the source. 67 | * \param maxSS The maximum number of supersteps. 68 | */ 69 | SSSP(mLong srcID, int maxSS) : srcID(srcID), maxSuperStep(maxSS) {} 70 | 71 | void initialize(userVertexObject * data) { 72 | // start all vertices with INF distance 73 | data->setVertexValue(INF); 74 | 75 | // TODO: HACK. Mizan does not read in edge values, 76 | // so let's assign everybody 1s 77 | for (int i = 0; i < data->getOutEdgeCount(); i++) { 78 | data->setOutEdgeValue( data->getOutEdgeID(i), mLong(1) ); 79 | } 80 | } 81 | 82 | void compute(messageIterator * messages, 83 | userVertexObject * data, 84 | messageManager * comm) { 85 | 86 | // can use getValue() to convert mLong to long long 87 | mLong currDist = data->getVertexValue(); 88 | 89 | // potential new minimum distance 90 | mLong newDist = isSrc(data->getVertexID()) ? mLong(0) : INF; 91 | 92 | while (messages->hasNext()) { 93 | // cout << "receiving msg at ss=" << data->getCurrentSS() << " at id=" << data->getVertexID().getValue() << endl; 94 | newDist = min(newDist, messages->getNext()); 95 | } 96 | 97 | // if new distance is smaller, notify out edges 98 | if (newDist < currDist) { 99 | data->setVertexValue(newDist); 100 | 101 | for (int i = 0; i < data->getOutEdgeCount(); i++) { 102 | // cout << "sending msg at ss=" << data->getCurrentSS() << " to id=" << data->getOutEdgeID(i).getValue() << endl; 103 | // (outEdgeValue is the value of an outgoing edge) 104 | comm->sendMessage(data->getOutEdgeID(i), 105 | mLong(newDist.getValue() + data->getOutEdgeValue(i).getValue())); 106 | } 107 | } 108 | 109 | // always vote to halt 110 | data->voteToHalt(); 111 | } 112 | }; 113 | #endif /* SSSP_H_ */ 114 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/algorithms/WCC.h: -------------------------------------------------------------------------------- 1 | /* 2 | * WCC.h 3 | * 4 | * Created on: Nov 17 2013 5 | * Authors: Jack Jin, Jenny Wang, Young Han 6 | */ 7 | 8 | #ifndef WCC_H_ 9 | #define WCC_H_ 10 | 11 | #include "../IsuperStep.h" 12 | #include "../Icombiner.h" 13 | #include "../dataManager/dataStructures/data/mLong.h" 14 | 15 | #define INF mLong(LLONG_MAX) 16 | 17 | // combiner that takes the minimum of all messages 18 | class WCCCombiner: public Icombiner { 19 | private: 20 | // NOTE: making this into a macro is dangerous!! 21 | mLong min(mLong a, mLong b) { 22 | return (a < b) ? a : b; 23 | } 24 | 25 | public: 26 | void combineMessages(mLong dst, messageIterator * messages, 27 | messageManager * mManager) { 28 | 29 | mLong minCompID = INF; 30 | while (messages->hasNext()) { 31 | minCompID = min(minCompID, messages->getNext()); 32 | } 33 | 34 | // send message if minCompID is not INF 35 | if (minCompID == INF) { 36 | } else { 37 | mManager->sendMessage(dst, minCompID); 38 | } 39 | } 40 | }; 41 | 42 | /* 43 | * Template types are where 44 | * K: ID class 45 | * V1: vertex value class 46 | * M: message value class 47 | * A: aggregation class 48 | * 49 | * For WCC, vertex and message values are both mLong 50 | */ 51 | class WCC: public IsuperStep { 52 | private: 53 | int maxSuperStep; 54 | 55 | mLong min(mLong a, mLong b) { 56 | return (a < b) ? a : b; 57 | } 58 | 59 | public: 60 | /** 61 | * \param srcID The vertex ID of the source. 62 | * \param maxSS The maximum number of supersteps. 63 | */ 64 | WCC(int maxSS) : maxSuperStep(maxSS) {} 65 | 66 | void initialize(userVertexObject * data) { 67 | // all vertices start w/ component IDs being their own vertex ID 68 | data->setVertexValue(data->getVertexID()); 69 | } 70 | 71 | void compute(messageIterator * messages, 72 | userVertexObject * data, 73 | messageManager * comm) { 74 | 75 | // can use getValue() to convert mLong to long long 76 | mLong currCompID = data->getVertexValue(); 77 | mLong newCompID = currCompID; 78 | 79 | while (messages->hasNext()) { 80 | newCompID = min(newCompID, messages->getNext()); 81 | } 82 | 83 | // if new component ID is smaller, notify neighbours 84 | // OR, if first supersteps, send message 85 | if (newCompID < currCompID || data->getCurrentSS() == 1) { 86 | data->setVertexValue(newCompID); 87 | 88 | for (int i = 0; i < data->getOutEdgeCount(); i++) { 89 | // (outEdgeValue is the value of an outgoing edge) 90 | comm->sendMessage(data->getOutEdgeID(i), newCompID); 91 | } 92 | } 93 | 94 | // always vote to halt 95 | data->voteToHalt(); 96 | } 97 | }; 98 | #endif /* WCC_H_ */ 99 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/algorithms/dimEst.h: -------------------------------------------------------------------------------- 1 | /* 2 | * dimEst.h 3 | * 4 | * Created on: Sep 17, 2012 5 | * Author: refops 6 | * 7 | * Modified by Young 8 | */ 9 | 10 | #ifndef DIMEST_H_ 11 | #define DIMEST_H_ 12 | 13 | #include "../IsuperStep.h" 14 | #include "../dataManager/dataStructures/data/mLongArray.h" 15 | #include "../dataManager/dataStructures/data/mLong.h" 16 | #include "../dataManager/dataStructures/data/mInt.h" 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | class dimEst: public IsuperStep { 25 | private: 26 | int maxSuperStep; 27 | int k; 28 | boost::mt19937 * generator; 29 | boost::uniform_real<> * uni_dist; 30 | boost::variate_generator > * uni; 31 | const static long long v62 = 62; 32 | const static long long v1 = 1; 33 | 34 | public: 35 | dimEst(int inMaxSS) { 36 | k = 8; 37 | maxSuperStep = inMaxSS; 38 | 39 | generator = new boost::mt19937(std::time(0)); 40 | uni_dist = new boost::uniform_real<>(0, 1); 41 | uni = new boost::variate_generator >(*generator, *uni_dist); 43 | } 44 | void initialize( 45 | userVertexObject * data) { 46 | mLong * value = new mLong[k]; 47 | int finalBitCount = 63; 48 | long rndVal = 0; 49 | for (int j = 0; j < k; j++) { 50 | rndVal = create_random_bm(finalBitCount); 51 | value[j].setValue((v1 << (v62 - rndVal))); 52 | } 53 | mLongArray valueArray(k, value); 54 | data->setVertexValue(valueArray); 55 | } 56 | void compute(messageIterator * messages, 57 | userVertexObject * data, 58 | messageManager * comm) { 59 | 60 | mLong * newBitMask = new mLong[k]; 61 | //mLong * oldBitMask = data->getVertexValue().getArray(); 62 | 63 | for (int i = 0; i < k; i++) { 64 | // TODO: need to do this, b/c of weird bug where oldBitMask[31] has wrong value 65 | newBitMask[i] = data->getVertexValue().getArray()[i]; //oldBitMask[i]; 66 | } 67 | 68 | //std::cout << "value: " << newBitMask[31].getValue() << " " << oldBitMask[31].getValue() << " " << data->getVertexValue().getArray()[31].getValue() << std::endl; 69 | 70 | mLongArray tmpArray; 71 | mLong * tmpBitMask; 72 | 73 | bool isChanged = false; 74 | long long a; 75 | long long b; 76 | long long c; 77 | while (messages->hasNext()) { 78 | tmpArray = messages->getNext(); 79 | tmpBitMask = tmpArray.getArray(); 80 | for (int i = 0; i < k; i++) { 81 | a = newBitMask[i].getValue(); 82 | b = tmpBitMask[i].getValue(); 83 | c = a | b; 84 | newBitMask[i].setValue(c); 85 | 86 | // NOTE: unused for now---to terminate when all vertices converge, 87 | // use an aggregator to track # of vertices that have finished 88 | //isChanged = isChanged || (a != c); 89 | } 90 | } 91 | 92 | mLongArray outArray(k, newBitMask); 93 | 94 | // WARNING: we cannot terminate based on LOCAL steady state, 95 | // we need all vertices computing until the very end 96 | if (data->getCurrentSS() >= maxSuperStep) { 97 | data->voteToHalt(); 98 | 99 | } else { 100 | // use outedges to match Giraph and GPS 101 | for (int i = 0; i < data->getOutEdgeCount(); i++) { 102 | comm->sendMessage(data->getOutEdgeID(i), outArray); 103 | } 104 | 105 | data->setVertexValue(outArray); 106 | } 107 | } 108 | 109 | //Src: Pegasus 110 | int create_random_bm(int size_bitmask) { 111 | int j; 112 | 113 | // cur_random is between 0 and 1. 114 | double cur_random = uni->operator ()(); //rand.nextDouble(); //Math.random(); 115 | double threshold = 0; 116 | for (j = 0; j < size_bitmask - 1; j++) { 117 | threshold += pow(2.0, -1 * j - 1); 118 | 119 | if (cur_random < threshold) { 120 | break; 121 | } 122 | } 123 | 124 | return j; 125 | } 126 | }; 127 | #endif /* DIMEST_H_ */ 128 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/algorithms/pageRank.h: -------------------------------------------------------------------------------- 1 | /* 2 | * pageRank.h 3 | * 4 | * Created on: Sep 18, 2012 5 | * Author: refops 6 | */ 7 | 8 | #ifndef PAGERANK_H_ 9 | #define PAGERANK_H_ 10 | 11 | #include "../IsuperStep.h" 12 | #include "../Icombiner.h" 13 | #include "../dataManager/dataStructures/data/mLong.h" 14 | #include "../dataManager/dataStructures/data/mDouble.h" 15 | 16 | class pageRankCombiner: public Icombiner { 17 | 18 | void combineMessages(mLong dst, messageIterator * messages, 19 | messageManager * mManager) { 20 | double newVal = 0; 21 | while (messages->hasNext()) { 22 | double tmp = messages->getNext().getValue(); 23 | newVal = newVal + tmp; 24 | } 25 | mDouble messageOut(newVal); 26 | mManager->sendMessage(dst, messageOut); 27 | } 28 | }; 29 | 30 | class pageRank: public IsuperStep { 31 | private: 32 | //int vertexTotal; 33 | int maxSuperStep; 34 | 35 | public: 36 | 37 | pageRank(int maxSS) { 38 | //vertexTotal = 0; 39 | maxSuperStep = maxSS; 40 | } 41 | void initialize(userVertexObject * data) { 42 | // NOTE: We follow GraphLab's alternative way of computing PageRank, 43 | // which is to not divide by |V|. To get the probability value at 44 | // each vertex, take its PageRank value and divide by |V|. 45 | 46 | //if (vertexTotal == 0) { 47 | // vertexTotal = data->getGlobalVertexCount(); 48 | //} 49 | 50 | // BUGFIX: this should not exist: vertexTotal++; 51 | 52 | data->setVertexValue(mDouble(1.0)); 53 | //data->setVertexValue(mDouble(1.0 / (double) vertexTotal)); 54 | } 55 | void compute(messageIterator * messages, 56 | userVertexObject * data, 57 | messageManager * comm) { 58 | 59 | double currVal = data->getVertexValue().getValue(); 60 | double newVal = 0; 61 | double c = 0.85; 62 | 63 | if (data->getCurrentSS() > 1) { 64 | while (messages->hasNext()) { 65 | double tmp = messages->getNext().getValue(); 66 | newVal = newVal + tmp; 67 | } 68 | newVal = newVal * c + (1.0 - c); 69 | //newVal = newVal * c + (1.0 - c) / ((double) vertexTotal); 70 | data->setVertexValue(mDouble(newVal)); 71 | } else { 72 | newVal = currVal; 73 | } 74 | 75 | // Termination condition based on max supersteps 76 | if (data->getCurrentSS() <= maxSuperStep) { 77 | mDouble outVal(newVal / ((double) data->getOutEdgeCount())); 78 | for (int i = 0; i < data->getOutEdgeCount(); i++) { 79 | comm->sendMessage(data->getOutEdgeID(i), outVal); 80 | } 81 | } else { 82 | data->voteToHalt(); 83 | } 84 | } 85 | }; 86 | #endif /* PAGERANK_H_ */ 87 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/communication/dataStructures/general.h: -------------------------------------------------------------------------------- 1 | /* 2 | * general.h 3 | * 4 | * Created on: Apr 2, 2012 5 | * Author: refops 6 | */ 7 | 8 | #ifndef GENERAL_H_ 9 | #define GENERAL_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "map" 15 | #include 16 | #include 17 | 18 | using namespace std; 19 | 20 | #include "boost/thread/mutex.hpp" 21 | #include "boost/thread/exceptions.hpp" 22 | #include "mpi.h" 23 | 24 | static int KB = 1024; 25 | static int MB = 1024 * 1024; 26 | static int data_msgsize = 4 * 1024; 27 | static int buffer_msgsize = 4 * KB; //* KB 28 | static queue SYS_cmdQueue; 29 | static queue DATA_cmdQueue; 30 | 31 | enum messageStatus { 32 | m_success, m_fail, 33 | }; 34 | 35 | enum messageCode { 36 | DM, BCast, AllNB, 37 | }; 38 | 39 | enum communicationType { 40 | _pt2pt, _ring, _pt2ptb, 41 | }; 42 | 43 | enum msgHeader { 44 | _SYS, _DATA, _EXIT_PE, 45 | }; 46 | enum SYS_CMDS { 47 | DHT_I, //dht_insert 48 | DHT_U, //dht_update 49 | DHT_A, //dht_ask 50 | DHT_R, //dht_response 51 | InitVertexCount, 52 | FinishInit, 53 | EndofSS, 54 | StartSS, 55 | Terminate, 56 | ENDMSG, 57 | VertexMigrate, 58 | SendSoftVertex, 59 | SendHardVertex, 60 | StealVertex, 61 | SendStolenVertex, 62 | StolenVertexResult, 63 | GraphMutation, 64 | LateStats,LateStatsTerminate, 65 | StealBarrier, 66 | Aggregator, 67 | MigrateBarrier, 68 | }; 69 | 70 | enum DATA_CMDS { 71 | SSdata, InNbrs, OutNbrs, ALLVTX, ENDDMSG, 72 | }; 73 | enum SYS_CMDS_PRIORITY { 74 | NO_PRIORITY, AFTER_DATABUFFER_PRIORITY, INSTANT_PRIORITY 75 | }; 76 | 77 | enum block_type { 78 | INT, DOUBLE, CHAR, LONG_LONG, 79 | }; 80 | 81 | static const char* msgHeader_strings[] = { "_SYS", "_DATA", "_EXIT_PE" }; 82 | static const char* DATA_CMDS_strings[] = { "SSdata", "InNbrs", "OutNbrs", "ALLVTX", "ENDDMSG" }; 83 | static const char* SYS_CMDS_strings[] = { "DHT_I", "DHT_U", "DHT_A", "DHT_R", 84 | "InitVertexCount", "FinishInit", "EndofSS", "StartSS", "Terminate", 85 | "ENDMSG", "SSExecTime", "VertexMigrate", "SendSoftVertex", 86 | "SendHardVertex", "StealVertex", "SendStolenVertex", 87 | "StolenVertexResult" }; 88 | 89 | #endif /* GENERAL_H_ */ 90 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/dataManager/dataStructures/data/mMSTEdgeValue.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * mMSTEdgeValue.cpp 3 | * 4 | * Created on: Dec 27, 2013 5 | * Author: Young Han 6 | */ 7 | 8 | #include "mMSTEdgeValue.h" 9 | #include "mLong.h" 10 | 11 | #define EDGE_VAL_LEN 3 12 | 13 | // indices into value array 14 | #define I_WEIGHT 0 15 | #define I_SRC 1 16 | #define I_DST 2 17 | 18 | /** Constructors/Destructors **/ 19 | mMSTEdgeValue::mMSTEdgeValue() : weight(0), src(0), dst(0) {} 20 | 21 | mMSTEdgeValue::mMSTEdgeValue(long long weight, long long src, long long dst) 22 | : weight(weight), src(src), dst(dst) {} 23 | 24 | // copy constructor (same as implicit one) 25 | mMSTEdgeValue::mMSTEdgeValue(const mMSTEdgeValue& obj) { 26 | weight = obj.weight; 27 | src = obj.src; 28 | dst = obj.dst; 29 | } 30 | 31 | mMSTEdgeValue::~mMSTEdgeValue() {} 32 | 33 | int mMSTEdgeValue::byteSize() { 34 | return sizeof(long long)*EDGE_VAL_LEN; 35 | } 36 | 37 | std::string mMSTEdgeValue::toString() { 38 | // copied from mLongArray.cpp 39 | char outArray[31*EDGE_VAL_LEN]; 40 | sprintf(outArray, "%lld:%lld:%lld:", weight, src, dst); 41 | std::string output(outArray); 42 | return output; 43 | } 44 | 45 | void mMSTEdgeValue::readFromCharArray(char * input) { 46 | // modified from mLongArray.cpp 47 | 48 | // should be constant, but whatever 49 | char delimiter = ':'; 50 | mLong array[EDGE_VAL_LEN]; 51 | 52 | int startPtr = 0; 53 | int endPtr = 0; 54 | for (int i = 0; i < EDGE_VAL_LEN; i++) { 55 | char tmpArray[30]; 56 | while (input[endPtr] != delimiter) { 57 | endPtr++; 58 | } 59 | //12345:668512:999831 60 | strncpy(tmpArray, &input[startPtr], (endPtr - startPtr)); 61 | tmpArray[endPtr - startPtr] = 0; 62 | array[i].readFromCharArray(tmpArray); 63 | endPtr++; 64 | startPtr = endPtr; 65 | } 66 | 67 | weight = array[I_WEIGHT].getValue(); 68 | src = array[I_SRC].getValue(); 69 | dst = array[I_DST].getValue(); 70 | } 71 | 72 | char * mMSTEdgeValue::byteEncode(int &size) { 73 | // modified from mLongArray.cpp.. basic idea is the same 74 | char * output = (char *) calloc(byteSize(), sizeof(char)); 75 | int j = 0; 76 | int tmpSize = 0; 77 | 78 | mLong array[EDGE_VAL_LEN]; 79 | array[I_WEIGHT] = mLong(weight); 80 | array[I_SRC] = mLong(src); 81 | array[I_DST] = mLong(dst); 82 | 83 | for (int i = 0; i < EDGE_VAL_LEN; i++) { 84 | tmpSize = array[i].byteEncode2(&output[j + 1]); 85 | output[j] = ((char) tmpSize); 86 | j = j + tmpSize + 1; 87 | } 88 | size = j; 89 | return output; 90 | } 91 | 92 | int mMSTEdgeValue::byteEncode2(char * buffer) { 93 | // does not use byteEncode()... presumably to save on space? 94 | int j = 0; 95 | int tmpSize = 0; 96 | 97 | mLong array[EDGE_VAL_LEN]; 98 | array[I_WEIGHT] = mLong(weight); 99 | array[I_SRC] = mLong(src); 100 | array[I_DST] = mLong(dst); 101 | 102 | for (int i = 0; i < EDGE_VAL_LEN; i++) { 103 | tmpSize = array[i].byteEncode2(&buffer[j + 1]); 104 | buffer[j] = ((char) tmpSize); 105 | j = j + tmpSize + 1; 106 | } 107 | return j; 108 | } 109 | 110 | void mMSTEdgeValue::byteDecode(int size, char * input) { 111 | // modified from mLongArray.cpp 112 | int j = 0; 113 | int objSize = 0; 114 | mLong obj; 115 | 116 | mLong array[EDGE_VAL_LEN]; 117 | int i = 0; 118 | 119 | while (j < size) { 120 | if (i >= EDGE_VAL_LEN) { 121 | std::cout << "ERROR in mMSTEdgeValue byteDecode()!!"; 122 | break; 123 | } 124 | 125 | objSize = ((int) input[j]); 126 | array[i].byteDecode(objSize, &input[j + 1]); 127 | j = j + objSize + 1; 128 | i++; 129 | } 130 | 131 | weight = array[I_WEIGHT].getValue(); 132 | src = array[I_SRC].getValue(); 133 | dst = array[I_DST].getValue(); 134 | } 135 | 136 | std::size_t mMSTEdgeValue::local_hash_value() const { 137 | // just like mLongArray, do hash of first field.. which is long long 138 | // copied from mLong.cpp 139 | return weight; 140 | } 141 | 142 | mMSTEdgeValue & mMSTEdgeValue::operator=(const mMSTEdgeValue& rhs) { 143 | // yes, same as the implicit assignment... 144 | weight = rhs.weight; 145 | src = rhs.src; 146 | dst = rhs.dst; 147 | } 148 | 149 | /** 150 | * Objects are == iff all fields are equal, unlike below. 151 | */ 152 | bool mMSTEdgeValue::operator==(const IdataType& rhs) const { 153 | return (weight == ((mMSTEdgeValue&) rhs).weight && 154 | src == ((mMSTEdgeValue&) rhs).src && 155 | dst == ((mMSTEdgeValue&) rhs).dst); 156 | } 157 | 158 | /** 159 | * Comparison is based on the weight. If weights are same, 160 | * then comparison is based on source vertex ID. 161 | * The destination ID does not play a role. 162 | * TODO 163 | */ 164 | bool mMSTEdgeValue::operator<(const IdataType& rhs) const { 165 | return (weight < ((mMSTEdgeValue&) rhs).weight && 166 | src < ((mMSTEdgeValue&) rhs).src && 167 | dst < ((mMSTEdgeValue&) rhs).dst); 168 | 169 | // if (weight == rhs.weight) { 170 | // return (src < rhs.src); 171 | // } 172 | // 173 | // return (weight < rhs.weight); 174 | } 175 | 176 | bool mMSTEdgeValue::operator>(const IdataType &rhs) const { 177 | return (weight > ((mMSTEdgeValue&) rhs).weight && 178 | src > ((mMSTEdgeValue&) rhs).src && 179 | dst > ((mMSTEdgeValue&) rhs).dst); 180 | } 181 | 182 | bool mMSTEdgeValue::operator<=(const IdataType &rhs) const { 183 | return (weight <= ((mMSTEdgeValue&) rhs).weight && 184 | src <= ((mMSTEdgeValue&) rhs).src && 185 | dst <= ((mMSTEdgeValue&) rhs).dst); 186 | } 187 | 188 | bool mMSTEdgeValue::operator>=(const IdataType &rhs) const { 189 | return (weight >= ((mMSTEdgeValue&) rhs).weight && 190 | src >= ((mMSTEdgeValue&) rhs).src && 191 | dst >= ((mMSTEdgeValue&) rhs).dst); 192 | } 193 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/dataManager/dataStructures/data/mMSTEdgeValue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * mMSTEdgeValue.h 3 | * 4 | * Created on: Dec 27, 2013 5 | * Author: Young Han 6 | */ 7 | 8 | #ifndef MMSTEDGEVALUE_H_ 9 | #define MMSTEDGEVALUE_H_ 10 | 11 | #include "IdataType.h" 12 | 13 | /** 14 | * MST edge and vertex value representations 15 | */ 16 | class mMSTEdgeValue: public IdataType { 17 | private: 18 | long long weight; 19 | long long src; // original source 20 | long long dst; // original destination 21 | public: 22 | mMSTEdgeValue(); 23 | mMSTEdgeValue(long long weight, long long src, long long dst); 24 | mMSTEdgeValue(const mMSTEdgeValue& obj); 25 | ~mMSTEdgeValue(); 26 | int byteSize(); 27 | std::string toString(); 28 | void readFromCharArray(char * input); 29 | char * byteEncode(int &size); 30 | int byteEncode2(char * buffer); 31 | void byteDecode(int size, char * input); 32 | std::size_t local_hash_value() const; 33 | mMSTEdgeValue & operator=(const mMSTEdgeValue& rhs); 34 | bool operator==(const IdataType& rhs) const; 35 | bool operator<(const IdataType& rhs) const; 36 | bool operator>(const IdataType &rhs) const; 37 | bool operator<=(const IdataType &rhs) const; 38 | bool operator>=(const IdataType &rhs) const; 39 | 40 | void cleanUp() {} 41 | 42 | //Class specific methods 43 | long long getWeight() { return weight; } 44 | long long getSrc() { return src; } 45 | long long getDst() { return dst;} 46 | 47 | // no setters---edge value should be immutable 48 | }; 49 | #endif /* MMSTEDGEVALUE_H_ */ 50 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/dataManager/dataStructures/data/mMSTVertexValue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * mMSTVertexValue.h 3 | * 4 | * Created on: Dec 27, 2013 5 | * Author: Young Han 6 | */ 7 | 8 | #ifndef MMSTVERTEXVALUE_H_ 9 | #define MMSTVERTEXVALUE_H_ 10 | 11 | #include "IdataType.h" 12 | 13 | /** 14 | * Enum constants 15 | */ 16 | // phases of computation 17 | enum MSTPhase { 18 | PHASE_1, // find min-weight edge 19 | PHASE_2A, // question phase 20 | PHASE_2B, // Q /and/ A phase 21 | PHASE_3A, // send supervertex IDs 22 | PHASE_3B, // receive PHASE_3A messages 23 | PHASE_4A, // send edges to supervertex 24 | PHASE_4B // receive/merge edges 25 | }; 26 | 27 | // vertex types 28 | enum MSTVertexType { 29 | TYPE_UNKNOWN, // initial state in Phase 2A 30 | TYPE_SUPERVERTEX, // supervertex 31 | TYPE_POINTS_AT_SUPERVERTEX, // child of supervertex 32 | TYPE_POINTS_AT_SUBVERTEX // child of child of supervertex 33 | }; 34 | 35 | /** 36 | * MST edge and vertex value representations 37 | */ 38 | class mMSTVertexValue: public IdataType { 39 | private: 40 | long long weight; 41 | long long src; // original source 42 | long long dst; // original destination 43 | 44 | MSTPhase phase; // computation phase 45 | MSTVertexType type; // vertex type 46 | long long pointer; // vertex's (potential) supervertex 47 | 48 | public: 49 | mMSTVertexValue(); 50 | mMSTVertexValue(long long weight, long long src, long long dst, 51 | MSTPhase phase, MSTVertexType type, long long pointer); 52 | 53 | // NOTE: This is only for compatibility when used as a mMSTEdgeValue. 54 | // Once Mizan supports separate edge value types, this should be deleted! 55 | mMSTVertexValue(long long weight, long long src, long long dst); 56 | 57 | mMSTVertexValue(const mMSTVertexValue& obj); 58 | ~mMSTVertexValue(); 59 | int byteSize(); 60 | std::string toString(); 61 | void readFromCharArray(char * input); 62 | char * byteEncode(int &size); 63 | int byteEncode2(char * buffer); 64 | void byteDecode(int size, char * input); 65 | std::size_t local_hash_value() const; 66 | mMSTVertexValue & operator=(const mMSTVertexValue& rhs); 67 | bool operator==(const IdataType& rhs) const; 68 | bool operator<(const IdataType& rhs) const; 69 | bool operator>(const IdataType &rhs) const; 70 | bool operator<=(const IdataType &rhs) const; 71 | bool operator>=(const IdataType &rhs) const; 72 | 73 | void cleanUp() {} 74 | 75 | //Class specific methods 76 | long long getWeight() { return weight; } 77 | long long getSrc() { return src; } 78 | long long getDst() { return dst;} 79 | 80 | MSTPhase getPhase() { return phase; } 81 | MSTVertexType getType() { return type; } 82 | long long getPointer() { return pointer; } 83 | 84 | void setWeight(long long w) { weight = w; } 85 | void setDst(long long d) { dst = d; } 86 | void setSrc(long long s) { src = s; } 87 | 88 | void setPhase(MSTPhase ph) { phase = ph; } 89 | void setType(MSTVertexType t) { type = t; } 90 | void setPointer(long long p) { pointer = p; } 91 | }; 92 | #endif /* MMSTVERTEXVALUE_H_ */ 93 | -------------------------------------------------------------------------------- /Mizan-0.1bu1/src/general.h: -------------------------------------------------------------------------------- 1 | /* 2 | * general.h 3 | * 4 | * Created on: Jun 13, 2012 5 | * Author: refops 6 | */ 7 | 8 | #ifndef GENERALMIZAN_H_ 9 | #define GENERALMIZAN_H_ 10 | #include 11 | 12 | #include "IAggregator.h" 13 | #include "computation/systemWideInfo.h" 14 | #include "communication/sysComm.h" 15 | #include "boost/thread.hpp" 16 | #include "dataManager/dataStructures/general.h" 17 | 18 | template class sysComm; 19 | template class userComm; 20 | 21 | template 22 | struct systemDataPointer { 23 | std::map *> aggContainer; 24 | boost::mutex aggContainerLock; 25 | systemWideInfo sysInfo; 26 | sysComm * sc; 27 | userComm * uc; 28 | }; 29 | 30 | struct MizanArgs { 31 | int algorithm; 32 | int clusterSize; 33 | std::string graphName; 34 | fileSystem fs; 35 | distType partition; 36 | std::string hdfsUserName; 37 | migrationMode migration; 38 | communicationType communication; 39 | int superSteps; 40 | // NOTE: this is "hacked" in... a better way is for the 41 | // relevant algorithms to parse a portion of the arguments 42 | long srcID; 43 | }; 44 | 45 | #endif /* GENERALMIZAN_H_ */ 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | graph-processing 2 | ================ 3 | 4 | A comparison of graph processing systems. Please see the [wiki](https://github.com/xvz/graph-processing/wiki/)! 5 | -------------------------------------------------------------------------------- /benchmark/bench-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs all the benchmarks. 4 | # 5 | # The batch-benchmarking scripts are quite primitive, simply because 6 | # when things fail it's usually easier to intervene manually. 7 | # 8 | # We recommend running this in a "screen" so a terminated ssh 9 | # connection doesn't kill it. 10 | # 11 | # Use "screen" to start a screen and run "./bench-all.sh" within it. 12 | # Detach from the screen at any time with C-a d (Ctrl-a d). 13 | # Reattach to the screen anywhere with "screen -R". This can be done 14 | # after a detach or when ssh is inadvertently killed. 15 | 16 | cd "$(dirname "${BASH_SOURCE[0]}")" 17 | source ./common/get-hosts.sh 18 | source ./common/get-dirs.sh 19 | 20 | # start (or restart) Hadoop 21 | ./hadoop/restart-hadoop.sh 22 | hadoop dfsadmin -safemode wait > /dev/null 23 | 24 | echo "Running Giraph experiments..." 25 | ./giraph/benchall.sh ${NUM_MACHINES} 5 26 | 27 | echo "Running GPS experiments..." 28 | ./gps/benchall.sh ${NUM_MACHINES} 5 29 | 30 | echo "Running GraphLab experiments..." 31 | ./graphlab/benchall.sh ${NUM_MACHINES} 5 32 | 33 | echo "Running Mizan experiments..." 34 | ./mizan/benchall.sh ${NUM_MACHINES} 5 -------------------------------------------------------------------------------- /benchmark/common/bench-finish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Finish data logging/collection at the master and all worker machines. 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "usage: $0 log-name-prefix" 7 | exit -1 8 | fi 9 | 10 | source "$(dirname "${BASH_SOURCE[0]}")"/get-hosts.sh 11 | 12 | logname=$1 13 | dir=$PWD 14 | 15 | for ((i = 0; i <= ${NUM_MACHINES}; i++)); do 16 | nbtfile=${logname}_${i}_nbt.txt # network bytes total 17 | 18 | # special case for master, to make it work for local testing too 19 | if [ $i -eq 0 ]; then 20 | name=${HOSTNAME} 21 | else 22 | name=${CLUSTER_NAME}${i} 23 | fi 24 | 25 | # 1. Change to the same directory as master. 26 | # 2. Append final network usage. 27 | # 3. Kill sar and free to stop tracking. 28 | # 29 | # NOTE: - could use `jobs -p` for kill, but difficult b/c we're ssh-ing 30 | # - must escape $ for things that should be evaluated remotely 31 | ssh ${name} "cd \"$dir\"; cat /proc/net/dev >> ./logs/${nbtfile} & kill \$(pgrep sar) & kill \$(pgrep free)" & 32 | done 33 | wait 34 | 35 | # get worker machines' files in parallel, with compression to speed things up 36 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 37 | rsync -az ${CLUSTER_NAME}${i}:"$dir"/logs/${logname}_${i}_*.txt ./logs/ & 38 | done 39 | wait -------------------------------------------------------------------------------- /benchmark/common/bench-init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initiate data logging/collection at the master and all worker machines. 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "usage: $0 log-name-prefix" 7 | exit -1 8 | fi 9 | 10 | source "$(dirname "${BASH_SOURCE[0]}")"/get-hosts.sh 11 | 12 | logname=$1 13 | dir=$PWD 14 | 15 | for ((i = 0; i <= ${NUM_MACHINES}; i++)); do 16 | cpufile=${logname}_${i}_cpu.txt # cpu usage 17 | netfile=${logname}_${i}_net.txt # network usage 18 | memfile=${logname}_${i}_mem.txt # memory usage 19 | nbtfile=${logname}_${i}_nbt.txt # network bytes total 20 | 21 | # special case for master, to make it work for local testing too 22 | if [ $i -eq 0 ]; then 23 | name=${HOSTNAME} 24 | else 25 | name=${CLUSTER_NAME}${i} 26 | fi 27 | 28 | # 1. Change to the same directory as master. 29 | # 2. Start sysstat for cpu and network usage, and free for memory usage (1s intervals). 30 | # 3. Print initial network bytes. 31 | # 32 | # NOTE: - & is like variant of ;, so don't need both 33 | # - grep needs stdbuf correction, otherwise nothing shows up 34 | ssh ${name} "cd \"$dir\"; sar 1 > ./logs/${cpufile} & free -s 1 | stdbuf -o0 grep + > ./logs/${memfile} & sar -n DEV 1 | stdbuf -o0 grep 'lo\|eth0' > ./logs/${netfile} & cat /proc/net/dev > ./logs/${nbtfile}" & 35 | done 36 | wait -------------------------------------------------------------------------------- /benchmark/common/cleanup-bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Cleans up rogue stat programs created by bench-init, 4 | # in the event that bench-finish was unable to run. 5 | # 6 | # Alternatively, one can run bench-finish by passing in 7 | # the correct log name prefix to clean things up and get 8 | # the worker machines' (incomplete) logs. 9 | 10 | source "$(dirname "${BASH_SOURCE[0]}")"/get-hosts.sh 11 | 12 | for ((i = 0; i <= ${NUM_MACHINES}; i++)); do 13 | # special case for master, to make it work for local testing too 14 | if [ $i -eq 0 ]; then 15 | name=${HOSTNAME} 16 | else 17 | name=${CLUSTER_NAME}${i} 18 | fi 19 | 20 | ssh ${name} "kill \$(pgrep sar) & kill \$(pgrep free)" & 21 | done 22 | wait -------------------------------------------------------------------------------- /benchmark/common/get-configs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Specifies system-specific configuration parameters 4 | # used by the various scripts. 5 | # 6 | # NOTE: include/source using "$(dirname "${BASH_SOURCE[0]}")" 7 | # as a part of the directory. 8 | 9 | # maximum JVM heap size for Giraph (per machine) 10 | # NOTE: to put changes into effect without re-initializing everything (i.e., ../init-all.sh), 11 | # run ../hadoop/init.sh; ../hadoop/restart-hadoop.sh 1 12 | GIRAPH_XMX=14500M 13 | 14 | # maximum JVM heap size for GPS (per WORKER, not machine) 15 | GPS_WORKER_XMX=7250M 16 | # max JVM heap size for GPS master 17 | GPS_MASTER_XMX=4096M 18 | 19 | 20 | # number of compute/input/output threads per machine 21 | GIRAPH_THREADS=2 22 | 23 | # number of workers per machine (WPM) 24 | GPS_WPM=2 25 | MIZAN_WPM=2 # NOTE: re-run premizan if this is changed -------------------------------------------------------------------------------- /benchmark/common/get-dirs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Specifies the absolute paths of the systems and other things. 4 | # 5 | # If path has spaces, escape the spaces AND quote it. For example, 6 | # SOME_DIR="/home/me/not\ a\ great\ folder\ name/". 7 | # 8 | # NOTE: if the including script will be included in other 9 | # scripts, use "$(dirname "${BASH_SOURCE[0]}")" as a part 10 | # of the directory. 11 | 12 | DIR_PREFIX=/home/ubuntu 13 | #DIR_PREFIX=/home/young/cs848 # for testing on a single machine 14 | 15 | # location of datasets/input graphs 16 | DATASET_DIR="$DIR_PREFIX"/datasets/ 17 | 18 | # $JAVA_DIR/bin/java should be the Java binary that is 19 | # used by all systems (incl. Hadoop) that need Java 20 | JAVA_DIR="$DIR_PREFIX"/jdk1.6.0_30/ 21 | 22 | # HADOOP_DATA is where HDFS files and Hadoop logs are stored 23 | HADOOP_DIR="$DIR_PREFIX"/hadoop-1.0.4/ 24 | HADOOP_DATA_DIR="$DIR_PREFIX"/hadoop_data/ 25 | 26 | GIRAPH_DIR="$DIR_PREFIX"/giraph-1.0.0/ 27 | 28 | # These must match "GPS_DIR" and "GPS_LOG_DIRECTORY" of $GPS_DIR/conf/gps-env.sh 29 | GPS_DIR="$DIR_PREFIX"/gps-rev-110/ 30 | GPS_LOG_DIR="$DIR_PREFIX"/var/tmp/ 31 | 32 | GRAPHLAB_DIR="$DIR_PREFIX"/graphlab-2a063b3829/ 33 | MIZAN_DIR="$DIR_PREFIX"/Mizan-0.1bu1/ -------------------------------------------------------------------------------- /benchmark/common/ssh-check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Simple script to check if worker machines can be ssh'd to. 4 | 5 | cd "$(dirname "${BASH_SOURCE[0]}")" 6 | source ./get-hosts.sh 7 | source ./get-dirs.sh 8 | 9 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 10 | nc -v -w 1 ${CLUSTER_NAME}${i} -z 22 11 | done 12 | -------------------------------------------------------------------------------- /benchmark/datasets/Makefile: -------------------------------------------------------------------------------- 1 | all: snap-convert snap-revert mst-convert 2 | 3 | clean: 4 | rm -f snap-convert 5 | rm -f snap-revert 6 | rm -f mst-convert 7 | 8 | snap-convert: snap-convert.cpp 9 | g++ -Wall snap-convert.cpp -o snap-convert 10 | 11 | snap-revert: snap-revert.cpp 12 | g++ -Wall snap-revert.cpp -o snap-revert 13 | 14 | mst-convert: mst-convert.cpp 15 | g++ -Wall mst-convert.cpp -o mst-convert -------------------------------------------------------------------------------- /benchmark/datasets/convert-adj.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # second arg is 1 if graph is for MST (SNAP format w/ edge weights) 4 | # and 0 otherwise (regular SNAP format) 5 | if [ $# -ne 2 ]; then 6 | echo "usage: $0 input-graph do-mst?" 7 | echo "" 8 | echo "do-mst: 0 converts regular SNAP format (src dst)" 9 | echo " 1 converts SNAP with edge weights (src dst weight)" 10 | exit -1 11 | fi 12 | 13 | scriptdir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 14 | graph=$(echo "$1" | sed 's/.txt$//g') 15 | domst=$2 16 | 17 | if [[ ! -f "${graph}.txt" ]]; then 18 | echo "${graph}.txt does not exist." 19 | exit -1 20 | fi 21 | 22 | if [[ -f "${graph}-adj.txt" ]]; then 23 | echo "${graph}-adj.txt already exists. Delete it first." 24 | exit -1 25 | fi 26 | 27 | # convert graph to adjacency format 28 | echo "Converting ${graph}.txt to adjacency format..." 29 | if [[ ${domst} -eq 1 ]]; then 30 | "${scriptdir}"/snap-convert "${graph}.txt" "${graph}-adj.txt" 2 2 31 | else 32 | "${scriptdir}"/snap-convert "${graph}.txt" "${graph}-adj.txt" 1 1 33 | fi 34 | 35 | echo "Done!" -------------------------------------------------------------------------------- /benchmark/datasets/convert-mst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Converts a SNAP graph input into an undirected graph 4 | # with unique edge weights. Output is in SNAP format, 5 | # with an additional column for weights. 6 | # 7 | # Processor and memory arguments below are used for sort. 8 | procs=$(nproc) 9 | mem=4G 10 | 11 | if [ $# -ne 1 ]; then 12 | echo "usage: $0 input-graph" 13 | exit -1 14 | fi 15 | 16 | scriptdir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 17 | graph=$(echo "$1" | sed 's/.txt$//g') 18 | 19 | if [[ ! -f "${graph}.txt" ]]; then 20 | echo "${graph}.txt does not exist." 21 | exit -1 22 | fi 23 | 24 | if [[ -f "${graph}-mst.txt" ]]; then 25 | echo "${graph}-mst.txt already exists. Delete it first." 26 | exit -1 27 | fi 28 | 29 | # sort the input, if it's not already sorted 30 | unsorted=$(sort -nk1 -nk2 --parallel=${procs} -S ${mem} -c "${graph}.txt" |& wc -l) 31 | 32 | if [[ ${unsorted} -eq 0 ]]; then 33 | echo "Input already sorted." 34 | sortedgraph="$graph" 35 | else 36 | echo "Sorting input..." 37 | sort -nk1 -nk2 --parallel=${procs} -S ${mem} "${graph}.txt" > "${graph}-sorted.txt" 38 | sortedgraph="${graph}-sorted" 39 | 40 | echo "Delete unsorted input?" 41 | rm -i "${graph}.txt" 42 | fi 43 | 44 | echo "Converting ${graph}.txt to MST format..." 45 | 46 | "${scriptdir}"/mst-convert "${sortedgraph}.txt" "${graph}-mst-unsorted.txt" 47 | 48 | # sort the output 49 | echo "Sorting output..." 50 | sort -nk1 -nk2 --parallel=${procs} -S ${mem} "${graph}-mst-unsorted.txt" > "${graph}-mst.txt" 51 | 52 | rm -f "${graph}-mst-unsorted.txt" 53 | 54 | echo "Done!" -------------------------------------------------------------------------------- /benchmark/datasets/load-files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Loads the input data, based on the cluster size. 4 | # 5 | # The size can be specified as an argument. Otherwise, 6 | # it will be obtained based on ../common/get-hosts.sh. 7 | 8 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 9 | source "$commondir"/get-dirs.sh 10 | source "$commondir"/get-hosts.sh 11 | 12 | if [ $# -eq 0 ]; then 13 | case ${NUM_MACHINES} in 14 | 4) size=1;; 15 | 8) size=1;; 16 | 16) size=2;; 17 | 32) size=2;; 18 | 64) size=3;; 19 | 128) size=3;; 20 | *) echo "Invalid number of machines."; 21 | echo "usage: $0 size"; 22 | echo ""; 23 | echo "size: 1 for amazon, google, patents"; 24 | echo " 2 for livejournal, orkut, arabic, twitter"; 25 | echo " 3 for livejournal, orkut, arabic, twitter, uk0705"; 26 | exit -1;; 27 | esac 28 | else 29 | size=$1 30 | fi 31 | 32 | cd "$DATASET_DIR" 33 | 34 | hadoop dfsadmin -safemode wait > /dev/null 35 | hadoop dfs -mkdir ./input || true # no problem if it already exists 36 | 37 | case ${size} in 38 | 1) echo "Uploading amazon*.txt..."; hadoop dfs -put amazon*.txt ./input/; 39 | echo "Uploading google*.txt..."; hadoop dfs -put google*.txt ./input/; 40 | echo "Uploading patents*.txt..."; hadoop dfs -put patents*.txt ./input/;; 41 | 2) echo "Uploading livejournal*.txt..."; hadoop dfs -put livejournal*.txt ./input/; 42 | echo "Uploading orkut*.txt..."; hadoop dfs -put orkut*.txt ./input/; 43 | echo "Uploading arabic*.txt..."; hadoop dfs -put arabic*.txt ./input/; 44 | echo "Uploading twitter-adj.txt..."; hadoop dfs -put twitter-adj.txt ./input/;; 45 | 3) echo "Uploading livejournal*.txt..."; hadoop dfs -put livejournal*.txt ./input/; 46 | echo "Uploading orkut*.txt..."; hadoop dfs -put orkut*.txt ./input/; 47 | echo "Uploading arabic*.txt..."; hadoop dfs -put arabic*.txt ./input/; 48 | echo "Uploading twitter*.txt..."; hadoop dfs -put twitter*.txt ./input/; 49 | echo "Uploading uk0705-adj.txt..."; hadoop dfs -put uk0705-adj.txt ./input/; 50 | echo "Uploading uk0705-mst-adj.txt..."; hadoop dfs -put uk0705-mst-adj.txt ./input/;; 51 | *) echo "Invalid size"; exit -1;; 52 | esac 53 | 54 | echo "Done." -------------------------------------------------------------------------------- /benchmark/datasets/load-splits.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Loads split input data, based on the cluster size. 4 | # 5 | # The size can be specified as an argument. Otherwise, 6 | # it will be obtained based on ../common/get-hosts.sh. 7 | 8 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 9 | scriptdir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 10 | source "$commondir"/get-dirs.sh 11 | source "$commondir"/get-hosts.sh 12 | 13 | if [ $# -eq 0 ]; then 14 | case ${NUM_MACHINES} in 15 | 4) size=1;; 16 | 8) size=1;; 17 | 16) size=2;; 18 | 32) size=2;; 19 | 64) size=3;; 20 | 128) size=3;; 21 | *) echo "Invalid number of machines."; 22 | echo "usage: $0 size"; 23 | echo ""; 24 | echo "size: 1 for amazon, google, patents"; 25 | echo " 2 for livejournal, orkut, arabic, twitter"; 26 | echo " 3 for livejournal, orkut, arabic, twitter, uk0705"; 27 | exit -1;; 28 | esac 29 | else 30 | size=$1 31 | fi 32 | 33 | cd "$DATASET_DIR" 34 | 35 | hadoop dfsadmin -safemode wait > /dev/null 36 | hadoop dfs -mkdir ./input || true # no problem if it already exists 37 | 38 | case ${size} in 39 | 1) "${scriptdir}"/split-input.sh amazon-adj.txt ${NUM_MACHINES}; 40 | "${scriptdir}"/split-input.sh google-adj.txt ${NUM_MACHINES}; 41 | "${scriptdir}"/split-input.sh patents-adj.txt ${NUM_MACHINES};; 42 | 2) "${scriptdir}"/split-input.sh livejournal-adj.txt ${NUM_MACHINES}; 43 | "${scriptdir}"/split-input.sh orkut-adj.txt ${NUM_MACHINES}; 44 | "${scriptdir}"/split-input.sh arabic-adj.txt ${NUM_MACHINES}; 45 | "${scriptdir}"/split-input.sh twitter-adj.txt ${NUM_MACHINES};; 46 | 3) "${scriptdir}"/split-input.sh livejournal-adj.txt ${NUM_MACHINES}; 47 | "${scriptdir}"/split-input.sh orkut-adj.txt ${NUM_MACHINES}; 48 | "${scriptdir}"/split-input.sh arabic-adj.txt ${NUM_MACHINES}; 49 | "${scriptdir}"/split-input.sh twitter-adj.txt ${NUM_MACHINES}; 50 | "${scriptdir}"/split-input.sh uk0705-adj.txt ${NUM_MACHINES};; 51 | *) echo "Invalid size"; exit -1;; 52 | esac 53 | 54 | case ${size} in 55 | 1) echo "Uploading amazon-adj-split/..."; hadoop dfs -put amazon-adj-split/ ./input/; 56 | echo "Uploading google-adj-split/..."; hadoop dfs -put google-adj-split/ ./input/; 57 | echo "Uploading patents-adj-split/..."; hadoop dfs -put patents-adj-split/ ./input/;; 58 | 2) echo "Uploading livejournal-adj-split/..."; hadoop dfs -put livejournal-adj-split/ ./input/; 59 | echo "Uploading orkut-adj-split/..."; hadoop dfs -put orkut-adj-split/ ./input/; 60 | echo "Uploading arabic-adj-split/..."; hadoop dfs -put arabic-adj-split/ ./input/; 61 | echo "Uploading twitter-adj-split/..."; hadoop dfs -put twitter-adj-split/ ./input/;; 62 | 3) echo "Uploading livejournal-adj-split/..."; hadoop dfs -put livejournal-adj-split/ ./input/; 63 | echo "Uploading orkut-adj-split/..."; hadoop dfs -put orkut-adj-split/ ./input/; 64 | echo "Uploading arabic-adj-split/..."; hadoop dfs -put arabic-adj-split/ ./input/; 65 | echo "Uploading twitter-adj-split/..."; hadoop dfs -put twitter-adj-split/ ./input/; 66 | echo "Uploading uk0705-adj-split/..."; hadoop dfs -put uk0705-adj-split/ ./input/;; 67 | *) echo "Invalid size"; exit -1;; 68 | esac 69 | 70 | echo "Done." -------------------------------------------------------------------------------- /benchmark/datasets/snap-convert.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define F_IN_SNAP 1 8 | #define F_IN_SNAPWEIGHT 2 9 | #define F_IN_GEN_UNITY 3 10 | #define F_IN_GEN_SEQ 4 11 | 12 | #define F_TO_ADJ 1 13 | #define F_TO_ADJWEIGHT 2 14 | #define F_TO_JSON 3 15 | 16 | static long counter = 1; 17 | 18 | static void usage(char **argv) { 19 | std::cout << "usage: " << argv[0] << " input-file output-file in-format out-format" << std::endl; 20 | std::cout << std::endl; 21 | std::cout << "in-format: 1. SNAP format (each line is: src dst)" << std::endl; 22 | std::cout << " 2. SNAP with weights (src dst weight)" << std::endl; 23 | std::cout << " 3. Same as 1, but output edge weights of 1." << std::endl; 24 | std::cout << " 4. Same as 1, but output unique sequential edge weights." << std::endl; 25 | std::cout << " (i.e., weights are assigned sequentially in the order" << std::endl; 26 | std::cout << " of how edges are listed in the input file)" << std::endl; 27 | std::cout << std::endl; 28 | std::cout << "out-format: 1. Adjacency list format (src dst1 dst2 ...)" << std::endl; 29 | std::cout << " 2. Adjacency list with weights (src dst1 weight1 dst2 weight2 ...)" << std::endl; 30 | std::cout << " 3. JSON ([src,0,[[dst1,weight1],[dst2,weight2],...]])" << std::endl; 31 | std::cout << std::endl; 32 | std::cout << "Note: edges with the same source ID must appear in a contiguous block!" << std::endl; 33 | std::cout << " e.g., 1 0 but NOT 1 0" << std::endl; 34 | std::cout << " 1 2 2 3" << std::endl; 35 | std::cout << " 2 3 1 2" << std::endl; 36 | } 37 | 38 | static inline void get_edge_weight(std::ifstream &ifs, int in_format, long &edge_weight) { 39 | switch (in_format) { 40 | case F_IN_SNAP: 41 | edge_weight = 0; 42 | break; 43 | 44 | case F_IN_SNAPWEIGHT: 45 | ifs >> edge_weight; 46 | break; 47 | 48 | case F_IN_GEN_UNITY: 49 | edge_weight = 1; 50 | break; 51 | 52 | case F_IN_GEN_SEQ: 53 | edge_weight = counter; 54 | counter++; 55 | break; 56 | 57 | default: 58 | std::cout << "Invalid in-format: " << in_format << "!" << std::endl; 59 | } 60 | } 61 | 62 | /** 63 | * Converts dataset/graph input formats. 64 | * 65 | * NOTE: Does not sort anything! 66 | */ 67 | int main(int argc, char **argv) { 68 | if ( argc < 5 ) { 69 | usage(argv); 70 | return -1; 71 | } 72 | 73 | std::ifstream ifs(argv[1], std::ifstream::in); 74 | std::ofstream ofs(argv[2], std::ofstream::out); 75 | int in_format = atoi(argv[3]); 76 | int out_format = atoi(argv[4]); 77 | 78 | if (!ifs || !ofs || 79 | (in_format < F_IN_SNAP || in_format > F_IN_GEN_SEQ) || 80 | (out_format < F_TO_ADJ || out_format > F_TO_JSON) ) { 81 | usage(argv); 82 | return -1; 83 | } 84 | 85 | std::cout.sync_with_stdio(false); // don't flush on \n 86 | 87 | // longs, just to be safe 88 | long vertex_id, edge_dst, edge_weight; 89 | long curr_id; 90 | 91 | // first pair of reads 92 | ifs >> curr_id; 93 | ifs >> edge_dst; 94 | get_edge_weight(ifs, in_format, edge_weight); 95 | 96 | // NOTE: eof() DOES happen to work here, b/c inner while(ifs >> ...) 97 | // statement breaks when no data is left *and* this failure sets 98 | // EOF flag correctly & in time for eof() to see 99 | switch (out_format) { 100 | case F_TO_ADJ: 101 | while (!ifs.eof()) { 102 | // format: vertex-id edge-dst ... 103 | ofs << curr_id << " " << edge_dst; 104 | 105 | while (ifs >> vertex_id >> edge_dst) { 106 | get_edge_weight(ifs, in_format, edge_weight); 107 | if (vertex_id != curr_id) { 108 | break; 109 | } 110 | 111 | ofs << " " << edge_dst; 112 | } 113 | 114 | ofs << "\n"; 115 | 116 | // new vertex_id found. carry over edge_dst and edge_weight too. 117 | curr_id = vertex_id; 118 | } 119 | break; 120 | 121 | case F_TO_ADJWEIGHT: 122 | while (!ifs.eof()) { 123 | // format: vertex-id edge-dst edge-val ... 124 | ofs << curr_id << " " << edge_dst << " " << edge_weight; 125 | 126 | while (ifs >> vertex_id >> edge_dst) { 127 | get_edge_weight(ifs, in_format, edge_weight); 128 | if (vertex_id != curr_id) { 129 | break; 130 | } 131 | 132 | ofs << " " << edge_dst << " " << edge_weight; 133 | } 134 | 135 | ofs << "\n"; 136 | 137 | // new vertex_id found. carry over edge_dst and edge_weight too. 138 | curr_id = vertex_id; 139 | } 140 | break; 141 | 142 | case F_TO_JSON: 143 | while (!ifs.eof()) { 144 | // format: [vertex-id, vertex-val, [[edge-dst,edge-val],...]] 145 | ofs << "[" << curr_id << ",0,[[" << edge_dst << "," << edge_weight << "]"; 146 | 147 | while (ifs >> vertex_id >> edge_dst) { 148 | get_edge_weight(ifs, in_format, edge_weight); 149 | if (vertex_id != curr_id) { 150 | break; 151 | } 152 | 153 | ofs << ",[" << edge_dst << "," << edge_weight << "]"; 154 | } 155 | 156 | ofs << "]]\n"; 157 | 158 | // new vertex_id found. carry over edge_dst and edge_weight too. 159 | curr_id = vertex_id; 160 | } 161 | break; 162 | 163 | default: 164 | std::cout << "Invalid out-format: " << out_format << "!" << std::endl; 165 | } 166 | 167 | ifs.close(); 168 | ofs.flush(); 169 | ofs.close(); 170 | return 0; 171 | } 172 | -------------------------------------------------------------------------------- /benchmark/datasets/snap-revert.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define F_IN_ADJ 1 8 | #define F_IN_ADJWEIGHT 2 9 | 10 | #define F_TO_SNAP 1 11 | #define F_TO_SNAPWEIGHT 2 12 | 13 | static void usage(char **argv) { 14 | std::cout << "usage: " << argv[0] << " input-file output-file in-format out-format" << std::endl; 15 | std::cout << std::endl; 16 | std::cout << "in-format: 1. Adjacency list format (src dst1 dst2 ...)" << std::endl; 17 | std::cout << " 2. Adjacency list with weights (src dst1 weight1 dst2 weight2 ...)" << std::endl; 18 | std::cout << std::endl; 19 | std::cout << "out-format: 1. SNAP format (src dst)" << std::endl; 20 | std::cout << " 2. SNAP with weights (src dst weight)" << std::endl; 21 | } 22 | 23 | 24 | static inline void write_output(std::ofstream &ofs, int out_format, 25 | long vertex_id, long edge_dst, long edge_weight) { 26 | switch(out_format) { 27 | case F_TO_SNAP: 28 | ofs << vertex_id << " " << edge_dst << "\n"; 29 | break; 30 | 31 | case F_TO_SNAPWEIGHT: 32 | ofs << vertex_id << " " << edge_dst << " " << edge_weight << "\n"; 33 | break; 34 | 35 | default: 36 | std::cout << "Invalid out-format: " << out_format << "!" << std::endl; 37 | } 38 | } 39 | 40 | 41 | /** 42 | * Converts adjacency format to SNAP. 43 | * 44 | * NOTE: Does not sort anything! 45 | */ 46 | int main(int argc, char **argv) { 47 | if ( argc < 5 ) { 48 | usage(argv); 49 | return -1; 50 | } 51 | 52 | std::ifstream ifs(argv[1], std::ifstream::in); 53 | std::ofstream ofs(argv[2], std::ofstream::out); 54 | int in_format = atoi(argv[3]); 55 | int out_format = atoi(argv[4]); 56 | 57 | if (!ifs || !ofs || 58 | (in_format < F_IN_ADJ || in_format > F_IN_ADJWEIGHT) || 59 | (out_format < F_TO_SNAP || in_format > F_TO_SNAPWEIGHT)) { 60 | usage(argv); 61 | return -1; 62 | } 63 | 64 | std::cout.sync_with_stdio(false); // don't flush on \n 65 | 66 | // longs, just to be safe 67 | long vertex_id, edge_dst, edge_weight; 68 | 69 | switch (in_format) { 70 | case F_IN_ADJ: 71 | while (ifs >> vertex_id) { 72 | while ( (ifs.peek() != '\n') && (ifs >> edge_dst) ) { 73 | write_output(ofs, out_format, vertex_id, edge_dst, 0); 74 | } 75 | } 76 | break; 77 | 78 | case F_IN_ADJWEIGHT: 79 | while (ifs >> vertex_id) { 80 | while ( (ifs.peek() != '\n') && (ifs >> edge_dst && ifs >> edge_weight) ) { 81 | write_output(ofs, out_format, vertex_id, edge_dst, edge_weight); 82 | } 83 | } 84 | break; 85 | 86 | default: 87 | std::cout << "Invalid in-format: " << in_format << "!" << std::endl; 88 | } 89 | 90 | ifs.close(); 91 | ofs.flush(); 92 | ofs.close(); 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /benchmark/datasets/split-input.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Split given input-graph into parts, placed in input-graph-split/ 4 | 5 | if [ $# -ne 2 ]; then 6 | echo "usage: $0 input-graph num-splits" 7 | exit -1 8 | fi 9 | 10 | graph=$(echo "$1" | sed 's/.txt$//g') 11 | numsplits=$2 12 | 13 | if [[ ! -f "${graph}.txt" ]]; then 14 | echo "${graph}.txt does not exist." 15 | exit -1 16 | fi 17 | 18 | if [[ $2 -le 0 ]]; then 19 | echo "Invalid number of chunks." 20 | exit -1 21 | fi 22 | 23 | if [[ -d "${graph}-split" ]]; then 24 | echo "${graph}-split/ already exists. Delete it first." 25 | exit -1 26 | fi 27 | 28 | # split input into specified chunks 29 | mkdir "${graph}-split" 30 | 31 | echo "Splitting ${graph}.txt..." 32 | split "${graph}.txt" "${graph}-split/${graph}-" -n l/${numsplits} 33 | 34 | echo "Done!" -------------------------------------------------------------------------------- /benchmark/giraph/benchall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 machines runs" 5 | echo "" 6 | echo "machines: 4, 8, 16, 32, 64, or 128" 7 | exit -1 8 | fi 9 | 10 | cd "$(dirname "${BASH_SOURCE[0]}")" 11 | 12 | MACHINES=$1 13 | RUNS=$2 14 | 15 | case ${MACHINES} in 16 | 4) GRAPHS=(amazon google patents); 17 | GRAPHS_MST=(amazon google patents); 18 | GRAPHS_MST_HASH=(amazon google patents); 19 | SRC=(0 0 6009554);; # for SSSP 20 | 8) GRAPHS=(amazon google patents); 21 | GRAPHS_MST=(amazon google patents); 22 | GRAPHS_MST_HASH=(amazon google patents); 23 | SRC=(0 0 6009554);; 24 | 16) GRAPHS=(livejournal orkut arabic twitter); 25 | GRAPHS_MST=(livejournal orkut arabic); 26 | GRAPHS_MST_HASH=(livejournal orkut); 27 | SRC=(0 1 3 0);; 28 | 32) GRAPHS=(livejournal orkut arabic twitter); 29 | GRAPHS_MST=(livejournal orkut arabic); 30 | GRAPHS_MST_HASH=(livejournal orkut arabic); 31 | SRC=(0 1 3 0);; 32 | 64) GRAPHS=(livejournal orkut arabic twitter uk0705); 33 | GRAPHS_MST=(livejournal orkut arabic); 34 | GRAPHS_MST_HASH=(livejournal orkut arabic twitter); 35 | SRC=(0 1 3 0 0);; 36 | 128) GRAPHS=(livejournal orkut arabic twitter uk0705); 37 | GRAPHS_MST=(livejournal orkut arabic uk0705); 38 | GRAPHS_MST_HASH=(livejournal orkut arabic twitter); 39 | SRC=(0 1 3 0 0);; 40 | *) echo "Invalid machines"; exit -1;; 41 | esac 42 | 43 | ################## 44 | # Byte array run 45 | ################## 46 | # we split the algs up for clarity 47 | for graph in "${GRAPHS[@]}"; do 48 | for ((i = 1; i <= RUNS; i++)); do 49 | ./pagerank.sh "${graph}-adj.txt" ${MACHINES} 0 50 | done 51 | done 52 | 53 | for j in "${!GRAPHS[@]}"; do 54 | for ((i = 1; i <= RUNS; i++)); do 55 | ./sssp.sh "${GRAPHS[$j]}-adj.txt" ${MACHINES} 0 ${SRC[$j]} 56 | done 57 | done 58 | 59 | for graph in "${GRAPHS[@]}"; do 60 | for ((i = 1; i <= RUNS; i++)); do 61 | ./wcc.sh "${graph}-adj.txt" ${MACHINES} 0 62 | done 63 | done 64 | 65 | # WARNING: this can be VERY slow for large graphs!! 66 | for graph in "${GRAPHS_MST[@]}"; do 67 | for ((i = 1; i <= RUNS; i++)); do 68 | ./mst.sh "${graph}-mst-adj.txt" ${MACHINES} 0 69 | done 70 | done 71 | 72 | #for graph in "${GRAPHS[@]}"; do 73 | # for ((i = 1; i <= RUNS; i++)); do 74 | # ./dimest.sh "${graph}-adj.txt" ${MACHINES} 0 75 | # done 76 | #done 77 | 78 | 79 | ##################### 80 | # Hash map run 81 | ##################### 82 | for graph in "${GRAPHS[@]}"; do 83 | for ((i = 1; i <= RUNS; i++)); do 84 | ./pagerank.sh "${graph}-adj.txt" ${MACHINES} 1 85 | done 86 | done 87 | 88 | for j in "${!GRAPHS[@]}"; do 89 | for ((i = 1; i <= RUNS; i++)); do 90 | ./sssp.sh "${GRAPHS[$j]}-adj.txt" ${MACHINES} 1 ${SRC[$j]} 91 | done 92 | done 93 | 94 | for graph in "${GRAPHS[@]}"; do 95 | for ((i = 1; i <= RUNS; i++)); do 96 | ./wcc.sh "${graph}-adj.txt" ${MACHINES} 1 97 | done 98 | done 99 | 100 | for graph in "${GRAPHS_MST_HASH[@]}"; do 101 | for ((i = 1; i <= RUNS; i++)); do 102 | ./mst.sh "${graph}-mst-adj.txt" ${MACHINES} 1 103 | done 104 | done 105 | 106 | #for graph in "${GRAPHS[@]}"; do 107 | # for ((i = 1; i <= RUNS; i++)); do 108 | # ./dimest.sh "${graph}-adj.txt" ${MACHINES} 1 109 | # done 110 | #done -------------------------------------------------------------------------------- /benchmark/giraph/dimest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines edge-type" 5 | echo "" 6 | echo "edge-type: 0 for byte array edges" 7 | echo " 1 for hash map edges" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | source ../common/get-configs.sh 13 | 14 | # place input in /user/${USER}/input/ 15 | # output is in /user/${USER}/giraph-output/ 16 | inputgraph=$(basename $1) 17 | outputdir=/user/${USER}/giraph-output/ 18 | hadoop dfs -rmr "$outputdir" || true 19 | 20 | # Technically this is the number of "workers", which can be more 21 | # than the number of machines. However, using multiple workers per 22 | # machine is inefficient! Use more Giraph threads instead (see below). 23 | machines=$2 24 | 25 | edgetype=$3 26 | case ${edgetype} in 27 | 0) edgeclass="";; # byte array edges are used by default 28 | 1) edgeclass="-Dgiraph.inputOutEdgesClass=org.apache.giraph.edge.HashMapEdges \ 29 | -Dgiraph.outEdgesClass=org.apache.giraph.edge.HashMapEdges";; 30 | *) echo "Invalid edge-type"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=dimest_${inputgraph}_${machines}_${edgetype}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # running time 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | hadoop jar "$GIRAPH_DIR"/giraph-examples/target/giraph-examples-1.0.0-for-hadoop-1.0.2-jar-with-dependencies.jar org.apache.giraph.GiraphRunner \ 43 | ${edgeclass} \ 44 | -Dgiraph.numComputeThreads=${GIRAPH_THREADS} \ 45 | -Dgiraph.numInputThreads=${GIRAPH_THREADS} \ 46 | -Dgiraph.numOutputThreads=${GIRAPH_THREADS} \ 47 | org.apache.giraph.examples.DiameterEstimationVertex \ 48 | -ca DiameterEstimationVertex.maxSS=30 \ 49 | -vif org.apache.giraph.examples.DiameterEstimationInputFormat \ 50 | -vip /user/${USER}/input/${inputgraph} \ 51 | -of org.apache.giraph.examples.DiameterEstimationVertex\$DiameterEstimationVertexOutputFormat \ 52 | -op "$outputdir" \ 53 | -w ${machines} 2>&1 | tee -a ./logs/${logfile} 54 | 55 | ## finish logging memory + network usage 56 | ../common/bench-finish.sh ${logname} 57 | 58 | ## clean up step needed for Giraph 59 | ./kill-java-job.sh -------------------------------------------------------------------------------- /benchmark/giraph/kill-java-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Kill all Java instances corresponding to Giraph jobs. 4 | # This is needed as they don't terminate automatically (they hang around consuming memory). 5 | # 6 | # NOTE: this will kill ALL jobs, including ongoing ones! 7 | # 8 | # To get rid of terminated running jobs from Hadoop web interface, 9 | # use "hadoop job -kill job_yyyymmddhhmm_aaaa" 10 | 11 | source "$(dirname "${BASH_SOURCE[0]}")"/../common/get-hosts.sh 12 | 13 | # do a kill on the master separately---this is useful when testing on a single machine 14 | kill -9 $(ps aux | grep "[j]obcache/job_[0-9]\{12\}_[0-9]\{4\}/" | awk '{print $2}') 15 | 16 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 17 | # [j] is a nifty trick to avoid "grep" showing up as a result 18 | ssh ${CLUSTER_NAME}$i "kill -9 \$(ps aux | grep \"[j]obcache/job_[0-9]\{12\}_[0-9]\{4\}/\" | awk '{print \$2}')" & 19 | done 20 | wait -------------------------------------------------------------------------------- /benchmark/giraph/mst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines edge-type" 5 | echo "" 6 | echo "edge-type: 0 for byte array edges" 7 | echo " 1 for hash map edges" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | source ../common/get-configs.sh 13 | 14 | # place input in /user/${USER}/input/ 15 | # output is in /user/${USER}/giraph-output/ 16 | inputgraph=$(basename $1) 17 | outputdir=/user/${USER}/giraph-output/ 18 | hadoop dfs -rmr "$outputdir" || true 19 | 20 | # Technically this is the number of "workers", which can be more 21 | # than the number of machines. However, using multiple workers per 22 | # machine is inefficient! Use more Giraph threads instead (see below). 23 | machines=$2 24 | 25 | edgetype=$3 26 | case ${edgetype} in 27 | 0) edgeclass="";; # byte array edges are used by default 28 | 1) edgeclass="-Dgiraph.inputOutEdgesClass=org.apache.giraph.edge.HashMapEdges \ 29 | -Dgiraph.outEdgesClass=org.apache.giraph.edge.HashMapEdges";; 30 | *) echo "Invalid edge-type"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=mst_${inputgraph}_${machines}_${edgetype}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # running time 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | # -Dmapred.task.timeout=0 is needed to prevent Giraph job from getting killed after spending 10 mins on one superstep 43 | # Giraph seems to ignore any mapred.task.timeout specified in Hadoop's mapred-site.xml 44 | hadoop jar "$GIRAPH_DIR"/giraph-examples/target/giraph-examples-1.0.0-for-hadoop-1.0.2-jar-with-dependencies.jar org.apache.giraph.GiraphRunner \ 45 | ${edgeclass} \ 46 | -Dgiraph.numComputeThreads=${GIRAPH_THREADS} \ 47 | -Dgiraph.numInputThreads=${GIRAPH_THREADS} \ 48 | -Dgiraph.numOutputThreads=${GIRAPH_THREADS} \ 49 | -Dmapred.task.timeout=0 \ 50 | org.apache.giraph.examples.MinimumSpanningTreeVertex \ 51 | -mc org.apache.giraph.examples.MinimumSpanningTreeVertex\$MinimumSpanningTreeVertexMasterCompute \ 52 | -vif org.apache.giraph.examples.MinimumSpanningTreeInputFormat \ 53 | -vip /user/${USER}/input/${inputgraph} \ 54 | -of org.apache.giraph.examples.MinimumSpanningTreeVertex\$MinimumSpanningTreeVertexOutputFormat \ 55 | -op "$outputdir" \ 56 | -w ${machines} 2>&1 | tee -a ./logs/${logfile} 57 | 58 | # -wc org.apache.giraph.examples.MinimumSpanningTreeVertex\$MinimumSpanningTreeVertexWorkerContext 59 | # see giraph-core/.../utils/ConfigurationUtils.java for command line opts (or -h flag to GiraphRunner) 60 | 61 | ## finish logging memory + network usage 62 | ../common/bench-finish.sh ${logname} 63 | 64 | ## clean up step needed for Giraph 65 | ./kill-java-job.sh -------------------------------------------------------------------------------- /benchmark/giraph/pagerank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines edge-type" 5 | echo "" 6 | echo "edge-type: 0 for byte array edges" 7 | echo " 1 for hash map edges" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | source ../common/get-configs.sh 13 | 14 | # place input in /user/${USER}/input/ 15 | # output is in /user/${USER}/giraph-output/ 16 | inputgraph=$(basename $1) 17 | outputdir=/user/${USER}/giraph-output/ 18 | hadoop dfs -rmr "$outputdir" || true 19 | 20 | # Technically this is the number of "workers", which can be more 21 | # than the number of machines. However, using multiple workers per 22 | # machine is inefficient! Use more Giraph threads instead (see below). 23 | machines=$2 24 | 25 | edgetype=$3 26 | case ${edgetype} in 27 | 0) edgeclass="";; # byte array edges are used by default 28 | 1) edgeclass="-Dgiraph.inputOutEdgesClass=org.apache.giraph.edge.HashMapEdges \ 29 | -Dgiraph.outEdgesClass=org.apache.giraph.edge.HashMapEdges";; 30 | *) echo "Invalid edge-type"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=pagerank_${inputgraph}_${machines}_${edgetype}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # running time 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | hadoop jar "$GIRAPH_DIR"/giraph-examples/target/giraph-examples-1.0.0-for-hadoop-1.0.2-jar-with-dependencies.jar org.apache.giraph.GiraphRunner \ 43 | ${edgeclass} \ 44 | -Dgiraph.numComputeThreads=${GIRAPH_THREADS} \ 45 | -Dgiraph.numInputThreads=${GIRAPH_THREADS} \ 46 | -Dgiraph.numOutputThreads=${GIRAPH_THREADS} \ 47 | org.apache.giraph.examples.SimplePageRankVertex \ 48 | -c org.apache.giraph.combiner.DoubleSumCombiner \ 49 | -ca SimplePageRankVertex.maxSS=30 \ 50 | -vif org.apache.giraph.examples.SimplePageRankInputFormat \ 51 | -vip /user/${USER}/input/${inputgraph} \ 52 | -of org.apache.giraph.examples.SimplePageRankVertex\$SimplePageRankVertexOutputFormat \ 53 | -op "$outputdir" \ 54 | -w ${machines} 2>&1 | tee -a ./logs/${logfile} 55 | 56 | # mc not needed b/c we don't want aggregators: -mc org.apache.giraph.examples.SimplePageRankVertex\$SimplePageRankVertexMasterCompute 57 | # alternative output format: -of org.apache.giraph.io.formats.IdWithValueTextOutputFormat 58 | 59 | ## finish logging memory + network usage 60 | ../common/bench-finish.sh ${logname} 61 | 62 | ## clean up step needed for Giraph 63 | ./kill-java-job.sh -------------------------------------------------------------------------------- /benchmark/giraph/prtolfinder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 input-graph machines" 5 | exit -1 6 | fi 7 | 8 | source ../common/get-dirs.sh 9 | source ../common/get-hosts.sh 10 | source ../common/get-configs.sh 11 | 12 | # place input in /user/${USER}/input/ 13 | # output is in /user/${USER}/giraph-output/ 14 | inputgraph=$(basename $1) 15 | outputdir=/user/${USER}/giraph-output/ 16 | hadoop dfs -rmr "$outputdir" || true 17 | 18 | # Technically this is the number of "workers", which can be more 19 | # than the number of machines. However, using multiple workers per 20 | # machine is inefficient! Use more Giraph threads instead (see below). 21 | machines=$2 22 | 23 | ## log names 24 | logname=prtolfinder_${inputgraph}_${machines}_0_"$(date +%Y%m%d-%H%M%S)" 25 | logfile=${logname}_time.txt # running time 26 | 27 | 28 | ## start logging memory + network usage 29 | #../common/bench-init.sh ${logname} 30 | 31 | ## start algorithm run 32 | # we use default byte array edges (better performance) 33 | # NOTE: this outputs no data to HDFS 34 | hadoop jar "$GIRAPH_DIR"/giraph-examples/target/giraph-examples-1.0.0-for-hadoop-1.0.2-jar-with-dependencies.jar org.apache.giraph.GiraphRunner \ 35 | -Dgiraph.numComputeThreads=${GIRAPH_THREADS} \ 36 | -Dgiraph.numInputThreads=${GIRAPH_THREADS} \ 37 | -Dgiraph.numOutputThreads=${GIRAPH_THREADS} \ 38 | org.apache.giraph.examples.PageRankTolFinderVertex \ 39 | -mc org.apache.giraph.examples.PageRankTolFinderVertex\$PageRankTolFinderVertexMasterCompute \ 40 | -c org.apache.giraph.combiner.DoubleSumCombiner \ 41 | -ca PageRankTolFinderVertex.maxSS=30 \ 42 | -vif org.apache.giraph.examples.SimplePageRankInputFormat \ 43 | -vip /user/${USER}/input/${inputgraph} \ 44 | -of org.apache.giraph.examples.PageRankTolFinderVertex\$PageRankTolFinderVertexOutputFormat \ 45 | -op "$outputdir" \ 46 | -w ${machines} 2>&1 | tee -a ./logs/${logfile} 47 | 48 | # -wc org.apache.giraph.examples.PageRankTolFinderVertex\$PageRankTolFinderVertexWorkerContext 49 | 50 | ## finish logging memory + network usage 51 | #../common/bench-finish.sh ${logname} 52 | 53 | 54 | ## get max deltas (changes in PR value) at each superstep 55 | jobid=$(grep "Running job" ./logs/${logfile} | awk '{print $7}') 56 | 57 | # The master on a cluster will not have anything---this is for local testing 58 | darray[0]=$(cat "$HADOOP_DIR"/logs/userlogs/${jobid}/*/syslog | grep 'max change' | awk '{print $9}' | tr '\n' ' ') 59 | 60 | # NOTE: this is a hack---ZK is located on one of the workers, so just go 61 | # through everyone and we'll get master.compute()'s output exactly once 62 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 63 | darray[${i}]=$(ssh ${CLUSTER_NAME}${i} "cat \"$HADOOP_DIR\"/logs/userlogs/${jobid}/*/syslog | grep 'max change' | awk '{print \$9}' | tr '\n' ','") 64 | done 65 | 66 | deltas=$(echo "${darray[*]}" | sed -e 's/^ *//' -e 's/ *$//') # join array and strip whitespace 67 | 68 | echo "" >> ./tolerances.txt 69 | echo "$(sed 's/-.*//g' <<< ${inputgraph})_deltas = [${deltas}]" >> ./tolerances.txt 70 | 71 | ## clean up step needed for Giraph 72 | ./kill-java-job.sh -------------------------------------------------------------------------------- /benchmark/giraph/recompile-giraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 4 | source "$commondir"/get-hosts.sh 5 | source "$commondir"/get-dirs.sh 6 | 7 | cd "$GIRAPH_DIR" 8 | 9 | # -pl specifies what packages to compile (e.g., giraph-examples,giraph-core) 10 | # -Dfindbugs.skip skips "find bugs" stage (saves quite a bit of time) 11 | mvn clean install -Phadoop_1.0 -DskipTests -pl giraph-examples -Dfindbugs.skip 12 | 13 | # copy compiled jars to worker machines 14 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 15 | scp ./giraph-examples/target/*.jar ${CLUSTER_NAME}${i}:"$GIRAPH_DIR"/giraph-examples/target/ & 16 | scp ./giraph-core/target/*.jar ${CLUSTER_NAME}${i}:"$GIRAPH_DIR"/giraph-core/target/ & 17 | done 18 | wait 19 | 20 | echo "OK." -------------------------------------------------------------------------------- /benchmark/giraph/sssp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 4 ]; then 4 | echo "usage: $0 input-graph machines edge-type source-vertex" 5 | echo "" 6 | echo "edge-type: 0 for byte array edges" 7 | echo " 1 for hash map edges" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | source ../common/get-configs.sh 13 | 14 | # place input in /user/${USER}/input/ 15 | # output is in /user/${USER}/giraph-output/ 16 | inputgraph=$(basename $1) 17 | outputdir=/user/${USER}/giraph-output/ 18 | hadoop dfs -rmr "$outputdir" || true 19 | 20 | # Technically this is the number of "workers", which can be more 21 | # than the number of machines. However, using multiple workers per 22 | # machine is inefficient! Use more Giraph threads instead (see below). 23 | machines=$2 24 | 25 | edgetype=$3 26 | case ${edgetype} in 27 | 0) edgeclass="";; # byte array edges are used by default 28 | 1) edgeclass="-Dgiraph.inputOutEdgesClass=org.apache.giraph.edge.HashMapEdges \ 29 | -Dgiraph.outEdgesClass=org.apache.giraph.edge.HashMapEdges";; 30 | *) echo "Invalid edge-type"; exit -1;; 31 | esac 32 | 33 | src=$4 34 | 35 | ## log names 36 | logname=sssp_${inputgraph}_${machines}_${edgetype}_"$(date +%Y%m%d-%H%M%S)" 37 | logfile=${logname}_time.txt # running time 38 | 39 | 40 | ## start logging memory + network usage 41 | ../common/bench-init.sh ${logname} 42 | 43 | ## start algorithm run 44 | hadoop jar "$GIRAPH_DIR"/giraph-examples/target/giraph-examples-1.0.0-for-hadoop-1.0.2-jar-with-dependencies.jar org.apache.giraph.GiraphRunner \ 45 | ${edgeclass} \ 46 | -Dgiraph.numComputeThreads=${GIRAPH_THREADS} \ 47 | -Dgiraph.numInputThreads=${GIRAPH_THREADS} \ 48 | -Dgiraph.numOutputThreads=${GIRAPH_THREADS} \ 49 | org.apache.giraph.examples.SimpleShortestPathsVertex \ 50 | -ca SimpleShortestPathsVertex.sourceId=${src} \ 51 | -vif org.apache.giraph.examples.SimpleShortestPathsInputFormat \ 52 | -vip /user/${USER}/input/${inputgraph} \ 53 | -of org.apache.giraph.io.formats.IdWithValueTextOutputFormat \ 54 | -op "$outputdir" \ 55 | -w ${machines} 2>&1 | tee -a ./logs/${logfile} 56 | 57 | ## finish logging memory + network usage 58 | ../common/bench-finish.sh ${logname} 59 | 60 | ## clean up step needed for Giraph 61 | ./kill-java-job.sh -------------------------------------------------------------------------------- /benchmark/giraph/wcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines edge-type" 5 | echo "" 6 | echo "edge-type: 0 for byte array edges" 7 | echo " 1 for hash map edges" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | source ../common/get-configs.sh 13 | 14 | # place input in /user/${USER}/input/ 15 | # output is in /user/${USER}/giraph-output/ 16 | inputgraph=$(basename $1) 17 | outputdir=/user/${USER}/giraph-output/ 18 | hadoop dfs -rmr "$outputdir" || true 19 | 20 | # Technically this is the number of "workers", which can be more 21 | # than the number of machines. However, using multiple workers per 22 | # machine is inefficient! Use more Giraph threads instead (see below). 23 | machines=$2 24 | 25 | edgetype=$3 26 | case ${edgetype} in 27 | 0) edgeclass="";; # byte array edges are used by default 28 | 1) edgeclass="-Dgiraph.inputOutEdgesClass=org.apache.giraph.edge.HashMapEdges \ 29 | -Dgiraph.outEdgesClass=org.apache.giraph.edge.HashMapEdges";; 30 | *) echo "Invalid edge-type"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=wcc_${inputgraph}_${machines}_${edgetype}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # running time 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | hadoop jar "$GIRAPH_DIR"/giraph-examples/target/giraph-examples-1.0.0-for-hadoop-1.0.2-jar-with-dependencies.jar org.apache.giraph.GiraphRunner \ 43 | ${edgeclass} \ 44 | -Dgiraph.numComputeThreads=${GIRAPH_THREADS} \ 45 | -Dgiraph.numInputThreads=${GIRAPH_THREADS} \ 46 | -Dgiraph.numOutputThreads=${GIRAPH_THREADS} \ 47 | org.apache.giraph.examples.ConnectedComponentsVertex \ 48 | -vif org.apache.giraph.examples.ConnectedComponentsInputFormat \ 49 | -vip /user/${USER}/input/${inputgraph} \ 50 | -of org.apache.giraph.io.formats.IdWithValueTextOutputFormat \ 51 | -op "$outputdir" \ 52 | -w ${machines} 2>&1 | tee -a ./logs/${logfile} 53 | 54 | ## finish logging memory + network usage 55 | ../common/bench-finish.sh ${logname} 56 | 57 | ## clean up step needed for Giraph 58 | ./kill-java-job.sh -------------------------------------------------------------------------------- /benchmark/gps/benchall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 machines runs" 5 | echo "" 6 | echo "machines: 4, 8, 16, 32, 64, or 128" 7 | exit -1 8 | fi 9 | 10 | cd "$(dirname "${BASH_SOURCE[0]}")" 11 | 12 | MACHINES=$1 13 | RUNS=$2 14 | 15 | case ${MACHINES} in 16 | 4) GRAPHS=(amazon google patents); 17 | GRAPHS_MST=(amazon google patents); 18 | SRC=(0 0 6009554); # for SSSP 19 | SLEEP_TIME=60;; 20 | 8) GRAPHS=(amazon google patents); 21 | GRAPHS_MST=(amazon google patents); 22 | SRC=(0 0 6009554); 23 | SLEEP_TIME=60;; 24 | 16) GRAPHS=(livejournal orkut arabic twitter); 25 | GRAPHS_MST=(livejournal orkut arabic); 26 | SRC=(0 1 3 0); 27 | SLEEP_TIME=60;; 28 | 32) GRAPHS=(livejournal orkut arabic twitter); 29 | GRAPHS_MST=(livejournal orkut arabic); 30 | SRC=(0 1 3 0); 31 | SLEEP_TIME=60;; 32 | 64) GRAPHS=(livejournal orkut arabic twitter uk0705); 33 | GRAPHS_MST=(livejournal orkut arabic twitter); 34 | SRC=(0 1 3 0 0); 35 | SLEEP_TIME=80;; 36 | 128) GRAPHS=(livejournal orkut arabic twitter uk0705); 37 | GRAPHS_MST=(livejournal orkut arabic twitter uk0705); 38 | SRC=(0 1 3 0 0); 39 | SLEEP_TIME=80;; 40 | *) echo "Invalid machines"; exit -1;; 41 | esac 42 | 43 | ################# 44 | # Normal run 45 | ################# 46 | # we split the algs up for simplicity 47 | for graph in "${GRAPHS[@]}"; do 48 | for ((i = 1; i <= RUNS; i++)); do 49 | ./pagerank.sh "${graph}-adj.txt" ${MACHINES} 0 50 | ./stop-nodes.sh 51 | sleep ${SLEEP_TIME} 52 | done 53 | done 54 | 55 | for j in "${!GRAPHS[@]}"; do 56 | for ((i = 1; i <= RUNS; i++)); do 57 | ./sssp.sh "${GRAPHS[$j]}-adj.txt" ${MACHINES} 0 ${SRC[$j]} 58 | ./stop-nodes.sh 59 | sleep ${SLEEP_TIME} 60 | done 61 | done 62 | 63 | for graph in "${GRAPHS[@]}"; do 64 | for ((i = 1; i <= RUNS; i++)); do 65 | ./wcc.sh "${graph}-adj.txt" ${MACHINES} 0 66 | ./stop-nodes.sh 67 | sleep ${SLEEP_TIME} 68 | done 69 | done 70 | 71 | for graph in "${GRAPHS_MST[@]}"; do 72 | for ((i = 1; i <= RUNS; i++)); do 73 | ./mst.sh "${graph}-mst-adj.txt" ${MACHINES} 74 | ./stop-nodes.sh 75 | sleep ${SLEEP_TIME} 76 | done 77 | done 78 | 79 | #./enable-dimest-fix.sh 80 | #for graph in "${GRAPHS[@]}"; do 81 | # for ((i = 1; i <= RUNS; i++)); do 82 | # ./dimest.sh "${graph}-adj.txt" ${MACHINES} 0 83 | # ./stop-nodes.sh 84 | # sleep ${SLEEP_TIME} 85 | # done 86 | #done 87 | #./disable-dimest-fix.sh 88 | 89 | ################# 90 | # LALP Run 91 | ################# 92 | for graph in "${GRAPHS[@]}"; do 93 | for ((i = 1; i <= RUNS; i++)); do 94 | ./pagerank.sh "${graph}-adj.txt" ${MACHINES} 1 95 | ./stop-nodes.sh 96 | sleep ${SLEEP_TIME} 97 | done 98 | done 99 | 100 | for j in "${!GRAPHS[@]}"; do 101 | for ((i = 1; i <= RUNS; i++)); do 102 | ./sssp.sh "${GRAPHS[$j]}-adj.txt" ${MACHINES} 1 ${SRC[$j]} 103 | ./stop-nodes.sh 104 | sleep ${SLEEP_TIME} 105 | done 106 | done 107 | 108 | for graph in "${GRAPHS[@]}"; do 109 | for ((i = 1; i <= RUNS; i++)); do 110 | ./wcc.sh "${graph}-adj.txt" ${MACHINES} 1 111 | ./stop-nodes.sh 112 | sleep ${SLEEP_TIME} 113 | done 114 | done 115 | 116 | # no MST 117 | 118 | #./enable-dimest-fix.sh 119 | #for graph in "${GRAPHS[@]}"; do 120 | # for ((i = 1; i <= RUNS; i++)); do 121 | # ./dimest.sh "${graph}-adj.txt" ${MACHINES} 0 122 | # ./stop-nodes.sh 123 | # sleep ${SLEEP_TIME} 124 | # done 125 | #done 126 | #./disable-dimest-fix.sh 127 | 128 | ################# 129 | # Dynamic Run 130 | ################# 131 | for graph in "${GRAPHS[@]}"; do 132 | for ((i = 1; i <= RUNS; i++)); do 133 | ./pagerank.sh "${graph}-adj.txt" ${MACHINES} 2 134 | ./stop-nodes.sh 135 | sleep ${SLEEP_TIME} 136 | done 137 | done 138 | 139 | for j in "${!GRAPHS[@]}"; do 140 | for ((i = 1; i <= RUNS; i++)); do 141 | ./sssp.sh "${GRAPHS[$j]}-adj.txt" ${MACHINES} 2 ${SRC[$j]} 142 | ./stop-nodes.sh 143 | sleep ${SLEEP_TIME} 144 | done 145 | done 146 | 147 | for graph in "${GRAPHS[@]}"; do 148 | for ((i = 1; i <= RUNS; i++)); do 149 | ./wcc.sh "${graph}-adj.txt" ${MACHINES} 2 150 | ./stop-nodes.sh 151 | sleep ${SLEEP_TIME} 152 | done 153 | done 154 | 155 | # no MST 156 | 157 | #./enable-dimest-fix.sh 158 | #for graph in "${GRAPHS[@]}"; do 159 | # for ((i = 1; i <= RUNS; i++)); do 160 | # ./dimest.sh "${graph}-adj.txt" ${MACHINES} 0 161 | # ./stop-nodes.sh 162 | # sleep ${SLEEP_TIME} 163 | # done 164 | #done 165 | #./disable-dimest-fix.sh -------------------------------------------------------------------------------- /benchmark/gps/debug-site.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # This runs GPS's web interface to view old runs/logs. 4 | # 5 | # NOTE: Compile debug_monitoring_runner.jar using $GPS_DIR/make_debug_monitoring_runner_jar.sh 6 | 7 | source "$(dirname "${BASH_SOURCE[0]}")"/../common/get-dirs.sh 8 | 9 | java -jar "$GPS_DIR"/debug_monitoring_runner.jar -hcf "$HADOOP_DIR"/conf/core-site.xml -msfp /user/${USER}/gps/stats-* -port 4444 -------------------------------------------------------------------------------- /benchmark/gps/dimest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines gps-mode" 5 | echo "" 6 | echo "gps-mode: 0 for normal (no lalp, no dynamic repartitioning)" 7 | echo " 1 for LALP" 8 | echo " 2 for dynamic repartitioning" 9 | echo " 3 for LALP and dynamic repartitioning" 10 | exit -1 11 | fi 12 | 13 | source ../common/get-dirs.sh 14 | source ../common/get-configs.sh 15 | 16 | # place input in /user/${USER}/input/ 17 | # output is in /user/${USER}/gps/output/ 18 | inputgraph=$(basename $1) 19 | 20 | # machines should be number of EC2 instances 21 | machines=$2 22 | workers=$(($machines * $GPS_WPM)) 23 | 24 | mode=$3 25 | case ${mode} in 26 | 0) modeflag="";; 27 | 1) modeflag="-lalp 100";; 28 | 2) modeflag="-dynamic";; 29 | 3) modeflag="-lalp 100 -dynamic";; 30 | *) echo "Invalid gps-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=dimest_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # GPS statistics (incl running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | # max controls max number of supersteps 43 | ./start-nodes.sh ${workers} quick-start \ 44 | ${modeflag} \ 45 | -ifs /user/${USER}/input/${inputgraph} \ 46 | -hcf "$HADOOP_DIR"/conf/core-site.xml \ 47 | -jc gps.examples.dimest.DiameterEstimationVertex###JobConfiguration \ 48 | -mcfg /user/${USER}/gps-machine-config/machine.cfg \ 49 | -log4jconfig "$GPS_DIR"/conf/log4j.config \ 50 | -other -max###30 51 | 52 | ## finish logging memory + network usage 53 | ../common/bench-finish.sh ${logname} 54 | 55 | ## get stats (see debug_site.sh for debug naming convention) 56 | hadoop dfs -get /user/${USER}/gps/output/quick-start-machine-stats ./logs/${logfile} 57 | #hadoop dfs -mv /user/${USER}/gps/output/quick-start-machine-stats /user/${USER}/gps/stats-${logname} -------------------------------------------------------------------------------- /benchmark/gps/disable-dimest-fix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Disables the fix for diameter estimation. 4 | # 5 | # This should be done before running non-diameter estimation algs. 6 | 7 | scriptdir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 8 | source "$scriptdir"/../common/get-dirs.sh 9 | 10 | cd "$GPS_DIR"/src/java/gps/messages/storage 11 | cp -f ArrayBackedIncomingMessageStorage.javaORIGINAL ArrayBackedIncomingMessageStorage.java 12 | 13 | "$scriptdir"/recompile-gps.sh -------------------------------------------------------------------------------- /benchmark/gps/enable-dimest-fix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Enables a fix for diameter estimation. 4 | # 5 | # This fix should be enabled only for diameter estimation, 6 | # and should be disabled when running other algorithms. 7 | 8 | scriptdir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 9 | source "$scriptdir"/../common/get-dirs.sh 10 | 11 | cd "$GPS_DIR"/src/java/gps/messages/storage 12 | cp -f ArrayBackedIncomingMessageStorage.javaDIMEST ArrayBackedIncomingMessageStorage.java 13 | 14 | "$scriptdir"/recompile-gps.sh -------------------------------------------------------------------------------- /benchmark/gps/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initiate GPS by creating slaves and machine config files. 4 | # 5 | # NOTE: "slaves" is NOT placed in master-script/, because we use 6 | # our own scripts for starting/stopping GPS workers. 7 | 8 | cd "$(dirname "${BASH_SOURCE[0]}")" 9 | source ../common/get-hosts.sh 10 | source ../common/get-dirs.sh 11 | source ../common/get-configs.sh 12 | 13 | rm -f slaves 14 | rm -f machine.cfg 15 | 16 | # create slaves file 17 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 18 | for ((j = 1; j <= ${GPS_WPM}; j++)); do 19 | echo "${CLUSTER_NAME}${i}" >> slaves 20 | done 21 | done 22 | 23 | # create machine config file 24 | echo "-1 ${HOSTNAME} 64000" >> machine.cfg # master is special 25 | 26 | w_id=0 # worker counter (needed if workers per machine > 1) 27 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 28 | # to get multiple workers per machine, use the same name 29 | # but give it a unique id and port 30 | for ((j = 1; j <= ${GPS_WPM}; j++)); do 31 | echo "${w_id} ${CLUSTER_NAME}${i} $((64001 + ${w_id}))" >> machine.cfg 32 | w_id=$((w_id+1)) 33 | done 34 | done 35 | 36 | # upload machine config file to HDFS 37 | hadoop dfsadmin -safemode wait > /dev/null 38 | hadoop dfs -rmr /user/${USER}/gps-machine-config/ || true 39 | hadoop dfs -mkdir /user/${USER}/gps-machine-config/ 40 | hadoop dfs -put machine.cfg /user/${USER}/gps-machine-config/ 41 | 42 | # make GPS log directories if needed 43 | if [[ ! -d "$GPS_LOG_DIR" ]]; then mkdir -p "$GPS_LOG_DIR"; fi 44 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 45 | ssh ${CLUSTER_NAME}${i} "if [[ ! -d \"$GPS_LOG_DIR\" ]]; then mkdir -p \"$GPS_LOG_DIR\"; fi" & 46 | done 47 | wait -------------------------------------------------------------------------------- /benchmark/gps/mst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 input-graph machines" 5 | exit -1 6 | fi 7 | 8 | source ../common/get-dirs.sh 9 | source ../common/get-configs.sh 10 | 11 | # place input in /user/${USER}/input/ 12 | # output is in /user/${USER}/gps/output/ 13 | inputgraph=$(basename $1) 14 | 15 | # machines should be number of EC2 instances 16 | machines=$2 17 | workers=$(($machines * $GPS_WPM)) 18 | 19 | ## log names 20 | # MST can only run in "normal" mode (LALP & dynamic repartitioning cannot be used) 21 | logname=mst_${inputgraph}_${machines}_0_"$(date +%Y%m%d-%H%M%S)" 22 | logfile=${logname}_time.txt # GPS statistics (incl running time) 23 | 24 | 25 | ## start logging memory + network usage 26 | ../common/bench-init.sh ${logname} 27 | 28 | ## start algorithm run 29 | # there are 3 versions of MST... according to author, these are: 30 | # 31 | # edgesatrootpjonebyone uses standard Boruvka (no optimizations) 32 | # edgesatselfpjonebyone uses "storing edges at subvertices" (SEAS) 33 | # -> "edge cleaning on demand" (ECOD) is enabled via flag 34 | # edgeshybridpjonebyone uses SEAS for few iterations then default... but not published 35 | ./start-nodes.sh ${workers} quick-start \ 36 | -ifs /user/${USER}/input/${inputgraph} \ 37 | -hcf "$HADOOP_DIR"/conf/core-site.xml \ 38 | -jc gps.examples.mst.edgesatrootpjonebyone.JobConfiguration \ 39 | -mcfg /user/${USER}/gps-machine-config/machine.cfg \ 40 | -log4jconfig "$GPS_DIR"/conf/log4j.config 41 | 42 | ## finish logging memory + network usage 43 | ../common/bench-finish.sh ${logname} 44 | 45 | ## get stats (see debug_site.sh for debug naming convention) 46 | hadoop dfs -get /user/${USER}/gps/output/quick-start-machine-stats ./logs/${logfile} 47 | #hadoop dfs -mv /user/${USER}/gps/output/quick-start-machine-stats /user/${USER}/gps/stats-${logname} -------------------------------------------------------------------------------- /benchmark/gps/pagerank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines gps-mode" 5 | echo "" 6 | echo "gps-mode: 0 for normal (no lalp, no dynamic repartitioning)" 7 | echo " 1 for LALP" 8 | echo " 2 for dynamic repartitioning" 9 | echo " 3 for LALP and dynamic repartitioning" 10 | exit -1 11 | fi 12 | 13 | source ../common/get-dirs.sh 14 | source ../common/get-configs.sh 15 | 16 | # place input in /user/${USER}/input/ 17 | # output is in /user/${USER}/gps/output/ 18 | inputgraph=$(basename $1) 19 | 20 | # machines should be number of EC2 instances 21 | machines=$2 22 | workers=$(($machines * $GPS_WPM)) 23 | 24 | mode=$3 25 | case ${mode} in 26 | 0) modeflag="";; 27 | 1) modeflag="-lalp 100";; 28 | 2) modeflag="-dynamic";; 29 | 3) modeflag="-lalp 100 -dynamic";; 30 | *) echo "Invalid gps-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=pagerank_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # GPS statistics (incl running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | # max controls max number of supersteps; must be 30, to match Giraph 43 | ./start-nodes.sh ${workers} quick-start \ 44 | ${modeflag} \ 45 | -ifs /user/${USER}/input/${inputgraph} \ 46 | -hcf "$HADOOP_DIR"/conf/core-site.xml \ 47 | -jc gps.examples.pagerank.PageRankVertex###JobConfiguration \ 48 | -mcfg /user/${USER}/gps-machine-config/machine.cfg \ 49 | -log4jconfig "$GPS_DIR"/conf/log4j.config \ 50 | -other -max###30 51 | 52 | ## finish logging memory + network usage 53 | ../common/bench-finish.sh ${logname} 54 | 55 | ## get stats (see debug_site.sh for debug naming convention) 56 | hadoop dfs -get /user/${USER}/gps/output/quick-start-machine-stats ./logs/${logfile} 57 | #hadoop dfs -mv /user/${USER}/gps/output/quick-start-machine-stats /user/${USER}/gps/stats-${logname} -------------------------------------------------------------------------------- /benchmark/gps/recompile-gps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 4 | source "$commondir"/get-hosts.sh 5 | source "$commondir"/get-dirs.sh 6 | 7 | cd "$GPS_DIR/local-master-scripts/" 8 | ./make_gps_node_runner_jar.sh 9 | 10 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 11 | scp ../gps_node_runner.jar ${CLUSTER_NAME}${i}:"$GPS_DIR"/gps_node_runner.jar & 12 | done 13 | wait 14 | 15 | echo "OK." -------------------------------------------------------------------------------- /benchmark/gps/sssp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 4 ]; then 4 | echo "usage: $0 input-graph machines gps-mode source-vertex" 5 | echo "" 6 | echo "gps-mode: 0 for normal (no lalp, no dynamic repartitioning)" 7 | echo " 1 for LALP" 8 | echo " 2 for dynamic repartitioning" 9 | echo " 3 for LALP and dynamic repartitioning" 10 | exit -1 11 | fi 12 | 13 | source ../common/get-dirs.sh 14 | source ../common/get-configs.sh 15 | 16 | # place input in /user/${USER}/input/ 17 | # output is in /user/${USER}/gps/output/ 18 | inputgraph=$(basename $1) 19 | 20 | # machines should be number of EC2 instances 21 | machines=$2 22 | workers=$(($machines * $GPS_WPM)) 23 | 24 | # NOTE: we can only use LALP for SSSP when ALL edge weights are the 25 | # same for the entire graph. In our case, all edge weights are 1. 26 | mode=$3 27 | case ${mode} in 28 | 0) modeflag="";; 29 | 1) modeflag="-lalp 100";; 30 | 2) modeflag="-dynamic";; 31 | 3) modeflag="-lalp 100 -dynamic";; 32 | *) echo "Invalid gps-mode"; exit -1;; 33 | esac 34 | 35 | src=$4 36 | 37 | ## log names 38 | logname=sssp_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 39 | logfile=${logname}_time.txt # GPS statistics (incl running time) 40 | 41 | 42 | ## start logging memory + network usage 43 | ../common/bench-init.sh ${logname} 44 | 45 | ## start algorithm run 46 | # This SSSP assigns edge weight of 1 to all edges, without using 47 | # the boolean trick of SingleSourceAllVerticesShortestPathVertex. 48 | # Input graph must not have edge weights. 49 | ./start-nodes.sh ${workers} quick-start \ 50 | ${modeflag} \ 51 | -ifs /user/${USER}/input/${inputgraph} \ 52 | -hcf "$HADOOP_DIR"/conf/core-site.xml \ 53 | -jc gps.examples.sssp.SSSPVertex###JobConfiguration \ 54 | -mcfg /user/${USER}/gps-machine-config/machine.cfg \ 55 | -log4jconfig "$GPS_DIR"/conf/log4j.config \ 56 | -other -root###${src} 57 | 58 | # gps.examples.edgevaluesssp.EdgeValueSSSPVertex###JobConfiguration 59 | # is for when input graph has edge weights. 60 | # input graph must have edge weights, but no vertex values 61 | 62 | ## finish logging memory + network usage 63 | ../common/bench-finish.sh ${logname} 64 | 65 | ## get stats (see debug_site.sh for debug naming convention) 66 | hadoop dfs -get /user/${USER}/gps/output/quick-start-machine-stats ./logs/${logfile} 67 | #hadoop dfs -mv /user/${USER}/gps/output/quick-start-machine-stats /user/${USER}/gps/stats-${logname} -------------------------------------------------------------------------------- /benchmark/gps/start-nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # A modified version of master-scripts/start_gps_nodes.sh made friendlier 4 | # for automation. This incorporates scripts/start_gps_node.sh, so worker 5 | # machines no longer need to be updated with that script. HDFS output paths, 6 | # log paths, etc. remain unchanged. 7 | # 8 | # Note that each machine can have *multiple* workers. Hence, we refer to 9 | # physical machines as "machines" and workers as "workers" or "slaves". 10 | # 11 | # Workers are started asynchronously, which is faster. This script (i.e., 12 | # the master) waits until all workers are done computations before exiting, 13 | # making it easier to script benchmarks. (Although a sleep delay is still 14 | # required---see the batch benching script.) 15 | # 16 | # Because of how GPS behaves, the # of workers argument is actually IGNORED. 17 | # Instead, we use # of workers specified in machine slaves/config file. 18 | # Specifically: 19 | # 20 | # >> If argument < # of actual workers, we start # of actual workers. 21 | # (Otherwise, GPS will hang waiting for the extra workers) 22 | # >> If argument > # of actual workers, we start # of actual workers. 23 | # (Because no ports are specified for extra non-existent workers) 24 | # 25 | 26 | # 27 | # To change max JVM heap size for GPS workers, see ../common/get-configs.sh. 28 | # 29 | 30 | # To use this, pass in arguments like: 31 | # 32 | #./start-nodes.sh ${workers} quick-start \ 33 | # -ifs /user/${USER}/input/${inputgraph} \ 34 | # -hcf "$HADOOP_DIR"/conf/core-site.xml \ 35 | # -jc gps.examples.pagerank.PageRankVertex###JobConfiguration \ 36 | # -mcfg /user/${USER}/gps-machine-config/cs848.cfg \ 37 | # -log4jconfig "$GPS_DIR"/conf/log4j.config \ 38 | # -other -max###30 39 | # 40 | # Note that GPS's default start script requires 3rd argument 41 | # and onwards to be double-quoted, i.e.: 42 | # 43 | #./master-scripts/start_gps_nodes.sh ${workers} quick-start \ 44 | # "-ifs /user/${USER}/input/${inputgraph} \ 45 | # -hcf \"$HADOOP_DIR\"/conf/core-site.xml \ 46 | # -jc gps.examples.pagerank.PageRankVertex###JobConfiguration \ 47 | # -mcfg /user/${USER}/gps-machine-config/cs848.cfg \ 48 | # -log4jconfig \"$GPS_DIR\"/conf/log4j.config \ 49 | # -other -max###30" 50 | # 51 | # 52 | # To start multiple workers per machine, modify the slaves file to be, e.g. 53 | # 54 | # cloud1 55 | # cloud1 56 | # cloud2 57 | # cloud2 58 | # 59 | # and similarly for the machine config file. 60 | # 61 | # 62 | # Side note: one way to get automation when using the original gps_start_nodes.sh 63 | # is by modifying the last slave's start_gps_node.sh to not have the "&". That way, 64 | # since slaves are started sequentially, the last one will return only when the 65 | # computation is complete. 66 | 67 | if [ $# -lt 3 ]; then 68 | echo "usage: $0 workers mode gps-args" 69 | echo "" 70 | echo "mode: use 'quick-start' (without quotes)" 71 | echo "gps-args: arguments passed to GPS jar, unquoted" 72 | exit -1 73 | fi 74 | 75 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 76 | source "$commondir"/get-dirs.sh 77 | source "$commondir"/get-configs.sh 78 | 79 | 80 | OUTPUT_DIR=/user/${USER}/gps/output/ 81 | 82 | ## start master 83 | MASTER_GPS_ID=-1 84 | GPS_MASTER_XMS=50M # initial heap size (master) 85 | 86 | echo "Using args: ${@:3}" 87 | 88 | echo "Starting GPS master -1" 89 | "$JAVA_DIR"/bin/java -Xincgc -Xms${GPS_MASTER_XMS} -Xmx${GPS_MASTER_XMX} -verbose:gc -jar "$GPS_DIR"/gps_node_runner.jar -machineid ${MASTER_GPS_ID} -ofp "$OUTPUT_DIR"/${2}-machine-stats ${@:3} &> "$GPS_LOG_DIR"/${2}-machine${i}-output.txt & 90 | 91 | ## start slaves asynchronously (faster this way) 92 | GPS_WORKER_XMS=256M # initial heap size (workers) 93 | 94 | # read-in effectively ensures # of workers never exceeds # of lines in "slaves" 95 | # the "|| ..." is a workaround in case the file doesn't end with a newline 96 | w_id=0 97 | while read slave || [ -n "$slave" ]; do 98 | echo "Starting GPS worker ${w_id}" 99 | 100 | # must have -n, otherwise ssh consumes all of stdin (i.e., all of the input file) 101 | # outer & runs ssh in the background 102 | # inner & and stdout/err redirections enable ssh connection to end while remote command continues to run 103 | ssh -n $slave "\"$JAVA_DIR\"/bin/java -Xincgc -Xms${GPS_WORKER_XMS} -Xmx${GPS_WORKER_XMX} -verbose:gc -jar \"$GPS_DIR\"/gps_node_runner.jar -machineid ${w_id} -ofp \"$OUTPUT_DIR\"/${2}-output-${w_id}-of-$((${1}-1)) ${@:3} &> \"$GPS_LOG_DIR\"/${2}-machine${w_id}-output.txt &" & 104 | 105 | w_id=$((w_id+1)) 106 | # no need to check if # workers < # slaves... GPS will hang in that situation 107 | done < "$(dirname "${BASH_SOURCE[0]}")"/slaves 108 | 109 | # ...and wait until computation completes (= master finishes) 110 | wait 111 | echo "Computation complete!" -------------------------------------------------------------------------------- /benchmark/gps/stop-nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Does the same thing as master-scripts/stop_gps_nodes.sh, but faster. 4 | # Also removes the need for a separate scripts/stop_nodes.sh. 5 | 6 | kill -9 $(ps aux | grep "[g]ps_node_runner" | awk '{print $2}') 7 | 8 | # the "|| ..." is a workaround in case the file doesn't end with a newline 9 | while read slave || [ -n "$slave" ]; do 10 | # must have -n, otherwise ssh consumes all of stdin (i.e., all of the input file) 11 | ssh -n $slave "kill -9 \$(ps aux | grep \"[g]ps_node_runner\" | awk '{print \$2}')" & 12 | done < "$(dirname "${BASH_SOURCE[0]}")"/slaves 13 | wait -------------------------------------------------------------------------------- /benchmark/gps/wcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines gps-mode" 5 | echo "" 6 | echo "gps-mode: 0 for normal (no lalp, no dynamic repartitioning)" 7 | echo " 1 for LALP" 8 | echo " 2 for dynamic repartitioning" 9 | echo " 3 for LALP and dynamic repartitioning" 10 | exit -1 11 | fi 12 | 13 | source ../common/get-dirs.sh 14 | source ../common/get-configs.sh 15 | 16 | # place input in /user/${USER}/input/ 17 | # output is in /user/${USER}/gps/output/ 18 | inputgraph=$(basename $1) 19 | 20 | # machines should be number of EC2 instances 21 | machines=$2 22 | workers=$(($machines * $GPS_WPM)) 23 | 24 | mode=$3 25 | case ${mode} in 26 | 0) modeflag="";; 27 | 1) modeflag="-lalp 100";; 28 | 2) modeflag="-dynamic";; 29 | 3) modeflag="-lalp 100 -dynamic";; 30 | *) echo "Invalid gps-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=wcc_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # GPS statistics (incl running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | # NOTE: numMaxIterations can be set but we don't set it 43 | # (to match Giraph and Mizan, neither of which use SS termination) 44 | ./start-nodes.sh ${workers} quick-start \ 45 | ${modeflag} \ 46 | -ifs /user/${USER}/input/${inputgraph} \ 47 | -hcf "$HADOOP_DIR"/conf/core-site.xml \ 48 | -jc gps.examples.wcc.WeaklyConnectedComponentsVertex###JobConfiguration \ 49 | -mcfg /user/${USER}/gps-machine-config/machine.cfg \ 50 | -log4jconfig "$GPS_DIR"/conf/log4j.config 51 | 52 | ## finish logging memory + network usage 53 | ../common/bench-finish.sh ${logname} 54 | 55 | ## get stats (see debug_site.sh for debug naming convention) 56 | hadoop dfs -get /user/${USER}/gps/output/quick-start-machine-stats ./logs/${logfile} 57 | #hadoop dfs -mv /user/${USER}/gps/output/quick-start-machine-stats /user/${USER}/gps/stats-${logname} -------------------------------------------------------------------------------- /benchmark/graphlab/benchall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 machines runs" 5 | echo "" 6 | echo "machines: 4, 8, 16, 32, 64, or 128" 7 | exit -1 8 | fi 9 | 10 | cd "$(dirname "${BASH_SOURCE[0]}")" 11 | 12 | MACHINES=$1 13 | RUNS=$2 14 | 15 | case ${MACHINES} in 16 | 4) GRAPHS=(amazon google patents); 17 | TOL=(0.408805 2.306985 2.220446E-16); # for PageRank 18 | SRC=(0 0 6009554);; # for SSSP 19 | 8) GRAPHS=(amazon google patents); 20 | TOL=(0.408805 2.306985 2.220446E-16); 21 | SRC=(0 0 6009554);; 22 | 16) GRAPHS=(livejournal orkut arabic twitter); 23 | TOL=(0.392500 0.011872 75.448252 0.769316); 24 | SRC=(0 1 3 0);; 25 | 32) GRAPHS=(livejournal orkut arabic twitter); 26 | TOL=(0.392500 0.011872 75.448252 0.769316); 27 | SRC=(0 1 3 0);; 28 | 64) GRAPHS=(livejournal orkut arabic twitter uk0705); 29 | TOL=(0.392500 0.011872 75.448252 0.769316 186.053578); 30 | SRC=(0 1 3 0 0);; 31 | 128) GRAPHS=(livejournal orkut arabic twitter uk0705); 32 | TOL=(0.392500 0.011872 75.448252 0.769316 186.053578); 33 | SRC=(0 1 3 0 0);; 34 | *) echo "Invalid machines"; exit -1;; 35 | esac 36 | 37 | ################# 38 | # Sync run 39 | ################# 40 | # we split the algs up for simplicity 41 | for j in "${!GRAPHS[@]}"; do 42 | for ((i = 1; i <= RUNS; i++)); do 43 | ./pagerank.sh "${GRAPHS[$j]}-adj-split/" ${MACHINES} 0 ${TOL[$j]} 44 | done 45 | done 46 | 47 | for j in "${!GRAPHS[@]}"; do 48 | for ((i = 1; i <= RUNS; i++)); do 49 | ./sssp.sh "${GRAPHS[$j]}-adj-split/" ${MACHINES} 0 ${SRC[$j]} 50 | done 51 | done 52 | 53 | for graph in "${GRAPHS[@]}"; do 54 | for ((i = 1; i <= RUNS; i++)); do 55 | ./wcc.sh "${graph}-adj-split/" ${MACHINES} 56 | done 57 | done 58 | 59 | #for graph in "${GRAPHS[@]}"; do 60 | # for ((i = 1; i <= RUNS; i++)); do 61 | # ./dimest.sh "${graph}-adj-split/" ${MACHINES} 62 | # done 63 | #done 64 | 65 | ################# 66 | # Async Run 67 | ################# 68 | for j in "${!GRAPHS[@]}"; do 69 | for ((i = 1; i <= RUNS; i++)); do 70 | ./pagerank.sh "${GRAPHS[$j]}-adj-split/" ${MACHINES} 1 ${TOL[$j]} 71 | done 72 | done 73 | 74 | for j in "${!GRAPHS[@]}"; do 75 | for ((i = 1; i <= RUNS; i++)); do 76 | ./sssp.sh "${GRAPHS[$j]}-adj-split/" ${MACHINES} 1 ${SRC[$j]} 77 | done 78 | done 79 | 80 | # no WCC 81 | # no dimest -------------------------------------------------------------------------------- /benchmark/graphlab/dimest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 input-graph machines" 5 | exit -1 6 | fi 7 | 8 | source ../common/get-dirs.sh 9 | 10 | # place input in /user/${USER}/input/ 11 | # output is in /user/${USER}/graphlab-output/ 12 | inputgraph=$(basename $1) 13 | outputdir=/user/${USER}/graphlab-output/ 14 | hadoop dfs -rmr "$outputdir" || true 15 | 16 | hdfspath=$(grep hdfs "$HADOOP_DIR"/conf/core-site.xml | sed -e 's/.*//' -e 's@.*@@') 17 | 18 | machines=$2 19 | 20 | ## log names 21 | # diameter estimation only supports synchronous mode 22 | logname=dimest_${inputgraph}_${machines}_0_"$(date +%Y%m%d-%H%M%S)" 23 | logfile=${logname}_time.txt 24 | 25 | 26 | ## start logging memory + network usage 27 | ../common/bench-init.sh ${logname} 28 | 29 | ## start algorithm run 30 | mpiexec -f ./machines -n ${machines} \ 31 | "$GRAPHLAB_DIR"/release/toolkits/graph_analytics/approximate_diameter \ 32 | --format adjgps \ 33 | --graph_opts ingress=random \ 34 | --graph "$hdfspath"/user/${USER}/input/${inputgraph} 2>&1 | tee -a ./logs/${logfile} 35 | # NOTE: no saveprefix option, diameters/results are outputted to time log 36 | 37 | ## finish logging memory + network usage 38 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/graphlab/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initiate GraphLab by creating machine file. 4 | # The contents actually correspond to physical machines. 5 | 6 | cd "$(dirname "${BASH_SOURCE[0]}")" 7 | source ../common/get-hosts.sh 8 | 9 | # create machines file 10 | rm -f machines 11 | 12 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 13 | echo "${CLUSTER_NAME}${i}" >> machines 14 | done -------------------------------------------------------------------------------- /benchmark/graphlab/pagerank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 4 ]; then 4 | echo "usage: $0 input-graph machines engine-mode tolerance" 5 | echo "" 6 | echo "engine-mode: 0 for synchronous engine" 7 | echo " 1 for asynchronous engine" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | 13 | # place input in /user/${USER}/input/ 14 | # output is in /user/${USER}/graphlab-output/ 15 | inputgraph=$(basename $1) 16 | outputdir=/user/${USER}/graphlab-output/ 17 | hadoop dfs -rmr "$outputdir" || true 18 | 19 | hdfspath=$(grep hdfs "$HADOOP_DIR"/conf/core-site.xml | sed -e 's/.*//' -e 's@.*@@') 20 | 21 | machines=$2 22 | 23 | mode=$3 24 | case ${mode} in 25 | 0) modeflag="sync";; 26 | 1) modeflag="async";; 27 | *) echo "Invalid engine-mode"; exit -1;; 28 | esac 29 | 30 | tol=$4 31 | 32 | ## log names 33 | logname=pagerank_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 34 | logfile=${logname}_time.txt 35 | 36 | 37 | ## start logging memory + network usage 38 | ../common/bench-init.sh ${logname} 39 | 40 | ## start algorithm run 41 | mpiexec -f ./machines -n ${machines} \ 42 | "$GRAPHLAB_DIR"/release/toolkits/graph_analytics/pagerank \ 43 | --tol ${tol} \ 44 | --engine ${modeflag} \ 45 | --format adjgps \ 46 | --graph_opts ingress=random \ 47 | --graph "$hdfspath"/user/${USER}/input/${inputgraph} \ 48 | --saveprefix "$hdfspath"/"$outputdir" 2>&1 | tee -a ./logs/${logfile} 49 | 50 | ## finish logging memory + network usage 51 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/graphlab/recompile-graphlab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 4 | source "$commondir"/get-hosts.sh 5 | source "$commondir"/get-dirs.sh 6 | 7 | # recompile GraphLab 8 | cd "$GRAPHLAB_DIR"/release/toolkits/graph_analytics/ 9 | make -j $(nproc) 10 | 11 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 12 | # NOTE: only copy binaries that will actually be used.. it takes too long otherwise 13 | scp ./pagerank ${CLUSTER_NAME}${i}:"$GRAPHLAB_DIR"/release/toolkits/graph_analytics/ & 14 | scp ./sssp ${CLUSTER_NAME}${i}:"$GRAPHLAB_DIR"/release/toolkits/graph_analytics/ & 15 | scp ./connected_component ${CLUSTER_NAME}$i:"$GRAPHLAB_DIR"/release/toolkits/graph_analytics/ & 16 | scp ./approximate_diameter ${CLUSTER_NAME}$i:"$GRAPHLAB_DIR"/release/toolkits/graph_analytics/ & 17 | 18 | rsync -avz --exclude '*.make' --exclude '*.cmake' "$GRAPHLAB_DIR"/deps/local/ ${CLUSTER_NAME}${i}:"$GRAPHLAB_DIR"/deps/local 19 | done 20 | wait 21 | 22 | echo "OK." -------------------------------------------------------------------------------- /benchmark/graphlab/sssp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 4 ]; then 4 | echo "usage: $0 input-graph machines engine-mode source-vertex" 5 | echo "" 6 | echo "engine-mode: 0 for synchronous engine" 7 | echo " 1 for asynchronous engine" 8 | exit -1 9 | fi 10 | 11 | source ../common/get-dirs.sh 12 | 13 | # place input in /user/${USER}/input/ 14 | # output is in /user/${USER}/graphlab-output/ 15 | inputgraph=$(basename $1) 16 | outputdir=/user/${USER}/graphlab-output/ 17 | hadoop dfs -rmr "$outputdir" || true 18 | 19 | hdfspath=$(grep hdfs "$HADOOP_DIR"/conf/core-site.xml | sed -e 's/.*//' -e 's@.*@@') 20 | 21 | machines=$2 22 | 23 | mode=$3 24 | case ${mode} in 25 | 0) modeflag="sync";; 26 | 1) modeflag="async";; 27 | *) echo "Invalid engine-mode"; exit -1;; 28 | esac 29 | 30 | src=$4 31 | 32 | ## log names 33 | logname=sssp_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 34 | logfile=${logname}_time.txt 35 | 36 | 37 | ## start logging memory + network usage 38 | ../common/bench-init.sh ${logname} 39 | 40 | ## start algorithm run 41 | mpiexec -f ./machines -n ${machines} \ 42 | "$GRAPHLAB_DIR"/release/toolkits/graph_analytics/sssp \ 43 | --source ${src} \ 44 | --directed 1 \ 45 | --engine ${modeflag} \ 46 | --format adjgps \ 47 | --graph_opts ingress=random \ 48 | --graph "$hdfspath"/user/${USER}/input/${inputgraph} \ 49 | --saveprefix "$hdfspath"/"$outputdir" 2>&1 | tee -a ./logs/${logfile} 50 | 51 | ## finish logging memory + network usage 52 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/graphlab/wcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 input-graph machines" 5 | exit -1 6 | fi 7 | 8 | source ../common/get-dirs.sh 9 | 10 | # place input in /user/${USER}/input/ 11 | # output is in /user/${USER}/graphlab-output/ 12 | inputgraph=$(basename $1) 13 | outputdir=/user/${USER}/graphlab-output/ 14 | hadoop dfs -rmr "$outputdir" || true 15 | 16 | hdfspath=$(grep hdfs "$HADOOP_DIR"/conf/core-site.xml | sed -e 's/.*//' -e 's@.*@@') 17 | 18 | machines=$2 19 | 20 | ## log names 21 | # WCC only supports synchronous mode 22 | logname=wcc_${inputgraph}_${machines}_0_"$(date +%Y%m%d-%H%M%S)" 23 | logfile=${logname}_time.txt 24 | 25 | 26 | ## start logging memory + network usage 27 | ../common/bench-init.sh ${logname} 28 | 29 | ## start algorithm run 30 | mpiexec -f ./machines -n ${machines} \ 31 | "$GRAPHLAB_DIR"/release/toolkits/graph_analytics/connected_component \ 32 | --format adjgps \ 33 | --graph_opts ingress=random \ 34 | --graph "$hdfspath"/user/${USER}/input/${inputgraph} \ 35 | --saveprefix "$hdfspath"/"$outputdir" 2>&1 | tee -a ./logs/${logfile} 36 | 37 | ## finish logging memory + network usage 38 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/hadoop/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initiate Hadoop by preparing the necessary config files 4 | # and copying them to all worker machines. 5 | # 6 | # To change the max JVM heap size for Hadoop mappers 7 | # (which will only affect Giraph), see ./get-configs.sh. 8 | # 9 | # NOTE: if testing on a single machine (i.e., pseudo-distributed), 10 | # slaves will have to be edited manually. 11 | 12 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 13 | source "$commondir"/get-hosts.sh 14 | source "$commondir"/get-dirs.sh 15 | source "$commondir"/get-configs.sh 16 | 17 | cd "$HADOOP_DIR/conf/" 18 | 19 | 20 | # masters and slaves 21 | echo "${HOSTNAME}" > masters 22 | 23 | rm -f slaves 24 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 25 | echo "${CLUSTER_NAME}${i}" >> slaves 26 | done 27 | 28 | 29 | # core-site.xml 30 | echo " 31 | 32 | 33 | 34 | 35 | 36 | 37 | hadoop.tmp.dir 38 | ${HADOOP_DATA_DIR}/hadoop_tmp-\${user.name} 39 | 40 | 41 | fs.default.name 42 | hdfs://${HOSTNAME}:54310 43 | 44 | 45 | fs.checkpoint.edits.dir 46 | ${HADOOP_DATA_DIR}/hadoop_checkpoint-\${user.name} 47 | 48 | " > core-site.xml 49 | 50 | 51 | # hdfs-site.xml (not really needed, but here it is) 52 | echo ' 53 | 54 | 55 | 56 | 57 | 58 | dfs.replication 59 | 1 60 | 61 | 62 | dfs.permissions 63 | false 64 | 65 | ' > hdfs-site.xml 66 | 67 | 68 | # mapred-site.xml 69 | echo " 70 | 71 | 72 | 73 | 74 | 75 | 76 | mapred.job.tracker 77 | ${HOSTNAME}:54311 78 | 79 | 80 | mapred.local.dir 81 | ${HADOOP_DATA_DIR}/hadoop_local-\${user.name} 82 | 83 | 84 | mapred.child.tmp 85 | ${HADOOP_DATA_DIR}/hadoop_child-\${user.name} 86 | 87 | 88 | mapred.job.tracker.persist.jobstatus.dir 89 | /home/${USER}/hadoop_jobstatus-\${user.name} 90 | 91 | 92 | mapred.tasktracker.map.tasks.maximum 93 | 5 94 | 95 | 96 | mapred.tasktracker.reduce.tasks.maximum 97 | 5 98 | 99 | 100 | mapred.map.tasks 101 | 5 102 | 103 | 104 | mapred.reduce.tasks 105 | 10 106 | 107 | 108 | mapreduce.job.counters.max 109 | 1000000 110 | 111 | 112 | mapreduce.job.counters.limit 113 | 1000000 114 | 115 | 116 | mapred.child.java.opts 117 | -Xmx${GIRAPH_XMX} 118 | 119 | " > mapred-site.xml 120 | 121 | 122 | # copy configs to worker machines 123 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 124 | rsync -avz ./* ${CLUSTER_NAME}${i}:"$HADOOP_DIR"/conf/ & 125 | done 126 | wait -------------------------------------------------------------------------------- /benchmark/hadoop/restart-hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Restarts Hadoop and kills any lingering Java processes. 4 | # This is indiscriminate---it will kill ALL Java processes. 5 | # 6 | # NOTE: To programmatically detect when Hadoop is up, use 7 | # "hadoop dfsadmin -safemode wait" or pass in "1" as arg. 8 | # 9 | # usage: ./restart-hadoop.sh [wait?] 10 | # 11 | # wait: 0 for no wait, 1 to wait for Hadoop to start 12 | 13 | source "$(dirname "${BASH_SOURCE[0]}")"/../common/get-hosts.sh 14 | 15 | stop-all.sh 16 | 17 | # do a kill on the master separately---this is useful when testing on a single machine 18 | kill -9 $(pgrep java) 19 | 20 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 21 | ssh ${CLUSTER_NAME}${i} "kill -9 \$(pgrep java)" & 22 | done 23 | wait 24 | 25 | start-all.sh 26 | 27 | if [[ $# -eq 1 && $1 -eq 1 ]]; then 28 | # wait until Hadoop is up 29 | hadoop dfsadmin -safemode wait 30 | fi -------------------------------------------------------------------------------- /benchmark/init-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initialize Hadoop and all systems. 4 | # 5 | # NOTE: before doing this, ensure: 6 | # 1. All machines have correct hostnames, /etc/hostname, and /etc/hosts 7 | # 2. Master has correct JVM Xmx size set for Giraph and GPS 8 | # 9 | # For (1), see ../ec2/uw-ec2.py init 10 | # For (2), see ./common/get-config.sh 11 | # 12 | # To check connectivity, use ./common/ssh-check.sh 13 | 14 | cd "$(dirname "${BASH_SOURCE[0]}")" 15 | source ./common/get-hosts.sh 16 | source ./common/get-dirs.sh 17 | 18 | # remove known_hosts (kills stale fingerprints) 19 | echo "Removing known_hosts..." 20 | rm -f ~/.ssh/known_hosts 21 | 22 | echo "Creating known_hosts..." 23 | for ((i = 0; i <= ${NUM_MACHINES}; i++)); do 24 | ssh -q -o StrictHostKeyChecking=no ${CLUSTER_NAME}${i} "exit" & 25 | done 26 | wait 27 | 28 | echo "Updating Hadoop configs..." 29 | ./hadoop/init.sh > /dev/null # quiet 30 | 31 | 32 | ############### 33 | # Hadoop 34 | ############### 35 | # remove old HDFS data (on master and worker machines) 36 | # NOTE: removing HDFS folder will kill targets of symlinks in logs/userlogs/ 37 | echo "Removing old HDFS data and Hadoop logs..." 38 | 39 | stop-all.sh > /dev/null # just in case anything is running 40 | 41 | for ((i = 0; i <= ${NUM_MACHINES}; i++)); do 42 | ssh ${CLUSTER_NAME}${i} "rm -rf \"$HADOOP_DATA_DIR\"; rm -rf \"$HADOOP_DIR\"/logs/*" & 43 | done 44 | wait 45 | 46 | # create new HDFS & start Hadoop 47 | echo "Creating new HDFS..." 48 | hadoop namenode -format 49 | 50 | echo "Starting up Hadoop..." 51 | start-all.sh 52 | 53 | # wait until Hadoop starts up (HDFS exits safemode) 54 | echo "Waiting for Hadoop to start..." 55 | hadoop dfsadmin -safemode wait > /dev/null 56 | 57 | 58 | ############### 59 | # Systems 60 | ############### 61 | # nothing to do for Giraph 62 | 63 | echo "Initializing GPS..." 64 | ./gps/init.sh 65 | 66 | echo "Initializing GraphLab..." 67 | ./graphlab/init.sh 68 | 69 | echo "Initializing Mizan..." 70 | ./mizan/init.sh 71 | 72 | 73 | ############### 74 | # Datasets 75 | ############### 76 | hadoop dfs -mkdir ./input || true 77 | #echo "Loading datasets..." 78 | #./datasets/load-files.sh -------------------------------------------------------------------------------- /benchmark/local-init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initialize Hadoop and all systems for local testing. 4 | # 5 | # This is for LOCAL TESTING only!! Ensure that: 6 | # 1. LOCAL_MACHINES is the number of pseudo-machines you want. 7 | # 2. ./common/get-dirs.sh has a correct DIR_PREFIX 8 | # 3. ./common/get-config.sh has correct JVM Xmx sizes 9 | 10 | # number of pseudo machines to use 11 | # adjust JVM Xmx accordingly to avoid running out of memory! 12 | LOCAL_MACHINES=1 13 | 14 | 15 | cd "$(dirname "${BASH_SOURCE[0]}")" 16 | source ./common/get-dirs.sh 17 | source ./common/get-configs.sh 18 | 19 | echo "Generating get-hosts.sh..." 20 | echo '#!/bin/bash 21 | 22 | # Set the prefix name and number of slaves/worker machines. 23 | # NOTE: This file is automatically generated by local-init.sh! 24 | 25 | HOSTNAME=$(hostname) 26 | CLUSTER_NAME=HOSTNAME 27 | NUM_MACHINES=0' > ./common/get-hosts.sh 28 | 29 | source ./common/get-hosts.sh 30 | 31 | 32 | echo "Updating Hadoop configs..." 33 | ./hadoop/init.sh > /dev/null # quiet 34 | 35 | # for local testing, need to create slave manually 36 | rm -f "$HADOOP_DIR"/conf/slaves 37 | for ((i = 1; i <= ${LOCAL_MACHINES}; i++)); do 38 | echo "localhost" >> "$HADOOP_DIR"/conf/slaves 39 | done 40 | 41 | ############### 42 | # Hadoop 43 | ############### 44 | # remove old HDFS data (on master and worker machines) 45 | # NOTE: removing HDFS folder will kill targets of symlinks in logs/userlogs/ 46 | echo "Removing old HDFS data and Hadoop logs..." 47 | 48 | stop-all.sh > /dev/null # just in case anything is running 49 | 50 | rm -rf "$HADOOP_DATA_DIR" 51 | rm -rf "$HADOOP_DIR"/logs/* 52 | 53 | # create new HDFS & start Hadoop 54 | echo "Creating new HDFS..." 55 | hadoop namenode -format 56 | 57 | echo "Starting up Hadoop..." 58 | start-all.sh 59 | 60 | # wait until Hadoop starts up (HDFS exits safemode) 61 | echo "Waiting for Hadoop to start..." 62 | hadoop dfsadmin -safemode wait > /dev/null 63 | 64 | # NOTE: for some reason HDFS is still not ready after safemode is off, 65 | # so sleep for 30s to ensure GPS init will succeed 66 | sleep 30 67 | 68 | ############### 69 | # Systems 70 | ############### 71 | # NOTE: we're duplicating each system's init.sh file... 72 | # It's a little messy but avoids cluttering up the existing files 73 | 74 | # nothing to do for Giraph 75 | 76 | echo "Initializing GPS..." 77 | rm -f ./gps/slaves 78 | rm -f ./gps/machine.cfg 79 | 80 | # create slaves file 81 | for ((i = 1; i <= ${LOCAL_MACHINES}; i++)); do 82 | for ((j = 1; j <= ${GPS_WPM}; j++)); do 83 | echo "localhost" >> ./gps/slaves 84 | done 85 | done 86 | 87 | # create machine config file 88 | echo "-1 ${HOSTNAME} 64000" >> ./gps/machine.cfg 89 | 90 | w_id=0 # worker counter (needed if workers per pseudo-machine > 1) 91 | for ((i = 1; i <= ${LOCAL_MACHINES}; i++)); do 92 | for ((j = 1; j <= ${GPS_WPM}; j++)); do 93 | echo "${w_id} localhost $((64001 + ${w_id}))" >> ./gps/machine.cfg 94 | w_id=$((w_id+1)) 95 | done 96 | done 97 | 98 | hadoop dfs -rmr /user/${USER}/gps-machine-config/ || true 99 | hadoop dfs -mkdir /user/${USER}/gps-machine-config/ 100 | hadoop dfs -put ./gps/machine.cfg /user/${USER}/gps-machine-config/ 101 | if [[ ! -d "$GPS_LOG_DIR" ]]; then mkdir -p "$GPS_LOG_DIR"; fi 102 | 103 | 104 | echo "Initializing GraphLab..." 105 | rm -f ./graphlab/machines 106 | for ((i = 1; i <= ${LOCAL_MACHINES}; i++)); do 107 | echo "localhost" >> ./graphlab/machines 108 | done 109 | 110 | echo "Initializing Mizan..." 111 | rm -f ./mizan/slaves 112 | for ((i = 1; i <= ${LOCAL_MACHINES}; i++)); do 113 | for ((j = 1; j <= ${MIZAN_WPM}; j++)); do 114 | echo "localhost" >> ./mizan/slaves 115 | done 116 | done 117 | 118 | ############### 119 | # Datasets 120 | ############### 121 | hadoop dfs -mkdir ./input || true 122 | #echo "Loading datasets..." 123 | #./datasets/load-files.sh -------------------------------------------------------------------------------- /benchmark/mizan/benchall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "usage: $0 machines runs" 5 | echo "" 6 | echo "machines: 4, 8, 16, 32, 64, or 128" 7 | exit -1 8 | fi 9 | 10 | cd "$(dirname "${BASH_SOURCE[0]}")" 11 | 12 | MACHINES=$1 13 | RUNS=$2 14 | 15 | case ${MACHINES} in 16 | 4) GRAPHS=(amazon google patents); 17 | SRC=(0 0 6009554);; # for SSSP 18 | 8) GRAPHS=(amazon google patents); 19 | SRC=(0 0 6009554);; 20 | 16) GRAPHS=(livejournal orkut arabic); 21 | SRC=(0 1 3);; 22 | 32) GRAPHS=(livejournal orkut arabic); 23 | SRC=(0 1 3);; 24 | 64) GRAPHS=(livejournal orkut arabic); 25 | SRC=(0 1 3);; 26 | 128) GRAPHS=(livejournal orkut arabic twitter); 27 | SRC=(0 1 3 0);; 28 | *) echo "Invalid machines"; exit -1;; 29 | esac 30 | 31 | 32 | ################## 33 | # Premizan 34 | ################## 35 | for graph in "${GRAPHS[@]}"; do 36 | for ((i = 1; i <= RUNS; i++)); do 37 | ./premizan.sh "${graph}.txt" ${MACHINES} 1 38 | done 39 | done 40 | 41 | ################## 42 | # Static run 43 | ################## 44 | # we split the algs up for clarity 45 | for graph in "${GRAPHS[@]}"; do 46 | for ((i = 1; i <= RUNS; i++)); do 47 | ./pagerank.sh "${graph}.txt" ${MACHINES} 0 48 | done 49 | done 50 | 51 | for j in "${!GRAPHS[@]}"; do 52 | for ((i = 1; i <= RUNS; i++)); do 53 | ./sssp.sh "${GRAPHS[$j]}.txt" ${MACHINES} 0 ${SRC[$j]} 54 | done 55 | done 56 | 57 | for graph in "${GRAPHS[@]}"; do 58 | for ((i = 1; i <= RUNS; i++)); do 59 | ./wcc.sh "${graph}.txt" ${MACHINES} 0 60 | done 61 | done 62 | 63 | # MST does not work (issues w/ aggregators + graph mutation in 0.1bu1) 64 | #for graph in "${GRAPHS[@]}"; do 65 | # for ((i = 1; i <= RUNS; i++)); do 66 | # ./mst.sh "${graph}-mst.txt" ${MACHINES} 0 67 | # done 68 | #done 69 | 70 | #for graph in "${GRAPHS[@]}"; do 71 | # for ((i = 1; i <= RUNS; i++)); do 72 | # ./dimest.sh "${graph}.txt" ${MACHINES} 0 73 | # done 74 | #done 75 | 76 | ## Other Mizan modes aren't working correctly, 77 | ## so we cannot test them -------------------------------------------------------------------------------- /benchmark/mizan/dimest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines migration-mode" 5 | echo "" 6 | echo "migration-mode: 0 for static (no dynamic migration)" 7 | echo " 1 for delayed migration" 8 | echo " 2 for mixed migration" 9 | exit -1 10 | fi 11 | 12 | source ../common/get-dirs.sh 13 | source ../common/get-configs.sh 14 | 15 | # place input into /user/${USER}/input/ (this is where preMizan looks) 16 | # output of preMizan is in /user/${USER}/m_output/mizan_${inputgraph}_mhash_${workers}/ 17 | # (or _mrange_${workers} if using range partitioning) 18 | # output of algorithm is in /user/${USER}/mizan-output/ 19 | inputgraph=$(basename $1) 20 | 21 | # we can have multiple workers per machine 22 | machines=$2 23 | workers=$(($machines * $MIZAN_WPM)) 24 | 25 | mode=$3 26 | case ${mode} in 27 | 0) modeflag="1";; 28 | 1) modeflag="2";; 29 | 2) modeflag="3";; 30 | *) echo "Invalid migration-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=dimest_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # Mizan stats (incl. running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | mpirun -f slaves -np ${workers} "$MIZAN_DIR"/Release/Mizan-0.1b \ 43 | -a 3 \ 44 | -s 30 \ 45 | -u ${USER} \ 46 | -g ${inputgraph} \ 47 | -w ${workers} \ 48 | -m ${modeflag} 2>&1 | tee -a ./logs/${logfile} 49 | 50 | ## finish logging memory + network usage 51 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/mizan/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Initiate Mizan by creating machine file. 4 | 5 | cd "$(dirname "${BASH_SOURCE[0]}")" 6 | source ../common/get-hosts.sh 7 | source ../common/get-configs.sh 8 | 9 | # create slaves file 10 | rm -f slaves 11 | 12 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 13 | for ((j = 1; j <= ${MIZAN_WPM}; j++)); do 14 | echo "${CLUSTER_NAME}${i}" >> slaves 15 | done 16 | done -------------------------------------------------------------------------------- /benchmark/mizan/mst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines migration-mode" 5 | echo "" 6 | echo "migration-mode: 0 for static (no dynamic migration)" 7 | echo " 1 for delayed migration" 8 | echo " 2 for mixed migration" 9 | exit -1 10 | fi 11 | 12 | source ../common/get-dirs.sh 13 | source ../common/get-configs.sh 14 | 15 | # place input into /user/${USER}/input/ (this is where preMizan looks) 16 | # output of preMizan is in /user/${USER}/m_output/mizan_${inputgraph}_mhash_${workers}/ 17 | # (or _mrange_${workers} if using range partitioning) 18 | # output of algorithm is in /user/${USER}/mizan-output/ 19 | inputgraph=$(basename $1) 20 | 21 | # we can have multiple workers per machine 22 | machines=$2 23 | workers=$(($machines * $MIZAN_WPM)) 24 | 25 | mode=$3 26 | case ${mode} in 27 | 0) modeflag="1";; 28 | 1) modeflag="2";; 29 | 2) modeflag="3";; 30 | *) echo "Invalid migration-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=mst_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # Mizan stats (incl. running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | mpirun -f slaves -np ${workers} "$MIZAN_DIR"/Release/Mizan-0.1b \ 43 | -a 7 \ 44 | -u ${USER} \ 45 | -g ${inputgraph} \ 46 | -w ${workers} \ 47 | -m ${modeflag} 2>&1 | tee -a ./logs/${logfile} 48 | 49 | ## finish logging memory + network usage 50 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/mizan/pagerank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines migration-mode" 5 | echo "" 6 | echo "migration-mode: 0 for static (no dynamic migration)" 7 | echo " 1 for delayed migration" 8 | echo " 2 for mixed migration" 9 | exit -1 10 | fi 11 | 12 | source ../common/get-dirs.sh 13 | source ../common/get-configs.sh 14 | 15 | # place input into /user/${USER}/input/ (this is where preMizan looks) 16 | # output of preMizan is in /user/${USER}/m_output/mizan_${inputgraph}_mhash_${workers}/ 17 | # (or _mrange_${workers} if using range partitioning) 18 | # output of algorithm is in /user/${USER}/mizan-output/ 19 | inputgraph=$(basename $1) 20 | 21 | # we can have multiple workers per machine 22 | machines=$2 23 | workers=$(($machines * $MIZAN_WPM)) 24 | 25 | mode=$3 26 | case ${mode} in 27 | 0) modeflag="1";; 28 | 1) modeflag="2";; 29 | 2) modeflag="3";; 30 | *) echo "Invalid migration-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=pagerank_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # Mizan stats (incl. running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | mpirun -f slaves -np ${workers} "$MIZAN_DIR"/Release/Mizan-0.1b \ 43 | -a 1 \ 44 | -s 30 \ 45 | -u ${USER} \ 46 | -g ${inputgraph} \ 47 | -w ${workers} \ 48 | -m ${modeflag} 2>&1 | tee -a ./logs/${logfile} 49 | 50 | ## finish logging memory + network usage 51 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/mizan/premizan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Performs Mizan's prepartitioning phase. This is mandatory as 4 | # Mizan expects input to be pre-partitioned in a specific way. 5 | 6 | # partition type is either 1 (hash) or 2 (range) 7 | if [ $# -ne 3 ]; then 8 | echo "usage: $0 input-graph machines partition-type" 9 | echo "" 10 | echo "partition-type: 1 for hash partitioning" 11 | echo " 2 for range partitioning" 12 | exit -1 13 | fi 14 | 15 | source ../common/get-dirs.sh 16 | source ../common/get-configs.sh 17 | 18 | # absolute path to this script's location 19 | scriptdir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 20 | 21 | 22 | # place input into /user/${USER}/input/ (this is where preMizan looks) 23 | # output of preMizan is in /user/${USER}/m_output/mizan_${inputgraph}_mhash_${workers}/ 24 | # (or _mrange_${workers} if using range partitioning) 25 | inputgraph=$(basename $1) 26 | 27 | # we can have multiple workers per machine 28 | machines=$2 29 | workers=$(($machines * $MIZAN_WPM)) 30 | 31 | ## log names 32 | logname=premizan_${inputgraph}_${machines}_${3}_"$(date +%Y%m%d-%H%M%S)" 33 | logfile=${logname}_time.txt 34 | 35 | ## start logging memory + network usage 36 | ../common/bench-init.sh ${logname} 37 | 38 | cd "$MIZAN_DIR"/preMizan/hadoopScripts/ 39 | 40 | ## start premizan conversion 41 | tstart="$(date +%s%N)" 42 | 43 | # taken from preMizan/preMizan.sh 44 | case $3 in 45 | [1]*) ./hadoop_run_modhash.sh $inputgraph ${workers} true 2>&1 | tee -a "$scriptdir"/logs/${logfile};; 46 | [2]*) ./hadoop_run_range.sh $inputgraph ${workers} true 2>&1 | tee -a "$scriptdir"/logs/${logfile};; 47 | *) echo "Error: invalid partition type!";; 48 | esac 49 | 50 | tdone="$(date +%s%N)" 51 | 52 | cd "$scriptdir" 53 | 54 | echo "" | tee -a ./logs/${logfile} 55 | echo "TOTAL TIME (ns): $tdone - $tstart" | tee -a ./logs/${logfile} 56 | echo "TOTAL TIME (sec): $(perl -e "print $(($tdone - $tstart))/1000000000")" | tee -a ./logs/${logfile} 57 | 58 | ## finish logging memory + network usage 59 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/mizan/recompile-mizan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | commondir=$(dirname "${BASH_SOURCE[0]}")/../common 4 | source "$commondir"/get-hosts.sh 5 | source "$commondir"/get-dirs.sh 6 | 7 | # recompile Mizan 8 | touch "$MIZAN_DIR"/src/main.cpp 9 | cd "$MIZAN_DIR/Release" 10 | make all 11 | 12 | for ((i = 1; i <= ${NUM_MACHINES}; i++)); do 13 | scp ./Mizan-0.1b ${CLUSTER_NAME}${i}:"$MIZAN_DIR"/Release/ & 14 | done 15 | wait 16 | 17 | echo "OK." -------------------------------------------------------------------------------- /benchmark/mizan/sssp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 4 ]; then 4 | echo "usage: $0 input-graph machines migration-mode source-vertex" 5 | echo "" 6 | echo "migration-mode: 0 for static (no dynamic migration)" 7 | echo " 1 for delayed migration" 8 | echo " 2 for mixed migration" 9 | exit -1 10 | fi 11 | 12 | source ../common/get-dirs.sh 13 | source ../common/get-configs.sh 14 | 15 | # place input into /user/${USER}/input/ (this is where preMizan looks) 16 | # output of preMizan is in /user/${USER}/m_output/mizan_${inputgraph}_mhash_${workers}/ 17 | # (or _mrange_${workers} if using range partitioning) 18 | # output of algorithm is in /user/${USER}/mizan-output/ 19 | inputgraph=$(basename $1) 20 | 21 | # we can have multiple workers per machine 22 | machines=$2 23 | workers=$(($machines * $MIZAN_WPM)) 24 | 25 | mode=$3 26 | case ${mode} in 27 | 0) modeflag="1";; 28 | 1) modeflag="2";; 29 | 2) modeflag="3";; 30 | *) echo "Invalid migration-mode"; exit -1;; 31 | esac 32 | 33 | src=$4 34 | 35 | ## log names 36 | logname=sssp_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 37 | logfile=${logname}_time.txt # Mizan stats (incl. running time) 38 | 39 | 40 | ## start logging memory + network usage 41 | ../common/bench-init.sh ${logname} 42 | 43 | ## start algorithm run 44 | mpirun -f slaves -np ${workers} "$MIZAN_DIR"/Release/Mizan-0.1b \ 45 | -a 5 \ 46 | --src ${src} \ 47 | -u ${USER} \ 48 | -g ${inputgraph} \ 49 | -w ${workers} \ 50 | -m ${modeflag} 2>&1 | tee -a ./logs/${logfile} 51 | 52 | ## finish logging memory + network usage 53 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/mizan/wcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [ $# -ne 3 ]; then 4 | echo "usage: $0 input-graph machines migration-mode" 5 | echo "" 6 | echo "migration-mode: 0 for static (no dynamic migration)" 7 | echo " 1 for delayed migration" 8 | echo " 2 for mixed migration" 9 | exit -1 10 | fi 11 | 12 | source ../common/get-dirs.sh 13 | source ../common/get-configs.sh 14 | 15 | # place input into /user/${USER}/input/ (this is where preMizan looks) 16 | # output of preMizan is in /user/${USER}/m_output/mizan_${inputgraph}_mhash_${workers}/ 17 | # (or _mrange_${workers} if using range partitioning) 18 | # output of algorithm is in /user/${USER}/mizan-output/ 19 | inputgraph=$(basename $1) 20 | 21 | # we can have multiple workers per machine 22 | machines=$2 23 | workers=$(($machines * $MIZAN_WPM)) 24 | 25 | mode=$3 26 | case ${mode} in 27 | 0) modeflag="1";; 28 | 1) modeflag="2";; 29 | 2) modeflag="3";; 30 | *) echo "Invalid migration-mode"; exit -1;; 31 | esac 32 | 33 | ## log names 34 | logname=wcc_${inputgraph}_${machines}_${mode}_"$(date +%Y%m%d-%H%M%S)" 35 | logfile=${logname}_time.txt # Mizan stats (incl. running time) 36 | 37 | 38 | ## start logging memory + network usage 39 | ../common/bench-init.sh ${logname} 40 | 41 | ## start algorithm run 42 | mpirun -f slaves -np ${workers} "$MIZAN_DIR"/Release/Mizan-0.1b \ 43 | -a 6 \ 44 | -u ${USER} \ 45 | -g ${inputgraph} \ 46 | -w ${workers} \ 47 | -m ${modeflag} 2>&1 | tee -a ./logs/${logfile} 48 | 49 | ## finish logging memory + network usage 50 | ../common/bench-finish.sh ${logname} -------------------------------------------------------------------------------- /benchmark/parsers/log-checker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Checks to ensure all log files are present. 4 | # 5 | # A simple way to use this is "./log-checker.sh *time.txt". 6 | # 7 | # Note that the *_0_mem.txt matching is useful for spotting failed 8 | # runs too, as bench-init runs before the time log is created. 9 | 10 | if [ $# -lt 1 ]; then 11 | echo "usage: $0 time/mem-log [time/mem-log ...]" 12 | echo "" 13 | echo "time/mem-log: experiment's time log file OR master's mem file" 14 | echo " (e.g. pagerank_orkut-adj.txt_16_20140101-123050_time.txt)" 15 | echo " ( OR pagerank_orkut-adj.txt_16_20140101-123050_0_mem.txt)" 16 | exit -1 17 | fi 18 | 19 | dir=$PWD 20 | 21 | # read args into array of files 22 | read -a FILES <<< $(echo "$@") 23 | 24 | for file in "${FILES[@]}"; do 25 | logname=$(echo $(basename "$file") | sed -e 's/_time.txt$//g' -e 's/_0_mem.txt$//g') 26 | 27 | # move to where the logs are 28 | cd "$dir/$(dirname "$file")" 29 | 30 | err="$logname\n" 31 | iserr=0 32 | 33 | # check if all files are present 34 | if [[ ! -f "${logname}_time.txt" ]]; then 35 | err="$err ERROR: ${logname}_time.txt missing!\n" 36 | iserr=1 37 | fi 38 | 39 | machines=$(echo "$logname" | sed 's/_/ /g' | awk '{print $3}') 40 | 41 | for (( i = 0; i <= ${machines}; i++ )); do 42 | if [[ ! -f "${logname}_${i}_mem.txt" ]]; then 43 | err="$err ERROR: ${logname}_${i}_mem.txt missing!\n" 44 | iserr=1 45 | elif [[ ! -f "${logname}_${i}_nbt.txt" ]]; then 46 | err="$err ERROR: ${logname}_${i}_nbt.txt missing!\n" 47 | iserr=1 48 | elif [[ ! -f "${logname}_${i}_cpu.txt" ]]; then 49 | err="$err WARNING: ${logname}_${i}_cpu.txt missing!\n" 50 | iserr=1 51 | elif [[ ! -f "${logname}_${i}_net.txt" ]]; then 52 | err="$err WARNING: ${logname}_${i}_net.txt missing!\n" 53 | iserr=1 54 | fi 55 | done 56 | 57 | # only print something when there's an error 58 | if [[ $iserr -eq 1 ]]; then 59 | echo -e "$err" 60 | fi 61 | done -------------------------------------------------------------------------------- /benchmark/readme.txt: -------------------------------------------------------------------------------- 1 | ===================================================================== 2 | Please see the wiki at http://github.com/xvz/graph-processing/wiki/ 3 | ===================================================================== 4 | 5 | Scripts specific to each system and/or Hadoop are located in their respective folders. Scripts common across multiple systems (e.g., pre- and post-benchmarking setup/cleanup scripts) are in "common". 6 | 7 | All results are stored in .//logs/, where system is giraph, gps, graphlab, or mizan. 8 | 9 | WARNING: Everything has only been tested in bash! Things may or may not break if you use a different shell. 10 | 11 | NOTE: Benching scripts MUST be run from their folders (i.e., $PWD = location of script)---otherwise they won't work. Other scripts can be ran from anywhere. 12 | -------------------------------------------------------------------------------- /giraph-1.0.0/findbugs-exclude.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-core/src/main/java/org/apache/giraph/io/formats/JsonLongLongLongLongVertexInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.giraph.io.formats; 19 | 20 | import com.google.common.collect.Lists; 21 | import org.apache.giraph.edge.Edge; 22 | import org.apache.giraph.edge.EdgeFactory; 23 | import org.apache.giraph.graph.Vertex; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.hadoop.mapreduce.InputSplit; 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 28 | import org.json.JSONArray; 29 | import org.json.JSONException; 30 | 31 | import java.io.IOException; 32 | import java.util.List; 33 | 34 | /** 35 | * VertexInputFormat that features long vertex ID's, 36 | * long vertex values and long 37 | * out-edge weights, and long message types, 38 | * specified in JSON format. 39 | */ 40 | public class JsonLongLongLongLongVertexInputFormat extends 41 | TextVertexInputFormat { 42 | 43 | @Override 44 | public TextVertexReader createVertexReader(InputSplit split, 45 | TaskAttemptContext context) { 46 | return new JsonLongLongLongLongVertexReader(); 47 | } 48 | 49 | /** 50 | * VertexReader that features long vertex 51 | * values and long out-edge weights. The 52 | * files should be in the following JSON format: 53 | * JSONArray(, , 54 | * JSONArray(JSONArray(, ), ...)) 55 | * Here is an example with vertex id 1, vertex value 4, and two edges. 56 | * First edge has a destination vertex 2, edge value 2. 57 | * Second edge has a destination vertex 3, edge value 1. 58 | * [1,4,[[2,2],[3,1]]] 59 | */ 60 | class JsonLongLongLongLongVertexReader extends 61 | TextVertexReaderFromEachLineProcessedHandlingExceptions { 63 | 64 | @Override 65 | protected JSONArray preprocessLine(Text line) throws JSONException { 66 | return new JSONArray(line.toString()); 67 | } 68 | 69 | @Override 70 | protected LongWritable getId(JSONArray jsonVertex) throws JSONException, 71 | IOException { 72 | return new LongWritable(jsonVertex.getLong(0)); 73 | } 74 | 75 | @Override 76 | protected LongWritable getValue(JSONArray jsonVertex) throws 77 | JSONException, IOException { 78 | return new LongWritable(jsonVertex.getLong(1)); 79 | } 80 | 81 | @Override 82 | protected Iterable> getEdges( 83 | JSONArray jsonVertex) throws JSONException, IOException { 84 | JSONArray jsonEdgeArray = jsonVertex.getJSONArray(2); 85 | List> edges = 86 | Lists.newArrayListWithCapacity(jsonEdgeArray.length()); 87 | for (int i = 0; i < jsonEdgeArray.length(); ++i) { 88 | JSONArray jsonEdge = jsonEdgeArray.getJSONArray(i); 89 | edges.add(EdgeFactory.create(new LongWritable(jsonEdge.getLong(0)), 90 | new LongWritable(jsonEdge.getLong(1)))); 91 | } 92 | return edges; 93 | } 94 | 95 | @Override 96 | protected Vertex handleException(Text line, JSONArray jsonVertex, 98 | JSONException e) { 99 | throw new IllegalArgumentException( 100 | "Couldn't get vertex from line " + line, e); 101 | } 102 | 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-core/src/main/java/org/apache/giraph/io/formats/JsonLongLongNullLongVertexInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.giraph.io.formats; 19 | 20 | import com.google.common.collect.Lists; 21 | import org.apache.giraph.edge.Edge; 22 | import org.apache.giraph.edge.EdgeFactory; 23 | import org.apache.giraph.graph.Vertex; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.NullWritable; 26 | import org.apache.hadoop.io.Text; 27 | import org.apache.hadoop.mapreduce.InputSplit; 28 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 29 | import org.json.JSONArray; 30 | import org.json.JSONException; 31 | 32 | import java.io.IOException; 33 | import java.util.List; 34 | 35 | /** 36 | * VertexInputFormat that features long vertex ID's, 37 | * long vertex values and null 38 | * out-edge weights, and long message types, 39 | * specified in JSON format. 40 | */ 41 | public class JsonLongLongNullLongVertexInputFormat extends 42 | TextVertexInputFormat { 43 | 44 | @Override 45 | public TextVertexReader createVertexReader(InputSplit split, 46 | TaskAttemptContext context) { 47 | return new JsonLongLongNullLongVertexReader(); 48 | } 49 | 50 | /** 51 | * VertexReader that features long vertex 52 | * values and null out-edge weights. The 53 | * files should be in the following JSON format: 54 | * JSONArray(, , 55 | * JSONArray(JSONArray(, ), ...)) 56 | * Here is an example with vertex id 1, vertex value 4, and two edges. 57 | * First edge has a destination vertex 2, edge value 2. 58 | * Second edge has a destination vertex 3, edge value 1. 59 | * [1,4,[[2,2],[3,1]]] 60 | */ 61 | class JsonLongLongNullLongVertexReader extends 62 | TextVertexReaderFromEachLineProcessedHandlingExceptions { 64 | 65 | @Override 66 | protected JSONArray preprocessLine(Text line) throws JSONException { 67 | return new JSONArray(line.toString()); 68 | } 69 | 70 | @Override 71 | protected LongWritable getId(JSONArray jsonVertex) throws JSONException, 72 | IOException { 73 | return new LongWritable(jsonVertex.getLong(0)); 74 | } 75 | 76 | @Override 77 | protected LongWritable getValue(JSONArray jsonVertex) throws 78 | JSONException, IOException { 79 | return new LongWritable(jsonVertex.getLong(1)); 80 | } 81 | 82 | @Override 83 | protected Iterable> getEdges( 84 | JSONArray jsonVertex) throws JSONException, IOException { 85 | JSONArray jsonEdgeArray = jsonVertex.getJSONArray(2); 86 | List> edges = 87 | Lists.newArrayListWithCapacity(jsonEdgeArray.length()); 88 | for (int i = 0; i < jsonEdgeArray.length(); ++i) { 89 | JSONArray jsonEdge = jsonEdgeArray.getJSONArray(i); 90 | edges.add(EdgeFactory.create(new LongWritable(jsonEdge.getLong(0)), 91 | NullWritable.get())); 92 | } 93 | return edges; 94 | } 95 | 96 | @Override 97 | protected Vertex handleException(Text line, JSONArray jsonVertex, 99 | JSONException e) { 100 | throw new IllegalArgumentException( 101 | "Couldn't get vertex from line " + line, e); 102 | } 103 | 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/ConnectedComponentsInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | import com.google.common.collect.Lists; 22 | import org.apache.giraph.edge.Edge; 23 | import org.apache.giraph.edge.EdgeFactory; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.NullWritable; 26 | import org.apache.hadoop.io.Text; 27 | import org.apache.giraph.io.formats.TextVertexInputFormat; 28 | import org.apache.hadoop.mapreduce.InputSplit; 29 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 30 | 31 | import java.io.IOException; 32 | import java.util.List; 33 | import java.util.regex.Pattern; 34 | 35 | /** 36 | * Simple text-based {@link org.apache.giraph.io.VertexInputFormat} for 37 | * {@link org.apache.giraph.examples.ConnectedComponentsVertex}. 38 | * 39 | * Inputs have long ids, no edge weights, and no vertex values. 40 | * (Vertex values are set to a long of 0.) 41 | * 42 | * Each line consists of: 43 | * vertex neighbor1 neighbor2 ... 44 | * 45 | * Values can be separated by spaces or tabs. 46 | */ 47 | public class ConnectedComponentsInputFormat extends 48 | TextVertexInputFormat { 49 | /** Separator of the vertex and neighbors */ 50 | private static final Pattern SEPARATOR = Pattern.compile("[\t ]"); 51 | 52 | @Override 53 | public TextVertexReader createVertexReader(InputSplit split, 54 | TaskAttemptContext context) 55 | throws IOException { 56 | return new ConnectedComponentsVertexReader(); 57 | } 58 | 59 | /** 60 | * Vertex reader associated with {@link ConnectedComponentsInputFormat}. 61 | */ 62 | public class ConnectedComponentsVertexReader extends 63 | TextVertexReaderFromEachLineProcessed { 64 | /** 65 | * Cached vertex id for the current line 66 | */ 67 | private LongWritable id; 68 | 69 | @Override 70 | protected String[] preprocessLine(Text line) throws IOException { 71 | String[] tokens = SEPARATOR.split(line.toString()); 72 | id = new LongWritable(Long.parseLong(tokens[0])); 73 | return tokens; 74 | } 75 | 76 | @Override 77 | protected LongWritable getId(String[] tokens) throws IOException { 78 | return id; 79 | } 80 | 81 | @Override 82 | protected LongWritable getValue(String[] tokens) throws IOException { 83 | return new LongWritable(0); 84 | } 85 | 86 | @Override 87 | protected Iterable> getEdges( 88 | String[] tokens) throws IOException { 89 | List> edges = 90 | Lists.newArrayListWithCapacity(tokens.length - 1); 91 | for (int i = 1; i < tokens.length; i++) { 92 | edges.add(EdgeFactory.create( 93 | new LongWritable(Long.parseLong(tokens[i])))); 94 | } 95 | return edges; 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/ConnectedComponentsVertex.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | // NOTE: original code used this.. have to comment it out 22 | // b/c of checkstyle errors 23 | //import org.apache.giraph.edge.Edge; 24 | import org.apache.giraph.graph.Vertex; 25 | import org.apache.hadoop.io.LongWritable; 26 | import org.apache.hadoop.io.NullWritable; 27 | 28 | import java.io.IOException; 29 | 30 | /** 31 | * Implementation of the HCC algorithm that identifies connected components and 32 | * assigns each vertex its "component identifier" (the smallest vertex id 33 | * in the component) 34 | * 35 | * The idea behind the algorithm is very simple: propagate the smallest 36 | * vertex id along the edges to all vertices of a connected component. The 37 | * number of supersteps necessary is equal to the length of the maximum 38 | * diameter of all components + 1 39 | * 40 | * The original Hadoop-based variant of this algorithm was proposed by Kang, 41 | * Charalampos, Tsourakakis and Faloutsos in 42 | * "PEGASUS: Mining Peta-Scale Graphs", 2010 43 | * 44 | * http://www.cs.cmu.edu/~ukang/papers/PegasusKAIS.pdf 45 | */ 46 | @Algorithm( 47 | name = "Connected components", 48 | description = "Finds connected components of the graph" 49 | ) 50 | public class ConnectedComponentsVertex extends Vertex { 52 | /** 53 | * Propagates the smallest vertex id to all neighbors. Will always choose to 54 | * halt and only reactivate if a smaller id has been sent to it. 55 | * 56 | * @param messages Iterator of messages from the previous superstep. 57 | * @throws IOException 58 | */ 59 | @Override 60 | public void compute(Iterable messages) throws IOException { 61 | long currentComponent = getValue().get(); 62 | 63 | // in first superstep, load proper vertex values and then broadcast 64 | if (getSuperstep() == 0) { 65 | currentComponent = getId().get(); 66 | setValue(new LongWritable(currentComponent)); 67 | 68 | // indiscriminately send messages to all neighbours, 69 | // as this mirrors GPS and Mizan implementations 70 | sendMessageToAllEdges(getValue()); 71 | 72 | voteToHalt(); 73 | return; 74 | } 75 | 76 | boolean changed = false; 77 | // did we get a smaller id ? 78 | for (LongWritable message : messages) { 79 | long candidateComponent = message.get(); 80 | if (candidateComponent < currentComponent) { 81 | currentComponent = candidateComponent; 82 | changed = true; 83 | } 84 | } 85 | 86 | // propagate new component id to the neighbors 87 | if (changed) { 88 | setValue(new LongWritable(currentComponent)); 89 | sendMessageToAllEdges(getValue()); 90 | } 91 | 92 | voteToHalt(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/DiameterEstimationInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | import com.google.common.collect.Lists; 22 | import org.apache.giraph.edge.Edge; 23 | import org.apache.giraph.edge.EdgeFactory; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.giraph.examples.DiameterEstimationVertex.LongArrayWritable; 26 | import org.apache.hadoop.io.NullWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.giraph.io.formats.TextVertexInputFormat; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | 32 | import java.io.IOException; 33 | import java.util.List; 34 | import java.util.regex.Pattern; 35 | 36 | /** 37 | * Simple text-based {@link org.apache.giraph.io.VertexInputFormat} for 38 | * {@link org.apache.giraph.examples.DiameterEstimationVertex}. 39 | * 40 | * Inputs have long ids, no edge weights, and no vertex values. 41 | * 42 | * Each line consists of: 43 | * vertex neighbor1 neighbor2 ... 44 | * 45 | * Values can be separated by spaces or tabs. 46 | */ 47 | public class DiameterEstimationInputFormat extends 48 | TextVertexInputFormat { 49 | /** Separator of the vertex and neighbors */ 50 | private static final Pattern SEPARATOR = Pattern.compile("[\t ]"); 51 | 52 | @Override 53 | public TextVertexReader createVertexReader(InputSplit split, 54 | TaskAttemptContext context) 55 | throws IOException { 56 | return new DiameterEstimationVertexReader(); 57 | } 58 | 59 | /** 60 | * Vertex reader associated with {@link DiameterEstimationInputFormat}. 61 | */ 62 | public class DiameterEstimationVertexReader extends 63 | TextVertexReaderFromEachLineProcessed { 64 | /** 65 | * Cached vertex id for the current line 66 | */ 67 | private LongWritable id; 68 | 69 | @Override 70 | protected String[] preprocessLine(Text line) throws IOException { 71 | String[] tokens = SEPARATOR.split(line.toString()); 72 | id = new LongWritable(Long.parseLong(tokens[0])); 73 | return tokens; 74 | } 75 | 76 | @Override 77 | protected LongWritable getId(String[] tokens) throws IOException { 78 | return id; 79 | } 80 | 81 | @Override 82 | protected LongArrayWritable getValue(String[] tokens) throws IOException { 83 | // ignore tokens and return dummy LongArrayWritable 84 | // (this will be replaced during computation) 85 | return new LongArrayWritable(); 86 | } 87 | 88 | @Override 89 | protected Iterable> getEdges( 90 | String[] tokens) throws IOException { 91 | List> edges = 92 | Lists.newArrayListWithCapacity(tokens.length - 1); 93 | for (int i = 1; i < tokens.length; i++) { 94 | edges.add(EdgeFactory.create( 95 | new LongWritable(Long.parseLong(tokens[i])))); 96 | } 97 | return edges; 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/JsonLongLongArrayInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.giraph.examples; 19 | 20 | import com.google.common.collect.Lists; 21 | import org.apache.giraph.edge.Edge; 22 | import org.apache.giraph.edge.EdgeFactory; 23 | import org.apache.giraph.graph.Vertex; 24 | import org.apache.giraph.io.formats.TextVertexInputFormat; 25 | import org.apache.hadoop.io.LongWritable; 26 | import org.apache.hadoop.io.NullWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.giraph.examples.DiameterEstimationVertex.LongArrayWritable; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | import org.json.JSONArray; 32 | import org.json.JSONException; 33 | 34 | import java.io.IOException; 35 | import java.util.List; 36 | 37 | /** 38 | * ***DEPRECATED*** 39 | * We no longer use Json format for input. Instead, we use simple 40 | * text input format. See the new DiameterEstimationInputFormat. 41 | * ***DEPRECATED*** 42 | * 43 | * VertexInputFormat that reads in long vertex IDs, 44 | * double vertex values and float 45 | * out-edge weights, and double message types, 46 | * specified in JSON format. Output graph has long 47 | * vertex IDs, but dimest-specific vertex value, out-edge weight 48 | * and message types. 49 | */ 50 | public class JsonLongLongArrayInputFormat extends 51 | TextVertexInputFormat { 52 | 53 | @Override 54 | public TextVertexReader createVertexReader(InputSplit split, 55 | TaskAttemptContext context) { 56 | return new JsonLongLongArrayReader(); 57 | } 58 | 59 | /** 60 | * VertexReader that features LongArrayWritable vertex 61 | * values and NullWritable out-edge weights. The 62 | * files should be in the following JSON format: 63 | * JSONArray(, , 64 | * JSONArray(JSONArray(, ), ...)) 65 | * Here is an example with vertex id 1, vertex value 4.3, and two edges. 66 | * First edge has a destination vertex 2, edge value 2.1. 67 | * Second edge has a destination vertex 3, edge value 0.7. 68 | * [1,4.3,[[2,2.1],[3,0.7]]] 69 | * 70 | * Vertex value and edge weights must be present but are ignored. 71 | */ 72 | class JsonLongLongArrayReader extends 73 | TextVertexReaderFromEachLineProcessedHandlingExceptions { 75 | 76 | @Override 77 | protected JSONArray preprocessLine(Text line) throws JSONException { 78 | return new JSONArray(line.toString()); 79 | } 80 | 81 | @Override 82 | protected LongWritable getId(JSONArray jsonVertex) throws JSONException, 83 | IOException { 84 | return new LongWritable(jsonVertex.getLong(0)); 85 | } 86 | 87 | @Override 88 | protected LongArrayWritable getValue(JSONArray jsonVertex) throws 89 | JSONException, IOException { 90 | // ignore whatever is in jsonVertex, and return dummy LongArrayWritable 91 | // instead (this will be replaced during computation) 92 | return new LongArrayWritable(); 93 | } 94 | 95 | @Override 96 | protected Iterable> getEdges( 97 | JSONArray jsonVertex) throws JSONException, IOException { 98 | 99 | JSONArray jsonEdgeArray = jsonVertex.getJSONArray(2); 100 | List> edges = 101 | Lists.newArrayListWithCapacity(jsonEdgeArray.length()); 102 | 103 | long dst; 104 | 105 | for (int i = 0; i < jsonEdgeArray.length(); ++i) { 106 | JSONArray jsonEdge = jsonEdgeArray.getJSONArray(i); 107 | dst = jsonEdge.getLong(0); 108 | edges.add(EdgeFactory.create(new LongWritable(dst), 109 | NullWritable.get())); 110 | } 111 | return edges; 112 | } 113 | 114 | @Override 115 | protected Vertex 117 | handleException(Text line, JSONArray jsonVertex, JSONException e) { 118 | throw new IllegalArgumentException( 119 | "Couldn't get vertex from line " + line, e); 120 | } 121 | 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/JsonLongMSTVertexInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.giraph.examples; 19 | 20 | import com.google.common.collect.Lists; 21 | import org.apache.giraph.edge.Edge; 22 | import org.apache.giraph.edge.EdgeFactory; 23 | import org.apache.giraph.graph.Vertex; 24 | import org.apache.giraph.io.formats.TextVertexInputFormat; 25 | import org.apache.hadoop.io.LongWritable; 26 | import org.apache.hadoop.io.Text; 27 | import org.apache.giraph.examples.MinimumSpanningTreeVertex.MSTVertexValue; 28 | import org.apache.giraph.examples.MinimumSpanningTreeVertex.MSTEdgeValue; 29 | import org.apache.giraph.examples.MinimumSpanningTreeVertex.MSTMessage; 30 | import org.apache.hadoop.mapreduce.InputSplit; 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 32 | import org.json.JSONArray; 33 | import org.json.JSONException; 34 | 35 | import java.io.IOException; 36 | import java.util.List; 37 | 38 | /** 39 | * ***DEPRECATED*** 40 | * We no longer use Json format for input. Instead, we use simple 41 | * text input format. See the new MinimumSpanningTreeInputFormat. 42 | * ***DEPRECATED*** 43 | * 44 | * VertexInputFormat that reads in long vertex IDs, 45 | * double vertex values and float 46 | * out-edge weights, and double message types, 47 | * specified in JSON format. Output graph has long 48 | * vertex IDs, but MST-specific vertex value, out-edge weight 49 | * and message types. 50 | */ 51 | public class JsonLongMSTVertexInputFormat extends 52 | TextVertexInputFormat { 53 | 54 | @Override 55 | public TextVertexReader createVertexReader(InputSplit split, 56 | TaskAttemptContext context) { 57 | return new JsonLongMSTVertexReader(); 58 | } 59 | 60 | /** 61 | * VertexReader that features MSTVertexValue vertex 62 | * values and MSTEdgeValue out-edge weights. The 63 | * files should be in the following JSON format: 64 | * JSONArray(, , 65 | * JSONArray(JSONArray(, ), ...)) 66 | * Here is an example with vertex id 1, vertex value 4.3, and two edges. 67 | * First edge has a destination vertex 2, edge value 2.1. 68 | * Second edge has a destination vertex 3, edge value 0.7. 69 | * [1,4.3,[[2,2.1],[3,0.7]]] 70 | */ 71 | class JsonLongMSTVertexReader extends 72 | TextVertexReaderFromEachLineProcessedHandlingExceptions { 74 | 75 | @Override 76 | protected JSONArray preprocessLine(Text line) throws JSONException { 77 | return new JSONArray(line.toString()); 78 | } 79 | 80 | @Override 81 | protected LongWritable getId(JSONArray jsonVertex) throws JSONException, 82 | IOException { 83 | return new LongWritable(jsonVertex.getLong(0)); 84 | } 85 | 86 | @Override 87 | protected MSTVertexValue getValue(JSONArray jsonVertex) throws 88 | JSONException, IOException { 89 | // ignore whatever is in jsonVertex, and return dummy MSTVertexValue 90 | // instead (this will be replaced during computation) 91 | return new MSTVertexValue(); 92 | } 93 | 94 | @Override 95 | protected Iterable> getEdges( 96 | JSONArray jsonVertex) throws JSONException, IOException { 97 | 98 | long src = jsonVertex.getLong(0); 99 | 100 | JSONArray jsonEdgeArray = jsonVertex.getJSONArray(2); 101 | List> edges = 102 | Lists.newArrayListWithCapacity(jsonEdgeArray.length()); 103 | 104 | long dst; 105 | double weight; 106 | 107 | for (int i = 0; i < jsonEdgeArray.length(); ++i) { 108 | JSONArray jsonEdge = jsonEdgeArray.getJSONArray(i); 109 | dst = jsonEdge.getLong(0); 110 | weight = jsonEdge.getDouble(1); 111 | 112 | edges.add(EdgeFactory.create(new LongWritable(dst), 113 | new MSTEdgeValue(weight, src, dst))); 114 | } 115 | return edges; 116 | } 117 | 118 | @Override 119 | protected Vertex 120 | handleException(Text line, JSONArray jsonVertex, JSONException e) { 121 | throw new IllegalArgumentException( 122 | "Couldn't get vertex from line " + line, e); 123 | } 124 | 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/MinimumSpanningTreeInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | import com.google.common.collect.Lists; 22 | import org.apache.giraph.edge.Edge; 23 | import org.apache.giraph.edge.EdgeFactory; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.giraph.examples.MinimumSpanningTreeVertex.MSTVertexValue; 26 | import org.apache.giraph.examples.MinimumSpanningTreeVertex.MSTEdgeValue; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.giraph.io.formats.TextVertexInputFormat; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | 32 | import java.io.IOException; 33 | import java.util.List; 34 | import java.util.regex.Pattern; 35 | 36 | /** 37 | * Simple text-based {@link org.apache.giraph.io.VertexInputFormat} for 38 | * {@link org.apache.giraph.examples.MinimumSpanningTreeVertex}. 39 | * 40 | * Inputs have long ids, double edge weights, and no vertex values. 41 | * 42 | * Each line consists of: 43 | * vertex neighbor1 neighbor1-weight neighbor2 neighbor2-weight ... 44 | * 45 | * Values can be separated by spaces or tabs. 46 | */ 47 | public class MinimumSpanningTreeInputFormat extends 48 | TextVertexInputFormat { 49 | /** Separator of the vertex and neighbors */ 50 | private static final Pattern SEPARATOR = Pattern.compile("[\t ]"); 51 | 52 | @Override 53 | public TextVertexReader createVertexReader(InputSplit split, 54 | TaskAttemptContext context) 55 | throws IOException { 56 | return new MinimumSpanningTreeVertexReader(); 57 | } 58 | 59 | /** 60 | * Vertex reader associated with {@link MinimumSpanningTreeInputFormat}. 61 | */ 62 | public class MinimumSpanningTreeVertexReader extends 63 | TextVertexReaderFromEachLineProcessed { 64 | /** 65 | * Cached vertex id for the current line 66 | */ 67 | private LongWritable id; 68 | 69 | @Override 70 | protected String[] preprocessLine(Text line) throws IOException { 71 | String[] tokens = SEPARATOR.split(line.toString()); 72 | id = new LongWritable(Long.parseLong(tokens[0])); 73 | return tokens; 74 | } 75 | 76 | @Override 77 | protected LongWritable getId(String[] tokens) throws IOException { 78 | return id; 79 | } 80 | 81 | @Override 82 | protected MSTVertexValue getValue(String[] tokens) throws IOException { 83 | // ignore tokens and return dummy MSTVertexValue 84 | // (this will be replaced during computation) 85 | return new MSTVertexValue(); 86 | } 87 | 88 | @Override 89 | protected Iterable> getEdges( 90 | String[] tokens) throws IOException { 91 | 92 | // divide by 2, to account for edge weights 93 | List> edges = 94 | Lists.newArrayListWithCapacity((tokens.length - 1) / 2); 95 | 96 | long src = id.get(); 97 | long dst; 98 | double weight; 99 | 100 | for (int i = 1; i < tokens.length - 1; i += 2) { 101 | dst = Long.parseLong(tokens[i]); 102 | weight = Double.parseDouble(tokens[i + 1]); 103 | 104 | edges.add(EdgeFactory.create(new LongWritable(dst), 105 | new MSTEdgeValue(weight, src, dst))); 106 | } 107 | 108 | return edges; 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/PageRankTolFinderVertex.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | import java.io.IOException; 22 | import org.apache.giraph.conf.IntConfOption; 23 | import org.apache.giraph.aggregators.DoubleMaxAggregator; 24 | import org.apache.giraph.graph.Vertex; 25 | import org.apache.giraph.io.formats.TextVertexOutputFormat; 26 | import org.apache.giraph.master.DefaultMasterCompute; 27 | import org.apache.hadoop.io.DoubleWritable; 28 | import org.apache.hadoop.io.NullWritable; 29 | import org.apache.hadoop.io.LongWritable; 30 | //import org.apache.hadoop.io.Text; 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 32 | import org.apache.log4j.Logger; 33 | 34 | /** 35 | * PageRank implementation that finds when the maximum error deltas 36 | * (between two supersteps) "plateaus". 37 | * 38 | * In other words, think of a plot of error-delta vs. superstep-number. 39 | * The goal is to determine when the function flattens out---this is 40 | * roughly where we should stop, as additional supersteps won't get 41 | * us any better of a convergence. 42 | * 43 | * As this "break even" point is different for different graphs, this 44 | * function helps determine what tolerance value should be used. 45 | */ 46 | @Algorithm( 47 | name = "PageRank Tolerance Finder" 48 | ) 49 | public class PageRankTolFinderVertex extends Vertex { 51 | /** Max number of supersteps */ 52 | public static final IntConfOption MAX_SUPERSTEPS = 53 | new IntConfOption("PageRankTolFinderVertex.maxSS", 100); 54 | 55 | /** Logger */ 56 | private static final Logger LOG = 57 | Logger.getLogger(PageRankTolFinderVertex.class); 58 | 59 | /** Max aggregator name */ 60 | private static String MAX_AGG = "max"; 61 | 62 | @Override 63 | public void compute(Iterable messages) { 64 | // NOTE: We follow GraphLab's alternative way of computing PageRank, 65 | // which is to not divide by |V|. To get the probability value at 66 | // each vertex, take its PageRank value and divide by |V|. 67 | 68 | double oldVal = getValue().get(); 69 | 70 | if (getSuperstep() == 0) { 71 | // FIX: initial value is 1/|V| (or 1), not 0.15/|V| (or 0.15) 72 | DoubleWritable vertexValue = new DoubleWritable(1.0); 73 | //new DoubleWritable(0.15f / getTotalNumVertices()); 74 | setValue(vertexValue); 75 | 76 | } else { 77 | double sum = 0; 78 | for (DoubleWritable message : messages) { 79 | sum += message.get(); 80 | } 81 | DoubleWritable vertexValue = new DoubleWritable(0.15f + 0.85f * sum); 82 | //new DoubleWritable((0.15f / getTotalNumVertices()) + 0.85f * sum); 83 | setValue(vertexValue); 84 | } 85 | 86 | aggregate(MAX_AGG, 87 | new DoubleWritable(Math.abs(oldVal - getValue().get()))); 88 | 89 | // Termination condition based on max supersteps 90 | if (getSuperstep() < MAX_SUPERSTEPS.get(getConf())) { 91 | long edges = getNumEdges(); 92 | sendMessageToAllEdges(new DoubleWritable(getValue().get() / edges)); 93 | } else { 94 | voteToHalt(); 95 | } 96 | } 97 | 98 | /** 99 | * Master compute associated with {@link PageRankTolFinderVertex}. 100 | * It registers required aggregators. 101 | */ 102 | public static class PageRankTolFinderVertexMasterCompute extends 103 | DefaultMasterCompute { 104 | @Override 105 | public void initialize() throws InstantiationException, 106 | IllegalAccessException { 107 | registerAggregator(MAX_AGG, DoubleMaxAggregator.class); 108 | } 109 | 110 | @Override 111 | public void compute() { 112 | // this is result of aggregators from the *previous* superstep 113 | if (getSuperstep() > 0) { 114 | LOG.info("SS " + (getSuperstep() - 1) + " max change: " + 115 | ((DoubleWritable) getAggregatedValue(MAX_AGG)).get()); 116 | } 117 | } 118 | } 119 | 120 | /** 121 | * Simple VertexOutputFormat that supports {@link PageRankTolFinderVertex} 122 | */ 123 | public static class PageRankTolFinderVertexOutputFormat extends 124 | TextVertexOutputFormat { 125 | @Override 126 | public TextVertexWriter createVertexWriter(TaskAttemptContext context) 127 | throws IOException, InterruptedException { 128 | return new PageRankTolFinderVertexWriter(); 129 | } 130 | 131 | /** 132 | * Simple VertexWriter that supports {@link PageRankTolFinderVertex} 133 | */ 134 | public class PageRankTolFinderVertexWriter extends TextVertexWriter { 135 | @Override 136 | public void writeVertex( 137 | Vertex vertex) 138 | throws IOException, InterruptedException { 139 | // don't need to output anything---we don't care about results 140 | //getRecordWriter().write( 141 | // new Text(vertex.getId().toString()), 142 | // new Text(vertex.getValue().toString())); 143 | } 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/SimplePageRankInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | import com.google.common.collect.Lists; 22 | import org.apache.giraph.edge.Edge; 23 | import org.apache.giraph.edge.EdgeFactory; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.DoubleWritable; 26 | import org.apache.hadoop.io.NullWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.giraph.io.formats.TextVertexInputFormat; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | 32 | import java.io.IOException; 33 | import java.util.List; 34 | import java.util.regex.Pattern; 35 | 36 | /** 37 | * Simple text-based {@link org.apache.giraph.io.VertexInputFormat} for 38 | * {@link org.apache.giraph.examples.SimplePageRankVertex}. 39 | * 40 | * Inputs have long ids, no edge weights, and no vertex values. 41 | * (Vertex values are set to a double of 0.0.) 42 | * 43 | * Each line consists of: 44 | * vertex neighbor1 neighbor2 ... 45 | * 46 | * Values can be separated by spaces or tabs. 47 | */ 48 | public class SimplePageRankInputFormat extends 49 | TextVertexInputFormat { 50 | /** Separator of the vertex and neighbors */ 51 | private static final Pattern SEPARATOR = Pattern.compile("[\t ]"); 52 | 53 | @Override 54 | public TextVertexReader createVertexReader(InputSplit split, 55 | TaskAttemptContext context) 56 | throws IOException { 57 | return new SimplePageRankVertexReader(); 58 | } 59 | 60 | /** 61 | * Vertex reader associated with {@link SimplePageRankInputFormat}. 62 | */ 63 | public class SimplePageRankVertexReader extends 64 | TextVertexReaderFromEachLineProcessed { 65 | /** 66 | * Cached vertex id for the current line 67 | */ 68 | private LongWritable id; 69 | 70 | @Override 71 | protected String[] preprocessLine(Text line) throws IOException { 72 | String[] tokens = SEPARATOR.split(line.toString()); 73 | id = new LongWritable(Long.parseLong(tokens[0])); 74 | return tokens; 75 | } 76 | 77 | @Override 78 | protected LongWritable getId(String[] tokens) throws IOException { 79 | return id; 80 | } 81 | 82 | @Override 83 | protected DoubleWritable getValue(String[] tokens) throws IOException { 84 | return new DoubleWritable(0.0); 85 | } 86 | 87 | @Override 88 | protected Iterable> getEdges( 89 | String[] tokens) throws IOException { 90 | List> edges = 91 | Lists.newArrayListWithCapacity(tokens.length - 1); 92 | for (int i = 1; i < tokens.length; i++) { 93 | edges.add(EdgeFactory.create( 94 | new LongWritable(Long.parseLong(tokens[i])))); 95 | } 96 | return edges; 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /giraph-1.0.0/giraph-examples/src/main/java/org/apache/giraph/examples/SimpleShortestPathsInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.giraph.examples; 20 | 21 | import com.google.common.collect.Lists; 22 | import org.apache.giraph.edge.Edge; 23 | import org.apache.giraph.edge.EdgeFactory; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.DoubleWritable; 26 | import org.apache.hadoop.io.FloatWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.giraph.io.formats.TextVertexInputFormat; 29 | import org.apache.hadoop.mapreduce.InputSplit; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | 32 | import java.io.IOException; 33 | import java.util.List; 34 | import java.util.regex.Pattern; 35 | 36 | /** 37 | * Simple text-based {@link org.apache.giraph.io.VertexInputFormat} for 38 | * {@link org.apache.giraph.examples.SimpleShortestPathsVertex}. 39 | * 40 | * Inputs have long ids, no edge weights, and no vertex values. 41 | * (Edge weights are set to a float of 1.0, vertex values are 42 | * set to a double of 0.0.) 43 | * 44 | * Each line consists of: 45 | * vertex neighbor1 neighbor2 ... 46 | * 47 | * Values can be separated by spaces or tabs. 48 | */ 49 | public class SimpleShortestPathsInputFormat extends 50 | TextVertexInputFormat { 51 | /** Separator of the vertex and neighbors */ 52 | private static final Pattern SEPARATOR = Pattern.compile("[\t ]"); 53 | 54 | @Override 55 | public TextVertexReader createVertexReader(InputSplit split, 56 | TaskAttemptContext context) 57 | throws IOException { 58 | return new SimpleShortestPathsVertexReader(); 59 | } 60 | 61 | /** 62 | * Vertex reader associated with {@link SimpleShortestPathsInputFormat}. 63 | */ 64 | public class SimpleShortestPathsVertexReader extends 65 | TextVertexReaderFromEachLineProcessed { 66 | /** 67 | * Cached vertex id for the current line 68 | */ 69 | private LongWritable id; 70 | 71 | @Override 72 | protected String[] preprocessLine(Text line) throws IOException { 73 | String[] tokens = SEPARATOR.split(line.toString()); 74 | id = new LongWritable(Long.parseLong(tokens[0])); 75 | return tokens; 76 | } 77 | 78 | @Override 79 | protected LongWritable getId(String[] tokens) throws IOException { 80 | return id; 81 | } 82 | 83 | @Override 84 | protected DoubleWritable getValue(String[] tokens) throws IOException { 85 | return new DoubleWritable(0.0); 86 | } 87 | 88 | @Override 89 | protected Iterable> getEdges( 90 | String[] tokens) throws IOException { 91 | List> edges = 92 | Lists.newArrayListWithCapacity(tokens.length - 1); 93 | for (int i = 1; i < tokens.length; i++) { 94 | edges.add(EdgeFactory.create( 95 | new LongWritable(Long.parseLong(tokens[i])), 96 | new FloatWritable((float) 1.0))); 97 | } 98 | return edges; 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /gps-rev-110/local-master-scripts/make_gps_node_runner_jar.sh: -------------------------------------------------------------------------------- 1 | cd .. 2 | GPS_DIR="`pwd`" 3 | GPS_SRC_DIR=${GPS_DIR}/src 4 | GPS_CLASSES_DIR=${GPS_DIR}/classes 5 | LIBS_DIR=${GPS_DIR}/libs 6 | 7 | echo "removing ${GPS_DIR}/gps_node_runner.jar" 8 | rm ${GPS_DIR}/gps_node_runner.jar 9 | 10 | echo "removing ${GPS_CLASSES_DIR}" 11 | rm -rf ${GPS_CLASSES_DIR} 12 | 13 | echo "making ${GPS_CLASSES_DIR}" 14 | mkdir ${GPS_CLASSES_DIR} 15 | 16 | echo "cding into ${GPS_SRC_DIR}" 17 | cd ${GPS_SRC_DIR} 18 | 19 | find java/gps/examples -name \*.java -print > file.list 20 | # TODO: bug fix? works without "echo" so this line might be old code 21 | #$GPS_SRC_DIR/java/gps/node/GPSNodeRunner.java >> file.list 22 | echo "compiling GPSNodeRunner to classes directory" 23 | #javac -verbose \ 24 | javac \ 25 | -cp $LIBS_DIR/asm-3.3.1.jar:$LIBS_DIR/guava-r08.jar:$LIBS_DIR/objenesis-1.2.jar:$LIBS_DIR/cglib-2.2.jar:$LIBS_DIR/commons-cli-1.2.jar:$LIBS_DIR/jline-0.9.94.jar:$LIBS_DIR/log4j-1.2.15.jar:$LIBS_DIR/commons-logging-1.1.1.jar:$LIBS_DIR/hadoop-core-1.0.4.jar:$LIBS_DIR/commons-collections-3.2.1.jar:$LIBS_DIR/commons-lang-2.4.jar:$LIBS_DIR/commons-configuration-1.6.jar:$LIBS_DIR/tools.jar:$LIBS_DIR/mina-core-2.0.3.jar:$LIBS_DIR/mina-example-2.0.3.jar:$LIBS_DIR/slf4j-api-1.6.1.jar:$LIBS_DIR/colt.jar:$LIBS_DIR/concurrent.jar:$GPS_SRC_DIR/java \ 26 | -d ${GPS_CLASSES_DIR} \ 27 | @file.list 28 | 29 | echo "cding into ${GPS_CLASSES_DIR}" 30 | cd ${GPS_CLASSES_DIR} 31 | pwd 32 | echo "making gps_node_runner.jar..." 33 | #jar -cmvf $GPS_DIR/local-master-scripts/manifest.txt ../gps_node_runner.jar gps/ 34 | jar -cmf $GPS_DIR/local-master-scripts/manifest.txt ../gps_node_runner.jar gps/ -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/examples/dimest/DiameterEstimationVertex.java: -------------------------------------------------------------------------------- 1 | package gps.examples.dimest; 2 | 3 | import org.apache.commons.cli.CommandLine; 4 | 5 | import gps.graph.NullEdgeVertex; 6 | import gps.graph.NullEdgeVertexFactory; 7 | import gps.node.GPSJobConfiguration; 8 | import gps.node.GPSNodeRunner; 9 | import gps.writable.LongArrayWritable; 10 | 11 | 12 | import java.util.Arrays; 13 | 14 | 15 | /** 16 | * GPS implementation of Flajolet-Martin diameter estimation. 17 | * 18 | * @author Young 19 | */ 20 | public class DiameterEstimationVertex extends NullEdgeVertex { 21 | 22 | public static int DEFAULT_NUM_MAX_ITERATIONS = 30; 23 | public static int numMaxIterations; 24 | 25 | /** K is number of bitstrings to use, 26 | larger K = more concentrated estimate **/ 27 | public static final int K = 8; 28 | 29 | /** Bit shift constant **/ 30 | private static final int V62 = 62; 31 | /** Bit shift constant **/ 32 | private static final int V1 = 1; 33 | 34 | public DiameterEstimationVertex(CommandLine line) { 35 | String otherOptsStr = line.getOptionValue(GPSNodeRunner.OTHER_OPTS_OPT_NAME); 36 | System.out.println("otherOptsStr: " + otherOptsStr); 37 | numMaxIterations = DEFAULT_NUM_MAX_ITERATIONS; 38 | if (otherOptsStr != null) { 39 | String[] split = otherOptsStr.split("###"); 40 | for (int index = 0; index < split.length; ) { 41 | String flag = split[index++]; 42 | String value = split[index++]; 43 | if ("-max".equals(flag)) { 44 | numMaxIterations = Integer.parseInt(value); 45 | System.out.println("numMaxIterations: " + numMaxIterations); 46 | } 47 | } 48 | } 49 | } 50 | 51 | @Override 52 | public void compute(Iterable incomingMessages, int superstepNo) { 53 | if (superstepNo == 1) { 54 | long[] value = new long[K]; 55 | int finalBitCount = 63; 56 | long rndVal = 0; 57 | 58 | for (int j = 0; j < value.length; j++) { 59 | rndVal = createRandomBM(finalBitCount); 60 | value[j] = V1 << (V62 - rndVal); 61 | } 62 | 63 | LongArrayWritable arr = new LongArrayWritable(value); 64 | sendMessages(getNeighborIds(), arr); 65 | setValue(arr); 66 | 67 | //System.out.println(getId() + ": done superstep 1... " + getValue()); 68 | return; 69 | } 70 | 71 | //System.out.println(getId() + ": normal superstep... " + getValue()); 72 | 73 | // get direct reference to vertex value's array 74 | long[] newBitmask = getValue().get(); 75 | 76 | // Some vertices have in-edges but no out-edges, so they're NOT 77 | // listed in the input graphs (from SNAP). This causes a new 78 | // vertex to be added during the 2nd superstep, and its value 79 | // to be non-initialized (i.e., empty array []). Since such 80 | // vertices have no out-edges, we can just halt. 81 | if (newBitmask.length == 0) { 82 | voteToHalt(); 83 | return; 84 | } 85 | 86 | boolean isChanged = false; 87 | long[] tmpBitmask; 88 | long tmp; 89 | 90 | for (LongArrayWritable message : incomingMessages) { 91 | tmpBitmask = message.get(); 92 | 93 | // if (tmpBitmask.length == 0) { 94 | // System.out.println(getId() + ": got empty message??"); 95 | // } else { 96 | // System.out.println(getId() + ": got " + message); 97 | // } 98 | 99 | // both arrays are of length K 100 | for (int i = 0; i < K; i++) { 101 | tmp = newBitmask[i]; // store old value 102 | 103 | // NOTE: this modifies vertex value directly 104 | newBitmask[i] = newBitmask[i] | tmpBitmask[i]; 105 | 106 | // check if there's a change 107 | // NOTE: unused for now---to terminate when all vertices converge, 108 | // use an aggregator to track # of vertices that have finished 109 | //isChanged = isChanged || (tmp != newBitmask[i]); 110 | } 111 | } 112 | 113 | //System.out.println(getId() + ": final array is " + getValue()); 114 | 115 | // WARNING: we cannot terminate based on LOCAL steady state, 116 | // we need all vertices computing until the very end 117 | if (superstepNo >= numMaxIterations) { 118 | //System.out.println(getId() + ": voting to halt"); 119 | voteToHalt(); 120 | 121 | } else { 122 | //System.out.println(getId() + ": not halting... sending message"); 123 | 124 | // otherwise, send our neighbours our bitstrings 125 | sendMessages(getNeighborIds(), getValue()); 126 | } 127 | } 128 | 129 | // Source: Mizan, which took this from Pegasus 130 | /** 131 | * Creates random bitstring. 132 | * 133 | * @param sizeBitmask Number of bits. 134 | * @return Random bit index. 135 | */ 136 | private int createRandomBM(int sizeBitmask) { 137 | int j; 138 | 139 | // random() gives double in [0,1)---just like in Mizan 140 | // NOTE: we use the default seed set by java.util.Random() 141 | double curRandom = Math.random(); 142 | double threshold = 0; 143 | 144 | for (j = 0; j < sizeBitmask - 1; j++) { 145 | threshold += Math.pow(2.0, -1.0 * j - 1.0); 146 | 147 | if (curRandom < threshold) { 148 | break; 149 | } 150 | } 151 | 152 | return j; 153 | } 154 | 155 | @Override 156 | public LongArrayWritable getInitialValue(int id) { 157 | return new LongArrayWritable(); 158 | } 159 | 160 | /** 161 | * Factory class for {@link DiameterEstimationVertex}. 162 | * 163 | * @author Young 164 | */ 165 | public static class DiameterEstimationVertexFactory extends NullEdgeVertexFactory { 166 | 167 | @Override 168 | public NullEdgeVertex newInstance(CommandLine commandLine) { 169 | return new DiameterEstimationVertex(commandLine); 170 | } 171 | } 172 | 173 | public static class JobConfiguration extends GPSJobConfiguration { 174 | 175 | @Override 176 | public Class getVertexFactoryClass() { 177 | return DiameterEstimationVertexFactory.class; 178 | } 179 | 180 | @Override 181 | public Class getVertexClass() { 182 | return DiameterEstimationVertex.class; 183 | } 184 | 185 | @Override 186 | public Class getVertexValueClass() { 187 | return LongArrayWritable.class; 188 | } 189 | 190 | @Override 191 | public Class getMessageValueClass() { 192 | return LongArrayWritable.class; 193 | } 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/examples/pagerank/PageRankVertex.java: -------------------------------------------------------------------------------- 1 | package gps.examples.pagerank; 2 | 3 | import org.apache.commons.cli.CommandLine; 4 | 5 | import gps.globalobjects.BooleanANDGlobalObject; 6 | import gps.globalobjects.DoubleMaxGlobalObject; 7 | import gps.globalobjects.DoubleSumGlobalObject; 8 | import gps.globalobjects.FloatSumGlobalObject; 9 | import gps.globalobjects.GlobalObjectsMap; 10 | import gps.globalobjects.IntMaxGlobalObject; 11 | import gps.globalobjects.IntSumGlobalObject; 12 | import gps.globalobjects.LongSumGlobalObject; 13 | import gps.graph.NullEdgeVertex; 14 | import gps.graph.NullEdgeVertexFactory; 15 | import gps.node.GPSJobConfiguration; 16 | import gps.node.GPSNodeRunner; 17 | import gps.writable.DoubleWritable; 18 | 19 | /** 20 | * GPS implementation of PageRank algorithm. 21 | * 22 | * @author semihsalihoglu 23 | */ 24 | public class PageRankVertex extends NullEdgeVertex { 25 | 26 | public static int DEFAULT_NUM_MAX_ITERATIONS = 30; 27 | public static int numMaxIterations; 28 | 29 | public PageRankVertex(CommandLine line) { 30 | String otherOptsStr = line.getOptionValue(GPSNodeRunner.OTHER_OPTS_OPT_NAME); 31 | System.out.println("otherOptsStr: " + otherOptsStr); 32 | 33 | numMaxIterations = DEFAULT_NUM_MAX_ITERATIONS; 34 | 35 | if (otherOptsStr != null) { 36 | String[] split = otherOptsStr.split("###"); 37 | 38 | for (int index = 0; index < split.length; ) { 39 | String flag = split[index++]; 40 | String value = split[index++]; 41 | 42 | if ("-max".equals(flag)) { 43 | numMaxIterations = Integer.parseInt(value); 44 | System.out.println("numMaxIterations: " + numMaxIterations); 45 | } 46 | } 47 | } 48 | } 49 | 50 | @Override 51 | public void compute(Iterable incomingMessages, int superstepNo) { 52 | // NOTE: We follow GraphLab's alternative way of computing PageRank, 53 | // which is to not divide by |V|. To get the probability value at 54 | // each vertex, take its PageRank value and divide by |V|. 55 | 56 | //int numVertices = ((IntSumGlobalObject) getGlobalObjectsMap().getGlobalObject( 57 | // GlobalObjectsMap.NUM_TOTAL_VERTICES)).getValue().getValue(); 58 | 59 | if (superstepNo == 1) { 60 | setValue(new DoubleWritable(1.0)); 61 | //setValue(new DoubleWritable((double) 1 / (double) numVertices)); 62 | sendMessages(getNeighborIds(), getValue()); 63 | return; 64 | } 65 | 66 | double oldVal = getValue().getValue(); 67 | double sum = 0.0; 68 | for (DoubleWritable messageValue : incomingMessages) { 69 | sum += messageValue.getValue(); 70 | } 71 | 72 | double currentState = 0.85 * sum + 0.15; 73 | //double currentState = 0.85 * sum/getNeighborIds().length + 0.15 / (double) numVertices; 74 | 75 | setValue(new DoubleWritable(currentState)); 76 | 77 | // Termination condition based on max supersteps 78 | int[] neighborIds = getNeighborIds(); 79 | // FIX: divide by sender's out-degree rather than receiver's out-degree 80 | // (i.e., don't do "currentState = 0.85 * sum / neighborIds.length + ...") 81 | DoubleWritable messageValue = new DoubleWritable(currentState / neighborIds.length); 82 | sendMessages(neighborIds, messageValue); 83 | 84 | if (superstepNo == numMaxIterations) { 85 | voteToHalt(); 86 | } 87 | } 88 | 89 | @Override 90 | public DoubleWritable getInitialValue(int id) { 91 | return new DoubleWritable(0.1); 92 | } 93 | 94 | /** 95 | * Factory class for {@link PageRankVertex}. 96 | * 97 | * @author semihsalihoglu 98 | */ 99 | public static class PageRankVertexFactory extends NullEdgeVertexFactory { 100 | 101 | @Override 102 | public NullEdgeVertex newInstance(CommandLine commandLine) { 103 | return new PageRankVertex(commandLine); 104 | } 105 | } 106 | 107 | public static class JobConfiguration extends GPSJobConfiguration { 108 | 109 | @Override 110 | public Class getVertexFactoryClass() { 111 | return PageRankVertexFactory.class; 112 | } 113 | 114 | @Override 115 | public Class getVertexClass() { 116 | return PageRankVertex.class; 117 | } 118 | 119 | @Override 120 | public Class getVertexValueClass() { 121 | return DoubleWritable.class; 122 | } 123 | 124 | @Override 125 | public Class getMessageValueClass() { 126 | return DoubleWritable.class; 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/examples/sssp/SSSPVertex.java: -------------------------------------------------------------------------------- 1 | package gps.examples.sssp; 2 | 3 | import org.apache.commons.cli.CommandLine; 4 | 5 | import gps.graph.NullEdgeVertex; 6 | import gps.graph.NullEdgeVertexFactory; 7 | import gps.node.GPSJobConfiguration; 8 | import gps.node.GPSNodeRunner; 9 | import gps.writable.IntWritable; 10 | 11 | // NOTE: this is different from SingleSourceAllVerticesShortestPathVertex, 12 | // in that we don't use the boolean shortcut method. 13 | // 14 | // Instead, this is a modification of gps.examples.edgevaluesssp.EdgeValueSSSPVertex, 15 | // where edge values are all 1. This matches the implementations in Giraph and GPS. 16 | public class SSSPVertex extends NullEdgeVertex { 17 | 18 | private static int DEFAULT_SOURCE_ID = 0; 19 | private int sourceId; 20 | public SSSPVertex() { 21 | } 22 | 23 | public SSSPVertex(CommandLine line) { 24 | String otherOptsStr = line.getOptionValue(GPSNodeRunner.OTHER_OPTS_OPT_NAME); 25 | System.out.println("otherOptsStr: " + otherOptsStr); 26 | sourceId = DEFAULT_SOURCE_ID; 27 | if (otherOptsStr != null) { 28 | String[] split = otherOptsStr.split("###"); 29 | for (int index = 0; index < split.length; ) { 30 | String flag = split[index++]; 31 | String value = split[index++]; 32 | if ("-root".equals(flag)) { 33 | sourceId = Integer.parseInt(value); 34 | System.out.println("sourceId: " + sourceId); 35 | } 36 | } 37 | } 38 | } 39 | 40 | @Override 41 | public void compute(Iterable messageValues, int superstepNo) { 42 | int previousDistance = getValue().getValue(); 43 | if (superstepNo == 1) { 44 | if (previousDistance == Integer.MAX_VALUE) { 45 | voteToHalt(); 46 | } else { 47 | sendMessages(getNeighborIds(), 48 | new IntWritable(getValue().getValue() + 1)); 49 | } 50 | } else { 51 | int minValue = previousDistance; 52 | int messageValueInt; 53 | for (IntWritable messageValue : messageValues) { 54 | messageValueInt = messageValue.getValue(); 55 | if (messageValueInt < minValue) { 56 | minValue = messageValueInt; 57 | } 58 | } 59 | int currentDistance = minValue; 60 | if (currentDistance < previousDistance) { 61 | IntWritable newState = new IntWritable(currentDistance); 62 | setValue(newState); 63 | sendMessages(getNeighborIds(), 64 | new IntWritable(getValue().getValue() + 1)); 65 | } else { 66 | voteToHalt(); 67 | } 68 | } 69 | } 70 | 71 | @Override 72 | public IntWritable getInitialValue(int id) { 73 | return id == sourceId ? new IntWritable(0) : new IntWritable(Integer.MAX_VALUE); 74 | } 75 | 76 | /** 77 | * Factory class for {@link SSSPVertex}. 78 | * 79 | * @author semihsalihoglu 80 | */ 81 | public static class SSSPVertexFactory 82 | extends NullEdgeVertexFactory { 83 | 84 | @Override 85 | public NullEdgeVertex newInstance(CommandLine commandLine) { 86 | return new SSSPVertex(commandLine); 87 | } 88 | } 89 | 90 | public static class JobConfiguration extends GPSJobConfiguration { 91 | 92 | @Override 93 | public Class getVertexFactoryClass() { 94 | return SSSPVertexFactory.class; 95 | } 96 | 97 | @Override 98 | public Class getVertexClass() { 99 | return SSSPVertex.class; 100 | } 101 | 102 | @Override 103 | public Class getVertexValueClass() { 104 | return IntWritable.class; 105 | } 106 | 107 | @Override 108 | public Class getMessageValueClass() { 109 | return IntWritable.class; 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/examples/sssp/SingleSourceAllVerticesShortestPathVertex.java: -------------------------------------------------------------------------------- 1 | package gps.examples.sssp; 2 | 3 | import org.apache.commons.cli.CommandLine; 4 | 5 | import gps.graph.NullEdgeVertex; 6 | import gps.graph.NullEdgeVertexFactory; 7 | import gps.node.GPSJobConfiguration; 8 | import gps.node.GPSNodeRunner; 9 | import gps.writable.BooleanWritable; 10 | import gps.writable.IntWritable; 11 | 12 | public class SingleSourceAllVerticesShortestPathVertex extends NullEdgeVertex { 13 | 14 | private static int DEFAULT_ROOT_ID = 0; 15 | private int root; 16 | protected boolean isFLPS = false; 17 | protected IntWritable numRecentlyUpdatedVertices; 18 | 19 | public SingleSourceAllVerticesShortestPathVertex(CommandLine line) { 20 | String otherOptsStr = line.getOptionValue(GPSNodeRunner.OTHER_OPTS_OPT_NAME); 21 | System.out.println("otherOptsStr: " + otherOptsStr); 22 | root = DEFAULT_ROOT_ID; 23 | if (otherOptsStr != null) { 24 | String[] split = otherOptsStr.split("###"); 25 | for (int index = 0; index < split.length; ) { 26 | String flag = split[index++]; 27 | String value = split[index++]; 28 | if ("-root".equals(flag)) { 29 | root = Integer.parseInt(value); 30 | System.out.println("sourceId: " + root); 31 | } 32 | } 33 | } 34 | } 35 | 36 | @Override 37 | public void compute(Iterable messageValues, int superstepNo) { 38 | performRegularLabelPropagation(messageValues, superstepNo); 39 | } 40 | 41 | protected void performRegularLabelPropagation(Iterable messageValues, int superstepNo) { 42 | int previousDistance = getValue().getValue(); 43 | if (superstepNo == 1) { 44 | if (previousDistance == Integer.MAX_VALUE) { 45 | if (!isFLPS) { 46 | voteToHalt(); 47 | } 48 | } else { 49 | sendMessages(getNeighborIds(), new BooleanWritable()); 50 | if (isFLPS) { 51 | numRecentlyUpdatedVertices.value++; 52 | voteToHalt(); 53 | } 54 | } 55 | } else { 56 | if (previousDistance != Integer.MAX_VALUE) { 57 | if (!isFLPS) { 58 | voteToHalt(); 59 | } 60 | } else if (messageValues.iterator().hasNext()) { 61 | // BUGFIX: distance 1 will occur at superstep 2, so *subtract* 1 62 | setValue(new IntWritable(superstepNo - 1)); 63 | sendMessages(getNeighborIds(), new BooleanWritable()); 64 | if (isFLPS) { 65 | numRecentlyUpdatedVertices.value++; 66 | voteToHalt(); 67 | } 68 | } 69 | } 70 | } 71 | 72 | @Override 73 | public IntWritable getInitialValue(int id) { 74 | return id == root ? new IntWritable(0) : new IntWritable(Integer.MAX_VALUE); 75 | } 76 | 77 | /** 78 | * Factory class for {@link SingleSourceAllVerticesShortestPathVertex}. 79 | * 80 | * @author semihsalihoglu 81 | */ 82 | public static class SingleSourceAllVerticesShortestPathVertexFactory extends NullEdgeVertexFactory { 83 | 84 | @Override 85 | public NullEdgeVertex newInstance(CommandLine commandLine) { 86 | return new SingleSourceAllVerticesShortestPathVertex(commandLine); 87 | } 88 | } 89 | 90 | public static class JobConfiguration extends GPSJobConfiguration { 91 | 92 | @Override 93 | public Class getVertexFactoryClass() { 94 | return SingleSourceAllVerticesShortestPathVertexFactory.class; 95 | } 96 | 97 | @Override 98 | public Class getVertexClass() { 99 | return SingleSourceAllVerticesShortestPathVertex.class; 100 | } 101 | 102 | @Override 103 | public Class getVertexValueClass() { 104 | return IntWritable.class; 105 | } 106 | 107 | @Override 108 | public Class getMessageValueClass() { 109 | return BooleanWritable.class; 110 | } 111 | } 112 | } -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/examples/wcc/WeaklyConnectedComponentsVertex.java: -------------------------------------------------------------------------------- 1 | package gps.examples.wcc; 2 | 3 | import org.apache.commons.cli.CommandLine; 4 | 5 | import gps.graph.NullEdgeVertex; 6 | import gps.graph.NullEdgeVertexFactory; 7 | import gps.node.GPSJobConfiguration; 8 | import gps.node.GPSNodeRunner; 9 | import gps.writable.IntWritable; 10 | 11 | public class WeaklyConnectedComponentsVertex extends NullEdgeVertex{ 12 | 13 | private int minValue; 14 | //public static int DEFAULT_NUM_MAX_ITERATIONS = 999; 15 | public static int numMaxIterations; 16 | public WeaklyConnectedComponentsVertex(CommandLine line) { 17 | //String otherOptsStr = line.getOptionValue(GPSNodeRunner.OTHER_OPTS_OPT_NAME); 18 | //System.out.println("otherOptsStr: " + otherOptsStr); 19 | //numMaxIterations = DEFAULT_NUM_MAX_ITERATIONS; 20 | //if (otherOptsStr != null) { 21 | // String[] split = otherOptsStr.split("###"); 22 | // for (int index = 0; index < split.length; ) { 23 | // String flag = split[index++]; 24 | // String value = split[index++]; 25 | // if ("-nmi".equals(flag)) { 26 | // numMaxIterations = Integer.parseInt(value); 27 | // System.out.println("numMaxIterations: " + numMaxIterations); 28 | // } 29 | // } 30 | //} 31 | } 32 | @Override 33 | public void compute(Iterable messageValues, int superstepNo) { 34 | if (superstepNo == 1) { 35 | setValue(new IntWritable(getId())); 36 | sendMessages(getNeighborIds(), getValue()); 37 | } else { 38 | minValue = getValue().getValue(); 39 | for (IntWritable message : messageValues) { 40 | if (message.getValue() < minValue) { 41 | minValue = message.getValue(); 42 | } 43 | } 44 | if (minValue < getValue().getValue()) { 45 | setValue(new IntWritable(minValue)); 46 | sendMessages(getNeighborIds(), getValue()); 47 | } else { 48 | voteToHalt(); 49 | } 50 | 51 | // No superstep termination conditions---run to completion instead 52 | //if (superstepNo == numMaxIterations) { 53 | // voteToHalt(); 54 | //} 55 | } 56 | } 57 | 58 | @Override 59 | public IntWritable getInitialValue(int id) { 60 | return new IntWritable(getId()); 61 | } 62 | 63 | public static class WeaklyConnectedComponentsVertexFactory extends 64 | NullEdgeVertexFactory { 65 | 66 | @Override 67 | public NullEdgeVertex newInstance(CommandLine commandline) { 68 | return new WeaklyConnectedComponentsVertex(commandline); 69 | } 70 | } 71 | 72 | public static class JobConfiguration extends GPSJobConfiguration { 73 | 74 | @Override 75 | public Class getVertexFactoryClass() { 76 | return WeaklyConnectedComponentsVertexFactory.class; 77 | } 78 | 79 | @Override 80 | public Class getVertexClass() { 81 | return WeaklyConnectedComponentsVertex.class; 82 | } 83 | 84 | @Override 85 | public Class getVertexValueClass() { 86 | return IntWritable.class; 87 | } 88 | 89 | @Override 90 | public Class getMessageValueClass() { 91 | return IntWritable.class; 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/node/worker/dynamic/VertexWrapper.java: -------------------------------------------------------------------------------- 1 | package gps.node.worker.dynamic; 2 | 3 | import gps.writable.MinaWritable; 4 | 5 | public class VertexWrapper { 6 | public int originalId; 7 | public int[] neighborIds; 8 | public V state; 9 | public boolean isActive; 10 | public int toOrFromMachineId; 11 | } 12 | -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/node/worker/dynamic/greedy/BaseGreedyDynamicGPSWorkerImpl.java: -------------------------------------------------------------------------------- 1 | package gps.node.worker.dynamic.greedy; 2 | 3 | import static gps.node.worker.GPSWorkerExposedGlobalVariables.*; 4 | import gps.communication.MessageSenderAndReceiverFactory; 5 | import gps.graph.Graph; 6 | import gps.graph.VertexFactory; 7 | import gps.messages.storage.ArrayBackedIncomingMessageStorage; 8 | import gps.node.GPSJobConfiguration; 9 | import gps.node.MachineConfig; 10 | import gps.node.worker.AbstractGPSWorker; 11 | import gps.writable.MinaWritable; 12 | import gps.writable.NullWritable; 13 | 14 | import org.apache.commons.cli.CommandLine; 15 | import org.apache.hadoop.fs.FileSystem; 16 | 17 | public abstract class BaseGreedyDynamicGPSWorkerImpl extends AbstractGPSWorker { 19 | 20 | public static int[] machineCommunicationHistogram; 21 | protected boolean[] fasterMachines; 22 | protected final int edgeThreshold; 23 | protected int benefitThreshold; 24 | protected int superstepNoToStopDynamism; 25 | 26 | public BaseGreedyDynamicGPSWorkerImpl(int localMachineId, CommandLine commandLine, 27 | FileSystem fileSystem, MachineConfig machineConfig, Graph graphPartition, 28 | VertexFactory vertexFactory, int graphSize, int outgoingBufferSizes, 29 | String outputFileName, MessageSenderAndReceiverFactory messageSenderAndReceiverFactory, 30 | ArrayBackedIncomingMessageStorage incomingMessageStorage, int benefitThreshold, 31 | int edgeThreshold, long pollingTime, int maxMessagesToTransmitConcurrently, 32 | int numVerticesFrequencyToCheckOutgoingBuffers, 33 | int sleepTimeWhenOutgoingBuffersExceedThreshold, 34 | int largeVertexPartitioningOutdegreeThreshold, boolean runPartitioningSuperstep, 35 | boolean combine, Class messageRepresentativeInstance, 36 | Class representativeEdgeInstance, GPSJobConfiguration jobConfiguration, 37 | int numProcessorsForHandlingIO, int superstepNoToStopDynamism) { 38 | super(localMachineId, commandLine, fileSystem, machineConfig, graphPartition, vertexFactory, 39 | graphSize, outgoingBufferSizes, outputFileName, messageSenderAndReceiverFactory, 40 | incomingMessageStorage, pollingTime, maxMessagesToTransmitConcurrently, 41 | numVerticesFrequencyToCheckOutgoingBuffers, 42 | sleepTimeWhenOutgoingBuffersExceedThreshold, largeVertexPartitioningOutdegreeThreshold, 43 | runPartitioningSuperstep, combine, messageRepresentativeInstance, 44 | representativeEdgeInstance, jobConfiguration, numProcessorsForHandlingIO); 45 | this.benefitThreshold = benefitThreshold; 46 | this.edgeThreshold = edgeThreshold; 47 | machineCommunicationHistogram = new int[getNumWorkers()]; 48 | // incomingMessageStorage.setMachineCommunicationHistogram(machineCommunicationHistogram); 49 | fasterMachines = new boolean[getNumWorkers()]; 50 | this.superstepNoToStopDynamism = superstepNoToStopDynamism; 51 | } 52 | 53 | @Override 54 | protected void doExtraWorkBeforeVertexComputation() { 55 | if (currentSuperstepNo > superstepNoToStopDynamism) { 56 | return; 57 | } 58 | machineCommunicationHistogram = new int[getNumWorkers()]; 59 | // System.out.println("Starting to dump machineCommunicationHistogram..."); 60 | // for (int i = 0; i < getNumWorkers(); ++i) { 61 | // getLogger().info("" + machineCommunicationHistogram[i]); 62 | // } 63 | // System.out.println("End of dumping machineCommunicationHistogram..."); 64 | // for (int i = 0; i < getNumWorkers(); ++i) { 65 | // machineCommunicationHistogram[i] = 0; 66 | // } 67 | } 68 | // 69 | // protected Integer putVertexIntoVerticesToMoveIfMaxCommunicationMachineIsNotLocalMachine( 70 | // int nodeId, Map vertexIdMachineIdMap) { 71 | // int maxCommunicationMachineId = findIdOfMaxCommunicatedMachine(); 72 | // if (maxCommunicationMachineId != getLocalMachineId() 73 | // && machineCommunicationHistogram[maxCommunicationMachineId] 74 | // >= (machineCommunicationHistogram[getLocalMachineId()] + benefitThreshold)) { 75 | // vertexIdMachineIdMap.put(nodeId, maxCommunicationMachineId); 76 | // return maxCommunicationMachineId; 77 | // } else { 78 | // return null; 79 | // } 80 | // } 81 | 82 | protected int findIdOfMaxCommunicatedMachine() { 83 | // System.out.println("Finding maxCommunicationMachine..."); 84 | // System.out.println("0: " + machineCommunicationHistogram[0]); 85 | int maxIndex = 0; 86 | int maxValue = machineCommunicationHistogram[0]; 87 | int numEqualMachines = 1; 88 | for (int i = 1; i < machineCommunicationHistogram.length; ++i) { 89 | int valueOfCurrentMachine = machineCommunicationHistogram[i]; 90 | // System.out.println(i + ": " + machineCommunicationHistogram[i]); 91 | if (valueOfCurrentMachine > maxValue) { 92 | maxValue = valueOfCurrentMachine; 93 | maxIndex = i; 94 | numEqualMachines = 1; 95 | } else if (valueOfCurrentMachine == maxValue) { 96 | numEqualMachines++; 97 | if (Math.random() <= ((double) 1.0 / (double) numEqualMachines)) { 98 | maxIndex = i; 99 | } 100 | } 101 | } 102 | // System.out.println("End of finding maxCommunicationMachine..."); 103 | return (int) maxIndex; 104 | } 105 | } -------------------------------------------------------------------------------- /gps-rev-110/src/java/gps/writable/LongArrayWritable.java: -------------------------------------------------------------------------------- 1 | package gps.writable; 2 | 3 | import java.util.Arrays; 4 | import org.apache.mina.core.buffer.IoBuffer; 5 | 6 | public class LongArrayWritable extends MinaWritable { 7 | 8 | public long[] value; 9 | 10 | public LongArrayWritable() { 11 | this.value = new long[0]; 12 | } 13 | 14 | public LongArrayWritable(long[] value) { 15 | this.value = value; 16 | } 17 | 18 | /** 19 | * Setter that does not deep copy. 20 | * 21 | * @param value Array. 22 | */ 23 | public void set(long[] value) { this.value = value; } 24 | 25 | /** 26 | * Getter. 27 | * 28 | * @return Array. 29 | */ 30 | public long[] get() { return value; } 31 | 32 | 33 | @Override 34 | public int numBytes() { 35 | return 4 + 8*value.length; 36 | } 37 | 38 | @Override 39 | public void write(IoBuffer ioBuffer) { 40 | ioBuffer.putInt(value.length); 41 | 42 | for (long longValue : value) { 43 | ioBuffer.putLong(longValue); 44 | } 45 | } 46 | 47 | @Override 48 | public void read(IoBuffer ioBuffer) { 49 | int length = ioBuffer.getInt(); 50 | this.value = new long[length]; 51 | 52 | for (int i = 0; i < length; ++i) { 53 | this.value[i] = ioBuffer.getLong(); 54 | } 55 | } 56 | 57 | @Override 58 | public int read(byte[] byteArray, int index) { 59 | int length = readIntegerFromByteArray(byteArray, index); 60 | 61 | this.value = new long[length]; 62 | index += 4; 63 | 64 | for (int i = 0; i < length; ++i) { 65 | this.value[i] = readLongFromByteArray(byteArray, index); 66 | index += 8; 67 | } 68 | 69 | return 4 + (8*length); 70 | } 71 | 72 | @Override 73 | public int read(IoBuffer ioBuffer, byte[] byteArray, int index) { 74 | int length = ioBuffer.getInt(); 75 | writeIntegerToByteArray(byteArray, length, index); 76 | index += 4; 77 | 78 | for (int i = 0; i < length; ++i) { 79 | ioBuffer.get(byteArray, index, 8); 80 | index += 8; 81 | } 82 | 83 | return 4 + (8*length); 84 | } 85 | 86 | @Override 87 | public void combine(byte[] messageQueue, byte[] tmpArray) { 88 | // Nothing to do. This writable is not combinable. 89 | } 90 | 91 | @Override 92 | public String toString() { 93 | return Arrays.toString(value); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /results/plots/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ############### 4 | # Constants 5 | ############### 6 | BYTE_PER_GB = 1024*1024*1024.0 7 | KB_PER_GB = 1024*1024.0 8 | MB_PER_GB = 1024.0 9 | 10 | MS_PER_SEC = 1000.0 11 | SEC_PER_MIN = 60.0 12 | 13 | ALGS = ('pagerank', 'sssp', 'wcc', 'mst') 14 | ALG_PR, ALG_SSSP, ALG_WCC, ALG_MST = ALGS 15 | ALG_PREMIZAN = 'premizan' 16 | 17 | GRAPHS = ('livejournal', 'orkut', 'arabic', 'twitter', 'uk0705') 18 | GRAPH_LJ, GRAPH_OR, GRAPH_AR, GRAPH_TW, GRAPH_UK = GRAPHS 19 | 20 | MACHINES = ('16', '32', '64', '128') 21 | 22 | SYSTEMS = ('giraph', 'gps', 'mizan', 'graphlab') 23 | SYS_GIRAPH, SYS_GPS, SYS_MIZAN, SYS_GRAPHLAB = SYSTEMS 24 | 25 | SYS_MODES = (('0','1'), # Giraph: byte array, hash map 26 | ('0','1','2'), # GPS: none, LALP, dynamic 27 | ('0',), # Mizan: static 28 | ('0','1')) # GraphLab: sync, async 29 | SYSMODE_HASH = '1' # premizan hash partitioning 30 | 31 | # combination of all systems and their sys modes 32 | ALL_SYS = [(system, sysmode) 33 | for system, sysmodes in zip(SYSTEMS, SYS_MODES) 34 | for sysmode in sysmodes] 35 | 36 | 37 | # conversion modes 38 | MODES = (0, 1, 2) 39 | MODE_TIME, MODE_MEM, MODE_NET = MODES 40 | 41 | # names for relevant statistics (indexed by "mode") 42 | STATS = (('run', 'io', 'tot'), # time 43 | ('mem_min', 'mem_max', 'mem_avg'), # memory 44 | ('recv_min', 'recv_max', 'recv_avg', # net 45 | 'sent_min', 'sent_max', 'sent_avg')) 46 | -------------------------------------------------------------------------------- /results/plots/gen-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | ./gen-data.py 0 > data_time.py 4 | ./gen-data.py 1 > data_mem.py 5 | ./gen-data.py 2 > data_net.py 6 | 7 | ./gen-data.py 1 --master > data_mem_master.py 8 | ./gen-data.py 2 --master > data_net_master.py -------------------------------------------------------------------------------- /results/plots/plot-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | ./plot.py 0 --save-eps 4 | ./plot.py 0 --save-eps --total-time 5 | ./plot.py 1 --save-eps 6 | ./plot.py 1 --save-eps --plot-sum 7 | ./plot.py 2 --save-eps --plot-sum 8 | ./plot.py 2 --save-eps 9 | 10 | ./plot-with-cuts.py 0 --save-eps 11 | ./plot-with-cuts.py 2 --save-eps 12 | 13 | ./plot.py 1 --master --save-eps 14 | ./plot.py 2 --master --save-eps 15 | 16 | ./plot.py 1 --premizan --save-eps 17 | ./plot.py 2 --premizan --save-eps --plot-sum 18 | ./plot.py 2 --premizan --save-eps 19 | ./plot.py 1 --premizan --master --save-eps 20 | ./plot.py 2 --premizan --master --save-eps -------------------------------------------------------------------------------- /results/plots/plot-paper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | ./plot.py 0 --save-paper 4 | ./plot.py 1 --save-paper --plot-max 5 | ./plot.py 2 --save-paper --plot-sum 6 | 7 | ./plot-with-cuts.py 0 --save-paper 8 | ./plot-with-cuts.py 2 --save-paper --------------------------------------------------------------------------------