├── .gitattributes
├── .gitignore
├── .gitmodules
├── .travis.yml
├── CMakeLists.txt
├── Readme.md
├── capnph
    ├── data.capnp
    ├── data.capnp.c++
    └── data.capnp.h
├── examples
    ├── basics.cpp
    ├── large_word_count.cpp
    ├── morte_carlo_pi.cpp
    ├── morte_carlo_pi_omp.cpp
    ├── mpi_ping_pong_failed.cpp
    ├── nest_flat_map.cpp
    └── word_count.cpp
├── include
    ├── aggregator.hpp
    ├── cache.hpp
    ├── cache_tracker.hpp
    ├── common.hpp
    ├── dependency.hpp
    ├── executor.hpp
    ├── map_output_tracker.hpp
    ├── partitioner.hpp
    ├── rdd
    │   ├── mapped_rdd.hpp
    │   ├── pair_rdd.hpp
    │   ├── parallel_collection.hpp
    │   └── rdd.hpp
    ├── scheduler
    │   ├── dag_scheduler.hpp
    │   ├── scheduler.hpp
    │   ├── stage.hpp
    │   └── task.hpp
    ├── serialize_capnp.hpp
    ├── serialize_wrapper.hpp
    ├── shuffle_fetcher.hpp
    ├── shuffle_manager.hpp
    ├── spark_context.hpp
    ├── spark_env.hpp
    ├── split.hpp
    └── utils
    │   ├── event_loop.hpp
    │   ├── function_signature.hpp
    │   ├── macros.hpp
    │   ├── match.hpp
    │   ├── pair_hash.hpp
    │   ├── ptr_cast.hpp
    │   ├── serde.hpp
    │   ├── span.hpp
    │   ├── tcp.hpp
    │   ├── thread_pool.hpp
    │   ├── traits.hpp
    │   └── utils.hpp
├── miscs
    ├── discussions.md
    ├── morte_carlo_pi.scala
    ├── report.pdf
    ├── split_text.py
    └── word_count.scala
└── src
    ├── cache.cpp
    ├── dependency.cpp
    ├── main.cpp
    ├── rdd
        └── rdd.cpp
    ├── scheduler
        └── dag_scheduler.cpp
    ├── serialize_capnp.cpp
    ├── serialize_wrapper.cpp
    └── spark_env.cpp


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #############################
  2 | # local config files
  3 | 
  4 | 
  5 | #############################
  6 | # C++ gitignore
  7 | # Prerequisites
  8 | 
  9 | *.d
 10 | 
 11 | 
 12 | 
 13 | # Compiled Object files
 14 | 
 15 | *.slo
 16 | 
 17 | *.lo
 18 | 
 19 | *.o
 20 | 
 21 | *.obj
 22 | 
 23 | 
 24 | 
 25 | # Precompiled Headers
 26 | 
 27 | *.gch
 28 | 
 29 | *.pch
 30 | 
 31 | 
 32 | 
 33 | # Compiled Dynamic libraries
 34 | 
 35 | *.so
 36 | 
 37 | *.dylib
 38 | 
 39 | *.dll
 40 | 
 41 | 
 42 | 
 43 | # Fortran module files
 44 | 
 45 | *.mod
 46 | 
 47 | *.smod
 48 | 
 49 | 
 50 | 
 51 | # Compiled Static libraries
 52 | 
 53 | *.lai
 54 | 
 55 | *.la
 56 | 
 57 | *.a
 58 | 
 59 | *.lib
 60 | 
 61 | 
 62 | 
 63 | # Executables
 64 | 
 65 | *.exe
 66 | 
 67 | *.out
 68 | 
 69 | *.app
 70 | 
 71 | #############################
 72 | # Python gitignore
 73 | # Byte-compiled / optimized / DLL files
 74 | 
 75 | __pycache__/
 76 | 
 77 | *.py[cod]
 78 | 
 79 | *$py.class
 80 | 
 81 | 
 82 | 
 83 | # C extensions
 84 | 
 85 | *.so
 86 | 
 87 | 
 88 | 
 89 | # Distribution / packaging
 90 | 
 91 | .Python
 92 | 
 93 | env/
 94 | 
 95 | build/
 96 | 
 97 | develop-eggs/
 98 | 
 99 | dist/
100 | 
101 | downloads/
102 | 
103 | eggs/
104 | 
105 | .eggs/
106 | 
107 | lib/
108 | 
109 | lib64/
110 | 
111 | parts/
112 | 
113 | sdist/
114 | 
115 | var/
116 | 
117 | wheels/
118 | 
119 | *.egg-info/
120 | 
121 | .installed.cfg
122 | 
123 | *.egg
124 | 
125 | package/
126 | 
127 | # PyInstaller
128 | 
129 | #  Usually these files are written by a python script from a template
130 | 
131 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
132 | 
133 | *.manifest
134 | 
135 | *.spec
136 | 
137 | 
138 | 
139 | # Installer logs
140 | 
141 | pip-log.txt
142 | 
143 | pip-delete-this-directory.txt
144 | 
145 | 
146 | 
147 | # Unit test / coverage reports
148 | 
149 | htmlcov/
150 | 
151 | .tox/
152 | 
153 | .coverage
154 | 
155 | .coverage.*
156 | 
157 | .cache
158 | 
159 | nosetests.xml
160 | 
161 | coverage.xml
162 | 
163 | *,cover
164 | 
165 | .hypothesis/
166 | 
167 | 
168 | 
169 | # Translations
170 | 
171 | *.mo
172 | 
173 | *.pot
174 | 
175 | 
176 | 
177 | # Django stuff:
178 | 
179 | *.log
180 | 
181 | local_settings.py
182 | 
183 | 
184 | 
185 | # Flask stuff:
186 | 
187 | instance/
188 | 
189 | .webassets-cache
190 | 
191 | 
192 | 
193 | # Scrapy stuff:
194 | 
195 | .scrapy
196 | 
197 | 
198 | 
199 | # Sphinx documentation
200 | 
201 | docs/_build/
202 | 
203 | 
204 | 
205 | # PyBuilder
206 | 
207 | target/
208 | 
209 | 
210 | 
211 | # Jupyter Notebook
212 | 
213 | .ipynb_checkpoints
214 | 
215 | 
216 | 
217 | # pyenv
218 | 
219 | .python-version
220 | 
221 | 
222 | 
223 | # celery beat schedule file
224 | 
225 | celerybeat-schedule
226 | 
227 | 
228 | 
229 | # dotenv
230 | 
231 | .env
232 | 
233 | 
234 | 
235 | # virtualenv
236 | 
237 | .venv
238 | 
239 | venv/
240 | 
241 | ENV/
242 | 
243 | 
244 | 
245 | # Spyder project settings
246 | 
247 | .spyderproject
248 | 
249 | 
250 | 
251 | # Rope project settings
252 | 
253 | .ropeproject
254 | 
255 | ###################################
256 | # Visual Studio gitignore
257 | ## Ignore Visual Studio temporary files, build results, and
258 | 
259 | ## files generated by popular Visual Studio add-ons.
260 | 
261 | ##
262 | 
263 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
264 | 
265 | 
266 | 
267 | # User-specific files
268 | 
269 | *.suo
270 | 
271 | *.user
272 | 
273 | *.userosscache
274 | 
275 | *.sln.docstates
276 | 
277 | 
278 | 
279 | # User-specific files (MonoDevelop/Xamarin Studio)
280 | 
281 | *.userprefs
282 | 
283 | 
284 | 
285 | # Build results
286 | 
287 | [Dd]ebug/
288 | 
289 | [Dd]ebugPublic/
290 | 
291 | [Rr]elease/
292 | 
293 | [Rr]eleases/
294 | 
295 | x64/
296 | 
297 | x86/
298 | 
299 | bld/
300 | 
301 | [Bb]in/
302 | 
303 | [Oo]bj/
304 | 
305 | [Ll]og/
306 | 
307 | 
308 | 
309 | # Visual Studio 2015 cache/options directory
310 | 
311 | .vs/
312 | .vscode/
313 | 
314 | # Uncomment if you have tasks that create the project's static files in wwwroot
315 | 
316 | #wwwroot/
317 | 
318 | 
319 | 
320 | # MSTest test Results
321 | 
322 | [Tt]est[Rr]esult*/
323 | 
324 | [Bb]uild[Ll]og.*
325 | 
326 | 
327 | 
328 | # NUNIT
329 | 
330 | *.VisualState.xml
331 | 
332 | TestResult.xml
333 | 
334 | 
335 | 
336 | # Build Results of an ATL Project
337 | 
338 | [Dd]ebugPS/
339 | 
340 | [Rr]eleasePS/
341 | 
342 | dlldata.c
343 | 
344 | 
345 | 
346 | # .NET Core
347 | 
348 | project.lock.json
349 | 
350 | project.fragment.lock.json
351 | 
352 | artifacts/
353 | 
354 | **/Properties/launchSettings.json
355 | 
356 | 
357 | 
358 | *_i.c
359 | 
360 | *_p.c
361 | 
362 | *_i.h
363 | 
364 | *.ilk
365 | 
366 | *.meta
367 | 
368 | *.obj
369 | 
370 | *.pch
371 | 
372 | *.pdb
373 | 
374 | *.pgc
375 | 
376 | *.pgd
377 | 
378 | *.rsp
379 | 
380 | *.sbr
381 | 
382 | *.tlb
383 | 
384 | *.tli
385 | 
386 | *.tlh
387 | 
388 | *.tmp
389 | 
390 | *.tmp_proj
391 | 
392 | *.log
393 | 
394 | *.vspscc
395 | 
396 | *.vssscc
397 | 
398 | .builds
399 | 
400 | *.pidb
401 | 
402 | *.svclog
403 | 
404 | *.scc
405 | 
406 | 
407 | 
408 | # Chutzpah Test files
409 | 
410 | _Chutzpah*
411 | 
412 | 
413 | 
414 | # Visual C++ cache files
415 | 
416 | ipch/
417 | 
418 | *.aps
419 | 
420 | *.ncb
421 | 
422 | *.opendb
423 | 
424 | *.opensdf
425 | 
426 | *.sdf
427 | 
428 | *.cachefile
429 | 
430 | *.VC.db
431 | 
432 | *.VC.VC.opendb
433 | 
434 | 
435 | 
436 | # Visual Studio profiler
437 | 
438 | *.psess
439 | 
440 | *.vsp
441 | 
442 | *.vspx
443 | 
444 | *.sap
445 | 
446 | 
447 | 
448 | # TFS 2012 Local Workspace
449 | 
450 | $tf/
451 | 
452 | 
453 | 
454 | # Guidance Automation Toolkit
455 | 
456 | *.gpState
457 | 
458 | 
459 | 
460 | # ReSharper is a .NET coding add-in
461 | 
462 | _ReSharper*/
463 | 
464 | *.[Rr]e[Ss]harper
465 | 
466 | *.DotSettings.user
467 | 
468 | 
469 | 
470 | # JustCode is a .NET coding add-in
471 | 
472 | .JustCode
473 | 
474 | 
475 | 
476 | # TeamCity is a build add-in
477 | 
478 | _TeamCity*
479 | 
480 | 
481 | 
482 | # DotCover is a Code Coverage Tool
483 | 
484 | *.dotCover
485 | 
486 | 
487 | 
488 | # Visual Studio code coverage results
489 | 
490 | *.coverage
491 | 
492 | *.coveragexml
493 | 
494 | 
495 | 
496 | # NCrunch
497 | 
498 | _NCrunch_*
499 | 
500 | .*crunch*.local.xml
501 | 
502 | nCrunchTemp_*
503 | 
504 | 
505 | 
506 | # MightyMoose
507 | 
508 | *.mm.*
509 | 
510 | AutoTest.Net/
511 | 
512 | 
513 | 
514 | # Web workbench (sass)
515 | 
516 | .sass-cache/
517 | 
518 | 
519 | 
520 | # Installshield output folder
521 | 
522 | [Ee]xpress/
523 | 
524 | 
525 | 
526 | # DocProject is a documentation generator add-in
527 | 
528 | DocProject/buildhelp/
529 | 
530 | DocProject/Help/*.HxT
531 | 
532 | DocProject/Help/*.HxC
533 | 
534 | DocProject/Help/*.hhc
535 | 
536 | DocProject/Help/*.hhk
537 | 
538 | DocProject/Help/*.hhp
539 | 
540 | DocProject/Help/Html2
541 | 
542 | DocProject/Help/html
543 | 
544 | 
545 | 
546 | # Click-Once directory
547 | 
548 | publish/
549 | 
550 | 
551 | 
552 | # Publish Web Output
553 | 
554 | *.[Pp]ublish.xml
555 | 
556 | *.azurePubxml
557 | 
558 | # TODO: Comment the next line if you want to checkin your web deploy settings
559 | 
560 | # but database connection strings (with potential passwords) will be unencrypted
561 | 
562 | *.pubxml
563 | 
564 | *.publishproj
565 | 
566 | 
567 | 
568 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
569 | 
570 | # checkin your Azure Web App publish settings, but sensitive information contained
571 | 
572 | # in these scripts will be unencrypted
573 | 
574 | PublishScripts/
575 | 
576 | 
577 | 
578 | # NuGet Packages
579 | 
580 | *.nupkg
581 | 
582 | # The packages folder can be ignored because of Package Restore
583 | 
584 | **/packages/*
585 | 
586 | # except build/, which is used as an MSBuild target.
587 | 
588 | !**/packages/build/
589 | 
590 | # Uncomment if necessary however generally it will be regenerated when needed
591 | 
592 | #!**/packages/repositories.config
593 | 
594 | # NuGet v3's project.json files produces more ignoreable files
595 | 
596 | *.nuget.props
597 | 
598 | *.nuget.targets
599 | 
600 | 
601 | 
602 | # Microsoft Azure Build Output
603 | 
604 | csx/
605 | 
606 | *.build.csdef
607 | 
608 | 
609 | 
610 | # Microsoft Azure Emulator
611 | 
612 | ecf/
613 | 
614 | rcf/
615 | 
616 | 
617 | 
618 | # Windows Store app package directories and files
619 | 
620 | AppPackages/
621 | 
622 | BundleArtifacts/
623 | 
624 | Package.StoreAssociation.xml
625 | 
626 | _pkginfo.txt
627 | 
628 | 
629 | 
630 | # Visual Studio cache files
631 | 
632 | # files ending in .cache can be ignored
633 | 
634 | *.[Cc]ache
635 | 
636 | # but keep track of directories ending in .cache
637 | 
638 | !*.[Cc]ache/
639 | 
640 | 
641 | 
642 | # Others
643 | 
644 | ClientBin/
645 | 
646 | ~$*
647 | 
648 | *~
649 | 
650 | *.dbmdl
651 | 
652 | *.dbproj.schemaview
653 | 
654 | *.jfm
655 | 
656 | *.pfx
657 | 
658 | *.publishsettings
659 | 
660 | orleans.codegen.cs
661 | 
662 | 
663 | 
664 | # Since there are multiple workflows, uncomment next line to ignore bower_components
665 | 
666 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
667 | 
668 | #bower_components/
669 | 
670 | 
671 | 
672 | # RIA/Silverlight projects
673 | 
674 | Generated_Code/
675 | 
676 | 
677 | 
678 | # Backup & report files from converting an old project file
679 | 
680 | # to a newer Visual Studio version. Backup files are not needed,
681 | 
682 | # because we have git ;-)
683 | 
684 | _UpgradeReport_Files/
685 | 
686 | Backup*/
687 | 
688 | UpgradeLog*.XML
689 | 
690 | UpgradeLog*.htm
691 | 
692 | 
693 | 
694 | # SQL Server files
695 | 
696 | *.mdf
697 | 
698 | *.ldf
699 | 
700 | 
701 | 
702 | # Business Intelligence projects
703 | 
704 | *.rdl.data
705 | 
706 | *.bim.layout
707 | 
708 | *.bim_*.settings
709 | 
710 | 
711 | 
712 | # Microsoft Fakes
713 | 
714 | FakesAssemblies/
715 | 
716 | 
717 | 
718 | # GhostDoc plugin setting file
719 | 
720 | *.GhostDoc.xml
721 | 
722 | 
723 | 
724 | # Node.js Tools for Visual Studio
725 | 
726 | .ntvs_analysis.dat
727 | 
728 | node_modules/
729 | 
730 | 
731 | 
732 | # Typescript v1 declaration files
733 | 
734 | typings/
735 | 
736 | 
737 | 
738 | # Visual Studio 6 build log
739 | 
740 | *.plg
741 | 
742 | 
743 | 
744 | # Visual Studio 6 workspace options file
745 | 
746 | *.opt
747 | 
748 | 
749 | 
750 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
751 | 
752 | *.vbw
753 | 
754 | 
755 | 
756 | # Visual Studio LightSwitch build output
757 | 
758 | **/*.HTMLClient/GeneratedArtifacts
759 | 
760 | **/*.DesktopClient/GeneratedArtifacts
761 | 
762 | **/*.DesktopClient/ModelManifest.xml
763 | 
764 | **/*.Server/GeneratedArtifacts
765 | 
766 | **/*.Server/ModelManifest.xml
767 | 
768 | _Pvt_Extensions
769 | 
770 | 
771 | 
772 | # Paket dependency manager
773 | 
774 | .paket/paket.exe
775 | 
776 | paket-files/
777 | 
778 | 
779 | 
780 | # FAKE - F# Make
781 | 
782 | .fake/
783 | 
784 | 
785 | 
786 | # JetBrains Rider
787 | 
788 | .idea/
789 | 
790 | *.sln.iml
791 | 
792 | 
793 | 
794 | # CodeRush
795 | 
796 | .cr/
797 | 
798 | 
799 | 
800 | # Python Tools for Visual Studio (PTVS)
801 | 
802 | __pycache__/
803 | 
804 | *.pyc
805 | 
806 | 
807 | 
808 | # Cake - Uncomment if you are using it
809 | 
810 | # tools/**
811 | 
812 | # !tools/packages.config
813 | 
814 | # User defined
815 | 
816 | ## 3rdParty
817 | 
818 | 
819 | 
820 | ## Debug Files
821 | 
822 | test/Debug/*
823 | 
824 | Debug/
825 | 
826 | test/*.vcxproj
827 | 
828 | test/*.vcxproj.filters
829 | 
830 | packages.config
831 | 
832 | *.sln
833 | 
834 | *.vcxproj
835 | 
836 | *.vcxproj.filters
837 | 
838 | ## Misc
839 | 
840 | cmake-build-debug/
841 | cmake-build-release/
842 | cmake-build*/
843 | 
844 | CMakeCache.txt
845 | 
846 | CMakeFiles/
847 | 
848 | .idea/
849 | .vs/
850 | .vscode/
851 | 
852 | examples/input
853 | 
854 | Makefile


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "3rdparty/ranges-v3"]
 2 | 	path = 3rdparty/ranges-v3
 3 | 	url = https://github.com/ericniebler/range-v3.git
 4 | [submodule "3rdparty/abseil"]
 5 | 	path = 3rdparty/abseil
 6 | 	url = https://github.com/abseil/abseil-cpp
 7 | [submodule "3rdParty/concurrentqueue"]
 8 | 	path = 3rdParty/concurrentqueue
 9 | 	url = https://github.com/cameron314/concurrentqueue.git
10 | [submodule "3rdparty/parallel-hashmap"]
11 | 	path = 3rdparty/parallel-hashmap
12 | 	url = https://github.com/greg7mdp/parallel-hashmap.git
13 | [submodule "3rdparty/fmt"]
14 | 	path = 3rdparty/fmt
15 | 	url = https://github.com/fmtlib/fmt.git
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | 
 3 | compiler:
 4 |   - gcc
 5 |   - clang
 6 | os:
 7 |   - linux
 8 | 
 9 | env:
10 |   - CXX=g++
11 | 
12 | addons:
13 |   apt:
14 |     sources:
15 |       - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main'
16 |       key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
17 |       - sourceline: 'ppa:ubuntu-toolchain-r/test'
18 |     packages:
19 |       - clang-9
20 |       - cmake
21 |       - gcc-9
22 |       - g++-9
23 |       - ninja-build
24 | 
25 | install:
26 |   - if [ "$CXX" = "g++" ]; then export CXX="g++-9" CC="gcc-9"; fi
27 | 
28 | before_script:
29 |   - mkdir -p build && cd build
30 |   - cmake ..
31 |   - cmake --build .
32 |   - cd ..
33 | 
34 | script:
35 |   - cd build && cmake --build .


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | project(Sparkpp)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set (CXX_EXTRA_FLAGS
 6 |         "-Wall -Wextra
 7 |         -fconcepts
 8 |         -Wno-missing-field-initializers
 9 |         -Wno-comment
10 |         -lboost_serialization
11 | ")
12 | add_definitions(${CXX_EXTRA_FLAGS})
13 | set(CMAKE_CXX_FLAGS_RELEASE
14 |         "${CMAKE_CXX_FLAGS_RELEASE} -march=native -mtune=native -fwhole-program -flto")
15 | 
16 | # for profiler, since gcc use -Wl,--as-needed by default
17 | # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no-as-needed")
18 | 
19 | if (CMAKE_BUILD_TYPE STREQUAL "Release")
20 |     add_compile_options(-march=native -mtune=native -fwhole-program -flto)
21 | endif()
22 | 
23 | # configure_file()
24 | 
25 | # execute_process(COMMAND capnp compile -oc++
26 | #         "${CMAKE_CURRENT_LIST_DIR}/capnp/data.capnp"
27 | #         "--src-prefix=")
28 | # set(CAPNP_INCLUDE_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/3rdparty/capnp)
29 | # CAPNP_GENERATE_CPP(CapnpSources CapnpHeaders capnp/data.capnp)
30 | # message("${CapnpSources}")
31 | set (CapnpSources "${PROJECT_SOURCE_DIR}/capnph/data.capnp.c++")
32 | find_package(CapnProto CONFIG REQUIRED)
33 | 
34 | set(Boost_USE_MULTITHREADED ON)
35 | find_package(Boost REQUIRED COMPONENTS system thread serialization)
36 | add_library(boost::serialization INTERFACE IMPORTED)
37 | include_directories(${Boost_INCLUDE_DIRS})
38 | 
39 | find_package(fmt)
40 | 
41 | # for openmp
42 | # find_package(OpenMP REQUIRED)
43 | 
44 | # for mpich
45 | # find_package(MPI REQUIRED)
46 | # include_directories(SYSTEM ${MPI_INCLUDE_PATH})
47 | 
48 | include_directories(include)
49 | include_directories(capnph)
50 | include_directories(3rdparty)
51 | include_directories(3rdparty/concurrentqueue)
52 | include_directories(3rdparty/parallel-hashmap)
53 | 
54 | 
55 | file(GLOB_RECURSE HEADERS
56 |         /concurrentqueue)
57 | file(GLOB_RECURSE HEADERS
58 |         "${PROJECT_SOURCE_DIR}/include/*.hpp"
59 |         )
60 | file(GLOB_RECURSE SRCS
61 |         "${PROJECT_SOURCE_DIR}/src/*.cpp"
62 |         )
63 | add_executable(Sparkpp ${HEADERS} ${SRCS} ${CapnpSources})
64 | 
65 | target_include_directories(Sparkpp PRIVATE include)
66 | target_include_directories(Sparkpp PRIVATE capnph)
67 | target_include_directories(Sparkpp PRIVATE 3rdparty)
68 | target_include_directories(Sparkpp PRIVATE 3rdparty/concurrentqueue)
69 | target_include_directories(Sparkpp PRIVATE 3rdparty/parallel-hashmap)
70 | 
71 | target_link_libraries(Sparkpp stdc++fs)
72 | target_link_libraries(Sparkpp boost::serialization)
73 | target_link_libraries(Sparkpp ${Boost_LIBRARIES})
74 | target_link_libraries(Sparkpp CapnProto::capnp)
75 | target_link_libraries(Sparkpp fmt::fmt)
76 | 
77 | # for tcmalloc
78 | target_link_libraries(Sparkpp tcmalloc)
79 | 
80 | # for profiler
81 | # target_link_libraries(Sparkpp profiler)
82 | 
83 | # for openmp
84 | # target_link_libraries(Sparkpp OpenMP::OpenMP_CXX)
85 | 
86 | # for mpi
87 | # target_link_libraries(Sparkpp MPI::MPI_C)


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Sparkpp
 2 | 
 3 | A na(t)ive proof-of-concept implementation of Apache Spark in C++.
 4 | 
 5 | Compiled & tested under gcc-9.2.1, boost-1.71, cmake-3.15.
 6 | 
 7 | Inspired by rust Spark implementation [native_spark](https://github.com/rajasekarv/native_spark) and based on [Spark-0.5](https://github.com/apache/spark/tree/branch-0.5).
 8 | 
 9 | ## Example
10 | 
11 | Check [examples](./examples)
12 | 
13 | ## Prerequisites
14 | 
15 | Check [bin/prepare.sh](./bin/prepare.sh)
16 | 
17 | * Boost
18 | * - [Serialization](https://github.com/boostorg/serialization)
19 | * - [Asio](https://github.com/boostorg/asio)
20 | * - [Beast](https://github.com/boostorg/beast)
21 | * [Cap'n Proto](https://github.com/capnproto/capnproto)
22 | * [fmt](https://github.com/fmtlib/fmt)
23 | * [gperftools](https://github.com/gperftools/gperftools)
24 | * - tcmalloc
25 | * - google-gprof
26 | * [concurrentqueue](https://github.com/cameron314/concurrentqueue)
27 | * [phmap](https://github.com/greg7mdp/parallel-hashmap)
28 | 
29 | ## Installation
30 | 
31 | ```shell script
32 | # install
33 | ./bin/prepare.sh                 # root
34 | ./bin/check.sh                   # check installation version
35 | # env
36 | export SPARK_LOCAL_IP=<local ip>
37 | export CPUPROFILE=<profile file> # if google-gprof enabled
38 | export CPUPROFILESIGNAL=<sig>    # if google-gprof enabled
39 | # master
40 | ./bin/start_master.sh
41 | # slave
42 | ./bin/start_slave.sh
43 | ```
44 | 
45 | ## TODOs
46 | 
47 | - [ ] More precise concept control
48 | - [ ] Async network support (-fcoroutines, boost::asio::io_service::async_accept), replacing raw socket + thread_pool
49 | - [ ] Compare single boost::serialization without Cap'n Proto (& with boost flags, like no_headers)
50 | - [ ] Add config (master/slave addr/port) file support
51 | - [ ] new version of Spark optimizations: ShuffleWriter
52 | - [ ] See other TODOs in files
53 | 
54 | ## Random Thoughts
55 | 
56 | Check [miscs/discussion.md](./miscs/discussions.md) and [miscs/report.pdf](./miscs/report.pdf)
57 | 
58 | ## Reference
59 | 
60 | * [Spark: Cluster Computing with Working Sets](https://www.usenix.org/legacy/event/hotcloud10/tech/full_papers/Zaharia.pdf)
61 | * [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf)
62 | * [Spark-0.5](https://github.com/apache/spark/tree/branch-0.5)
63 | * [native_spark](https://github.com/rajasekarv/native_spark)


--------------------------------------------------------------------------------
/capnph/data.capnp:
--------------------------------------------------------------------------------
 1 | @0xad095e939d603738;
 2 | 
 3 | struct Execution {
 4 |     isShuffle @0: Bool;
 5 |     partitionId @1: UInt32;
 6 |     rdd @2: Data;
 7 |     funcOrDep @3: Data;
 8 | }
 9 | 
10 | struct Result {
11 |     msg @0: Data;
12 | }
13 | 
14 | struct Message {
15 |     msg @0: Data;
16 | }


--------------------------------------------------------------------------------
/capnph/data.capnp.c++:
--------------------------------------------------------------------------------
  1 | // Generated by Cap'n Proto compiler, DO NOT EDIT
  2 | // source: data.capnp
  3 | 
  4 | #include "data.capnp.h"
  5 | 
  6 | namespace capnp {
  7 | namespace schemas {
  8 | static const ::capnp::_::AlignedData<81> b_86ebb1259307df55 = {
  9 |   {   0,   0,   0,   0,   5,   0,   6,   0,
 10 |      85, 223,   7, 147,  37, 177, 235, 134,
 11 |      18,   0,   0,   0,   1,   0,   1,   0,
 12 |      56,  55,  96, 157, 147,  94,   9, 173,
 13 |       2,   0,   7,   0,   0,   0,   0,   0,
 14 |       0,   0,   0,   0,   0,   0,   0,   0,
 15 |      21,   0,   0,   0, 226,   0,   0,   0,
 16 |      33,   0,   0,   0,   7,   0,   0,   0,
 17 |       0,   0,   0,   0,   0,   0,   0,   0,
 18 |      29,   0,   0,   0, 231,   0,   0,   0,
 19 |       0,   0,   0,   0,   0,   0,   0,   0,
 20 |       0,   0,   0,   0,   0,   0,   0,   0,
 21 |      99,  97, 112, 110, 112, 104,  47, 100,
 22 |      97, 116,  97,  46,  99,  97, 112, 110,
 23 |     112,  58,  69, 120, 101,  99, 117, 116,
 24 |     105, 111, 110,   0,   0,   0,   0,   0,
 25 |       0,   0,   0,   0,   1,   0,   1,   0,
 26 |      16,   0,   0,   0,   3,   0,   4,   0,
 27 |       0,   0,   0,   0,   0,   0,   0,   0,
 28 |       0,   0,   1,   0,   0,   0,   0,   0,
 29 |       0,   0,   0,   0,   0,   0,   0,   0,
 30 |      97,   0,   0,   0,  82,   0,   0,   0,
 31 |       0,   0,   0,   0,   0,   0,   0,   0,
 32 |      96,   0,   0,   0,   3,   0,   1,   0,
 33 |     108,   0,   0,   0,   2,   0,   1,   0,
 34 |       1,   0,   0,   0,   1,   0,   0,   0,
 35 |       0,   0,   1,   0,   1,   0,   0,   0,
 36 |       0,   0,   0,   0,   0,   0,   0,   0,
 37 |     105,   0,   0,   0,  98,   0,   0,   0,
 38 |       0,   0,   0,   0,   0,   0,   0,   0,
 39 |     104,   0,   0,   0,   3,   0,   1,   0,
 40 |     116,   0,   0,   0,   2,   0,   1,   0,
 41 |       2,   0,   0,   0,   0,   0,   0,   0,
 42 |       0,   0,   1,   0,   2,   0,   0,   0,
 43 |       0,   0,   0,   0,   0,   0,   0,   0,
 44 |     113,   0,   0,   0,  34,   0,   0,   0,
 45 |       0,   0,   0,   0,   0,   0,   0,   0,
 46 |     108,   0,   0,   0,   3,   0,   1,   0,
 47 |     120,   0,   0,   0,   2,   0,   1,   0,
 48 |       3,   0,   0,   0,   1,   0,   0,   0,
 49 |       0,   0,   1,   0,   3,   0,   0,   0,
 50 |       0,   0,   0,   0,   0,   0,   0,   0,
 51 |     117,   0,   0,   0,  82,   0,   0,   0,
 52 |       0,   0,   0,   0,   0,   0,   0,   0,
 53 |     116,   0,   0,   0,   3,   0,   1,   0,
 54 |     128,   0,   0,   0,   2,   0,   1,   0,
 55 |     105, 115,  83, 104, 117, 102, 102, 108,
 56 |     101,   0,   0,   0,   0,   0,   0,   0,
 57 |       1,   0,   0,   0,   0,   0,   0,   0,
 58 |       0,   0,   0,   0,   0,   0,   0,   0,
 59 |       0,   0,   0,   0,   0,   0,   0,   0,
 60 |       0,   0,   0,   0,   0,   0,   0,   0,
 61 |       1,   0,   0,   0,   0,   0,   0,   0,
 62 |       0,   0,   0,   0,   0,   0,   0,   0,
 63 |       0,   0,   0,   0,   0,   0,   0,   0,
 64 |     112,  97, 114, 116, 105, 116, 105, 111,
 65 |     110,  73, 100,   0,   0,   0,   0,   0,
 66 |       8,   0,   0,   0,   0,   0,   0,   0,
 67 |       0,   0,   0,   0,   0,   0,   0,   0,
 68 |       0,   0,   0,   0,   0,   0,   0,   0,
 69 |       0,   0,   0,   0,   0,   0,   0,   0,
 70 |       8,   0,   0,   0,   0,   0,   0,   0,
 71 |       0,   0,   0,   0,   0,   0,   0,   0,
 72 |       0,   0,   0,   0,   0,   0,   0,   0,
 73 |     114, 100, 100,   0,   0,   0,   0,   0,
 74 |      13,   0,   0,   0,   0,   0,   0,   0,
 75 |       0,   0,   0,   0,   0,   0,   0,   0,
 76 |       0,   0,   0,   0,   0,   0,   0,   0,
 77 |       0,   0,   0,   0,   0,   0,   0,   0,
 78 |      13,   0,   0,   0,   0,   0,   0,   0,
 79 |       0,   0,   0,   0,   0,   0,   0,   0,
 80 |       0,   0,   0,   0,   0,   0,   0,   0,
 81 |     102, 117, 110,  99,  79, 114,  68, 101,
 82 |     112,   0,   0,   0,   0,   0,   0,   0,
 83 |      13,   0,   0,   0,   0,   0,   0,   0,
 84 |       0,   0,   0,   0,   0,   0,   0,   0,
 85 |       0,   0,   0,   0,   0,   0,   0,   0,
 86 |       0,   0,   0,   0,   0,   0,   0,   0,
 87 |      13,   0,   0,   0,   0,   0,   0,   0,
 88 |       0,   0,   0,   0,   0,   0,   0,   0,
 89 |       0,   0,   0,   0,   0,   0,   0,   0, }
 90 | };
 91 | ::capnp::word const* const bp_86ebb1259307df55 = b_86ebb1259307df55.words;
 92 | #if !CAPNP_LITE
 93 | static const uint16_t m_86ebb1259307df55[] = {3, 0, 1, 2};
 94 | static const uint16_t i_86ebb1259307df55[] = {0, 1, 2, 3};
 95 | const ::capnp::_::RawSchema s_86ebb1259307df55 = {
 96 |   0x86ebb1259307df55, b_86ebb1259307df55.words, 81, nullptr, m_86ebb1259307df55,
 97 |   0, 4, i_86ebb1259307df55, nullptr, nullptr, { &s_86ebb1259307df55, nullptr, nullptr, 0, 0, nullptr }
 98 | };
 99 | #endif  // !CAPNP_LITE
100 | static const ::capnp::_::AlignedData<33> b_ea9f732119b10178 = {
101 |   {   0,   0,   0,   0,   5,   0,   6,   0,
102 |     120,   1, 177,  25,  33, 115, 159, 234,
103 |      18,   0,   0,   0,   1,   0,   0,   0,
104 |      56,  55,  96, 157, 147,  94,   9, 173,
105 |       1,   0,   7,   0,   0,   0,   0,   0,
106 |       0,   0,   0,   0,   0,   0,   0,   0,
107 |      21,   0,   0,   0, 202,   0,   0,   0,
108 |      33,   0,   0,   0,   7,   0,   0,   0,
109 |       0,   0,   0,   0,   0,   0,   0,   0,
110 |      29,   0,   0,   0,  63,   0,   0,   0,
111 |       0,   0,   0,   0,   0,   0,   0,   0,
112 |       0,   0,   0,   0,   0,   0,   0,   0,
113 |      99,  97, 112, 110, 112, 104,  47, 100,
114 |      97, 116,  97,  46,  99,  97, 112, 110,
115 |     112,  58,  82, 101, 115, 117, 108, 116,
116 |       0,   0,   0,   0,   0,   0,   0,   0,
117 |       0,   0,   0,   0,   1,   0,   1,   0,
118 |       4,   0,   0,   0,   3,   0,   4,   0,
119 |       0,   0,   0,   0,   0,   0,   0,   0,
120 |       0,   0,   1,   0,   0,   0,   0,   0,
121 |       0,   0,   0,   0,   0,   0,   0,   0,
122 |      13,   0,   0,   0,  34,   0,   0,   0,
123 |       0,   0,   0,   0,   0,   0,   0,   0,
124 |       8,   0,   0,   0,   3,   0,   1,   0,
125 |      20,   0,   0,   0,   2,   0,   1,   0,
126 |     109, 115, 103,   0,   0,   0,   0,   0,
127 |      13,   0,   0,   0,   0,   0,   0,   0,
128 |       0,   0,   0,   0,   0,   0,   0,   0,
129 |       0,   0,   0,   0,   0,   0,   0,   0,
130 |       0,   0,   0,   0,   0,   0,   0,   0,
131 |      13,   0,   0,   0,   0,   0,   0,   0,
132 |       0,   0,   0,   0,   0,   0,   0,   0,
133 |       0,   0,   0,   0,   0,   0,   0,   0, }
134 | };
135 | ::capnp::word const* const bp_ea9f732119b10178 = b_ea9f732119b10178.words;
136 | #if !CAPNP_LITE
137 | static const uint16_t m_ea9f732119b10178[] = {0};
138 | static const uint16_t i_ea9f732119b10178[] = {0};
139 | const ::capnp::_::RawSchema s_ea9f732119b10178 = {
140 |   0xea9f732119b10178, b_ea9f732119b10178.words, 33, nullptr, m_ea9f732119b10178,
141 |   0, 1, i_ea9f732119b10178, nullptr, nullptr, { &s_ea9f732119b10178, nullptr, nullptr, 0, 0, nullptr }
142 | };
143 | #endif  // !CAPNP_LITE
144 | static const ::capnp::_::AlignedData<33> b_ab8efc637371947e = {
145 |   {   0,   0,   0,   0,   5,   0,   6,   0,
146 |     126, 148, 113, 115,  99, 252, 142, 171,
147 |      18,   0,   0,   0,   1,   0,   0,   0,
148 |      56,  55,  96, 157, 147,  94,   9, 173,
149 |       1,   0,   7,   0,   0,   0,   0,   0,
150 |       0,   0,   0,   0,   0,   0,   0,   0,
151 |      21,   0,   0,   0, 210,   0,   0,   0,
152 |      33,   0,   0,   0,   7,   0,   0,   0,
153 |       0,   0,   0,   0,   0,   0,   0,   0,
154 |      29,   0,   0,   0,  63,   0,   0,   0,
155 |       0,   0,   0,   0,   0,   0,   0,   0,
156 |       0,   0,   0,   0,   0,   0,   0,   0,
157 |      99,  97, 112, 110, 112, 104,  47, 100,
158 |      97, 116,  97,  46,  99,  97, 112, 110,
159 |     112,  58,  77, 101, 115, 115,  97, 103,
160 |     101,   0,   0,   0,   0,   0,   0,   0,
161 |       0,   0,   0,   0,   1,   0,   1,   0,
162 |       4,   0,   0,   0,   3,   0,   4,   0,
163 |       0,   0,   0,   0,   0,   0,   0,   0,
164 |       0,   0,   1,   0,   0,   0,   0,   0,
165 |       0,   0,   0,   0,   0,   0,   0,   0,
166 |      13,   0,   0,   0,  34,   0,   0,   0,
167 |       0,   0,   0,   0,   0,   0,   0,   0,
168 |       8,   0,   0,   0,   3,   0,   1,   0,
169 |      20,   0,   0,   0,   2,   0,   1,   0,
170 |     109, 115, 103,   0,   0,   0,   0,   0,
171 |      13,   0,   0,   0,   0,   0,   0,   0,
172 |       0,   0,   0,   0,   0,   0,   0,   0,
173 |       0,   0,   0,   0,   0,   0,   0,   0,
174 |       0,   0,   0,   0,   0,   0,   0,   0,
175 |      13,   0,   0,   0,   0,   0,   0,   0,
176 |       0,   0,   0,   0,   0,   0,   0,   0,
177 |       0,   0,   0,   0,   0,   0,   0,   0, }
178 | };
179 | ::capnp::word const* const bp_ab8efc637371947e = b_ab8efc637371947e.words;
180 | #if !CAPNP_LITE
181 | static const uint16_t m_ab8efc637371947e[] = {0};
182 | static const uint16_t i_ab8efc637371947e[] = {0};
183 | const ::capnp::_::RawSchema s_ab8efc637371947e = {
184 |   0xab8efc637371947e, b_ab8efc637371947e.words, 33, nullptr, m_ab8efc637371947e,
185 |   0, 1, i_ab8efc637371947e, nullptr, nullptr, { &s_ab8efc637371947e, nullptr, nullptr, 0, 0, nullptr }
186 | };
187 | #endif  // !CAPNP_LITE
188 | }  // namespace schemas
189 | }  // namespace capnp
190 | 
191 | // =======================================================================================
192 | 
193 | 
194 | // Execution
195 | constexpr uint16_t Execution::_capnpPrivate::dataWordSize;
196 | constexpr uint16_t Execution::_capnpPrivate::pointerCount;
197 | #if !CAPNP_LITE
198 | constexpr ::capnp::Kind Execution::_capnpPrivate::kind;
199 | constexpr ::capnp::_::RawSchema const* Execution::_capnpPrivate::schema;
200 | #endif  // !CAPNP_LITE
201 | 
202 | // Result
203 | constexpr uint16_t Result::_capnpPrivate::dataWordSize;
204 | constexpr uint16_t Result::_capnpPrivate::pointerCount;
205 | #if !CAPNP_LITE
206 | constexpr ::capnp::Kind Result::_capnpPrivate::kind;
207 | constexpr ::capnp::_::RawSchema const* Result::_capnpPrivate::schema;
208 | #endif  // !CAPNP_LITE
209 | 
210 | // Message
211 | constexpr uint16_t Message::_capnpPrivate::dataWordSize;
212 | constexpr uint16_t Message::_capnpPrivate::pointerCount;
213 | #if !CAPNP_LITE
214 | constexpr ::capnp::Kind Message::_capnpPrivate::kind;
215 | constexpr ::capnp::_::RawSchema const* Message::_capnpPrivate::schema;
216 | #endif  // !CAPNP_LITE
217 | 
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/capnph/data.capnp.h:
--------------------------------------------------------------------------------
  1 | // Generated by Cap'n Proto compiler, DO NOT EDIT
  2 | // source: data.capnp
  3 | 
  4 | #pragma once
  5 | 
  6 | #include <capnp/generated-header-support.h>
  7 | #include <kj/windows-sanity.h>
  8 | 
  9 | #if CAPNP_VERSION != 8000
 10 | #error "Version mismatch between generated code and library headers.  You must use the same version of the Cap'n Proto compiler and library."
 11 | #endif
 12 | 
 13 | 
 14 | namespace capnp {
 15 | namespace schemas {
 16 | 
 17 | CAPNP_DECLARE_SCHEMA(86ebb1259307df55);
 18 | CAPNP_DECLARE_SCHEMA(ea9f732119b10178);
 19 | CAPNP_DECLARE_SCHEMA(ab8efc637371947e);
 20 | 
 21 | }  // namespace schemas
 22 | }  // namespace capnp
 23 | 
 24 | 
 25 | struct Execution {
 26 |   Execution() = delete;
 27 | 
 28 |   class Reader;
 29 |   class Builder;
 30 |   class Pipeline;
 31 | 
 32 |   struct _capnpPrivate {
 33 |     CAPNP_DECLARE_STRUCT_HEADER(86ebb1259307df55, 1, 2)
 34 |     #if !CAPNP_LITE
 35 |     static constexpr ::capnp::_::RawBrandedSchema const* brand() { return &schema->defaultBrand; }
 36 |     #endif  // !CAPNP_LITE
 37 |   };
 38 | };
 39 | 
 40 | struct Result {
 41 |   Result() = delete;
 42 | 
 43 |   class Reader;
 44 |   class Builder;
 45 |   class Pipeline;
 46 | 
 47 |   struct _capnpPrivate {
 48 |     CAPNP_DECLARE_STRUCT_HEADER(ea9f732119b10178, 0, 1)
 49 |     #if !CAPNP_LITE
 50 |     static constexpr ::capnp::_::RawBrandedSchema const* brand() { return &schema->defaultBrand; }
 51 |     #endif  // !CAPNP_LITE
 52 |   };
 53 | };
 54 | 
 55 | struct Message {
 56 |   Message() = delete;
 57 | 
 58 |   class Reader;
 59 |   class Builder;
 60 |   class Pipeline;
 61 | 
 62 |   struct _capnpPrivate {
 63 |     CAPNP_DECLARE_STRUCT_HEADER(ab8efc637371947e, 0, 1)
 64 |     #if !CAPNP_LITE
 65 |     static constexpr ::capnp::_::RawBrandedSchema const* brand() { return &schema->defaultBrand; }
 66 |     #endif  // !CAPNP_LITE
 67 |   };
 68 | };
 69 | 
 70 | // =======================================================================================
 71 | 
 72 | class Execution::Reader {
 73 | public:
 74 |   typedef Execution Reads;
 75 | 
 76 |   Reader() = default;
 77 |   inline explicit Reader(::capnp::_::StructReader base): _reader(base) {}
 78 | 
 79 |   inline ::capnp::MessageSize totalSize() const {
 80 |     return _reader.totalSize().asPublic();
 81 |   }
 82 | 
 83 | #if !CAPNP_LITE
 84 |   inline ::kj::StringTree toString() const {
 85 |     return ::capnp::_::structString(_reader, *_capnpPrivate::brand());
 86 |   }
 87 | #endif  // !CAPNP_LITE
 88 | 
 89 |   inline bool getIsShuffle() const;
 90 | 
 91 |   inline  ::uint32_t getPartitionId() const;
 92 | 
 93 |   inline bool hasRdd() const;
 94 |   inline  ::capnp::Data::Reader getRdd() const;
 95 | 
 96 |   inline bool hasFuncOrDep() const;
 97 |   inline  ::capnp::Data::Reader getFuncOrDep() const;
 98 | 
 99 | private:
100 |   ::capnp::_::StructReader _reader;
101 |   template <typename, ::capnp::Kind>
102 |   friend struct ::capnp::ToDynamic_;
103 |   template <typename, ::capnp::Kind>
104 |   friend struct ::capnp::_::PointerHelpers;
105 |   template <typename, ::capnp::Kind>
106 |   friend struct ::capnp::List;
107 |   friend class ::capnp::MessageBuilder;
108 |   friend class ::capnp::Orphanage;
109 | };
110 | 
111 | class Execution::Builder {
112 | public:
113 |   typedef Execution Builds;
114 | 
115 |   Builder() = delete;  // Deleted to discourage incorrect usage.
116 |                        // You can explicitly initialize to nullptr instead.
117 |   inline Builder(decltype(nullptr)) {}
118 |   inline explicit Builder(::capnp::_::StructBuilder base): _builder(base) {}
119 |   inline operator Reader() const { return Reader(_builder.asReader()); }
120 |   inline Reader asReader() const { return *this; }
121 | 
122 |   inline ::capnp::MessageSize totalSize() const { return asReader().totalSize(); }
123 | #if !CAPNP_LITE
124 |   inline ::kj::StringTree toString() const { return asReader().toString(); }
125 | #endif  // !CAPNP_LITE
126 | 
127 |   inline bool getIsShuffle();
128 |   inline void setIsShuffle(bool value);
129 | 
130 |   inline  ::uint32_t getPartitionId();
131 |   inline void setPartitionId( ::uint32_t value);
132 | 
133 |   inline bool hasRdd();
134 |   inline  ::capnp::Data::Builder getRdd();
135 |   inline void setRdd( ::capnp::Data::Reader value);
136 |   inline  ::capnp::Data::Builder initRdd(unsigned int size);
137 |   inline void adoptRdd(::capnp::Orphan< ::capnp::Data>&& value);
138 |   inline ::capnp::Orphan< ::capnp::Data> disownRdd();
139 | 
140 |   inline bool hasFuncOrDep();
141 |   inline  ::capnp::Data::Builder getFuncOrDep();
142 |   inline void setFuncOrDep( ::capnp::Data::Reader value);
143 |   inline  ::capnp::Data::Builder initFuncOrDep(unsigned int size);
144 |   inline void adoptFuncOrDep(::capnp::Orphan< ::capnp::Data>&& value);
145 |   inline ::capnp::Orphan< ::capnp::Data> disownFuncOrDep();
146 | 
147 | private:
148 |   ::capnp::_::StructBuilder _builder;
149 |   template <typename, ::capnp::Kind>
150 |   friend struct ::capnp::ToDynamic_;
151 |   friend class ::capnp::Orphanage;
152 |   template <typename, ::capnp::Kind>
153 |   friend struct ::capnp::_::PointerHelpers;
154 | };
155 | 
156 | #if !CAPNP_LITE
157 | class Execution::Pipeline {
158 | public:
159 |   typedef Execution Pipelines;
160 | 
161 |   inline Pipeline(decltype(nullptr)): _typeless(nullptr) {}
162 |   inline explicit Pipeline(::capnp::AnyPointer::Pipeline&& typeless)
163 |       : _typeless(kj::mv(typeless)) {}
164 | 
165 | private:
166 |   ::capnp::AnyPointer::Pipeline _typeless;
167 |   friend class ::capnp::PipelineHook;
168 |   template <typename, ::capnp::Kind>
169 |   friend struct ::capnp::ToDynamic_;
170 | };
171 | #endif  // !CAPNP_LITE
172 | 
173 | class Result::Reader {
174 | public:
175 |   typedef Result Reads;
176 | 
177 |   Reader() = default;
178 |   inline explicit Reader(::capnp::_::StructReader base): _reader(base) {}
179 | 
180 |   inline ::capnp::MessageSize totalSize() const {
181 |     return _reader.totalSize().asPublic();
182 |   }
183 | 
184 | #if !CAPNP_LITE
185 |   inline ::kj::StringTree toString() const {
186 |     return ::capnp::_::structString(_reader, *_capnpPrivate::brand());
187 |   }
188 | #endif  // !CAPNP_LITE
189 | 
190 |   inline bool hasMsg() const;
191 |   inline  ::capnp::Data::Reader getMsg() const;
192 | 
193 | private:
194 |   ::capnp::_::StructReader _reader;
195 |   template <typename, ::capnp::Kind>
196 |   friend struct ::capnp::ToDynamic_;
197 |   template <typename, ::capnp::Kind>
198 |   friend struct ::capnp::_::PointerHelpers;
199 |   template <typename, ::capnp::Kind>
200 |   friend struct ::capnp::List;
201 |   friend class ::capnp::MessageBuilder;
202 |   friend class ::capnp::Orphanage;
203 | };
204 | 
205 | class Result::Builder {
206 | public:
207 |   typedef Result Builds;
208 | 
209 |   Builder() = delete;  // Deleted to discourage incorrect usage.
210 |                        // You can explicitly initialize to nullptr instead.
211 |   inline Builder(decltype(nullptr)) {}
212 |   inline explicit Builder(::capnp::_::StructBuilder base): _builder(base) {}
213 |   inline operator Reader() const { return Reader(_builder.asReader()); }
214 |   inline Reader asReader() const { return *this; }
215 | 
216 |   inline ::capnp::MessageSize totalSize() const { return asReader().totalSize(); }
217 | #if !CAPNP_LITE
218 |   inline ::kj::StringTree toString() const { return asReader().toString(); }
219 | #endif  // !CAPNP_LITE
220 | 
221 |   inline bool hasMsg();
222 |   inline  ::capnp::Data::Builder getMsg();
223 |   inline void setMsg( ::capnp::Data::Reader value);
224 |   inline  ::capnp::Data::Builder initMsg(unsigned int size);
225 |   inline void adoptMsg(::capnp::Orphan< ::capnp::Data>&& value);
226 |   inline ::capnp::Orphan< ::capnp::Data> disownMsg();
227 | 
228 | private:
229 |   ::capnp::_::StructBuilder _builder;
230 |   template <typename, ::capnp::Kind>
231 |   friend struct ::capnp::ToDynamic_;
232 |   friend class ::capnp::Orphanage;
233 |   template <typename, ::capnp::Kind>
234 |   friend struct ::capnp::_::PointerHelpers;
235 | };
236 | 
237 | #if !CAPNP_LITE
238 | class Result::Pipeline {
239 | public:
240 |   typedef Result Pipelines;
241 | 
242 |   inline Pipeline(decltype(nullptr)): _typeless(nullptr) {}
243 |   inline explicit Pipeline(::capnp::AnyPointer::Pipeline&& typeless)
244 |       : _typeless(kj::mv(typeless)) {}
245 | 
246 | private:
247 |   ::capnp::AnyPointer::Pipeline _typeless;
248 |   friend class ::capnp::PipelineHook;
249 |   template <typename, ::capnp::Kind>
250 |   friend struct ::capnp::ToDynamic_;
251 | };
252 | #endif  // !CAPNP_LITE
253 | 
254 | class Message::Reader {
255 | public:
256 |   typedef Message Reads;
257 | 
258 |   Reader() = default;
259 |   inline explicit Reader(::capnp::_::StructReader base): _reader(base) {}
260 | 
261 |   inline ::capnp::MessageSize totalSize() const {
262 |     return _reader.totalSize().asPublic();
263 |   }
264 | 
265 | #if !CAPNP_LITE
266 |   inline ::kj::StringTree toString() const {
267 |     return ::capnp::_::structString(_reader, *_capnpPrivate::brand());
268 |   }
269 | #endif  // !CAPNP_LITE
270 | 
271 |   inline bool hasMsg() const;
272 |   inline  ::capnp::Data::Reader getMsg() const;
273 | 
274 | private:
275 |   ::capnp::_::StructReader _reader;
276 |   template <typename, ::capnp::Kind>
277 |   friend struct ::capnp::ToDynamic_;
278 |   template <typename, ::capnp::Kind>
279 |   friend struct ::capnp::_::PointerHelpers;
280 |   template <typename, ::capnp::Kind>
281 |   friend struct ::capnp::List;
282 |   friend class ::capnp::MessageBuilder;
283 |   friend class ::capnp::Orphanage;
284 | };
285 | 
286 | class Message::Builder {
287 | public:
288 |   typedef Message Builds;
289 | 
290 |   Builder() = delete;  // Deleted to discourage incorrect usage.
291 |                        // You can explicitly initialize to nullptr instead.
292 |   inline Builder(decltype(nullptr)) {}
293 |   inline explicit Builder(::capnp::_::StructBuilder base): _builder(base) {}
294 |   inline operator Reader() const { return Reader(_builder.asReader()); }
295 |   inline Reader asReader() const { return *this; }
296 | 
297 |   inline ::capnp::MessageSize totalSize() const { return asReader().totalSize(); }
298 | #if !CAPNP_LITE
299 |   inline ::kj::StringTree toString() const { return asReader().toString(); }
300 | #endif  // !CAPNP_LITE
301 | 
302 |   inline bool hasMsg();
303 |   inline  ::capnp::Data::Builder getMsg();
304 |   inline void setMsg( ::capnp::Data::Reader value);
305 |   inline  ::capnp::Data::Builder initMsg(unsigned int size);
306 |   inline void adoptMsg(::capnp::Orphan< ::capnp::Data>&& value);
307 |   inline ::capnp::Orphan< ::capnp::Data> disownMsg();
308 | 
309 | private:
310 |   ::capnp::_::StructBuilder _builder;
311 |   template <typename, ::capnp::Kind>
312 |   friend struct ::capnp::ToDynamic_;
313 |   friend class ::capnp::Orphanage;
314 |   template <typename, ::capnp::Kind>
315 |   friend struct ::capnp::_::PointerHelpers;
316 | };
317 | 
318 | #if !CAPNP_LITE
319 | class Message::Pipeline {
320 | public:
321 |   typedef Message Pipelines;
322 | 
323 |   inline Pipeline(decltype(nullptr)): _typeless(nullptr) {}
324 |   inline explicit Pipeline(::capnp::AnyPointer::Pipeline&& typeless)
325 |       : _typeless(kj::mv(typeless)) {}
326 | 
327 | private:
328 |   ::capnp::AnyPointer::Pipeline _typeless;
329 |   friend class ::capnp::PipelineHook;
330 |   template <typename, ::capnp::Kind>
331 |   friend struct ::capnp::ToDynamic_;
332 | };
333 | #endif  // !CAPNP_LITE
334 | 
335 | // =======================================================================================
336 | 
337 | inline bool Execution::Reader::getIsShuffle() const {
338 |   return _reader.getDataField<bool>(
339 |       ::capnp::bounded<0>() * ::capnp::ELEMENTS);
340 | }
341 | 
342 | inline bool Execution::Builder::getIsShuffle() {
343 |   return _builder.getDataField<bool>(
344 |       ::capnp::bounded<0>() * ::capnp::ELEMENTS);
345 | }
346 | inline void Execution::Builder::setIsShuffle(bool value) {
347 |   _builder.setDataField<bool>(
348 |       ::capnp::bounded<0>() * ::capnp::ELEMENTS, value);
349 | }
350 | 
351 | inline  ::uint32_t Execution::Reader::getPartitionId() const {
352 |   return _reader.getDataField< ::uint32_t>(
353 |       ::capnp::bounded<1>() * ::capnp::ELEMENTS);
354 | }
355 | 
356 | inline  ::uint32_t Execution::Builder::getPartitionId() {
357 |   return _builder.getDataField< ::uint32_t>(
358 |       ::capnp::bounded<1>() * ::capnp::ELEMENTS);
359 | }
360 | inline void Execution::Builder::setPartitionId( ::uint32_t value) {
361 |   _builder.setDataField< ::uint32_t>(
362 |       ::capnp::bounded<1>() * ::capnp::ELEMENTS, value);
363 | }
364 | 
365 | inline bool Execution::Reader::hasRdd() const {
366 |   return !_reader.getPointerField(
367 |       ::capnp::bounded<0>() * ::capnp::POINTERS).isNull();
368 | }
369 | inline bool Execution::Builder::hasRdd() {
370 |   return !_builder.getPointerField(
371 |       ::capnp::bounded<0>() * ::capnp::POINTERS).isNull();
372 | }
373 | inline  ::capnp::Data::Reader Execution::Reader::getRdd() const {
374 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_reader.getPointerField(
375 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
376 | }
377 | inline  ::capnp::Data::Builder Execution::Builder::getRdd() {
378 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_builder.getPointerField(
379 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
380 | }
381 | inline void Execution::Builder::setRdd( ::capnp::Data::Reader value) {
382 |   ::capnp::_::PointerHelpers< ::capnp::Data>::set(_builder.getPointerField(
383 |       ::capnp::bounded<0>() * ::capnp::POINTERS), value);
384 | }
385 | inline  ::capnp::Data::Builder Execution::Builder::initRdd(unsigned int size) {
386 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::init(_builder.getPointerField(
387 |       ::capnp::bounded<0>() * ::capnp::POINTERS), size);
388 | }
389 | inline void Execution::Builder::adoptRdd(
390 |     ::capnp::Orphan< ::capnp::Data>&& value) {
391 |   ::capnp::_::PointerHelpers< ::capnp::Data>::adopt(_builder.getPointerField(
392 |       ::capnp::bounded<0>() * ::capnp::POINTERS), kj::mv(value));
393 | }
394 | inline ::capnp::Orphan< ::capnp::Data> Execution::Builder::disownRdd() {
395 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::disown(_builder.getPointerField(
396 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
397 | }
398 | 
399 | inline bool Execution::Reader::hasFuncOrDep() const {
400 |   return !_reader.getPointerField(
401 |       ::capnp::bounded<1>() * ::capnp::POINTERS).isNull();
402 | }
403 | inline bool Execution::Builder::hasFuncOrDep() {
404 |   return !_builder.getPointerField(
405 |       ::capnp::bounded<1>() * ::capnp::POINTERS).isNull();
406 | }
407 | inline  ::capnp::Data::Reader Execution::Reader::getFuncOrDep() const {
408 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_reader.getPointerField(
409 |       ::capnp::bounded<1>() * ::capnp::POINTERS));
410 | }
411 | inline  ::capnp::Data::Builder Execution::Builder::getFuncOrDep() {
412 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_builder.getPointerField(
413 |       ::capnp::bounded<1>() * ::capnp::POINTERS));
414 | }
415 | inline void Execution::Builder::setFuncOrDep( ::capnp::Data::Reader value) {
416 |   ::capnp::_::PointerHelpers< ::capnp::Data>::set(_builder.getPointerField(
417 |       ::capnp::bounded<1>() * ::capnp::POINTERS), value);
418 | }
419 | inline  ::capnp::Data::Builder Execution::Builder::initFuncOrDep(unsigned int size) {
420 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::init(_builder.getPointerField(
421 |       ::capnp::bounded<1>() * ::capnp::POINTERS), size);
422 | }
423 | inline void Execution::Builder::adoptFuncOrDep(
424 |     ::capnp::Orphan< ::capnp::Data>&& value) {
425 |   ::capnp::_::PointerHelpers< ::capnp::Data>::adopt(_builder.getPointerField(
426 |       ::capnp::bounded<1>() * ::capnp::POINTERS), kj::mv(value));
427 | }
428 | inline ::capnp::Orphan< ::capnp::Data> Execution::Builder::disownFuncOrDep() {
429 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::disown(_builder.getPointerField(
430 |       ::capnp::bounded<1>() * ::capnp::POINTERS));
431 | }
432 | 
433 | inline bool Result::Reader::hasMsg() const {
434 |   return !_reader.getPointerField(
435 |       ::capnp::bounded<0>() * ::capnp::POINTERS).isNull();
436 | }
437 | inline bool Result::Builder::hasMsg() {
438 |   return !_builder.getPointerField(
439 |       ::capnp::bounded<0>() * ::capnp::POINTERS).isNull();
440 | }
441 | inline  ::capnp::Data::Reader Result::Reader::getMsg() const {
442 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_reader.getPointerField(
443 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
444 | }
445 | inline  ::capnp::Data::Builder Result::Builder::getMsg() {
446 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_builder.getPointerField(
447 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
448 | }
449 | inline void Result::Builder::setMsg( ::capnp::Data::Reader value) {
450 |   ::capnp::_::PointerHelpers< ::capnp::Data>::set(_builder.getPointerField(
451 |       ::capnp::bounded<0>() * ::capnp::POINTERS), value);
452 | }
453 | inline  ::capnp::Data::Builder Result::Builder::initMsg(unsigned int size) {
454 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::init(_builder.getPointerField(
455 |       ::capnp::bounded<0>() * ::capnp::POINTERS), size);
456 | }
457 | inline void Result::Builder::adoptMsg(
458 |     ::capnp::Orphan< ::capnp::Data>&& value) {
459 |   ::capnp::_::PointerHelpers< ::capnp::Data>::adopt(_builder.getPointerField(
460 |       ::capnp::bounded<0>() * ::capnp::POINTERS), kj::mv(value));
461 | }
462 | inline ::capnp::Orphan< ::capnp::Data> Result::Builder::disownMsg() {
463 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::disown(_builder.getPointerField(
464 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
465 | }
466 | 
467 | inline bool Message::Reader::hasMsg() const {
468 |   return !_reader.getPointerField(
469 |       ::capnp::bounded<0>() * ::capnp::POINTERS).isNull();
470 | }
471 | inline bool Message::Builder::hasMsg() {
472 |   return !_builder.getPointerField(
473 |       ::capnp::bounded<0>() * ::capnp::POINTERS).isNull();
474 | }
475 | inline  ::capnp::Data::Reader Message::Reader::getMsg() const {
476 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_reader.getPointerField(
477 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
478 | }
479 | inline  ::capnp::Data::Builder Message::Builder::getMsg() {
480 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::get(_builder.getPointerField(
481 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
482 | }
483 | inline void Message::Builder::setMsg( ::capnp::Data::Reader value) {
484 |   ::capnp::_::PointerHelpers< ::capnp::Data>::set(_builder.getPointerField(
485 |       ::capnp::bounded<0>() * ::capnp::POINTERS), value);
486 | }
487 | inline  ::capnp::Data::Builder Message::Builder::initMsg(unsigned int size) {
488 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::init(_builder.getPointerField(
489 |       ::capnp::bounded<0>() * ::capnp::POINTERS), size);
490 | }
491 | inline void Message::Builder::adoptMsg(
492 |     ::capnp::Orphan< ::capnp::Data>&& value) {
493 |   ::capnp::_::PointerHelpers< ::capnp::Data>::adopt(_builder.getPointerField(
494 |       ::capnp::bounded<0>() * ::capnp::POINTERS), kj::mv(value));
495 | }
496 | inline ::capnp::Orphan< ::capnp::Data> Message::Builder::disownMsg() {
497 |   return ::capnp::_::PointerHelpers< ::capnp::Data>::disown(_builder.getPointerField(
498 |       ::capnp::bounded<0>() * ::capnp::POINTERS));
499 | }
500 | 
501 | 
502 | 


--------------------------------------------------------------------------------
/examples/basics.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "spark_env.hpp"
 3 | #include "spark_context.hpp"
 4 | 
 5 | SparkEnv env;
 6 | 
 7 | int main(int argc, char** argv) {
 8 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
 9 |     vector<addr_t> slaveAddrs = {
10 |             {"18.218.54.64", 24457},
11 |             {"3.17.81.214", 24457}
12 |     };
13 |     env.init(argc, argv, masterAddr);
14 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
15 |     vector<int> values = {1, 2, 3, 4, 5, 6, 7};
16 |     auto rdd = sc.parallelize(values, 3);
17 |     auto rdd2 = rdd.map([](int x) {
18 |         return x + 1;
19 |     });
20 |     auto rdd3 = rdd2.map([](int x) {
21 |         return x - 1;
22 |     });
23 |     auto rdd4 = rdd3.mapPair([](int x) {
24 |         return make_pair(x % 2, x);
25 |     });
26 |     // (1, 3, 5, 7) | (2, 4, 6)
27 |     auto rdd5 = rdd4.groupByKey(2);
28 |     // 16 | 12
29 |     auto rdd7 = rdd5.map([](pair<int, vector<int>> x) -> int {
30 |         int acc = 0;
31 |         for (auto i : x.second) {
32 |             acc += i;
33 |         }
34 |         return acc;
35 |     });
36 |     auto v = rdd7.collect();
37 |     // 12, 16 or 16, 12
38 |     std::cout << v[0] << v[1] << '\n';
39 |     return 0;
40 | }


--------------------------------------------------------------------------------
/examples/large_word_count.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <chrono>
 4 | #include <algorithm>
 5 | #include "spark_env.hpp"
 6 | #include "spark_context.hpp"
 7 | #include <boost/algorithm/string.hpp>
 8 | #include <fmt/format.h>
 9 | 
10 | using namespace std::chrono;
11 | 
12 | SparkEnv env;
13 | 
14 | int main(int argc, char** argv) {
15 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
16 |     vector<addr_t> slaveAddrs = {
17 |             {"18.218.54.64", 24457},
18 |             {"3.17.81.214", 24457}
19 |     };
20 |     env.init(argc, argv, masterAddr);
21 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
22 |     vector<size_t> files(40);
23 |     std::iota(files.begin(), files.end(), 0);
24 | 
25 |     auto t_begin = steady_clock::now();
26 | 
27 |     // refer miscs/word_count.scala
28 |     auto rdd = sc.parallelize(files, files.size());
29 |     auto rdd2 = rdd.flatMap([](size_t v) noexcept {
30 |         auto path = fmt::format("/home/ubuntu/Sparkpp/examples/input/input_{}", v);
31 |         std::ifstream ifs{path};
32 |         vector<vector<string>> words;
33 |         for (string line; getline(ifs, line);) {
34 |             vector<string> w;
35 |             boost::algorithm::split(w, move(line), [](const char c) {
36 |                 return c == ' ';
37 |             });
38 |             words.push_back(move(w));
39 |         }
40 |         return flatten(words);
41 |     });
42 |     auto rdd3 = rdd2.mapPair([](string&& w) noexcept -> pair<string, int> {
43 |         return make_pair(move(w), 1);
44 |     });
45 |     auto rdd4 = rdd3.reduceByKey([](int a, int b) noexcept {
46 |         return a + b;
47 |     }, 8);
48 |     auto result = rdd4.collect();
49 | 
50 |     auto t_end = steady_clock::now();
51 | 
52 |     std::cout << result.size() << '\n';
53 |     std::cout << "Elapsed time in milliseconds: "
54 |               << duration_cast<milliseconds>(t_end - t_begin).count() << " ms\n";
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/examples/morte_carlo_pi.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <algorithm>
 4 | #include "spark_env.hpp"
 5 | #include "spark_context.hpp"
 6 | 
 7 | using namespace std::chrono;
 8 | 
 9 | SparkEnv env;
10 | 
11 | int main(int argc, char** argv) {
12 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
13 |     vector<addr_t> slaveAddrs = {
14 |             {"18.218.54.64", 24457},
15 |             {"3.17.81.214", 24457}
16 |     };
17 |     env.init(argc, argv, masterAddr);
18 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
19 | 
20 |     constexpr long long chunks = 1e6;
21 |     constexpr long long chunkSize = 1e4;
22 |     vector<long long> values(chunks);
23 |     std::iota(values.begin(), values.end(), 0ll);
24 | 
25 |     auto t_begin = steady_clock::now();
26 | 
27 |     // refer miscs/morte_carlo_pi.scala
28 |     auto rdd = sc.parallelize(values, 4);
29 |     auto random = rdd.map([](long long n) noexcept {
30 |         unsigned long long count = 0;
31 |         for (auto i = 0; i < chunkSize; ++i) {
32 |             n = (n * 998244353ll + 19260817ll) % 134456;
33 |             double x = n / 67228.0 - 1;
34 |             n = (n * 998244353ll + 19260817ll) % 134456;
35 |             double y = n / 67228.0 - 1;
36 |             if (x * x + y * y < 1) {
37 |                 ++count;
38 |             }
39 |         }
40 |         return count;
41 |     });
42 | 
43 |     auto cnt = random.reduce([](unsigned long long n, unsigned long long m) {
44 |         return n + m;
45 |     });
46 | 
47 |     auto t_end = steady_clock::now();
48 | 
49 |     std::cout << "Pi = " << (4.0 * cnt / chunks / chunkSize) << '\n';
50 |     std::cout << "Elapsed time in milliseconds: "
51 |               << duration_cast<milliseconds>(t_end - t_begin).count() << " ms\n";
52 |     return 0;
53 | }


--------------------------------------------------------------------------------
/examples/morte_carlo_pi_omp.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <algorithm>
 4 | #include <random>
 5 | #include "spark_env.hpp"
 6 | #include "spark_context.hpp"
 7 | 
 8 | using namespace std::chrono;
 9 | 
10 | SparkEnv env;
11 | 
12 | int main(int argc, char** argv) {
13 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
14 |     vector<addr_t> slaveAddrs = {
15 |             {"18.218.54.64", 24457},
16 |             {"3.17.81.214", 24457}
17 |     };
18 |     env.init(argc, argv, masterAddr);
19 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
20 | 
21 |     constexpr long long chunks = 8;
22 |     constexpr long long chunkSize = 1e9;
23 |     vector<long long> values(chunks);
24 |     std::iota(values.begin(), values.end(), 0ll);
25 | 
26 |     auto t_begin = steady_clock::now();
27 | 
28 |     // insufficient use of slaves (2 cores, 4 threads)
29 |     auto rdd = sc.parallelize(values, 2);
30 |     auto random = rdd.map([](long long) noexcept {
31 |         unsigned long long count = 0;
32 |         std::random_device rd;
33 |         std::mt19937 gen(rd());
34 |         std::uniform_real_distribution<double> dis{-1.0, 1.0};
35 |         // but we can start OpenMP tasks per slave (by FIFO dispatching)
36 |         #pragma omp parallel for reduction(+:count) default(none) private(dis, gen)
37 |         for (auto i = 0; i < chunkSize; ++i) {
38 |             double x = dis(gen);
39 |             double y = dis(gen);
40 |             if (x * x + y * y < 1) {
41 |                 ++count;
42 |             }
43 |         }
44 |         return count;
45 |     });
46 | 
47 |     auto cnt = random.reduce([](unsigned long long n, unsigned long long m) {
48 |         return n + m;
49 |     });
50 | 
51 |     auto t_end = steady_clock::now();
52 | 
53 |     std::cout << "Pi = " << (4.0 * cnt / chunks / chunkSize) << '\n';
54 |     std::cout << "Elapsed time in milliseconds: "
55 |               << duration_cast<milliseconds>(t_end - t_begin).count() << " ms\n";
56 |     return 0;
57 | }


--------------------------------------------------------------------------------
/examples/mpi_ping_pong_failed.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include "spark_env.hpp"
 4 | #include "spark_context.hpp"
 5 | #include <mpi/mpi.h>
 6 | 
 7 | using namespace std::chrono;
 8 | 
 9 | SparkEnv env;
10 | 
11 | int main(int argc, char** argv) {
12 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
13 |     vector<addr_t> slaveAddrs = {
14 |               {"18.218.54.64", 24457}
15 |             , {"3.17.81.214", 24457}
16 |     };
17 |     env.init(argc, argv, masterAddr);
18 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
19 | 
20 |     auto t_begin = steady_clock::now();
21 | 
22 |     auto rdd = sc.parallelize(slaveAddrs, 2);
23 | 
24 |     // failed because of no global aware of MPI_COMM_WORLD
25 |     // need a process managing wrapper, like Torque?
26 |     // refer MPI tutorials ping-pong
27 |     auto mpi = rdd.map([](addr_t) {
28 |         const int limit = 10;
29 |         MPI_Init(NULL, NULL);
30 |         int world_rank;
31 |         MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
32 |         int world_size;
33 |         MPI_Comm_size(MPI_COMM_WORLD, &world_size);
34 |         std::cout << world_rank << ' ' << world_size << '\n';
35 |         int count = 0;
36 |         int partner_rank = (world_rank + 1) % 2;
37 |         while (count < limit) {
38 |             if (world_rank == count % 2) {
39 |                 ++count;
40 |                 MPI_Send(&count, 1, MPI_INT, partner_rank, 0, MPI_COMM_WORLD);
41 |                 std::cout << world_rank << " sent & incremented count "
42 |                           << count << " -> " << partner_rank << '\n';
43 |             } else {
44 |                 MPI_Recv(&count, 1, MPI_INT, partner_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
45 |                 std::cout << world_rank << " received count "
46 |                           << count << " <- " << partner_rank << '\n';
47 |             }
48 |         }
49 |         MPI_Finalize();
50 |         return count;
51 |     });
52 |     auto start = mpi.collect();
53 | 
54 |     auto t_end = steady_clock::now();
55 | 
56 |     std::cout << "Collect: " << start[0] << ' ' << start[1] << '\n';
57 |     std::cout << "Elapsed time in milliseconds: "
58 |               << duration_cast<milliseconds>(t_end - t_begin).count() << " ms\n";
59 |     return 0;
60 | }


--------------------------------------------------------------------------------
/examples/nest_flat_map.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <algorithm>
 3 | #include <chrono>
 4 | #include "spark_env.hpp"
 5 | #include "spark_context.hpp"
 6 | 
 7 | using namespace std::chrono;
 8 | 
 9 | SparkEnv env;
10 | 
11 | int main(int argc, char** argv) {
12 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
13 |     vector<addr_t> slaveAddrs = {
14 |             {"18.218.54.64", 24457},
15 |             {"3.17.81.214", 24457}
16 |     };
17 |     env.init(argc, argv, masterAddr);
18 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
19 |     vector<size_t> values(4);
20 |     std::iota(values.begin(), values.end(), 0);
21 | 
22 |     auto t_begin = steady_clock::now();
23 |     auto rdd = sc.parallelize(values, 8);
24 |     // 0, 1, 1, 2, 2, 3, 3, 4
25 |     auto rdd2 = rdd.flatMap([](int v) {
26 |         return vector<int>{v, v + 1};
27 |     });
28 |     // 0, 2, 2, 4, 4, 6, 6, 8, 0, 1, 1, 2, 2, 3, 3, 4
29 |     auto rdd3 = rdd2.flatMap([](int v) {
30 |         return vector<int>{v, v * 2};
31 |     });
32 |     auto rdd4 = rdd3.mapPair([](int v) {
33 |         return make_pair(v % 2, v);
34 |     });
35 |     auto rdd5 = rdd4.reduceByKey([](int a, int b) {
36 |         return a + b;
37 |     }, 2);
38 |     // 8, 40
39 |     auto result = rdd5.collect();
40 |     auto t_end = steady_clock::now();
41 | 
42 |     for (auto i : result) {
43 |         std::cout << i.first << ' ' << i.second << '\n';
44 |     }
45 |     std::cout << result.size() << '\n';
46 |     std::cout << "Elapsed time in milliseconds: "
47 |               << duration_cast<milliseconds>(t_end - t_begin).count() << " ms\n";
48 |     return 0;
49 | }


--------------------------------------------------------------------------------
/examples/word_count.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include "spark_env.hpp"
 4 | #include "spark_context.hpp"
 5 | #include <boost/algorithm/string.hpp>
 6 | #include <fmt/format.h>
 7 | 
 8 | using namespace std::chrono;
 9 | 
10 | SparkEnv env;
11 | 
12 | int main(int argc, char** argv) {
13 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
14 |     vector<addr_t> slaveAddrs = {
15 |             {"18.218.54.64", 24457},
16 |             {"3.17.81.214", 24457}
17 |     };
18 |     env.init(argc, argv, masterAddr);
19 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
20 |     // https://cts.instructure.com/courses/172185/pages/word-count-examples
21 |     vector<string> lines = {
22 |             "and of I friend great spark master follower",
23 |             "of I friend great spark master follower and",
24 |             "I friend great spark master follower and of",
25 |             "friend great spark master follower and of I",
26 |             "great spark master follower and of I friend",
27 |             "spark master follower and of I friend great",
28 |             "master follower and of I friend great spark",
29 |             "follower and of I friend great spark master",
30 |             "and of I friend great spark master follower",
31 |             "spark master follower and of I friend great",
32 |             "master follower and of I friend great spark",
33 |             "follower and of I friend great spark master",
34 |             "and of I friend great spark master follower",
35 |             "spark master follower and of I friend great",
36 |             "master follower and of I friend great spark",
37 |             "follower and of I friend great spark master",
38 |             "and of I friend great spark master follower",
39 |             "spark master follower and of I friend great",
40 |             "master follower and of I friend great spark",
41 |             "follower and of I friend great spark master",
42 |             "and of I friend great spark master follower",
43 |             "spark master follower and of I friend great",
44 |             "master follower and of I friend great spark",
45 |             "follower and of I friend great spark master",
46 |             "and of I friend great spark master follower",
47 |             "spark master follower and of I friend great",
48 |             "master follower and of I friend great spark",
49 |             "follower and of I friend great spark master",
50 |             "and of I friend great spark master follower",
51 |             "spark master follower and of I friend great",
52 |             "master follower and of I friend great spark",
53 |             "follower and of I friend great spark master",
54 |             "and of I friend great spark master follower",
55 |             "spark master follower and of I friend great",
56 |             "master follower and of I friend great spark",
57 |             "follower and of I friend great spark master",
58 |             "and of I friend great spark master follower",
59 |             "spark master follower and of I friend great",
60 |             "master follower and of I friend great spark",
61 |             "follower and of I friend great spark master",
62 |             "and of I friend great spark master follower",
63 |             "spark master follower and of I friend great",
64 |             "master follower and of I friend great spark",
65 |             "follower and of I friend great spark master",
66 |             "and of I friend great spark master follower"
67 |     };
68 |     auto t_begin = steady_clock::now();
69 |     auto rdd = sc.parallelize(lines, lines.size());
70 |     auto rdd2 = rdd.flatMap([](string&& s) {
71 |         vector<string> v;
72 |         boost::algorithm::split(v, move(s), boost::algorithm::is_space());
73 |         return v;
74 |     });
75 |     rdd2.cache();
76 |     auto rdd3 = rdd2.mapPair([](string&& w) -> pair<string, int> {
77 |         return make_pair(move(w), 1);
78 |     });
79 |     auto rdd4 = rdd3.reduceByKey([](int a, int b) {
80 |         return a + b;
81 |     }, 2);
82 |     auto result = rdd4.collect();
83 |     auto t_end = steady_clock::now();
84 |     for (auto& [k, v] : result) {
85 |         std::cout << k << ": " << v << '\n';
86 |     }
87 |     std::cout << "Elapsed time in milliseconds: "
88 |               << duration_cast<milliseconds>(t_end - t_begin).count() << " ms\n";
89 |     return 0;
90 | }


--------------------------------------------------------------------------------
/include/aggregator.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/7/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_AGGREGATOR_HPP
 6 | #define SPARKPP_AGGREGATOR_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include "serialize_wrapper.hpp"
10 | 
11 | struct AggregatorBase {
12 |     virtual void* createCombiner() = 0;
13 |     virtual void* mergeValue() = 0;
14 |     virtual void* mergeCombiners() = 0;
15 |     virtual void serialize_dyn(vector<char>& bytes) const = 0;
16 |     virtual void deserialize_dyn(const char*&, size_t&) = 0;
17 | };
18 | 
19 | 
20 | template <typename K, typename V, typename C>
21 | struct Aggregator : AggregatorBase {
22 |     /// force function pointer
23 |     using createCombiner_t = C(*)(V);
24 |     using mergeValue_t = C(*)(C, V);
25 |     using mergeCombiners_t = C(*)(C, C);
26 |     /// V -> C
27 |     createCombiner_t f_createCombiner;
28 |     /// C, V -> C
29 |     mergeValue_t f_mergeValue;
30 |     /// C, C -> C
31 |     mergeCombiners_t f_mergeCombiners;
32 | 
33 |     Aggregator(createCombiner_t cc, mergeValue_t mv, mergeCombiners_t mc)
34 |         : f_createCombiner{cc}, f_mergeValue{mv}, f_mergeCombiners{mc} {}
35 | 
36 |     void* createCombiner() override {
37 |         return (void*)f_createCombiner;
38 |     }
39 |     void* mergeValue() override {
40 |         return (void*)f_mergeValue;
41 |     }
42 |     void* mergeCombiners() override {
43 |         return (void*)f_mergeCombiners;
44 |     }
45 | 
46 |     void serialize_dyn(vector<char>& bytes) const override {
47 |         size_t oldSize = bytes.size();
48 |         bytes.resize(oldSize + sizeof(Aggregator));
49 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(Aggregator));
50 |     }
51 |     void deserialize_dyn(const char*& bytes, size_t& size) override {
52 |         // plain_copy
53 |         bytes += sizeof(Aggregator);
54 |         size -= sizeof(Aggregator);
55 |     }
56 | };
57 | 
58 | #endif //SPARKPP_AGGREGATOR_HPP
59 | 


--------------------------------------------------------------------------------
/include/cache.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/13/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_CACHE_HPP
 6 | #define SPARKPP_CACHE_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include "serialize_wrapper.hpp"
10 | 
11 | struct KeySpace;
12 | 
13 | // HACK: Currently unbounded
14 | struct BoundedMemoryCache {
15 |     atomic<size_t> nextKeySpaceId = 0;
16 |     size_t maxBytes = 2000;
17 |     mutex lock;
18 |     size_t currentBytes = 0;
19 |     // keyspaceId, rddId
20 |     using datasetId_t = pair<size_t, size_t>;
21 |     // datasetId, partitionId
22 |     using key_t = pair<datasetId_t, size_t>;
23 |     using value_t = Storage;
24 |     unordered_map<key_t, Storage, pair_hash> map;
25 | 
26 |     KeySpace newKeySpace();
27 |     optional<Storage> get(key_t key) {
28 |         lock_guard lk{lock};
29 |         if (map.find(key) != map.end()) {
30 |             return {map[key]};
31 |         } else {
32 |             return {};
33 |         }
34 |     }
35 |     optional<size_t> put(key_t key, Storage&& value) {
36 |         // size in MBs
37 |         size_t size = (value.v.size() + 2) / 128 / 1024;
38 |         if (size > maxBytes) {
39 |             return {};
40 |         }
41 |         lock_guard lk{lock};
42 |         map.emplace(move(key), move(value));
43 |         return {size};
44 |     }
45 | };
46 | 
47 | struct KeySpace {
48 |     BoundedMemoryCache& cache;
49 |     size_t keySpaceId;
50 |     optional<Storage> get(size_t datasetId, size_t partition) {
51 |         return cache.get(make_pair(make_pair(keySpaceId, datasetId), partition));
52 |     }
53 |     optional<size_t> put(size_t datasetId, size_t partition, Storage&& value) {
54 |         return cache.put(make_pair(make_pair(keySpaceId, datasetId), partition), move(value));
55 |     }
56 |     size_t getCapacity() {
57 |         return cache.maxBytes;
58 |     }
59 | };
60 | 
61 | 
62 | #endif //SPARKPP_CACHE_HPP
63 | 


--------------------------------------------------------------------------------
/include/cache_tracker.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/13/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_CACHE_TRACKER_HPP
  6 | #define SPARKPP_CACHE_TRACKER_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "cache.hpp"
 10 | #include "rdd/rdd.hpp"
 11 | #include "serialize_wrapper.hpp"
 12 | #include "serialize_capnp.hpp"
 13 | #include <boost/asio.hpp>
 14 | 
 15 | using namespace boost::asio;
 16 | 
 17 | struct CacheTrackerMessage {
 18 |     struct AddedToCache {
 19 |         int rddId;
 20 |         int partition;
 21 |         host_t host;
 22 |         size_t size;
 23 |         SN_BOOST_SERIALIZE_MEMBERS_IN(rddId, partition, host, size);
 24 |     };
 25 |     struct DroppedFromCache {
 26 |         int rddId;
 27 |         int partition;
 28 |         host_t host;
 29 |         size_t size;
 30 |         SN_BOOST_SERIALIZE_MEMBERS_IN(rddId, partition, host, size);
 31 |     };
 32 |     struct MemoryCacheLost {
 33 |         host_t host;
 34 |         SN_BOOST_SERIALIZE_MEMBERS_IN(host);
 35 |     };
 36 |     struct RegisterRDD {
 37 |         int rddId;
 38 |         int numPartitions;
 39 |         SN_BOOST_SERIALIZE_MEMBERS_IN(rddId, numPartitions);
 40 |     };
 41 |     struct SlaveCacheStarted {
 42 |         host_t host;
 43 |         size_t size;
 44 |         SN_BOOST_SERIALIZE_MEMBERS_IN(host, size);
 45 |     };
 46 |     struct GetCacheStatus {
 47 |         SN_BOOST_SERIALIZE_EMPTY();
 48 |     };
 49 |     struct GetCacheLocations {
 50 |         SN_BOOST_SERIALIZE_EMPTY();
 51 |     };
 52 |     struct StopCacheTracker {
 53 |         SN_BOOST_SERIALIZE_EMPTY();
 54 |     };
 55 |     variant<AddedToCache,
 56 |             DroppedFromCache,
 57 |             MemoryCacheLost,
 58 |             RegisterRDD,
 59 |             SlaveCacheStarted,
 60 |             GetCacheLocations,
 61 |             GetCacheStatus,
 62 |             StopCacheTracker> vmember;
 63 |     auto& get() {
 64 |         return vmember;
 65 |     }
 66 |     const auto& get() const {
 67 |         return vmember;
 68 |     }
 69 | };
 70 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::AddedToCache);
 71 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::DroppedFromCache);
 72 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::MemoryCacheLost);
 73 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::RegisterRDD);
 74 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::SlaveCacheStarted);
 75 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::GetCacheStatus);
 76 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::GetCacheLocations);
 77 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerMessage::StopCacheTracker);
 78 | 
 79 | 
 80 | struct CacheTrackerReply {
 81 |     struct CacheLocations {
 82 |         unordered_map<size_t, vector<list<host_t>>> locs;
 83 |         SN_BOOST_SERIALIZE_MEMBERS_IN(locs);
 84 |     };
 85 |     struct CacheStatus {
 86 |         // host, capacity, usage
 87 |         vector<tuple<host_t, size_t, size_t>> status;
 88 |         SN_BOOST_SERIALIZE_MEMBERS_IN(status);
 89 |     };
 90 |     struct Ok {
 91 |         SN_BOOST_SERIALIZE_EMPTY();
 92 |     };
 93 |     variant<CacheLocations, CacheStatus, Ok> vmember;
 94 |     variant<CacheLocations, CacheStatus, Ok>& get() {
 95 |         return vmember;
 96 |     }
 97 |     const auto& get() const {
 98 |         return vmember;
 99 |     }
100 | };
101 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerReply::CacheLocations);
102 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerReply::CacheStatus);
103 | BOOST_IS_BITWISE_SERIALIZABLE(CacheTrackerReply::Ok);
104 | 
105 | 
106 | struct CacheTracker {
107 |     bool isMaster;
108 |     // I hope there is something acts like Arc<T>...
109 |     shared_mutex locs_lck;
110 |     unordered_map<size_t, vector<list<host_t>>> locs;
111 |     shared_mutex slaveCapacity_lck;
112 |     unordered_map<host_t, size_t> slaveCapacity;
113 |     shared_mutex slaveUsage_lck;
114 |     unordered_map<host_t, size_t> slaveUsage;
115 |     shared_mutex registeredRddIds_lck;
116 |     unordered_set<size_t> registeredRddIds;
117 |     std::condition_variable loading_cv;
118 |     mutex loading_lck;
119 |     unordered_set<pair<size_t, size_t>, pair_hash> loading;
120 |     // per worker
121 |     addr_t masterAddr;
122 |     KeySpace cache;
123 |     CacheTracker(bool isMaster_, addr_t masterAddr_, BoundedMemoryCache& cache_)
124 |         : isMaster{isMaster_}, masterAddr{move(masterAddr_)},
125 |         cache{cache_.newKeySpace()} {
126 |         masterAddr.second += 1;
127 |         if (isMaster) {
128 |             server();
129 |         }
130 |         client(CacheTrackerMessage{
131 |                 .vmember = {
132 |                         CacheTrackerMessage::SlaveCacheStarted{
133 |                                 .host = std::getenv("SPARK_LOCAL_IP"),
134 |                                 .size = cache.getCapacity()
135 |                         }
136 |                 }
137 |         });
138 |     }
139 | 
140 |     void server() {
141 |         thread thd{
142 |             [this]() {
143 |                 io_service ioc;
144 |                 ip::tcp::endpoint endpoint{ip::tcp::v4(), masterAddr.second};
145 |                 ip::tcp::acceptor acceptor{ioc, endpoint};
146 |                 while (true) {
147 |                     ip::tcp::socket socket{ioc};
148 |                     acceptor.accept(socket);
149 |                     thread per_conn{
150 |                         [this, socket = move(socket)]() mutable {
151 |                             int fd = socket.native_handle();
152 |                             ::capnp::PackedFdMessageReader xmessage{fd};
153 |                             auto result = recvData<Message>(xmessage);
154 |                             auto message = CacheTrackerMessage{};
155 |                             deserialize(
156 |                                 message.get(),
157 |                                 result
158 |                             );
159 |                             auto reply = match(
160 |                                     message.get(),
161 |                                     [this](const CacheTrackerMessage::SlaveCacheStarted& msg) {
162 |                                         unique_lock lk1{slaveCapacity_lck};
163 |                                         unique_lock lk2{slaveUsage_lck};
164 |                                         slaveCapacity[msg.host] = msg.size;
165 |                                         return CacheTrackerReply{
166 |                                             .vmember = CacheTrackerReply::Ok{}
167 |                                         };
168 |                                     },
169 |                                     [this](const CacheTrackerMessage::RegisterRDD& msg) {
170 |                                         unique_lock lk{locs_lck};
171 |                                         locs[msg.rddId].resize(msg.numPartitions);
172 |                                         return CacheTrackerReply{
173 |                                                 .vmember = CacheTrackerReply::Ok{}
174 |                                         };
175 |                                     },
176 |                                     [this](const CacheTrackerMessage::AddedToCache& msg) {
177 |                                         if (msg.size > 0) {
178 |                                             unique_lock lk{slaveUsage_lck};
179 |                                             slaveUsage[msg.host] += msg.size;
180 |                                         }
181 |                                         unique_lock lk{locs_lck};
182 |                                         if (locs.find(msg.rddId) != locs.end()) {
183 |                                             auto& v = locs[msg.rddId];
184 |                                             if ((int)v.size() > msg.partition) {
185 |                                                 v[msg.partition].push_front(msg.host);
186 |                                             }
187 |                                         }
188 |                                         return CacheTrackerReply{
189 |                                                 .vmember = CacheTrackerReply::Ok{}
190 |                                         };
191 |                                     },
192 |                                     [this](const CacheTrackerMessage::DroppedFromCache& msg) {
193 |                                         if (msg.size > 0) {
194 |                                             unique_lock lk{slaveUsage_lck};
195 |                                             slaveUsage[msg.host] -= msg.size;
196 |                                         }
197 |                                         unique_lock lk{locs_lck};
198 |                                         if (locs.find(msg.rddId) != locs.end()) {
199 |                                             auto& v = locs[msg.rddId];
200 |                                             if ((int)v.size() > msg.partition) {
201 |                                                 auto& l = v[msg.partition];
202 |                                                 l.erase(remove_if(l.begin(), l.end(), [&](const host_t& h){
203 |                                                     return h == msg.host;
204 |                                                 }));
205 |                                             }
206 |                                         }
207 |                                         return CacheTrackerReply{
208 |                                                 .vmember = CacheTrackerReply::Ok{}
209 |                                         };
210 |                                     },
211 |                                     [this](const CacheTrackerMessage::GetCacheLocations&) {
212 |                                         shared_lock lk{locs_lck};
213 |                                         return CacheTrackerReply{
214 |                                                 .vmember = CacheTrackerReply::CacheLocations{locs}
215 |                                         };
216 |                                     },
217 |                                     [this](const CacheTrackerMessage::GetCacheStatus&) {
218 |                                         shared_lock lk1{slaveCapacity_lck};
219 |                                         shared_lock lk2{slaveUsage_lck};
220 |                                         vector<tuple<host_t, size_t, size_t>> status;
221 |                                         status.reserve(slaveCapacity.size());
222 |                                         for (auto& v : slaveCapacity) {
223 |                                             status.push_back(
224 |                                                     make_tuple(v.first, v.second, slaveUsage[v.first])
225 |                                             );
226 |                                         }
227 |                                         return CacheTrackerReply{
228 |                                             .vmember = CacheTrackerReply::CacheStatus{move(status)}
229 |                                         };
230 |                                     }
231 |                             );
232 |                             vector<char> rbytes;
233 |                             serialize(reply.get(), rbytes);
234 |                             sendData<Message>(fd, rbytes);
235 |                         }
236 |                     };
237 |                     per_conn.detach();
238 |                 }
239 |             }
240 |         };
241 |         thd.detach();
242 |     }
243 | 
244 |     CacheTrackerReply client(const CacheTrackerMessage& message) {
245 |         io_service ioc;
246 |         ip::tcp::resolver resolver{ioc};
247 |         ip::tcp::resolver::query query{masterAddr.first, std::to_string(masterAddr.second),
248 |                                        boost::asio::ip::resolver_query_base::numeric_service};
249 |         auto iter = resolver.resolve(query);
250 |         ip::tcp::resolver::iterator end;
251 |         ip::tcp::endpoint endpoint = *iter;
252 |         ip::tcp::socket socket{ioc};
253 |         boost::system::error_code ec;
254 |         do {
255 |             auto start_iter = iter;
256 |             ec.clear();
257 |             socket.close();
258 |             std::this_thread::sleep_for(5ms);
259 |             while (start_iter != end) {
260 |                 socket.connect(endpoint, ec);
261 |                 if (!ec) break;
262 |                 ++start_iter;
263 |             }
264 |         } while (ec);
265 |         int fd = socket.native_handle();
266 |         vector<char> bytes;
267 |         serialize(message.get(), bytes);
268 |         sendData<Message>(fd, bytes);
269 |         ::capnp::PackedFdMessageReader xmessage{fd};
270 |         auto reader = recvData<Message>(xmessage);
271 |         CacheTrackerReply reply;
272 |         deserialize(reply.get(), reinterpret_cast<const char*>(reader.asBytes().begin()), reader.size());
273 |         return reply;
274 |     }
275 | 
276 |     unordered_map<size_t, vector<vector<host_t>>> getLocationsSnapshot() {
277 |         auto reply = client(CacheTrackerMessage{
278 |             .vmember = CacheTrackerMessage::GetCacheLocations{}
279 |         });
280 |         auto xlocs = match(reply.get(), [](const CacheTrackerReply::CacheLocations& msg){
281 |             return msg.locs;
282 |         });
283 |         unordered_map<size_t, vector<vector<host_t>>> res;
284 |         for (auto& p : xlocs) {
285 |             res[p.first].resize(p.second.size());
286 |             for (size_t i = 0; i < p.second.size(); ++i) {
287 |                 res[p.first][i] = vector<host_t>{
288 |                     std::make_move_iterator(p.second[i].begin()),
289 |                     std::make_move_iterator(p.second[i].end())};
290 |             }
291 |         }
292 |         return res;
293 |     }
294 | 
295 |     auto getCacheStatus() {
296 |         auto reply = client(CacheTrackerMessage{
297 |                 .vmember = CacheTrackerMessage::GetCacheStatus{}
298 |         });
299 |         return match(reply.get(), [](const CacheTrackerReply::CacheStatus& msg){
300 |             return msg.status;
301 |         });
302 |     }
303 | 
304 |     void registerRDD(int rddId, int numPartitions) {
305 |         {
306 |             shared_lock lk{registeredRddIds_lck};
307 |             if (registeredRddIds.count(rddId)) {
308 |                 return;
309 |             }
310 |         }
311 |         unique_lock lk{registeredRddIds_lck};
312 |         registeredRddIds.insert(rddId);
313 |         client(CacheTrackerMessage{
314 |             .vmember = CacheTrackerMessage::RegisterRDD {
315 |                 .rddId = static_cast<int>(rddId),
316 |                 .numPartitions = static_cast<int>(numPartitions)
317 |             }
318 |         });
319 |     }
320 | 
321 |     // the rdd should be referenced, not holding owner
322 |     template <typename T>
323 |     auto getOrCompute(RDD<T>* rdd, unique_ptr<Split> split) -> unique_ptr<Iterator<T>> {
324 |         auto key = make_pair(rdd->id(), split->index());
325 |         auto val = cache.get(key.first, key.second);
326 |         if (val.is_initialized()) {
327 |             // need a owning iterator
328 |             vector<T> vec;
329 |             deserialize(vec, val->v.data(), val->v.size());
330 |             return make_unique<OwnIterator<T>>(move(vec));
331 |         }
332 |         unique_lock lk{loading_lck};
333 |         loading_cv.wait(lk, [this, key = key](){
334 |             return !loading.count(key);
335 |         });
336 |         loading.insert(key);
337 |         auto after_val = cache.get(key.first, key.second);
338 |         if (after_val.is_initialized()) {
339 |             vector<T> vec;
340 |             deserialize(vec, after_val->v.data(), after_val->v.size());
341 |             return make_unique<OwnIterator<T>>(move(vec));
342 |         }
343 |         auto iter = rdd->compute(move(split));
344 |         auto v = iter->collect();
345 |         vector<char> bytes;
346 |         serialize(v, bytes);
347 |         cache.put(key.first, key.second, move(bytes));
348 |         loading.erase(key);
349 |         loading_cv.notify_all();
350 |         return make_unique<OwnIterator<T>>(move(v));
351 |     }
352 | };
353 | 
354 | 
355 | 
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | #endif //SPARKPP_CACHE_TRACKER_HPP
363 | 


--------------------------------------------------------------------------------
/include/common.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/7/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_COMMON_HPP
  6 | #define SPARKPP_COMMON_HPP
  7 | 
  8 | #include <cstdint>
  9 | #include <cstring>
 10 | #include <cstdlib>
 11 | #include <cstddef>
 12 | 
 13 | #include <atomic>
 14 | 
 15 | using std::atomic;
 16 | 
 17 | #include <vector>
 18 | #include <array>
 19 | #include <utility>
 20 | #include <map>
 21 | #include <set>
 22 | #include <unordered_map>
 23 | #include <unordered_set>
 24 | #include <list>
 25 | #include <tuple>
 26 | 
 27 | /// Everything sequence becomes a vector now...
 28 | template <typename T>
 29 | using Seq = std::vector<T>;
 30 | 
 31 | using std::array;
 32 | using std::list;
 33 | using std::pair;
 34 | using std::vector;
 35 | using std::string;
 36 | using std::map;
 37 | using std::unordered_map;
 38 | using std::set;
 39 | using std::unordered_set;
 40 | using std::tuple;
 41 | 
 42 | using std::make_pair;
 43 | using std::make_tuple;
 44 | 
 45 | #include <mutex>
 46 | #include <shared_mutex>
 47 | #include <thread>
 48 | 
 49 | using std::mutex;
 50 | using std::shared_mutex;
 51 | using std::lock_guard;
 52 | using std::shared_lock;
 53 | using std::unique_lock;
 54 | using std::thread;
 55 | 
 56 | #include <memory>
 57 | 
 58 | using std::byte;
 59 | using std::forward;
 60 | using std::move;
 61 | using std::unique_ptr;
 62 | using std::shared_ptr;
 63 | using std::make_unique;
 64 | using std::make_shared;
 65 | 
 66 | #include <functional>
 67 | 
 68 | using std::invoke;
 69 | 
 70 | #include <filesystem>
 71 | #include <experimental/filesystem>
 72 | 
 73 | namespace fs = std::experimental::filesystem;
 74 | 
 75 | // #include <optional>
 76 | // #include <variant>
 77 | 
 78 | // using std::optional;
 79 | // using std::variant;
 80 | 
 81 | #include <any>
 82 | using std::any;
 83 | 
 84 | #include <iostream>
 85 | 
 86 | using std::cout;
 87 | using std::cerr;
 88 | using std::clog;
 89 | 
 90 | #include <algorithm>
 91 | #include <numeric>
 92 | 
 93 | using std::copy_n;
 94 | using std::remove_if;
 95 | 
 96 | #include <type_traits>
 97 | 
 98 | using std::decay_t;
 99 | 
100 | #include <chrono>
101 | 
102 | using namespace std::chrono_literals;
103 | 
104 | 
105 | #include "utils/utils.hpp"
106 | 
107 | 
108 | #include "concurrentqueue/blockingconcurrentqueue.h"
109 | 
110 | using moodycamel::BlockingConcurrentQueue;
111 | 
112 | #include "parallel-hashmap/parallel_hashmap/phmap.h"
113 | 
114 | using phmap::parallel_flat_hash_map;
115 | using phmap::parallel_flat_hash_set;
116 | 
117 | using addr_t = pair<string, uint16_t>;
118 | using host_t = string;
119 | 
120 | #include <boost/variant.hpp>
121 | #include <boost/optional.hpp>
122 | 
123 | using boost::optional;
124 | using boost::variant;
125 | 
126 | #include <fmt/format.h>
127 | 
128 | 
129 | 
130 | #endif //SPARKPP_COMMON_HPP
131 | 


--------------------------------------------------------------------------------
/include/dependency.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/7/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_DEPENDENCY_HPP
 6 | #define SPARKPP_DEPENDENCY_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include "serialize_wrapper.hpp"
10 | #include "aggregator.hpp"
11 | #include "partitioner.hpp"
12 | #include "split.hpp"
13 | 
14 | struct RDDBase;
15 | template <typename T>
16 | struct RDD;
17 | 
18 | struct Dependency {
19 |     virtual RDDBase* rdd() const = 0;
20 |     virtual ~Dependency() = default;
21 | };
22 | 
23 | struct NarrowDependency : Dependency {
24 |     RDDBase* m_rdd;
25 |     NarrowDependency(RDDBase* rdd_) : m_rdd(rdd_) {}
26 |     RDDBase* rdd() const override {
27 |         return m_rdd;
28 |     }
29 |     virtual vector<size_t> getParents(size_t partitionId) = 0;
30 | };
31 | 
32 | struct ShuffleDependencyBase : Dependency {
33 |     virtual string runShuffle(RDDBase* rdd, unique_ptr<Split> split, size_t partition) = 0;
34 |     virtual size_t shuffle_id() const = 0;
35 |     virtual void serialize_dyn(vector<char>&) const = 0;
36 |     virtual void deserialize_dyn(const char*&, size_t&) = 0;
37 | };
38 | 
39 | template <typename K, typename V, typename C>
40 | struct ShuffleDependency : ShuffleDependencyBase {
41 |     size_t shuffleId;
42 |     RDD<pair<K, V>>* m_rdd;
43 |     Partitioner* partitioner;
44 |     AggregatorBase* aggregator;
45 |     ShuffleDependency(size_t shuffleId_, RDD<pair<K, V>>* rdd_,
46 |             Partitioner* partitioner_, AggregatorBase* aggregator_)
47 |         : shuffleId{shuffleId_}, m_rdd{rdd_}, partitioner{partitioner_}, aggregator{aggregator_} {}
48 |     RDDBase* rdd() const override {
49 |         return m_rdd;
50 |     }
51 |     string runShuffle(RDDBase* rdd, unique_ptr<Split> split, size_t partition);
52 |     size_t shuffle_id() const override {
53 |         return shuffleId;
54 |     }
55 |     virtual void serialize_dyn(vector<char>& bytes) const;
56 |     virtual void deserialize_dyn(const char*&, size_t&);
57 | };
58 | 
59 | ShuffleDependencyBase* dep_from_reader(::capnp::Data::Reader reader);
60 | 
61 | 
62 | struct OneToOneDependency : NarrowDependency {
63 |     using NarrowDependency::NarrowDependency;
64 |     vector<size_t> getParents(size_t partitionId) override {
65 |         return { partitionId };
66 |     }
67 | };
68 | 
69 | struct RangeDependency : NarrowDependency {
70 |     size_t inStart, outStart, length;
71 |     RangeDependency(RDDBase* rdd_, size_t inStart_, size_t outStart_, size_t length_)
72 |         : NarrowDependency{rdd_}, inStart{inStart_}, outStart{outStart_}, length{length_} {}
73 |     Seq<size_t> getParents(size_t partitionId) override {
74 |         if (partitionId >= outStart && partitionId < outStart + length) {
75 |             return { partitionId - outStart + inStart };
76 |         } else {
77 |             return {};
78 |         }
79 |     }
80 | };
81 | 
82 | 
83 | 
84 | 
85 | #endif //SPARKPP_DEPENDENCY_HPP
86 | 


--------------------------------------------------------------------------------
/include/executor.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/8/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_EXECUTOR_HPP
 6 | #define SPARKPP_EXECUTOR_HPP
 7 | 
 8 | #include <utility>
 9 | 
10 | #include "common.hpp"
11 | #include "serialize_wrapper.hpp"
12 | #include "serialize_capnp.hpp"
13 | #include "rdd/rdd.hpp"
14 | #include <boost/asio.hpp>
15 | #include <boost/asio/thread_pool.hpp>
16 | #include <boost/asio/post.hpp>
17 | #include "data.capnp.h"
18 | 
19 | using namespace boost::asio;
20 | 
21 | struct Executor {
22 |     pair<string, uint16_t> masterAddress;
23 |     uint16_t port;
24 |     thread_pool pool{std::thread::hardware_concurrency()};
25 |     Executor(pair<string, uint16_t> masterAddress_, uint16_t port_)
26 |         : masterAddress{move(masterAddress_)}, port{port_} {}
27 |     void run() {
28 |         io_service ioc;
29 |         ip::tcp::endpoint endpoint{ip::tcp::v4(), port};
30 |         ip::tcp::acceptor acceptor{ioc, endpoint};
31 |         while (true) {
32 |             ip::tcp::socket socket{ioc};
33 |             acceptor.accept(socket);
34 |             post(pool, [socket = move(socket)]() mutable {
35 |                 // read & serialize
36 |                 int fd = socket.native_handle();
37 |                 ::capnp::PackedFdMessageReader message{fd};
38 |                 auto task = recvExecution(message);
39 |                 auto s = task->run(0);
40 |                 // write result back
41 |                 sendData<Result>(fd, s.v);
42 |             });
43 |         }
44 |     }
45 | };
46 | 
47 | #endif //SPARKPP_EXECUTOR_HPP
48 | 


--------------------------------------------------------------------------------
/include/map_output_tracker.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/14/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_MAP_OUTPUT_TRACKER_HPP
  6 | #define SPARKPP_MAP_OUTPUT_TRACKER_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "serialize_capnp.hpp"
 10 | #include <boost/asio.hpp>
 11 | 
 12 | using namespace boost::asio;
 13 | 
 14 | struct MapOutputTracker {
 15 |     bool isMaster;
 16 |     parallel_flat_hash_map<size_t, vector<string>> server_urls;
 17 |     unordered_set<size_t> fetching;
 18 |     mutex fetching_lck;
 19 |     std::condition_variable cv;
 20 |     addr_t masterAddr;
 21 |     mutex generation_lck;
 22 |     int64_t generation;
 23 | 
 24 |     MapOutputTracker(bool isMaster_, addr_t masterAddr_)
 25 |         : isMaster{isMaster_}, masterAddr{move(masterAddr_)}, generation{0} {
 26 |         if (isMaster)
 27 |             server();
 28 |     }
 29 | 
 30 |     void server() {
 31 |         thread thd{
 32 |                 [this]() {
 33 |                     io_service ioc;
 34 |                     ip::tcp::endpoint endpoint{ip::tcp::v4(), masterAddr.second};
 35 |                     ip::tcp::acceptor acceptor{ioc, endpoint};
 36 |                     while (true) {
 37 |                         ip::tcp::socket socket{ioc};
 38 |                         acceptor.accept(socket);
 39 |                         thread per_conn{
 40 |                                 [this, socket = move(socket)]() mutable {
 41 |                                     int fd = socket.native_handle();
 42 |                                     ::capnp::PackedFdMessageReader message{fd};
 43 |                                     auto result = recvData<Message>(message);
 44 |                                     size_t shuffleId;
 45 |                                     deserialize(shuffleId, result);
 46 |                                     if (server_urls.find(shuffleId) != server_urls.end()) {
 47 |                                         auto v = server_urls[shuffleId];
 48 |                                         vector<char> bytes;
 49 |                                         serialize(v, bytes);
 50 |                                         sendData<Message>(fd, bytes);
 51 |                                     }
 52 |                                 }
 53 |                         };
 54 |                         per_conn.detach();
 55 |                     }
 56 |                 }
 57 |         };
 58 |         thd.detach();
 59 |     }
 60 | 
 61 |     vector<string> client(size_t shuffleId) {
 62 |         io_service ioc;
 63 |         ip::tcp::resolver resolver{ioc};
 64 |         ip::tcp::resolver::query query{masterAddr.first, std::to_string(masterAddr.second),
 65 |                                        boost::asio::ip::resolver_query_base::numeric_service};
 66 |         auto iter = resolver.resolve(query);
 67 |         ip::tcp::resolver::iterator end;
 68 |         ip::tcp::endpoint endpoint = *iter;
 69 |         ip::tcp::socket socket{ioc};
 70 |         boost::system::error_code ec;
 71 |         do {
 72 |             auto start_iter = iter;
 73 |             ec.clear();
 74 |             socket.close();
 75 |             std::this_thread::sleep_for(5ms);
 76 |             while (start_iter != end) {
 77 |                 socket.connect(endpoint, ec);
 78 |                 if (!ec) break;
 79 |                 ++start_iter;
 80 |             }
 81 |         } while (ec);
 82 |         int fd = socket.native_handle();
 83 |         vector<char> bytes;
 84 |         serialize(shuffleId, bytes);
 85 |         sendData<Message>(fd, bytes);
 86 |         ::capnp::PackedFdMessageReader message{fd};
 87 |         auto reader = recvData<Message>(message);
 88 |         vector<string> res;
 89 |         deserialize(res, reader);
 90 |         return res;
 91 |     }
 92 | 
 93 |     void increaseGeneration() {
 94 |         unique_lock lk{generation_lck};
 95 |         generation += 1;
 96 |     }
 97 | 
 98 |     void registerShuffle(size_t shuffleId, size_t numMaps) {
 99 |         if (server_urls.find(shuffleId) == server_urls.end()) {
100 |             server_urls[shuffleId].resize(numMaps);
101 |         }
102 |     }
103 | 
104 |     void registerMapOutput(size_t shuffleId, size_t mapId, string uri) {
105 |         auto& v = server_urls[shuffleId];
106 |         // FIXME: lock each vector?
107 |         v[mapId] = move(uri);
108 |     }
109 | 
110 |     void registerMapOutputs(size_t shuffleId, vector<string> locs) {
111 |         server_urls[shuffleId] = move(locs);
112 |     }
113 | 
114 |     void unregisterMapOutput(size_t shuffleId, int mapId, [[maybe_unused]] string uri) {
115 |         // assert(server_urls.find(shuffleId) == server_urls.end())
116 |         auto& v = server_urls[shuffleId];
117 |         v[mapId].clear();
118 |         increaseGeneration();
119 |     }
120 | 
121 |     vector<string> getServerUris(size_t shuffleId) {
122 |         if (server_urls.count(shuffleId)) {
123 |             return server_urls[shuffleId];
124 |         }
125 |         unique_lock lk{fetching_lck};
126 |         cv.wait(lk, [this, shuffleId]() {
127 |             return !fetching.count(shuffleId);
128 |         });
129 |         if (server_urls.count(shuffleId)) {
130 |             return server_urls[shuffleId];
131 |         }
132 |         fetching.insert(shuffleId);
133 |         auto v = client(shuffleId);
134 |         server_urls[shuffleId] = v;
135 |         fetching.erase(shuffleId);
136 |         cv.notify_all();
137 |         return v;
138 |     }
139 | 
140 |     int64_t getGeneration() {
141 |         unique_lock lk{generation_lck};
142 |         return generation;
143 |     }
144 | 
145 |     void updateGeneration(int64_t v) {
146 |         unique_lock lk{generation_lck};
147 |         if (v > generation) {
148 |             server_urls.clear();
149 |             generation = v;
150 |         }
151 |     }
152 | };
153 | 
154 | #endif //SPARKPP_MAP_OUTPUT_TRACKER_HPP
155 | 


--------------------------------------------------------------------------------
/include/partitioner.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/7/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_PARTITIONER_HPP
 6 | #define SPARKPP_PARTITIONER_HPP
 7 | 
 8 | #include "common.hpp"
 9 | 
10 | struct Partitioner {
11 |     virtual size_t numPartitions() const = 0;
12 |     virtual size_t getPartition(const any&) const = 0;
13 |     virtual void serialize_dyn(vector<char>&) const = 0;
14 |     virtual void deserialize_dyn(const char*&, size_t&) = 0;
15 | };
16 | 
17 | template <typename K>
18 | struct HashPartitioner : Partitioner {
19 |     size_t partitions;
20 |     HashPartitioner(size_t p_) : partitions{p_} {}
21 |     size_t numPartitions() const {
22 |         return partitions;
23 |     }
24 |     // FIXME: is there a way to fix this any? using pointer?
25 |     size_t getPartition(const any& key) const {
26 |         auto v = std::any_cast<K>(key);
27 |         return std::hash<K>{}(v) % partitions;
28 |     }
29 |     void serialize_dyn(vector<char>& bytes) const override {
30 |         size_t oldSize = bytes.size();
31 |         bytes.resize(oldSize + sizeof(HashPartitioner));
32 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(HashPartitioner));
33 |     }
34 |     void deserialize_dyn(const char*& bytes, size_t& size) override {
35 |         // plain_copy
36 |         bytes += sizeof(HashPartitioner);
37 |         size -= sizeof(HashPartitioner);
38 |     }
39 | };
40 | 
41 | 
42 | 
43 | #endif //SPARKPP_PARTITIONER_HPP
44 | 


--------------------------------------------------------------------------------
/include/rdd/mapped_rdd.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/17/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_MAPPED_RDD_HPP
  6 | #define SPARKPP_MAPPED_RDD_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "rdd/rdd.hpp"
 10 | #include "serialize_wrapper.hpp"
 11 | #include "serialize_capnp.hpp"
 12 | 
 13 | template <typename T, typename U, typename F>
 14 | struct MappedRDD : RDD<U> {
 15 |     RDD<T>* prev;
 16 |     F func;
 17 |     OneToOneDependency dep;
 18 |     Dependency* depP;
 19 |     MappedRDD(RDD<T>* p_, F f_) : RDD<U>{p_->sc}, prev{p_}, func{move(f_)}, dep{p_}, depP{&dep} {}
 20 |     size_t numOfSplits() override {
 21 |         return prev->numOfSplits();
 22 |     }
 23 |     unique_ptr<Split> split(size_t partitionId) override {
 24 |         return prev->split(partitionId);
 25 |     }
 26 |     span<Dependency*> dependencies() override {
 27 |         return make_span<Dependency*>(&depP, 1);
 28 |     };
 29 | 
 30 |     unique_ptr<Iterator<U>> compute(unique_ptr<Split> split) {
 31 |         return make_unique<MapIterator<T, U, F>>(
 32 |             move(prev->iterator(move(split))),
 33 |             func
 34 |         );
 35 |     };
 36 | 
 37 |     void serialize_dyn(vector<char>& bytes) const {
 38 |         size_t oldSize = bytes.size();
 39 |         bytes.resize(oldSize + sizeof(MappedRDD));
 40 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(MappedRDD));
 41 |         prev->serialize_dyn(bytes);
 42 |     };
 43 | 
 44 |     void deserialize_dyn(const char*& bytes, size_t& size) {
 45 |         bytes += sizeof(MappedRDD);
 46 |         size -= sizeof(MappedRDD);
 47 |         prev = reinterpret_cast<RDD<T>*>(const_cast<char*>(bytes));
 48 |         prev->deserialize_dyn(bytes, size);
 49 |     };
 50 | };
 51 | 
 52 | template <typename T, typename U, typename F>
 53 | struct FlatMappedRDD : RDD<U> {
 54 |     RDD<T>* prev;
 55 |     F func;
 56 |     OneToOneDependency dep;
 57 |     Dependency* depP;
 58 |     FlatMappedRDD(RDD<T>* p_, F f_) : RDD<U>{p_->sc}, prev{p_}, func{move(f_)}, dep{p_}, depP{&dep} {}
 59 |     size_t numOfSplits() override {
 60 |         return prev->numOfSplits();
 61 |     }
 62 |     unique_ptr<Split> split(size_t partitionId) override {
 63 |         return prev->split(partitionId);
 64 |     }
 65 |     span<Dependency*> dependencies() override {
 66 |         return make_span<Dependency*>(&depP, 1);
 67 |     };
 68 | 
 69 |     unique_ptr<Iterator<U>> compute(unique_ptr<Split> split) {
 70 |         return make_unique<FlatMapIterator<T, U, F>>(
 71 |                 move(prev->iterator(move(split))),
 72 |                 func
 73 |         );
 74 |     };
 75 | 
 76 |     void serialize_dyn(vector<char>& bytes) const {
 77 |         size_t oldSize = bytes.size();
 78 |         bytes.resize(oldSize + sizeof(FlatMappedRDD));
 79 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(FlatMappedRDD));
 80 |         prev->serialize_dyn(bytes);
 81 |     };
 82 | 
 83 |     void deserialize_dyn(const char*& bytes, size_t& size) {
 84 |         bytes += sizeof(FlatMappedRDD);
 85 |         size -= sizeof(FlatMappedRDD);
 86 |         prev = reinterpret_cast<RDD<T>*>(const_cast<char*>(bytes));
 87 |         prev->deserialize_dyn(bytes, size);
 88 |     };
 89 | };
 90 | 
 91 | template <typename T, typename F>
 92 | struct FilterRDD : RDD<T> {
 93 |     RDD<T>* prev;
 94 |     F func;
 95 |     OneToOneDependency dep;
 96 |     Dependency* depP;
 97 |     FilterRDD(RDD<T>* p_, F f_) : RDD<T>{p_->sc}, prev{p_}, func{move(f_)}, dep{p_}, depP{&dep} {}
 98 |     size_t numOfSplits() override {
 99 |         return prev->numOfSplits();
100 |     }
101 |     unique_ptr<Split> split(size_t partitionId) override {
102 |         return prev->split(partitionId);
103 |     }
104 |     span<Dependency*> dependencies() override {
105 |         return make_span<Dependency*>(&depP, 1);
106 |     };
107 | 
108 |     unique_ptr<Iterator<T>> compute(unique_ptr<Split> split) {
109 |         return make_unique<FilterIterator<T, F>>(
110 |                 move(prev->iterator(move(split))),
111 |                 func
112 |         );
113 |     };
114 | 
115 |     void serialize_dyn(vector<char>& bytes) const {
116 |         size_t oldSize = bytes.size();
117 |         bytes.resize(oldSize + sizeof(FilterRDD));
118 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(FilterRDD));
119 |         prev->serialize_dyn(bytes);
120 |     };
121 | 
122 |     void deserialize_dyn(const char*& bytes, size_t& size) {
123 |         bytes += sizeof(FilterRDD);
124 |         size -= sizeof(FilterRDD);
125 |         prev = reinterpret_cast<RDD<T>*>(const_cast<char*>(bytes));
126 |         prev->deserialize_dyn(bytes, size);
127 |     };
128 | };
129 | 
130 | 
131 | 
132 | #endif //SPARKPP_MAPPED_RDD_HPP
133 | 


--------------------------------------------------------------------------------
/include/rdd/pair_rdd.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/19/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_PAIR_RDD_HPP
  6 | #define SPARKPP_PAIR_RDD_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "rdd/rdd.hpp"
 10 | 
 11 | 
 12 | template <typename K, typename V, typename U, typename F>
 13 | struct MappedValueRDD;
 14 | 
 15 | template <typename K, typename V, typename C>
 16 | struct ShuffleRDD;
 17 | 
 18 | 
 19 | template <typename K, typename V>
 20 | struct PairRDD : RDD<pair<K, V>> {
 21 |     PairRDD(SparkContext& sc) : RDD<pair<K, V>>{sc} {}
 22 | 
 23 |     template <typename G, typename U = typename function_traits<G>::result_type>
 24 |     auto mapValues(G f) -> MappedValueRDD<K, V, U, G> {
 25 |         return MappedValueRDD{this, move(f)};
 26 |     }
 27 | 
 28 |     template <typename C>
 29 |     ShuffleRDD<K, V, C> combineByKey(unique_ptr<AggregatorBase> aggregator, unique_ptr<Partitioner> partitioner) {
 30 |         return ShuffleRDD<K, V, C>{this, move(aggregator), move(partitioner)};
 31 |     }
 32 | 
 33 |     // Invocable<V, V> F
 34 |     template <typename F>
 35 |     auto reduceByKey(F func, size_t numSplits) {
 36 |         auto part = make_unique<HashPartitioner<K>>(numSplits);
 37 |         auto createCombiner = +[](V v) { return v; };
 38 |         auto aggr = make_unique<Aggregator<K, V, V>>(createCombiner, +func, +func);
 39 |         return combineByKey<V>(move(aggr), move(part));
 40 |     }
 41 | 
 42 |     auto groupByKey(size_t numSplits) {
 43 |         auto part = make_unique<HashPartitioner<K>>(numSplits);
 44 |         // NOTE: no bitwise copy performed on these results (Shuffle)
 45 |         auto createCombiner = +[](V v) { return vector<V>{move(v)}; };
 46 |         auto mergeValue = +[](vector<V> vv, V v) {
 47 |             vv.push_back(move(v));
 48 |             return vv;
 49 |         };
 50 |         auto mergeCombiners = +[](vector<V> v1, vector<V> v2) {
 51 |             v1.insert(v1.end(),
 52 |                     std::make_move_iterator(v2.begin()),
 53 |                     std::make_move_iterator(v2.end()));
 54 |             return v1;
 55 |         };
 56 |         auto aggr = make_unique<Aggregator<K, V, vector<V>>>(
 57 |                 createCombiner, mergeValue, mergeCombiners);
 58 |         return combineByKey<vector<V>>(move(aggr), move(part));
 59 |     }
 60 | };
 61 | 
 62 | 
 63 | 
 64 | 
 65 | template <typename T, typename K, typename V, typename F>
 66 | struct MapPairRDD : PairRDD<K, V> {
 67 |     using pair_t = pair<K, V>;
 68 |     RDD<T>* prev;
 69 |     F func;
 70 |     OneToOneDependency dep;
 71 |     Dependency* depP;
 72 |     MapPairRDD(RDD<T>* p_, F f_)
 73 |         : PairRDD<K, V>{p_->sc}, prev{p_}, func{move(f_)}, dep{p_}, depP{&dep} {}
 74 |     size_t numOfSplits() override {
 75 |         return prev->numOfSplits();
 76 |     }
 77 |     unique_ptr<Split> split(size_t partitionId) override {
 78 |         return prev->split(partitionId);
 79 |     }
 80 |     span<Dependency*> dependencies() override {
 81 |         return make_span<Dependency*>(&depP, 1);
 82 |     };
 83 | 
 84 |     unique_ptr<Iterator<pair<K, V>>> compute(unique_ptr<Split> split) {
 85 |         return make_unique<MapIterator<T, pair_t, F>>(
 86 |                 move(prev->iterator(move(split))),
 87 |                 func
 88 |         );
 89 |     };
 90 | 
 91 |     void serialize_dyn(vector<char>& bytes) const {
 92 |         size_t oldSize = bytes.size();
 93 |         bytes.resize(oldSize + sizeof(MapPairRDD));
 94 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(MapPairRDD));
 95 |         prev->serialize_dyn(bytes);
 96 |     };
 97 | 
 98 |     void deserialize_dyn(const char*& bytes, size_t& size) {
 99 |         bytes += sizeof(MapPairRDD);
100 |         size -= sizeof(MapPairRDD);
101 |         prev = reinterpret_cast<RDD<T>*>(const_cast<char*>(bytes));
102 |         prev->deserialize_dyn(bytes, size);
103 |     };
104 | 
105 | };
106 | 
107 | 
108 | 
109 | 
110 | template <typename K, typename V, typename U, typename F>
111 | struct MappedValuesRDD : PairRDD<K, U> {
112 |     using pair_t = pair<K, V>;
113 |     RDD<pair<K, V>>* prev;
114 |     F func;
115 |     OneToOneDependency dep;
116 |     Dependency* depP;
117 |     MappedValuesRDD(RDD<pair<K, V>>* p, F f)
118 |         : PairRDD<K, V>{p->sc}, prev{p}, func{move(f)}, dep{p}, depP{dep} {}
119 |     size_t numOfSplits() override {
120 |         return prev->numOfSplits();
121 |     }
122 |     unique_ptr<Split> split(size_t partitionId) override {
123 |         return prev->split(partitionId);
124 |     }
125 |     span<Dependency*> dependencies() override {
126 |         return make_span<Dependency*>(&depP, 1);
127 |     };
128 |     unique_ptr<Iterator<pair<K, U>>> compute(unique_ptr<Split> split) {
129 |         return make_unique<MapValueIterator<K, V, U, F>>(
130 |                 move(prev->iterator(move(split))),
131 |                 func
132 |         );
133 |     };
134 | 
135 |     void serialize_dyn(vector<char>& bytes) const {
136 |         size_t oldSize = bytes.size();
137 |         bytes.resize(oldSize + sizeof(MappedValuesRDD));
138 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(MappedValuesRDD));
139 |         prev->serialize_dyn(bytes);
140 |     };
141 | 
142 |     void deserialize_dyn(const char*& bytes, size_t& size) {
143 |         bytes += sizeof(MappedValuesRDD);
144 |         size -= sizeof(MappedValuesRDD);
145 |         prev = reinterpret_cast<RDD<pair<K, V>>*>(const_cast<char*>(bytes));
146 |         prev->deserialize_dyn(bytes, size);
147 |     };
148 | };
149 | 
150 | template <typename K, typename V, typename C>
151 | struct ShuffleRDD : PairRDD<K, C> {
152 |     RDD<pair<K, V>>* parent;
153 |     size_t shuffleId;
154 |     unique_ptr<AggregatorBase> aggregator;
155 |     unique_ptr<Partitioner> partitioner;
156 |     ShuffleDependency<K, V, C> dep;
157 |     Dependency* depP;
158 |     ShuffleRDD(RDD<pair<K, V>>* p, unique_ptr<AggregatorBase> a, unique_ptr<Partitioner> pt)
159 |         : PairRDD<K, C>{p->sc}, parent{p}, shuffleId{p->sc.newShuffleId()},
160 |         aggregator{move(a)}, partitioner{move(pt)},
161 |         dep{shuffleId, p, partitioner.get(), aggregator.get()}, depP{&dep} {}
162 |     size_t numOfSplits() override {
163 |         return partitioner->numPartitions();
164 |     }
165 |     unique_ptr<Split> split(size_t partitionId) override {
166 |         return make_unique<Split>(partitionId);
167 |     }
168 |     span<Dependency*> dependencies() override {
169 |         return make_span<Dependency*>(&depP, 1);
170 |     };
171 |     unique_ptr<Iterator<pair<K, C>>> compute(unique_ptr<Split> split);
172 | 
173 |     void serialize_dyn(vector<char>& bytes) const {
174 |         size_t oldSize = bytes.size();
175 |         bytes.resize(oldSize + sizeof(ShuffleRDD));
176 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(ShuffleRDD));
177 |         parent->serialize_dyn(bytes);
178 |         aggregator->serialize_dyn(bytes);
179 |         // covered in Dependency
180 |         // partitioner->serialize_dyn(bytes);
181 |         // dep.serialize_dyn(bytes);
182 |     };
183 | 
184 |     void deserialize_dyn(const char*& bytes, size_t& size) {
185 |         bytes += sizeof(ShuffleRDD);
186 |         size -= sizeof(ShuffleRDD);
187 |         parent = reinterpret_cast<RDD<pair<K, V>>*>(const_cast<char*>(bytes));
188 |         parent->deserialize_dyn(bytes, size);
189 |         // avoid UB destructor
190 |         partitioner.release();
191 |         aggregator.release();
192 |         // plain type
193 |         auto aggr = reinterpret_cast<AggregatorBase*>(const_cast<char*>(bytes));
194 |         aggr->deserialize_dyn(bytes, size);
195 |         aggregator.reset(aggr);
196 |     };
197 | };
198 | 
199 | 
200 | 
201 | #endif //SPARKPP_PAIR_RDD_HPP
202 | 


--------------------------------------------------------------------------------
/include/rdd/parallel_collection.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/17/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_PARALLEL_COLLECTION_HPP
 6 | #define SPARKPP_PARALLEL_COLLECTION_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include "rdd/rdd.hpp"
10 | #include "split.hpp"
11 | #include "serialize_capnp.hpp"
12 | #include "serialize_wrapper.hpp"
13 | 
14 | template <typename T>
15 | struct ParallelCollectionSplit : Split {
16 |     size_t slice;
17 |     size_t rddId;
18 |     vector<T> values;
19 |     ParallelCollectionSplit(size_t s, size_t rid, vector<T> v) : Split{s}, slice{s}, rddId{rid}, values{move(v)} {}
20 | };
21 | 
22 | template <typename T>
23 | struct ParallelCollection : RDD<T> {
24 |     // Use pimpl to avoid extra undefined destructor
25 |     struct PCVal {
26 |         vector<T> data;
27 |         vector<vector<T>> splits;
28 |         PCVal() = default;
29 |         PCVal(vector<T> d_, vector<vector<T>> s_) : data{move(d_)}, splits{move(s_)} {}
30 |         SN_BOOST_SERIALIZE_MEMBERS_IN(data, splits);
31 |     };
32 |     unique_ptr<PCVal> pimpl;
33 |     size_t numSlices;
34 |     mutable size_t serialSize = 0;
35 |     ParallelCollection(SparkContext& sc, vector<T> data, size_t n) : RDD<T>{sc}, numSlices{n} {
36 |         auto splits = slice(data, n);
37 |         pimpl = make_unique<PCVal>(move(data), move(splits));
38 |     }
39 |     vector<vector<T>> slice(const vector<T>& seq, size_t n) {
40 |         vector<vector<T>> slice(n);
41 |         size_t jmp = seq.size() / n;
42 |         for (size_t i = 0; i < n; ++i) {
43 |             slice[i].assign(seq.begin() + i * jmp, seq.begin() + (i + 1) * jmp);
44 |         }
45 |         size_t start = n * jmp;
46 |         for (size_t i = start; i < seq.size(); ++i) {
47 |             slice[i - start].push_back(seq[i]);
48 |         }
49 |         return slice;
50 |     }
51 |     size_t numOfSplits() override {
52 |         return numSlices;
53 |     }
54 |     unique_ptr<Split> split(size_t partitionId) override {
55 |         return make_unique<ParallelCollectionSplit<T>>(
56 |                 partitionId, this->m_id, pimpl->splits[partitionId]);
57 |     }
58 |     unique_ptr<Iterator<T>> compute(unique_ptr<Split> split) {
59 |         auto pcSplit = dynamic_unique_ptr_cast<ParallelCollectionSplit<T>>(move(split));
60 |         return make_unique<OwnIterator<T>>(move(pcSplit->values));
61 |     };
62 | 
63 |     // TODO: only serialize partitionId part
64 |     void serialize_dyn(vector<char>& bytes) const {
65 |         vector<char> pbytes;
66 |         {
67 |             SerialGuard gd{pbytes};
68 |             gd << *pimpl;
69 |         }
70 |         // NOTE: record size here to dynamically adjust byte offset
71 |         serialSize = pbytes.size();
72 |         size_t oldSize = bytes.size();
73 |         bytes.resize(oldSize + sizeof(ParallelCollection));
74 |         memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(ParallelCollection));
75 |         bytes.insert(bytes.end(),
76 |                 std::make_move_iterator(pbytes.begin()),
77 |                 std::make_move_iterator(pbytes.end()));
78 |     };
79 | 
80 |     void deserialize_dyn(const char*& bytes, size_t& size) {
81 |         bytes += sizeof(ParallelCollection);
82 |         size -= sizeof(ParallelCollection);
83 |         pimpl.release();
84 |         pimpl = make_unique<PCVal>();
85 |         DeserialGuard gd{bytes, size};
86 |         gd >> *pimpl;
87 |         bytes += serialSize;
88 |         size -= serialSize;
89 |     };
90 | 
91 |     void finalize() override {
92 |         pimpl.reset(nullptr);
93 |     }
94 | };
95 | 
96 | 
97 | #endif //SPARKPP_PARALLEL_COLLECTION_HPP
98 | 


--------------------------------------------------------------------------------
/include/rdd/rdd.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/7/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_RDD_HPP
  6 | #define SPARKPP_RDD_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "split.hpp"
 10 | #include "dependency.hpp"
 11 | #include "partitioner.hpp"
 12 | #include "serialize_wrapper.hpp"
 13 | #include <capnp/message.h>
 14 | #include <boost/iostreams/device/back_inserter.hpp>
 15 | #include <boost/iostreams/device/array.hpp>
 16 | #include <boost/iostreams/stream.hpp>
 17 | #include <boost/archive/binary_oarchive.hpp>
 18 | #include <boost/archive/binary_iarchive.hpp>
 19 | #include <boost/serialization/vector.hpp>
 20 | 
 21 | struct SparkContext;
 22 | 
 23 | // FIXME: currently all data in a RDD is passed (like Spark-0.5 does)
 24 | // FIXME: do we really need boost::serializaton + capnproto? (replace everything with boost only)
 25 | /// For non-trivial objects, we store them into pimpl mode + boost::serialization.
 26 | /// Since C++ has no support for `mem::forget`,
 27 | /// we need to place objects like `std::vector` to pimpl to avoid undefined destructors.
 28 | /// Fortunately, the size of pimpl objects are usually small, since they don't store value inside them.
 29 | /// For trivial objects, just directly copy based on class bytes.
 30 | /// Only derived classes virtual methods know how to change offsets; (CRTP won't work)
 31 | struct RDDBase {
 32 |     virtual size_t id() = 0;
 33 |     unique_ptr<IterBase> iterator(unique_ptr<Split> split) {
 34 |         return unique_ptr<IterBase>{ iterator_impl(move(split)) };
 35 |     }
 36 |     virtual IterBase* iterator_impl(unique_ptr<Split> split) = 0;
 37 |     // virtual vector<Partition> getPartitions() = 0;
 38 |     virtual void serialize_dyn(vector<char>&) const = 0;
 39 |     virtual void deserialize_dyn(const char*&, size_t&) = 0;
 40 |     virtual size_t numOfSplits() = 0;
 41 |     // virtual vector<unique_ptr<Split>> splits() = 0;
 42 |     virtual unique_ptr<Split> split(size_t partitionId) = 0;
 43 |     virtual span<Dependency*> dependencies() = 0;
 44 |     // after reviving from buffer & execution, release resources
 45 |     virtual void finalize() {}
 46 | };
 47 | 
 48 | RDDBase* rdd_from_reader(::capnp::Data::Reader reader);
 49 | 
 50 | // Transformations
 51 | 
 52 | template <typename T, typename U, typename F>
 53 | struct MappedRDD;
 54 | 
 55 | template <typename T, typename U, typename F>
 56 | struct FlatMappedRDD;
 57 | 
 58 | template <typename T, typename K, typename V, typename F>
 59 | struct MapPairRDD;
 60 | 
 61 | template <typename T, typename F>
 62 | struct FilterRDD;
 63 | 
 64 | template <typename T>
 65 | struct RDD : RDDBase {
 66 |     SparkContext& sc;
 67 |     vector<Dependency*> deps;
 68 |     size_t m_id;
 69 |     bool shouldCache = false;
 70 | 
 71 |     RDD(SparkContext& sc_);
 72 |     size_t id() override {
 73 |         return m_id;
 74 |     }
 75 |     RDD& cache() {
 76 |         shouldCache = true;
 77 |         return *this;
 78 |     }
 79 | 
 80 |     virtual unique_ptr<Iterator<T>> compute(unique_ptr<Split> split) = 0;
 81 |     unique_ptr<Iterator<T>> iterator(unique_ptr<Split> split) {
 82 |         return unique_ptr<Iterator<T>>{ iterator_impl(move(split)) };
 83 |     }
 84 |     Iterator<T>* iterator_impl(unique_ptr<Split> split) override;
 85 | 
 86 |     span<Dependency*> dependencies() override {
 87 |         return make_span(deps);
 88 |     }
 89 | 
 90 |     // TODO: persist, unpersist, storageLevel
 91 | 
 92 |     // Transformations, Lazy
 93 |     // HACK: this requires lifetime to continue. better use `enable_shared_from_this` + `shared_from_this`
 94 |     // But this will cause extra overhead in type serialization & type system
 95 |     // Currently every RDD<T> should live long through the program.
 96 | 
 97 |     // Invocable<T> F -> U
 98 |     template <typename F, typename U = typename function_traits<F>::result_type>
 99 |     auto map(F f) -> MappedRDD<T, U, F> {
100 |         return MappedRDD<T, U, F>{this, move(f)};
101 |     }
102 | 
103 |     // Invocable<T> F -> Vec<U>
104 |     template <typename F, typename V = typename function_traits<F>::result_type,
105 |             typename U = typename V::value_type>
106 |     auto flatMap(F f) -> FlatMappedRDD<T, U, F> {
107 |         return FlatMappedRDD<T, U, F>{this, move(f)};
108 |     }
109 | 
110 |     // Invocable<T> F -> pair<K, V>
111 |     template <typename F,
112 |             typename R = typename function_traits<F>::result_type,
113 |             typename K = typename R::first_type, typename V = typename R::second_type>
114 |     auto mapPair(F f) -> MapPairRDD<T, K, V, F> {
115 |         return MapPairRDD<T, K, V, F>{this, move(f)};
116 |     }
117 | 
118 |     // Invocable<T> F -> bool
119 |     template <typename F>
120 |     auto filter(F f) -> FilterRDD<T, F> {
121 |         return FilterRDD<T, F>{this, move(f)};
122 |     }
123 | 
124 |     // Actions, Eager
125 | 
126 |     // Invocable<T, T>
127 |     template <typename F>
128 |     T reduce(F&& f);
129 | 
130 |     vector<T> collect();
131 |     size_t count();
132 | };
133 | 
134 | #include "rdd/mapped_rdd.hpp"
135 | #include "rdd/pair_rdd.hpp"
136 | 
137 | 
138 | #endif //SPARKPP_RDD_HPP
139 | 


--------------------------------------------------------------------------------
/include/scheduler/dag_scheduler.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/11/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_DAG_SCHEDULER_HPP
  6 | #define SPARKPP_DAG_SCHEDULER_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "rdd/rdd.hpp"
 10 | #include "serialize_wrapper.hpp"
 11 | #include "serialize_capnp.hpp"
 12 | #include "scheduler/scheduler.hpp"
 13 | #include "scheduler/task.hpp"
 14 | #include "scheduler/stage.hpp"
 15 | #include <boost/asio/thread_pool.hpp>
 16 | #include <boost/asio/post.hpp>
 17 | 
 18 | struct TaskEndReason {
 19 |     struct Success {
 20 |         SN_BOOST_SERIALIZE_EMPTY();
 21 |     };
 22 |     struct FetchFailed {
 23 |         string addr;
 24 |         size_t shuffleId;
 25 |         size_t mapId;
 26 |         size_t reduceId;
 27 |         SN_BOOST_SERIALIZE_MEMBERS_IN(addr, shuffleId, mapId, reduceId);
 28 |     };
 29 |     struct Error {
 30 |         string reason;
 31 |         SN_BOOST_SERIALIZE_MEMBERS_IN(reason);
 32 |     };
 33 |     variant<Success, FetchFailed, Error> vmember;
 34 |     auto& get() {
 35 |         return vmember;
 36 |     }
 37 |     const auto& get() const {
 38 |         return vmember;
 39 |     }
 40 | };
 41 | 
 42 | struct CompletionEvent {
 43 |     unique_ptr<Task> task;
 44 |     TaskEndReason reason;
 45 |     Storage result;
 46 |     CompletionEvent() noexcept {}
 47 |     CompletionEvent(unique_ptr<Task>&& task_, TaskEndReason reason, Storage result) noexcept
 48 |         : task{move(task_)}, reason{move(reason)}, result{move(result)} {}
 49 |     CompletionEvent(CompletionEvent&& rhs) noexcept
 50 |     : task{move(rhs.task)}, reason{move(rhs.reason)}, result{move(rhs.result)} {}
 51 |     CompletionEvent& operator=(CompletionEvent&& rhs) noexcept {
 52 |         task = move(rhs.task);
 53 |         reason = move(rhs.reason);
 54 |         result = move(rhs.result);
 55 |         return *this;
 56 |     }
 57 |     // accumUpdates
 58 | };
 59 | 
 60 | /*
 61 | struct DAGEventLoop : EventLoop<DAGSchedulerEvent> {
 62 |     DAGScheduler& dagScheduler;
 63 |     DAGEventLoop(DAGScheduler& schd) : dagScheduler{schd} {}
 64 |     void onReceive(DAGSchedulerEvent event);
 65 | };
 66 |  */
 67 | 
 68 | // Spark master-version Job path:
 69 | //      runJob -> submitJob -> JobWaiter ->
 70 | //      handleJobSubmitted -> ActiveJob -> submitStage ->
 71 | //      submitTasks -> launchTasks -> handleSuccessfulTask -> 
 72 | //      CompletionEvent -> handleTaskCompletion ->
 73 | //      job.taskSucceeded -> submitJob -> runJob
 74 | // Spark branch-0.5 Job path
 75 | //      runJob -> submitStage -> submitMissingTasks ->
 76 | //      submitTasks -> taskEnded -> CompletionEvent ->
 77 | //      waitForEvent -> submitMissingTasks
 78 | /// Basically DAGScheduler + TaskScheduler
 79 | struct DAGScheduler : Scheduler {
 80 |     atomic<size_t> nextJobId;
 81 |     atomic<size_t> nextStageId;
 82 |     atomic<size_t> nextRunId;
 83 |     atomic<size_t> nextTaskId;
 84 |     // NOTE: only accessible from main thread
 85 |     unordered_map<size_t, shared_ptr<Stage>> idToStage;
 86 |     unordered_map<size_t, shared_ptr<Stage>> shuffleToMapStage;
 87 |     unordered_map<size_t, vector<vector<host_t>>> cacheLocs;
 88 |     unordered_map<size_t, BlockingConcurrentQueue<CompletionEvent>> eventQueues;
 89 |     vector<addr_t> address;
 90 |     boost::asio::thread_pool pool{4};
 91 | 
 92 |     DAGScheduler(vector<addr_t> addr) : address{move(addr)} {}
 93 | 
 94 |     shared_ptr<Stage> newStage(RDDBase* rdd, optional<ShuffleDependencyBase*> shuffleDep);
 95 |     shared_ptr<Stage> getShuffleMapStage(ShuffleDependencyBase* shuffleDep);
 96 | 
 97 |     vector<host_t> getPreferredLocs(RDDBase* rdd, size_t partitionId);
 98 |     vector<Stage*> getMissingParentStages(const Stage& stage);
 99 |     vector<Stage*> getParentStages(RDDBase* rdd);
100 | 
101 | 
102 |     auto& getCacheLocs(RDDBase* rdd) {
103 |         return cacheLocs[rdd->id()];
104 |     }
105 | 
106 |     void updateCacheLocs();
107 | 
108 |     template <typename F, typename T, typename U = typename function_traits<F>::result_type>
109 |     vector<U> runJob(F&& func, RDD<T>* finalRdd, const vector<size_t>& partitions);
110 | 
111 | 
112 |     void visitMissingParent(unordered_set<Stage *> &missing, unordered_set<RDDBase *> &visited, RDDBase *r);
113 |     void visitParent(unordered_set<Stage*>& parents, unordered_set<RDDBase*>& visited, RDDBase* r);
114 | 
115 |     /// when a stage's parents are available, do the task
116 |     void submitMissingTasks(size_t runId, RDDBase *finalRdd, FnBase *func,
117 |                             unordered_map<Stage *, unordered_set<size_t>>& pendingTasks,
118 |                             const vector<size_t>& partitions,
119 |                             vector<bool>& finished, Stage *stage, Stage *finalStage);
120 | 
121 |     void submitTasks(unique_ptr<Task> task);
122 | 
123 |     void
124 |     submitStage(size_t runId, RDDBase *finalRdd, FnBase *func,
125 |                 unordered_map<Stage *, unordered_set<size_t>> &pendingTasks,
126 |                 const vector<size_t> &partitions, vector<bool> &finished, Stage *finalStage,
127 |                 unordered_set<Stage *>& waiting, unordered_set<Stage *>& running, Stage *stage);
128 | 
129 |     void taskEnded(unique_ptr<Task> task, TaskEndReason reason, Storage result);
130 | };
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | #endif //SPARKPP_DAG_SCHEDULER_HPP
138 | 


--------------------------------------------------------------------------------
/include/scheduler/scheduler.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/11/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_SCHEDULER_HPP
 6 | #define SPARKPP_SCHEDULER_HPP
 7 | 
 8 | #include "common.hpp"
 9 | 
10 | struct Scheduler {};
11 | 
12 | #endif //SPARKPP_SCHEDULER_HPP
13 | 


--------------------------------------------------------------------------------
/include/scheduler/stage.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/12/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_STAGE_HPP
 6 | #define SPARKPP_STAGE_HPP
 7 | 
 8 | #include <utility>
 9 | 
10 | #include "common.hpp"
11 | #include "rdd/rdd.hpp"
12 | #include "dependency.hpp"
13 | 
14 | struct Stage {
15 |     size_t id;
16 |     RDDBase* rdd;
17 |     optional<ShuffleDependencyBase*> shuffleDep;
18 |     vector<Stage*> parents;
19 |     vector<vector<host_t>> outputLocs;
20 |     size_t numAvailableOutputs = 0;
21 |     size_t numPartitions;
22 | 
23 |     Stage(size_t id_, RDDBase* rdd_, optional<ShuffleDependencyBase*> shuffleDep_, vector<Stage*> stage_)
24 |         : id{id_}, rdd{rdd_}, shuffleDep{shuffleDep_}, parents{std::move(stage_)} {
25 |         outputLocs.resize(rdd->numOfSplits());
26 |         numPartitions = rdd->numOfSplits();
27 |     }
28 | 
29 |     bool isShuffleMap() {
30 |         return shuffleDep.has_value();
31 |     }
32 |     bool isAvailable() {
33 |         if (parents.empty() && !isShuffleMap()) {
34 |             return true;
35 |         }
36 |         return numAvailableOutputs == rdd->numOfSplits();
37 |     }
38 | 
39 |     void addOutputLoc(size_t partition, host_t host) {
40 |         if (outputLocs[partition].empty()) {
41 |             numAvailableOutputs += 1;
42 |         }
43 |         outputLocs[partition].push_back(move(host));
44 |     }
45 |     void removeOutputLoc(size_t partition, host_t host) {
46 |         size_t old = outputLocs[partition].size();
47 |         outputLocs[partition].erase(
48 |                 remove_if(outputLocs[partition].begin(), outputLocs[partition].end(), [&](const host_t& h) {
49 |                     return h == host;
50 |         }), outputLocs[partition].end());
51 |         if (old != 0 && outputLocs[partition].empty()) {
52 |             numAvailableOutputs -= 1;
53 |         }
54 |     }
55 | };
56 | 
57 | 
58 | #endif //SPARKPP_STAGE_HPP
59 | 


--------------------------------------------------------------------------------
/include/scheduler/task.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/12/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_TASK_HPP
  6 | #define SPARKPP_TASK_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "rdd/rdd.hpp"
 10 | #include "serialize_wrapper.hpp"
 11 | #include "dependency.hpp"
 12 | 
 13 | struct TaskContext {
 14 |     size_t runId;
 15 |     size_t stageId;
 16 |     // int attemptId;
 17 | };
 18 | 
 19 | struct Task {
 20 |     virtual Storage run(size_t id) = 0;
 21 |     virtual vector<host_t> preferredLocations() {
 22 |         return {};
 23 |     }
 24 |     virtual optional<size_t> generation() {
 25 |         return {};
 26 |     }
 27 |     virtual size_t task_id() const = 0;
 28 |     virtual size_t run_id() const = 0;
 29 |     virtual size_t stage_id() const = 0;
 30 | };
 31 | 
 32 | struct ResultTask : Task {
 33 |     size_t taskId = 0;
 34 |     size_t runId = 0;
 35 |     size_t stageId = 0;
 36 |     RDDBase* rdd;
 37 |     FnBase* func;
 38 |     size_t partition;
 39 |     vector<host_t> locs;
 40 |     size_t outputId = 0;
 41 | 
 42 |     ResultTask(size_t pid, RDDBase* r, FnBase* f)
 43 |         : rdd{r}, func{f}, partition{pid} {}
 44 | 
 45 |     ResultTask(size_t tid, size_t rid, size_t sid, RDDBase* r, FnBase* f, size_t pid, vector<host_t> l, size_t oid)
 46 |         : taskId{tid}, runId{rid}, stageId{sid}, rdd{r}, func{f}, partition{pid}, locs{move(l)}, outputId{oid} {}
 47 |     Storage run([[maybe_unused]] size_t attemptId) {
 48 |         unique_ptr<Split> split = rdd->split(partition);
 49 |         auto s = func->call(rdd->iterator(move(split)));
 50 |         rdd->finalize();
 51 |         return s;
 52 |     }
 53 |     vector<host_t> preferredLocations() override {
 54 |         return locs;
 55 |     }
 56 |     size_t task_id() const override {
 57 |         return taskId;
 58 |     }
 59 |     size_t run_id() const override {
 60 |         return runId;
 61 |     }
 62 |     size_t stage_id() const override {
 63 |         return stageId;
 64 |     }
 65 | };
 66 | 
 67 | struct ShuffleMapTask : Task {
 68 |     size_t taskId = 0;
 69 |     size_t runId = 0;
 70 |     size_t stageId = 0;
 71 |     RDDBase* rdd;
 72 |     ShuffleDependencyBase* dep;
 73 |     size_t partition;
 74 |     vector<host_t> locs;
 75 | 
 76 |     ShuffleMapTask(size_t pid, RDDBase* r, ShuffleDependencyBase* d_)
 77 |         : rdd{r}, dep{d_}, partition{pid} {
 78 | 
 79 |     }
 80 | 
 81 |     ShuffleMapTask(size_t tid, size_t rid, size_t sid, RDDBase* r, ShuffleDependencyBase* d_, size_t pid, vector<host_t> l)
 82 |         : taskId{tid}, runId{rid}, stageId{sid}, rdd{r}, dep{d_}, partition{pid}, locs{move(l)} {}
 83 |     vector<host_t> preferredLocations() override {
 84 |         return locs;
 85 |     }
 86 |     Storage run([[maybe_unused]] size_t attemptId) {
 87 |         unique_ptr<Split> split = rdd->split(partition);
 88 |         auto s = dep->runShuffle(rdd, move(split), partition);
 89 |         Storage st{
 90 |             {s.begin(), s.end()}
 91 |         };
 92 |         rdd->finalize();
 93 |         return st;
 94 |     }
 95 |     size_t task_id() const override {
 96 |         return taskId;
 97 |     }
 98 |     size_t run_id() const override {
 99 |         return runId;
100 |     }
101 |     size_t stage_id() const override {
102 |         return stageId;
103 |     }
104 | };
105 | 
106 | #endif //SPARKPP_TASK_HPP
107 | 


--------------------------------------------------------------------------------
/include/serialize_capnp.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/14/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_SERIALIZE_CAPNP_HPP
 6 | #define SPARKPP_SERIALIZE_CAPNP_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include "scheduler/task.hpp"
10 | #include "serialize_wrapper.hpp"
11 | #include "data.capnp.h"
12 | #include <capnp/serialize-packed.h>
13 | #include <capnp/message.h>
14 | 
15 | ::capnp::Data::Reader vec_to_reader(vector<char>&);
16 | vector<char> reader_to_vec(::capnp::Data::Reader);
17 | 
18 | template <typename T>
19 | void sendData(int fd, ::capnp::Data::Reader reader);
20 | 
21 | template <typename T>
22 | void sendData(int fd, vector<char>& bytes);
23 | 
24 | void sendExecution(int fd, Task* task);
25 | 
26 | unique_ptr<Task> recvExecution(::capnp::PackedFdMessageReader& message);
27 | 
28 | template <typename T>
29 | void sendData(int fd, ::capnp::Data::Reader reader) {
30 |     ::capnp::MallocMessageBuilder builder;
31 |     typename T::Builder data = builder.initRoot<T>();
32 |     data.setMsg(reader);
33 |     ::capnp::writePackedMessageToFd(fd, builder);
34 | }
35 | 
36 | template <typename T>
37 | void sendData(int fd, vector<char>& bytes) {
38 |     ::capnp::Data::Reader reader{
39 |             reinterpret_cast<unsigned char*>(bytes.data()),
40 |             bytes.size()
41 |     };
42 |     sendData<T>(fd, reader);
43 | }
44 | 
45 | template <typename T>
46 | typename ::capnp::Data::Reader recvData(::capnp::PackedFdMessageReader& message) {
47 |     typename T::Reader result = message.getRoot<T>();
48 |     return result.getMsg();
49 | }
50 | 
51 | #endif //SPARKPP_SERIALIZE_CAPNP_HPP
52 | 


--------------------------------------------------------------------------------
/include/serialize_wrapper.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/10/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_SERIALIZE_WRAPPER_HPP
  6 | #define SPARKPP_SERIALIZE_WRAPPER_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include <capnp/message.h>
 10 | 
 11 | struct Storage {
 12 |     vector<char> v;
 13 |     Storage() = default;
 14 |     Storage(vector<char> v_) : v{move(v_)} {};
 15 |     Storage(const Storage& rhs) = default;
 16 |     Storage(Storage&& rhs) = default;
 17 |     template <typename T>
 18 |     Storage(const T& data) : v{
 19 |         reinterpret_cast<const char*>(&data),
 20 |         reinterpret_cast<const char*>(&data) + sizeof(T)} {}
 21 |     Storage& operator=(const Storage&) = default;
 22 |     Storage& operator=(Storage&&) = default;
 23 |     // value-semantic T
 24 |     template <typename T>
 25 |     explicit operator T() {
 26 |         return *reinterpret_cast<T*>(v.data());
 27 |     }
 28 |     ::capnp::Data::Reader to_reader() {
 29 |         return {reinterpret_cast<unsigned char*>(v.data()), v.size()};
 30 |     }
 31 | };
 32 | 
 33 | struct IterBase {
 34 |     virtual ~IterBase() = default;
 35 | };
 36 | 
 37 | template <typename T>
 38 | struct Iterator : IterBase {
 39 |     // FIXME: lots of virtual calling to this method to get data...
 40 |     virtual optional<T> next() = 0;
 41 |     virtual bool hasNext() = 0;
 42 |     vector<T> collect() {
 43 |         vector<T> res;
 44 |         while (hasNext()) {
 45 |             res.push_back(move(next().value()));
 46 |         }
 47 |         return res;
 48 |     }
 49 |     size_t count() {
 50 |         size_t cnt = 0;
 51 |         while (hasNext()) {
 52 |             next();
 53 |             cnt += 1;
 54 |         }
 55 |         return cnt;
 56 |     }
 57 | };
 58 | 
 59 | 
 60 | /// `slice::Iter`
 61 | template <typename T>
 62 | struct SliceIter : Iterator<T> {
 63 |     const T* ptr;
 64 |     const T* end;
 65 |     SliceIter(const T* p, const T* e) : ptr{p}, end{e} {};
 66 |     optional<T> next() override {
 67 |         if (ptr != end) {
 68 |             T v = *ptr;
 69 |             ++ptr;
 70 |             return v;
 71 |         }
 72 |         return {};
 73 |     }
 74 |     bool hasNext() override {
 75 |         return ptr != end;
 76 |     }
 77 | };
 78 | 
 79 | /// T must be a standard container type
 80 | template <typename T>
 81 | struct OwnIterator : Iterator<T> {
 82 |     vector<T> data;
 83 |     typename vector<T>::iterator iter;
 84 |     typename vector<T>::iterator end;
 85 |     OwnIterator(vector<T> data_) : data{move(data_)}, iter{data.begin()}, end{data.end()} {}
 86 |     optional<T> next() override {
 87 |         if (iter != end) {
 88 |             T v = *iter;
 89 |             ++iter;
 90 |             return v;
 91 |         }
 92 |         return {};
 93 |     }
 94 |     bool hasNext() override {
 95 |         return iter != end;
 96 |     }
 97 | };
 98 | 
 99 | template <typename T, typename U, typename F>
100 | struct MapIterator : Iterator<U> {
101 |     unique_ptr<Iterator<T>> prev;
102 |     F func;
103 |     MapIterator(unique_ptr<Iterator<T>> prev, F func)
104 |         : prev{move(prev)}, func{move(func)} {}
105 |     optional<U> next() override {
106 |         auto s = prev->next();
107 |         return s.map([func = func](T t) mutable {
108 |             return invoke(move(func), move(t));
109 |         });
110 |     }
111 |     bool hasNext() override {
112 |         return prev->hasNext();
113 |     }
114 | };
115 | 
116 | // F: T -> Vec<U>
117 | template <typename T, typename U, typename F>
118 | struct FlatMapIterator : Iterator<U> {
119 |     unique_ptr<Iterator<T>> prev;
120 |     F func;
121 |     optional<vector<U>> current;
122 |     typename vector<U>::iterator iter;
123 |     typename vector<U>::iterator end;
124 |     FlatMapIterator(unique_ptr<Iterator<T>> prev, F func)
125 |             : prev{move(prev)}, func{move(func)} {}
126 |     optional<U> next() override {
127 |         while (!current.is_initialized() || iter == end) {
128 |             if (!prev->hasNext()) {
129 |                 return {};
130 |             }
131 |             current = invoke(func, std::move(prev->next().value()));
132 |             iter = current->begin();
133 |             end = current->end();
134 |         }
135 |         auto u = *iter;
136 |         ++iter;
137 |         return u;
138 |     }
139 |     bool hasNext() override {
140 |         return prev->hasNext() || (iter != end);
141 |     }
142 | };
143 | 
144 | template <typename K, typename V, typename U, typename F>
145 | struct MapValueIterator : Iterator<pair<K, U>> {
146 |     unique_ptr<Iterator<pair<K, V>>> prev;
147 |     F func;
148 |     MapValueIterator(unique_ptr<Iterator<pair<K, V>>> p, F f)
149 |             : prev{move(p)}, func{move(f)} {}
150 |     optional<pair<K, U>> next() override {
151 |         auto s = prev->next();
152 |         if (!s.is_initialized()) {
153 |             return {};
154 |         }
155 |         auto p = move(s.value());
156 |         return make_pair(move(p.first), func(move(p.second)));
157 |     }
158 |     bool hasNext() override {
159 |         return prev->hasNext();
160 |     }
161 | };
162 | 
163 | template <typename K, typename C>
164 | struct HashIterator : Iterator<pair<K, C>> {
165 |     unordered_map<K, C> combiners;
166 |     using iter_t = typename unordered_map<K, C>::iterator;
167 |     iter_t iter;
168 |     iter_t end;
169 |     HashIterator(unordered_map<K, C> m)
170 |         : combiners{move(m)},
171 |         iter{combiners.begin()}, end{combiners.end()} {}
172 |     bool hasNext() override {
173 |         return iter != end;
174 |     }
175 |     optional<pair<K, C>> next() override {
176 |         if (iter == end) {
177 |             return {};
178 |         }
179 |         auto v = *iter;
180 |         pair<K, C> p = make_pair(move(v.first), move(v.second));
181 |         ++iter;
182 |         return p;
183 |     }
184 | };
185 | 
186 | template <typename T, typename F>
187 | struct FilterIterator : Iterator<T> {
188 |     unique_ptr<Iterator<T>> prev;
189 |     F func;
190 |     optional<T> temp;
191 |     FilterIterator(unique_ptr<Iterator<T>> prev, F func)
192 |             : prev{move(prev)}, func{move(func)} {}
193 |     optional<T> next() override {
194 |         if (temp.is_initialized()) {
195 |             optional<T> u{std::move(temp)};
196 |             temp = boost::none;
197 |             return u;
198 |         }
199 |         while (true) {
200 |             auto s = prev->next();
201 |             if (!s.is_initialized()) {
202 |                 return {};
203 |             }
204 |             if (invoke(func, s.value())) {
205 |                 return s;
206 |             }
207 |         }
208 |     }
209 |     bool hasNext() override {
210 |         if (temp.is_initialized()) {
211 |             return true;
212 |         }
213 |         while (prev->hasNext()) {
214 |             auto s = prev->next();
215 |             if (invoke(func, s.value())) {
216 |                 temp = move(s.value());
217 |                 return true;
218 |             }
219 |         }
220 |         return false;
221 |     }
222 | };
223 | 
224 | 
225 | 
226 | 
227 | struct FnBase {
228 |     virtual Storage call(unique_ptr<IterBase> ib) = 0;
229 |     virtual ::capnp::Data::Reader to_reader() = 0;
230 |     virtual ~FnBase() = default;
231 | };
232 | 
233 | 
234 | template <typename T>
235 | void serialize(const T& v, vector<char>& bytes);
236 | 
237 | /// The wrapped function must have 
238 | ///     no reference / 
239 | ///     environment dependency / 
240 | ///     non-trivial constructible states 
241 | ///     (value semantics)
242 | template <typename F>
243 | struct FnWrapper : FnBase {
244 |     F f;
245 |     using FuncSig = function_traits<F>;
246 |     // NOTE: R should be serializable and call serial() function
247 |     using R = typename FuncSig::result_type;
248 |     // single input, unique_ptr<Iterator<Ty>>
249 |     using T = typename decay_t<typename FuncSig::template args<0>::type>::element_type;
250 |     FnWrapper(F f_) : f{move(f_)} {};
251 | 
252 |     // input type could be any, but output type should be serializable
253 |     Storage call(unique_ptr<IterBase> ib) override {
254 |         unique_ptr<T> iter = dynamic_unique_ptr_cast<T>(move(ib));
255 |         auto result = f(move(iter));
256 |         vector<char> bytes;
257 |         serialize(result, bytes);
258 |         return Storage{move(bytes)};
259 |     }
260 |     ::capnp::Data::Reader to_reader() override {
261 |         return {reinterpret_cast<unsigned char*>(this), sizeof(FnWrapper)};
262 |     }
263 | };
264 | 
265 | template <typename F>
266 | FnWrapper(F&&) -> FnWrapper<F>;
267 | 
268 | FnBase* fn_from_reader(::capnp::Data::Reader reader);
269 | 
270 | template <typename T>
271 | void serialize(const T& v, vector<char>& bytes) {
272 |     SerialGuard gd{bytes};
273 |     gd << v;
274 | }
275 | 
276 | template <typename T>
277 | void deserialize(T& v, const char* bytes, size_t size) {
278 |     DeserialGuard gd{bytes, size};
279 |     gd >> v;
280 | }
281 | 
282 | template <typename T>
283 | void deserialize(T& v, ::capnp::Data::Reader reader) {
284 |     DeserialGuard gd{reinterpret_cast<const char*>(reader.asBytes().begin()), reader.size()};
285 |     gd >> v;
286 | }
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | #endif //SPARKPP_SERIALIZE_WRAPPER_HPP
298 | 


--------------------------------------------------------------------------------
/include/shuffle_fetcher.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/14/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_SHUFFLE_FETCHER_HPP
 6 | #define SPARKPP_SHUFFLE_FETCHER_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include <boost/beast/core.hpp>
10 | #include <boost/beast/http.hpp>
11 | #include <boost/beast/version.hpp>
12 | #include <boost/asio/connect.hpp>
13 | #include <boost/asio/ip/tcp.hpp>
14 | #include <boost/asio/thread_pool.hpp>
15 | #include <boost/asio/post.hpp>
16 | #include <boost/serialization/vector.hpp>
17 | #include <fmt/format.h>
18 | 
19 | namespace beast = boost::beast;
20 | namespace http = beast::http;
21 | namespace net = boost::asio;
22 | using tcp = net::ip::tcp;
23 | 
24 | template <typename F>
25 | struct GetType {
26 |     using trait = function_traits<F>;
27 |     using P = typename trait::template args<0>::type;
28 |     using K = typename P::first_type;
29 |     using V = typename P::second_type;
30 | };
31 | 
32 | 
33 | struct ParallelShuffleFetcher {
34 |     template <typename F>
35 |     void fetch(size_t shuffleId, size_t reduceId, F&& func);
36 | };
37 | 
38 | 
39 | 
40 | #endif //SPARKPP_SHUFFLE_FETCHER_HPP
41 | 


--------------------------------------------------------------------------------
/include/shuffle_manager.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/14/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_SHUFFLE_MANAGER_HPP
 6 | #define SPARKPP_SHUFFLE_MANAGER_HPP
 7 | 
 8 | #include "common.hpp"
 9 | #include <boost/beast/core.hpp>
10 | #include <boost/beast/http.hpp>
11 | #include <boost/beast/version.hpp>
12 | #include <boost/asio/connect.hpp>
13 | #include <boost/asio/ip/tcp.hpp>
14 | #include <boost/asio/thread_pool.hpp>
15 | #include <boost/asio/post.hpp>
16 | #include <fmt/format.h>
17 | 
18 | 
19 | namespace beast = boost::beast;
20 | namespace http = beast::http;
21 | namespace net = boost::asio;
22 | using tcp = net::ip::tcp;
23 | 
24 | struct ShuffleManager {
25 |     string localDir;
26 |     string shuffleDir;
27 |     string serverUri;
28 |     ShuffleManager() : localDir{"/tmp/sparkpp"}, shuffleDir{"/tmp/sparkpp/shuffle"} {
29 |         fs::create_directories(localDir);
30 |         fs::create_directories(shuffleDir);
31 |         char* localIp = std::getenv("SPARK_LOCAL_IP");
32 |         serverUri = fmt::format("{}", localIp);
33 |         thread thd{[localDir = localDir]() {
34 |             uint16_t port = 28001;
35 |             net::io_context ioc;
36 |             tcp::acceptor acceptor{ioc, {ip::tcp::v4(), port}};
37 |             while (true) {
38 |                 tcp::socket socket{ioc};
39 |                 acceptor.accept(socket);
40 |                 thread per_conn([localDir = localDir, socket = move(socket)]() mutable {
41 |                     beast::flat_buffer buffer;
42 |                     beast::error_code ec;
43 |                     http::request<http::string_body> req;
44 |                     http::read(socket, buffer, req, ec);
45 |                     // TODO: replace this to in-memory shuffle cache?
46 |                     string path = localDir + req.target().to_string();
47 |                     http::file_body::value_type body;
48 |                     body.open(path.c_str(), beast::file_mode::scan, ec);
49 |                     http::response<http::file_body> res{
50 |                         std::piecewise_construct,
51 |                         make_tuple(move(body)),
52 |                         make_tuple(http::status::ok, req.version())
53 |                     };
54 |                     res.set(http::field::server, BOOST_BEAST_VERSION_STRING);
55 |                     http::write(socket, res);
56 |                     socket.shutdown(tcp::socket::shutdown_both, ec);
57 |                 });
58 |                 per_conn.detach();
59 |             }
60 |         }};
61 |         thd.detach();
62 |     }
63 | 
64 | 
65 | };
66 | 
67 | 
68 | #endif //SPARKPP_SHUFFLE_MANAGER_HPP
69 | 


--------------------------------------------------------------------------------
/include/spark_context.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/5/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_SPARK_CONTEXT_HPP
  6 | #define SPARKPP_SPARK_CONTEXT_HPP
  7 | 
  8 | #include <utility>
  9 | #include <rdd/rdd.hpp>
 10 | 
 11 | 
 12 | #include "common.hpp"
 13 | #include "executor.hpp"
 14 | #include "spark_env.hpp"
 15 | #include "scheduler/dag_scheduler.hpp"
 16 | #include "rdd/parallel_collection.hpp"
 17 | 
 18 | enum class SparkContextType {
 19 |     Local,
 20 |     Distributed,
 21 |     NumOfSparkContextType
 22 | };
 23 | 
 24 | enum class SparkDistributeType {
 25 |     Master,
 26 |     Slave,
 27 |     NumOfSparkDistributeType
 28 | };
 29 | 
 30 | struct SparkConfig {
 31 |     SparkContextType mode;
 32 |     SparkDistributeType type;
 33 |     uint16_t port;
 34 |     vector<pair<string, uint16_t>> addr;
 35 |     SparkConfig(
 36 |             SparkContextType mode_,
 37 |             SparkDistributeType type_,
 38 |             uint16_t port_,
 39 |             vector<pair<string, uint16_t>> addr_
 40 |     ) : mode{mode_}, type{type_}, port{port_}, addr{move(addr_)} {}
 41 | };
 42 | 
 43 | struct SparkContext {
 44 |     std::atomic<size_t> nextRddId{0};
 45 |     std::atomic<size_t> nextShuffleId{0};
 46 |     SparkConfig config;
 47 |     vector<pair<string, uint16_t>> address;
 48 |     DAGScheduler scheduler;
 49 |     // scheduler
 50 | 
 51 |     size_t newRddId() {
 52 |         return nextRddId.fetch_add(1);
 53 |     }
 54 |     size_t newShuffleId() {
 55 |         return nextShuffleId.fetch_add(1);
 56 |     }
 57 | 
 58 |     // TODO: make this external configuration
 59 |     SparkConfig getConfig(char** argv, addr_t masterAddr, vector<addr_t> slaveAddrs) {
 60 |         auto ty = string{argv[1]};
 61 |         if (ty == "master") {
 62 |             return SparkConfig{
 63 |                     SparkContextType::Distributed,
 64 |                     SparkDistributeType::Master,
 65 |                     masterAddr.second,
 66 |                     move(slaveAddrs)
 67 |             };
 68 |         } else {
 69 |             return SparkConfig{
 70 |                     SparkContextType::Distributed,
 71 |                     SparkDistributeType::Slave,
 72 |                     slaveAddrs[0].second,
 73 |                     {move(masterAddr)}
 74 |             };
 75 |         }
 76 |     }
 77 | 
 78 |     SparkContext([[maybe_unused]] int argc, char** argv, addr_t masterAddr, vector<addr_t> slaveAddrs)
 79 |         : config{getConfig(argv, move(masterAddr), move(slaveAddrs))}, scheduler{config.addr} {
 80 |         switch (config.mode) {
 81 |             case SparkContextType::Distributed: {
 82 |                 switch (config.type) {
 83 |                     case SparkDistributeType::Master: {
 84 |                         address = move(config.addr);
 85 |                         break;
 86 |                     }
 87 |                     case SparkDistributeType::Slave: {
 88 |                         auto executor = Executor{config.addr[0], config.port};
 89 |                         executor.run();
 90 |                         exit(0);
 91 |                     }
 92 |                     default: __builtin_unreachable();
 93 |                 }
 94 |                 break;
 95 |             }
 96 |             case SparkContextType::Local: {
 97 |                 break;
 98 |             }
 99 |             default: __builtin_unreachable();
100 |         }
101 |     }
102 | 
103 |     template <typename T>
104 |     ParallelCollection<T> parallelize(vector<T> data, size_t numSlices = 8) {
105 |         return ParallelCollection<T>{*this, move(data), numSlices};
106 |     }
107 | 
108 |     template <typename T, typename F, typename U = typename function_traits<F>::result_type>
109 |     vector<U> runJob(RDD<T>* rdd, F&& f, const vector<size_t>& partitions) {
110 |         return scheduler.runJob(forward<F>(f), rdd, partitions);
111 |     }
112 | 
113 |     template <typename T, typename F, typename U = typename function_traits<F>::result_type>
114 |     vector<U> runJob(RDD<T>* rdd, F&& f) {
115 |         vector<size_t> partitions(rdd->numOfSplits());
116 |         std::iota(partitions.begin(), partitions.end(), 0);
117 |         return scheduler.runJob(forward<F>(f), rdd, partitions);
118 |     }
119 | };
120 | 
121 | 
122 | // HACK: weird solution for circular import dependency
123 | // FIXME: to avoid huge amount of dynamic dispatchs (1 task, 1 rdd, 1 action, N iterator)
124 | //      each actions should be builtin-function of iterators
125 | //      where `next()` & `hasNext()` should be non-virtual
126 | //      but `collect()`, `reduce()`, `count()` should be virtual
127 | template<typename T>
128 | template<typename F>
129 | T RDD<T>::reduce(F&& f) {
130 |     auto rf = [f = f](unique_ptr<Iterator<T>> iter) mutable {
131 |         T acc{};
132 |         while (iter->hasNext()) {
133 |             acc = std::invoke(forward<F>(f), acc, move(iter->next().value()));
134 |         }
135 |         return acc;
136 |     };
137 |     auto result = sc.runJob(this, move(rf));
138 |     T acc{};
139 |     for (auto&& r : result) {
140 |         acc = std::invoke(forward<F>(f), acc, move(r));
141 |     }
142 |     return acc;
143 | }
144 | 
145 | 
146 | template<typename F, typename T, typename U>
147 | vector<U> DAGScheduler::runJob(F &&func, RDD<T> *finalRdd, const vector<size_t>& partitions) {
148 |     // using U = typename function_traits<F>::result_type;
149 |     size_t numFinished = 0;
150 |     unordered_set<Stage*> waiting;
151 |     unordered_set<Stage*> running;
152 |     unordered_set<Stage*> failed;
153 |     unordered_map<Stage*, unordered_set<size_t>> pendingTasks;
154 |     [[maybe_unused]] size_t lastFetchFailureTime = 0;
155 |     size_t runId = nextRunId.fetch_add(1);
156 |     size_t numOutputParts = partitions.size();
157 |     auto finalStage = newStage(finalRdd, {});
158 |     vector<U> results(numOutputParts);
159 |     vector<bool> finished(numOutputParts);
160 |     FnWrapper funcWrapper{forward<F>(func)};
161 | 
162 |     updateCacheLocs();
163 |     eventQueues[runId];
164 | 
165 |     submitStage(runId, finalRdd, &funcWrapper,
166 |                 pendingTasks, partitions, finished, finalStage.get(), waiting, running, finalStage.get());
167 |     while (numFinished != numOutputParts) {
168 |         CompletionEvent event;
169 |         bool v = eventQueues[runId].wait_dequeue_timed(event, 500ms);
170 | 
171 |         if (v) {
172 |             auto stage = idToStage[event.task->stage_id()];
173 |             pendingTasks[stage.get()].erase(event.task->task_id());
174 |             match(event.reason.get(),
175 |                   [&](const TaskEndReason::Success&) {
176 |                       if (auto rt = dynamic_cast<ResultTask*>(event.task.get())) {
177 |                           U result;
178 |                           deserialize(result, event.result.to_reader());
179 |                           results[rt->outputId] = move(result);
180 |                           finished[rt->outputId] = true;
181 |                           numFinished += 1;
182 |                       } else {
183 |                           auto smt = dynamic_cast<ShuffleMapTask*>(event.task.get());
184 |                           auto stage = idToStage[smt->stageId];
185 |                           string result{event.result.v.begin(), event.result.v.end()};
186 |                           stage->addOutputLoc(smt->partition, result);
187 |                           if (running.count(stage.get()) && pendingTasks[stage.get()].empty()) {
188 |                               running.erase(stage.get());
189 |                               if (stage->shuffleDep.is_initialized()) {
190 |                                   vector<host_t> locs;
191 |                                   for (auto& v: stage->outputLocs) {
192 |                                       locs.push_back(v[0]);
193 |                                   }
194 |                                   env.mapOutputTracker->registerMapOutputs(
195 |                                           stage->shuffleDep.value()->shuffle_id(), locs);
196 |                               }
197 |                               updateCacheLocs();
198 |                               vector<Stage*> newlyRunnable;
199 |                               for (auto& s : waiting) {
200 |                                   if (getMissingParentStages(*s).empty()) {
201 |                                       newlyRunnable.push_back(s);
202 |                                   }
203 |                               }
204 |                               for (auto& s : newlyRunnable) {
205 |                                   waiting.erase(s);
206 |                                   running.insert(s);
207 |                               }
208 |                               for (auto& s : newlyRunnable) {
209 |                                   submitMissingTasks(
210 |                                           runId, finalRdd, &funcWrapper,
211 |                                           pendingTasks, partitions, finished,
212 |                                           s, finalStage.get());
213 |                               }
214 |                           }
215 |                       }
216 |                   },
217 |                   [&]([[maybe_unused]] const TaskEndReason::FetchFailed& f) {
218 |                       // TODO: handle failure
219 |                       ;
220 |                   }
221 |             );
222 |         }
223 |         // TODO: handle resubmit timeout
224 |         if (!failed.empty()) {
225 |             updateCacheLocs();
226 |             for (auto ps : failed) {
227 |                 submitStage(runId, finalRdd, &funcWrapper,
228 |                             pendingTasks, partitions, finished, finalStage.get(), waiting, running, ps);
229 |             }
230 |             failed.clear();
231 |         }
232 |     }
233 |     eventQueues.erase(runId);
234 |     return results;
235 | }
236 | 
237 | 
238 | template <typename T>
239 | RDD<T>::RDD(SparkContext &sc_) : sc{sc_} {
240 |     m_id = sc.newRddId();
241 | }
242 | 
243 | template<typename T>
244 | Iterator<T>* RDD<T>::iterator_impl(unique_ptr<Split> split) {
245 |     if (shouldCache) {
246 |         auto p = env.cacheTracker->getOrCompute(this, move(split));
247 |         return p.release();
248 |     } else {
249 |         auto p = compute(move(split));
250 |         return p.release();
251 |     }
252 | }
253 | 
254 | template<typename T>
255 | vector<T> RDD<T>::collect() {
256 |     auto cf = [](unique_ptr<Iterator<T>> iter) {
257 |         return iter->collect();
258 |     };
259 |     auto v = sc.runJob(this, move(cf));
260 |     return flatten(move(v));
261 | }
262 | 
263 | template<typename T>
264 | size_t RDD<T>::count() {
265 |     auto cf = [](unique_ptr<Iterator<T>> iter) {
266 |         return iter->count();
267 |     };
268 |     auto counts = sc.runJob(this, move(cf));
269 |     size_t cnt = 0;
270 |     for (auto i : counts) {
271 |         cnt += i;
272 |     }
273 |     return cnt;
274 | }
275 | 
276 | template<typename K, typename V, typename C>
277 | unique_ptr<Iterator<pair<K, C>>> ShuffleRDD<K, V, C>::compute(unique_ptr<Split> split) {
278 |     unordered_map<K, C> combiners;
279 |     auto mergePair = [&](pair<K, C> c) {
280 |         if (!combiners.count(c.first)) {
281 |             combiners.insert(move(c));
282 |         } else {
283 |             using mc_t = C(*)(C, C);
284 |             auto f_mc = reinterpret_cast<mc_t>(aggregator->mergeCombiners());
285 |             combiners[c.first] = f_mc(move(combiners[c.first]), move(c.second));
286 |         }
287 |     };
288 |     env.shuffleFetcher->fetch(dep.shuffleId, split->m_index, move(mergePair));
289 |     return make_unique<HashIterator<K, C>>(move(combiners));
290 | }
291 | 
292 | 
293 | #include <fstream>
294 | #include <boost/archive/binary_oarchive.hpp>
295 | 
296 | 
297 | template<typename K, typename V, typename C>
298 | string ShuffleDependency<K, V, C>::runShuffle(RDDBase* rdd, unique_ptr<Split> split, size_t partition) {
299 |     size_t numOutputSplits = partitioner->numPartitions();
300 |     vector<unordered_map<K, C>> buckets{numOutputSplits};
301 |     auto* rddkv = dynamic_cast<RDD<pair<K, V>>*>(rdd);
302 |     auto iter = rddkv->compute(move(split));
303 |     while (true) {
304 |         auto s = iter->next();
305 |         if (!s.is_initialized()) {
306 |             break;
307 |         }
308 |         auto p = move(s.value());
309 |         auto bucketId = partitioner->getPartition(p.first);
310 |         auto& bucket = buckets[bucketId];
311 |         auto b_iter = bucket.find(p.first);
312 |         if (b_iter == bucket.end()) {
313 |             using fcc_t = C(*)(V);
314 |             auto f_cc = reinterpret_cast<fcc_t>(aggregator->createCombiner());
315 |             bucket[move(p.first)] = f_cc(move(p.second));
316 |         } else {
317 |             using mv_t = C(*)(C, V);
318 |             auto f_mv = reinterpret_cast<mv_t>(aggregator->mergeValue());
319 |             b_iter->second = f_mv(move(b_iter->second), move(p.second));
320 |         }
321 |     }
322 |     for (size_t i = 0; i < numOutputSplits; ++i) {
323 |         string file_path = fmt::format("/tmp/sparkpp/shuffle/{}.{}.{}", shuffleId, partition, i);
324 |         std::ofstream ofs{file_path, std::ofstream::binary};
325 |         boost::archive::binary_oarchive ar{ofs, boost::archive::no_header | boost::archive::no_tracking};
326 |         vector<pair<K, C>> v{
327 |                 std::make_move_iterator(buckets[i].begin()),
328 |                 std::make_move_iterator(buckets[i].end())
329 |         };
330 |         ar << v;
331 |     }
332 |     return env.shuffleManager->serverUri;
333 | }
334 | 
335 | template<typename K, typename V, typename C>
336 | void ShuffleDependency<K, V, C>::serialize_dyn(vector<char> &bytes) const {
337 |     size_t oldSize = bytes.size();
338 |     bytes.resize(oldSize + sizeof(ShuffleDependency));
339 |     memcpy(bytes.data() + oldSize, reinterpret_cast<const char*>(this), sizeof(ShuffleDependency));
340 |     partitioner->serialize_dyn(bytes);
341 |     aggregator->serialize_dyn(bytes);
342 | }
343 | 
344 | 
345 | template<typename K, typename V, typename C>
346 | void ShuffleDependency<K, V, C>::deserialize_dyn(const char*& bytes, size_t& size) {
347 |     bytes += sizeof(ShuffleDependency);
348 |     size -= sizeof(ShuffleDependency);
349 |     partitioner = reinterpret_cast<Partitioner*>(const_cast<char*>(bytes));
350 |     partitioner->deserialize_dyn(bytes, size);
351 |     aggregator = reinterpret_cast<AggregatorBase*>(const_cast<char*>(bytes));
352 |     aggregator->deserialize_dyn(bytes, size);
353 | }
354 | 
355 | 
356 | 
357 | 
358 | 
359 | #endif //SPARKPP_SPARK_CONTEXT_HPP
360 | 


--------------------------------------------------------------------------------
/include/spark_env.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/13/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_SPARK_ENV_HPP
  6 | #define SPARKPP_SPARK_ENV_HPP
  7 | 
  8 | #include "common.hpp"
  9 | #include "cache.hpp"
 10 | #include "cache_tracker.hpp"
 11 | #include "map_output_tracker.hpp"
 12 | #include "shuffle_fetcher.hpp"
 13 | #include "shuffle_manager.hpp"
 14 | 
 15 | struct SparkEnv {
 16 |     BoundedMemoryCache cache;
 17 |     unique_ptr<MapOutputTracker> mapOutputTracker;
 18 |     unique_ptr<CacheTracker> cacheTracker;
 19 |     unique_ptr<ParallelShuffleFetcher> shuffleFetcher;
 20 |     unique_ptr<ShuffleManager> shuffleManager;
 21 |     void init(int argc, char** argv, const addr_t& masterAddr);
 22 | };
 23 | 
 24 | extern SparkEnv env;
 25 | 
 26 | 
 27 | template<typename F>
 28 | void ParallelShuffleFetcher::fetch(size_t shuffleId, size_t reduceId, F &&func) {
 29 |     using K = typename GetType<F>::K;
 30 |     using V = typename GetType<F>::V;
 31 |     auto uris = env.mapOutputTracker->getServerUris(shuffleId);
 32 |     unordered_map<string, vector<size_t>> inputsByUri;
 33 |     size_t totalResults = 0;
 34 |     for (size_t i = 0; i < uris.size(); ++i) {
 35 |         inputsByUri[uris[i]].push_back(i);
 36 |     }
 37 |     // # of threads fetching blocks
 38 |     size_t parallelFetches = 4;
 39 |     BlockingConcurrentQueue<pair<string, vector<size_t>>> serverQueue;
 40 |     for (const auto& [k, v] : inputsByUri) {
 41 |         serverQueue.enqueue(make_pair(k, v));
 42 |         totalResults += v.size();
 43 |     }
 44 |     BlockingConcurrentQueue<vector<pair<K, V>>> resultQueue;
 45 |     net::thread_pool pool{parallelFetches};
 46 |     for (size_t i = 0; i < parallelFetches; ++i) {
 47 |         post(pool, [&]() {
 48 |             while (true) {
 49 |                 pair<string, vector<size_t>> p;
 50 |                 auto found = serverQueue.try_dequeue(p);
 51 |                 if (!found) {
 52 |                     break;
 53 |                 }
 54 |                 auto& host = p.first;
 55 |                 auto& ids = p.second;
 56 |                 for (int inputId : ids) {
 57 |                     string target = fmt::format("/shuffle/{}.{}.{}", shuffleId, inputId, reduceId);
 58 | 
 59 |                     net::io_context ioc;
 60 |                     tcp::resolver resolver{ioc};
 61 |                     beast::tcp_stream stream{ioc};
 62 |                     tcp::resolver::query query{host, "28001",
 63 |                                                boost::asio::ip::resolver_query_base::numeric_service};
 64 |                     const auto results = resolver.resolve(query);
 65 |                     stream.connect(results);
 66 | 
 67 |                     http::request<http::string_body> req{
 68 |                             http::verb::get, target, 11
 69 |                     };
 70 |                     req.set(http::field::host, p.first);
 71 |                     req.set(http::field::user_agent, BOOST_BEAST_VERSION_STRING);
 72 |                     http::write(stream, req);
 73 | 
 74 |                     beast::flat_buffer buffer;
 75 |                     http::response<http::dynamic_body> res;
 76 |                     http::read(stream, buffer, res);
 77 |                     beast::error_code ec;
 78 |                     stream.socket().shutdown(tcp::socket::shutdown_both, ec);
 79 |                     vector<pair<K, V>> data;
 80 |                     // FIXME: lots of copies here
 81 |                     string body{
 82 |                         boost::asio::buffers_begin(res.body().data()),
 83 |                         boost::asio::buffers_end(res.body().data())
 84 |                     };
 85 |                     std::stringstream ss{move(body)};
 86 |                     boost::archive::binary_iarchive ia{ss, boost::archive::no_header | boost::archive::no_tracking};
 87 |                     ia >> data;
 88 |                     resultQueue.enqueue(move(data));
 89 |                 }
 90 |             }
 91 |         });
 92 |     }
 93 |     size_t resultDone = 0;
 94 |     while (resultDone < totalResults) {
 95 |         vector<pair<K, V>> result;
 96 |         resultQueue.wait_dequeue(result);
 97 |         for (auto&& p : result) {
 98 |             invoke(forward<F>(func), move(p));
 99 |         }
100 |         resultDone += 1;
101 |     }
102 | }
103 | 
104 | 
105 | #endif //SPARKPP_SPARK_EMV_HPP
106 | 


--------------------------------------------------------------------------------
/include/split.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/13/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_SPLIT_HPP
 6 | #define SPARKPP_SPLIT_HPP
 7 | 
 8 | #include "common.hpp"
 9 | 
10 | struct Split {
11 |     size_t m_index;
12 |     Split(size_t idx) : m_index{idx} {}
13 |     virtual size_t index() {
14 |         return m_index;
15 |     }
16 | };
17 | 
18 | 
19 | 
20 | #endif //SPARKPP_SPLIT_HPP
21 | 


--------------------------------------------------------------------------------
/include/utils/event_loop.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/12/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_EVENT_LOOP_HPP
 6 | #define SPARKPP_EVENT_LOOP_HPP
 7 | 
 8 | #include <atomic>
 9 | #include <stdexcept>
10 | #include <thread>
11 | #include "concurrentqueue/blockingconcurrentqueue.h"
12 | using moodycamel::BlockingConcurrentQueue;
13 | 
14 | // DefaultConstructible
15 | template <typename E>
16 | struct EventLoop {
17 |     BlockingConcurrentQueue<E> eventQueue;
18 |     std::atomic<bool> stopped;
19 |     std::thread thd;
20 |     void onStart() {}
21 |     void onStop() {}
22 | 
23 |     void onReceive(E event) {}
24 |     void onError(std::exception_ptr) {}
25 | 
26 |     void start() {
27 |         onStart();
28 |         thd = std::thread{[this]() {
29 |             while (!stopped.load()) {
30 |                 try {
31 |                     E e;
32 |                     eventQueue.wait_dequeue(e);
33 |                     onReceive(e);
34 |                 } catch(...) {
35 |                     onError(std::current_exception());
36 |                 }
37 |             }
38 |         }};
39 |         thd.detach();
40 |     }
41 | 
42 |     void stop() {
43 |         bool v = false;
44 |         if (stopped.compare_exchange_strong(v, true)) {
45 |             thd.join();
46 |         }
47 |     }
48 | };
49 | 
50 | #endif //SPARKPP_EVENT_LOOP_HPP
51 | 


--------------------------------------------------------------------------------
/include/utils/function_signature.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/10/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_FUNCTION_SIGNATURE_HPP
 6 | #define SPARKPP_FUNCTION_SIGNATURE_HPP
 7 | 
 8 | template <typename T>
 9 | struct function_traits;
10 | 
11 | //simple function
12 | template <typename R, typename... Args>
13 | struct function_traits<R(Args...)> {
14 |     enum {
15 |         args_size = sizeof...(Args)
16 |     };
17 |     typedef R type(Args...);
18 |     using result_type = R;
19 |     using pointer = R(*)(Args...);
20 |     using function_type = R(Args...);
21 |     using stl_function_type = std::function<function_type>;
22 | 
23 |     template <size_t I>
24 |     struct args {
25 |         using type = typename std::tuple_element<I, std::tuple<Args...>>::type;
26 |     };
27 | 
28 | };
29 | 
30 | // function pointer
31 | template <typename R, typename... Args>
32 | struct function_traits<R(*)(Args...)> : public function_traits<R(Args...)> {
33 |     using type = R(*)(Args...);
34 | };
35 | 
36 | //const, volatile specialization member function
37 | template <typename R, typename C, typename... Args>
38 | struct function_traits<R(C::*)(Args...)> : public function_traits<R(Args...)> {
39 |     using class_type = C;
40 |     using type = R(C::*)(Args...);
41 | };
42 | 
43 | template <typename R, typename C, typename... Args>
44 | struct function_traits<R(C::*)(Args...) const> : public function_traits<R(Args...)> {
45 |     using class_type = C;
46 |     using type = R(C::*)(Args...);
47 | };
48 | 
49 | // function pointer
50 | template <typename R, typename... Args>
51 | struct function_traits<R(*)(Args...) noexcept> : public function_traits<R(Args...)> {
52 |     using type = R(*)(Args...);
53 | };
54 | 
55 | //const, volatile specialization member function
56 | template <typename R, typename C, typename... Args>
57 | struct function_traits<R(C::*)(Args...) noexcept> : public function_traits<R(Args...)> {
58 |     using class_type = C;
59 |     using type = R(C::*)(Args...);
60 | };
61 | 
62 | template <typename R, typename C, typename... Args>
63 | struct function_traits<R(C::*)(Args...) const noexcept> : public function_traits<R(Args...)> {
64 |     using class_type = C;
65 |     using type = R(C::*)(Args...);
66 | };
67 | 
68 | //std::function
69 | template <typename R, typename... Args>
70 | struct function_traits<std::function<R(Args...)>> : public function_traits<R(Args...)> {
71 |     using type = std::function<R(Args...)>;
72 | };
73 | 
74 | //function object / functor / lambda
75 | template <typename F>
76 | struct function_traits : public function_traits<decltype(&std::decay_t<F>::operator())> {
77 |     using type = decltype(&std::decay_t<F>::operator());
78 | };
79 | 
80 | #endif //SPARKPP_FUNCTION_SIGNATURE_HPP
81 | 


--------------------------------------------------------------------------------
/include/utils/macros.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/15/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_MACROS_HPP
 6 | #define SPARKPP_MACROS_HPP
 7 | 
 8 | #define MACRO_EXPAND(...) __VA_ARGS__
 9 | 
10 | #define MACRO_CONCAT_IMPL(A, B) A##_##B
11 | #define MACRO_CONCAT(A, B) MACRO_CONCAT_IMPL(A, B)
12 | 
13 | #define SN_REVERSE_SEQ_N() \
14 |     20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10 \
15 |     10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
16 | 
17 | #define SN_SEQ_N( \
18 |     _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, \
19 |     _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, N, ...) N
20 | 
21 | #define SN_GET_ARG_IMPL(...) MACRO_EXPAND(SN_SEQ_N(__VA_ARGS__))
22 | #define SN_GET_ARG_N(...) SN_GET_ARG_IMPL(__VA_ARGS__, SN_REVERSE_SEQ_N())
23 | 
24 | 
25 | #define SN_REGISTER_ARG_LIST_1(op, arg, ...) op(arg)
26 | #define SN_REGISTER_ARG_LIST_2(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_1(op, __VA_ARGS__))
27 | #define SN_REGISTER_ARG_LIST_3(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_2(op, __VA_ARGS__))
28 | #define SN_REGISTER_ARG_LIST_4(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_3(op, __VA_ARGS__))
29 | #define SN_REGISTER_ARG_LIST_5(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_4(op, __VA_ARGS__))
30 | #define SN_REGISTER_ARG_LIST_6(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_5(op, __VA_ARGS__))
31 | #define SN_REGISTER_ARG_LIST_7(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_6(op, __VA_ARGS__))
32 | #define SN_REGISTER_ARG_LIST_8(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_7(op, __VA_ARGS__))
33 | #define SN_REGISTER_ARG_LIST_9(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_8(op, __VA_ARGS__))
34 | #define SN_REGISTER_ARG_LIST_10(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_9(op, __VA_ARGS__))
35 | #define SN_REGISTER_ARG_LIST_11(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_10(op, __VA_ARGS__))
36 | #define SN_REGISTER_ARG_LIST_12(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_11(op, __VA_ARGS__))
37 | #define SN_REGISTER_ARG_LIST_13(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_12(op, __VA_ARGS__))
38 | #define SN_REGISTER_ARG_LIST_14(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_13(op, __VA_ARGS__))
39 | #define SN_REGISTER_ARG_LIST_15(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_14(op, __VA_ARGS__))
40 | #define SN_REGISTER_ARG_LIST_16(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_15(op, __VA_ARGS__))
41 | #define SN_REGISTER_ARG_LIST_17(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_16(op, __VA_ARGS__))
42 | #define SN_REGISTER_ARG_LIST_18(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_17(op, __VA_ARGS__))
43 | #define SN_REGISTER_ARG_LIST_19(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_18(op, __VA_ARGS__))
44 | #define SN_REGISTER_ARG_LIST_20(op, arg, ...) op(arg), MACRO_EXPAND(SN_REGISTER_ARG_LIST_19(op, __VA_ARGS__))
45 | 
46 | #define SN_REGISTER_ID(ID) ID
47 | 
48 | #define SN_REGISTER_ARG_LIST(N, op, arg, ...) \
49 |     MACRO_CONCAT(SN_REGISTER_ARG_LIST, N)(op, arg, __VA_ARGS__)
50 | 
51 | #define SN_SEPERATOR ;
52 | #define SN_CONCAT_INIT_LIST_1(element, ...) ar & element
53 | #define SN_CONCAT_INIT_LIST_2(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_1(__VA_ARGS__))
54 | #define SN_CONCAT_INIT_LIST_3(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_2(__VA_ARGS__))
55 | #define SN_CONCAT_INIT_LIST_4(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_3(__VA_ARGS__))
56 | #define SN_CONCAT_INIT_LIST_5(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_4(__VA_ARGS__))
57 | #define SN_CONCAT_INIT_LIST_6(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_5(__VA_ARGS__))
58 | #define SN_CONCAT_INIT_LIST_7(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_6(__VA_ARGS__))
59 | #define SN_CONCAT_INIT_LIST_8(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_7(__VA_ARGS__))
60 | #define SN_CONCAT_INIT_LIST_9(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_8(__VA_ARGS__))
61 | #define SN_CONCAT_INIT_LIST_10(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_9(__VA_ARGS__))
62 | #define SN_CONCAT_INIT_LIST_11(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_10(__VA_ARGS__))
63 | #define SN_CONCAT_INIT_LIST_12(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_11(__VA_ARGS__))
64 | #define SN_CONCAT_INIT_LIST_13(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_12(__VA_ARGS__))
65 | #define SN_CONCAT_INIT_LIST_14(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_13(__VA_ARGS__))
66 | #define SN_CONCAT_INIT_LIST_15(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_14(__VA_ARGS__))
67 | #define SN_CONCAT_INIT_LIST_16(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_15(__VA_ARGS__))
68 | #define SN_CONCAT_INIT_LIST_17(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_16(__VA_ARGS__))
69 | #define SN_CONCAT_INIT_LIST_18(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_17(__VA_ARGS__))
70 | #define SN_CONCAT_INIT_LIST_19(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_18(__VA_ARGS__))
71 | #define SN_CONCAT_INIT_LIST_20(element, ...) ar & element SN_SEPERATOR MACRO_EXPAND(SN_CONCAT_INIT_LIST_19(__VA_ARGS__))
72 | 
73 | #define SN_BOOST_SERIALIZE_MEMBERS_IMPL(N, ...) MACRO_EXPAND(MACRO_CONCAT(SN_CONCAT_INIT_LIST, N)(__VA_ARGS__))
74 | 
75 | #define SN_BOOST_SERIALIZE_MEMBERS_IN(...) \
76 | friend class boost::serialization::access; \
77 | template <class Archive> \
78 | void serialize(Archive& ar, const unsigned int) { \
79 |     SN_BOOST_SERIALIZE_MEMBERS_IMPL(SN_GET_ARG_N(__VA_ARGS__), __VA_ARGS__); \
80 | } \
81 | 
82 | #define SN_BOOST_SERIALIZE_EMPTY() \
83 | friend class boost::serialization::access; \
84 | template <class Archive> \
85 | void serialize(Archive&, const unsigned int) {} \
86 | 
87 | 
88 | 
89 | 
90 | 
91 | #endif //SPARKPP_MACROS_HPP
92 | 


--------------------------------------------------------------------------------
/include/utils/match.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/12/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_MATCH_BASE_HPP
 6 | #define SPARKPP_MATCH_BASE_HPP
 7 | 
 8 | #include <variant>
 9 | #include <utility>
10 | #include <cstdint>
11 | #include <functional>
12 | #include <type_traits>
13 | #include "utils/serde.hpp"
14 | #include "utils/function_signature.hpp"
15 | #include <boost/variant/apply_visitor.hpp>
16 | #include <boost/serialization/variant.hpp>
17 | 
18 | template <typename ...Args>
19 | struct TypeList {};
20 | 
21 | template <typename T, std::size_t N>
22 | struct TypeAt;
23 | 
24 | template <std::size_t N>
25 | struct TypeAt<TypeList<>, N> {
26 |     using type = void;
27 | };
28 | 
29 | template <typename H, typename ...T, std::size_t N>
30 | struct TypeAt<TypeList<H, T...>, N> {
31 |     using type = std::conditional_t<N == 0, H, typename TypeAt<TypeList<T...>, N - 1>::type>;
32 | };
33 | 
34 | template <typename T, std::size_t N>
35 | using TypeAt_t = typename TypeAt<T, N>::type;
36 | 
37 | template <typename T, typename ST>
38 | struct TypeIndex {
39 |     constexpr static const int value = -1;
40 | };
41 | 
42 | template <typename ...Args, typename ST>
43 | struct TypeIndex<TypeList<ST, Args...>, ST> {
44 |     constexpr static const int value = 0;
45 | };
46 | 
47 | template <typename H, typename ...T, typename ST>
48 | struct TypeIndex<TypeList<H, T...>, ST> {
49 |     constexpr static const int value = (TypeIndex<TypeList<T...>, ST>::value == -1) ? -1 : 1 + (TypeIndex<TypeList<T...>, ST>::value);
50 | };
51 | 
52 | template <typename L, typename ST>
53 | constexpr std::size_t TypeIndex_v = TypeIndex<L, ST>::value;
54 | 
55 | template<class... Ts> struct overloaded : Ts... {
56 |     using Ts::operator()...;
57 | };
58 | template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
59 | 
60 | template <typename V, typename F>
61 | struct GetIndex;
62 | 
63 | template <typename ...Args, typename F>
64 | struct GetIndex<boost::variant<Args...>, F> {
65 |     using arg_t = std::decay_t<typename function_traits<F>::template args<0>::type>;
66 |     constexpr static const int value = TypeIndex_v<TypeList<Args...>, arg_t>;
67 | };
68 | 
69 | /// Fallback option
70 | template <typename VT, typename F, typename R = typename function_traits<F>::result_type>
71 | R match(VT&& v, F&& f) {
72 |     using GI = GetIndex<std::decay_t<VT>, std::decay_t<F>>;
73 |     using T = typename GI::arg_t;
74 |     return std::invoke(std::forward<F>(f), std::forward<T>(boost::get<T>(std::forward<VT>(v))));
75 | }
76 | 
77 | /// We need to dynamically match serialized variant, not statically call visitors
78 | template <typename VT, typename F, typename ...Fs, typename R = typename function_traits<F>::result_type>
79 | R match(VT&& v, F&& f, Fs&&... fs) {
80 |     using GI = GetIndex<std::decay_t<VT>, std::decay_t<F>>;
81 |     using T = typename GI::arg_t;
82 |     int index = GI::value;
83 |     if (v.which() == index) {
84 |         return std::invoke(std::forward<F>(f), std::forward<T>(boost::get<T>(std::forward<VT>(v))));
85 |     }
86 |     return match(std::forward<VT>(v), std::forward<Fs>(fs)...);
87 | }
88 | 
89 | #endif //SPARKPP_MATCH_BASE_HPP
90 | 


--------------------------------------------------------------------------------
/include/utils/pair_hash.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/13/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_PAIR_HASH_HPP
 6 | #define SPARKPP_PAIR_HASH_HPP
 7 | 
 8 | #include <utility>
 9 | #include <cstdint>
10 | #include <functional>
11 | #include <boost/functional/hash.hpp>
12 | 
13 | struct pair_hash {
14 |     template <typename T, typename U>
15 |     std::size_t operator()(const std::pair<T, U>& x) const {
16 |         return boost::hash_value(x);
17 |     }
18 | };
19 | 
20 | #endif //SPARKPP_PAIR_HASH_HPP
21 | 


--------------------------------------------------------------------------------
/include/utils/ptr_cast.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/17/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_PTR_CAST_HPP
 6 | #define SPARKPP_PTR_CAST_HPP
 7 | 
 8 | #include <memory>
 9 | 
10 | template<typename Derived, typename Base>
11 | std::unique_ptr<Derived>
12 | static_unique_ptr_cast(std::unique_ptr<Base>&& p) {
13 |     auto d = static_cast<Derived *>(p.release());
14 |     return std::unique_ptr<Derived>(d);
15 | }
16 | 
17 | template<typename Derived, typename Base>
18 | std::unique_ptr<Derived> dynamic_unique_ptr_cast(std::unique_ptr<Base>&& p) {
19 |     if (Derived *result = dynamic_cast<Derived*>(p.get())) {
20 |         p.release();
21 |         return std::unique_ptr<Derived>(result);
22 |     }
23 |     return std::unique_ptr<Derived>(nullptr);
24 | }
25 | 
26 | #endif //SPARKPP_PTR_CAST_HPP
27 | 


--------------------------------------------------------------------------------
/include/utils/serde.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/14/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_SERDE_HPP
  6 | #define SPARKPP_SERDE_HPP
  7 | 
  8 | #include <cstdint>
  9 | #include <vector>
 10 | #include <utility>
 11 | #include <tuple>
 12 | #include <boost/iostreams/device/back_inserter.hpp>
 13 | #include <boost/iostreams/device/array.hpp>
 14 | #include <boost/iostreams/stream.hpp>
 15 | #include <boost/serialization/unordered_map.hpp>
 16 | #include <boost/serialization/unordered_set.hpp>
 17 | #include <boost/serialization/variant.hpp>
 18 | #include <boost/serialization/vector.hpp>
 19 | #include <boost/serialization/optional.hpp>
 20 | #include <boost/serialization/string.hpp>
 21 | #include <boost/serialization/list.hpp>
 22 | #include <boost/archive/binary_oarchive.hpp>
 23 | #include <boost/archive/binary_iarchive.hpp>
 24 | 
 25 | /// @ref: https://github.com/Sydius/serialize-tuple/blob/master/serialize_tuple.h
 26 | namespace boost {
 27 |     namespace serialization {
 28 | 
 29 |         template<uint N>
 30 |         struct Serialize
 31 |         {
 32 |             template<class Archive, typename... Args>
 33 |             static void serialize(Archive & ar, std::tuple<Args...> & t, const unsigned int version)
 34 |             {
 35 |                 ar & std::get<N-1>(t);
 36 |                 Serialize<N-1>::serialize(ar, t, version);
 37 |             }
 38 |         };
 39 | 
 40 |         template<>
 41 |         struct Serialize<0>
 42 |         {
 43 |             template<class Archive, typename... Args>
 44 |             static void serialize(Archive & ar, std::tuple<Args...> & t, const unsigned int version)
 45 |             {
 46 |                 (void) ar;
 47 |                 (void) t;
 48 |                 (void) version;
 49 |             }
 50 |         };
 51 | 
 52 |         template<class Archive, typename... Args>
 53 |         void serialize(Archive & ar, std::tuple<Args...> & t, const unsigned int version)
 54 |         {
 55 |             Serialize<sizeof...(Args)>::serialize(ar, t, version);
 56 |         }
 57 | 
 58 |     }
 59 | }
 60 | 
 61 | 
 62 | struct SerialGuard {
 63 |     boost::iostreams::back_insert_device<vector<char>> sink;
 64 |     boost::iostreams::stream<boost::iostreams::back_insert_device<vector<char>>> s;
 65 |     boost::archive::binary_oarchive oa;
 66 |     SerialGuard(std::vector<char>& bytes)
 67 |         : sink{bytes}, s{sink}, oa{s, boost::archive::no_header | boost::archive::no_tracking} {}
 68 |     template <typename T>
 69 |     SerialGuard& operator<<(T&& t) {
 70 |         oa << std::forward<T>(t);
 71 |         return *this;
 72 |     }
 73 |     template <typename T>
 74 |     SerialGuard& operator&(T&& t) {
 75 |         oa & std::forward<T>(t);
 76 |         return *this;
 77 |     }
 78 |     ~SerialGuard() {
 79 |         s.flush();
 80 |     }
 81 | };
 82 | 
 83 | struct DeserialGuard {
 84 |     boost::iostreams::basic_array_source<char> device;
 85 |     boost::iostreams::stream<boost::iostreams::basic_array_source<char>> s;
 86 |     boost::archive::binary_iarchive ia;
 87 |     DeserialGuard(const char* bytes, std::size_t size)
 88 |         : device{bytes, size}, s{device}, ia{s, boost::archive::no_header | boost::archive::no_tracking} {}
 89 |     template <typename T>
 90 |     DeserialGuard& operator>>(T& t) {
 91 |         ia >> t;
 92 |         return *this;
 93 |     }
 94 |     template <typename T>
 95 |     DeserialGuard& operator&(T& t) {
 96 |         ia & t;
 97 |         return *this;
 98 |     }
 99 |     ~DeserialGuard() = default;
100 | };
101 | 
102 | 
103 | 
104 | #endif //SPARKPP_SERDE_HPP
105 | 


--------------------------------------------------------------------------------
/include/utils/span.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/14/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_SPAN_HPP
 6 | #define SPARKPP_SPAN_HPP
 7 | 
 8 | #include <vector>
 9 | #include <algorithm>
10 | 
11 | template <typename T>
12 | struct span {
13 |     T* ptr;
14 |     size_t len;
15 | };
16 | 
17 | template <typename C>
18 | auto make_span(C& c) {
19 |     return span<typename C::value_type>{
20 |         .ptr = c.data(),
21 |         .len = c.size()
22 |     };
23 | }
24 | 
25 | template <typename T>
26 | auto make_span(T* ptr, size_t len) {
27 |     return span<T>{ptr, len};
28 | }
29 | 
30 | template <typename T>
31 | std::vector<T> flatten(const std::vector<std::vector<T>>& v) {
32 |     std::size_t total_size = 0;
33 |     for (const auto& sub : v)
34 |         total_size += sub.size();
35 |     std::vector<T> result;
36 |     result.reserve(total_size);
37 |     for (const auto& sub : v)
38 |         result.insert(result.end(), sub.begin(), sub.end());
39 |     return result;
40 | }
41 | 
42 | 
43 | #endif //SPARKPP_SPAN_HPP
44 | 


--------------------------------------------------------------------------------
/include/utils/tcp.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/8/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_TCP_HPP
  6 | #define SPARKPP_TCP_HPP
  7 | 
  8 | #include <boost/optional.hpp>
  9 | 
 10 | #ifdef __WIN32__
 11 | #include <winsock2.h>
 12 | #pragma comment(lib, "ws2_32")
 13 | 
 14 | static auto wsa_init = []() {
 15 |     WSADATA d;
 16 |     WORD sockVer = MAKEWORD(2, 2);
 17 |     return WSAStartup(sockVer, &d);
 18 | }();
 19 | 
 20 | #else
 21 | #include <sys/types.h>
 22 | #include <sys/socket.h>
 23 | #include <netinet/in.h>
 24 | #include <netdb.h>
 25 | #include <arpa/inet.h>
 26 | #endif
 27 | 
 28 | #pragma GCC diagnostic push
 29 | #pragma GCC diagnostic ignored "-Wpedantic"
 30 | 
 31 | struct TcpStream {
 32 |     int fd;
 33 |     static boost::optional<TcpStream> connect(const char* addr, uint16_t port) {
 34 |         int sockfd = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
 35 |         sockaddr_in sAddr {
 36 |             .sin_family = AF_INET,
 37 |             .sin_port = htons(port),
 38 |         };
 39 |         inet_pton(AF_INET, addr, &sAddr.sin_addr);
 40 |         [[maybe_unused]] int msgfd = ::connect(sockfd, reinterpret_cast<sockaddr*>(&sAddr), sizeof(sockaddr_in));
 41 |         return {
 42 |             msgfd == 0,
 43 |             TcpStream{sockfd}
 44 |         };
 45 |     }
 46 | 
 47 |     TcpStream(int fd_) noexcept : fd{fd_} {}
 48 |     TcpStream(const TcpStream&) = delete;
 49 |     TcpStream(TcpStream&& rhs) noexcept : fd{rhs.fd} {
 50 |         rhs.fd = 0;
 51 |     }
 52 |     TcpStream& operator=(const TcpStream&) = delete;
 53 |     TcpStream& operator=(TcpStream&&) = default;
 54 | 
 55 |     ~TcpStream() {
 56 | #ifdef __WIN32__
 57 |         int how = SD_BOTH;
 58 | #else
 59 |         int how = SHUT_RDWR;
 60 | #endif
 61 |         if (fd)
 62 |             ::shutdown(fd, how);
 63 |     }
 64 | };
 65 | 
 66 | 
 67 | 
 68 | // TODO: handle errors
 69 | struct TcpListener {
 70 |     int sockfd;
 71 |     uint16_t port;
 72 |     sockaddr_in addr;
 73 |     static TcpListener bind(uint16_t port, uint16_t bufferSize = 10) {
 74 |         int option = 1;
 75 |         int sockfd = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
 76 |         setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (const char*)&option, sizeof(option));
 77 |         sockaddr_in addr{
 78 |             .sin_family = AF_INET,
 79 |             .sin_port = htons(port)
 80 |         };
 81 |         addr.sin_addr.s_addr = htonl(INADDR_ANY);
 82 |         [[maybe_unused]] int success = ::bind(sockfd, reinterpret_cast<sockaddr*>(&addr), sizeof(sockaddr_in));
 83 |         ::listen(sockfd, bufferSize);
 84 | 
 85 | #ifdef __WIN32__
 86 |         int len = sizeof(sockaddr_in);
 87 | #else
 88 |         socklen_t len = sizeof(sockaddr_in);
 89 | #endif
 90 |         ::getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &len);
 91 |         port = ntohs(addr.sin_port);
 92 |         return TcpListener{
 93 |             .sockfd = sockfd,
 94 |             .port = port,
 95 |             .addr = addr
 96 |         };
 97 |     }
 98 | 
 99 |     TcpStream accept() {
100 |         sockaddr_in addr;
101 | #ifdef __WIN32__
102 |         int len = sizeof(sockaddr_in);
103 | #else
104 |         socklen_t len = sizeof(sockaddr_in);
105 | #endif
106 |         int msgfd = ::accept(sockfd, reinterpret_cast<sockaddr*>(&addr), &len);
107 |         return TcpStream{msgfd};
108 |     }
109 | };
110 | #pragma GCC diagnostic pop
111 | 
112 | 
113 | 
114 | #endif //SPARKPP_TCP_HPP
115 | 


--------------------------------------------------------------------------------
/include/utils/thread_pool.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/8/2019.
  3 | //
  4 | 
  5 | #ifndef SPARKPP_THREAD_POOL_HPP
  6 | #define SPARKPP_THREAD_POOL_HPP
  7 | 
  8 | #include <functional>
  9 | #include <thread>
 10 | #include <future>
 11 | #include <deque>
 12 | 
 13 | // ENHANCE: refer to rust [ThreadPool](https://docs.rs/threadpool/1.7.1/src/threadpool/lib.rs.html)
 14 | /// A thread pool from sn_Thread
 15 | class WorkQueue{
 16 | public:
 17 |     explicit WorkQueue(int numWorkers = -1) {
 18 |         if (numWorkers < 1) {
 19 |             numWorkers = std::thread::hardware_concurrency() - 1;
 20 |         }
 21 |         while (numWorkers--) {
 22 |             m_workers.emplace_back(std::thread(&WorkQueue::do_work, this));
 23 |         }
 24 |     }
 25 |     ~WorkQueue() {
 26 |         abort();
 27 |     }
 28 | 
 29 |     void abort() {
 30 |         m_exit = true;
 31 |         m_finish_work = false;
 32 |         m_signal.notify_all();
 33 |         join_all();
 34 | 
 35 |         {
 36 |             std::lock_guard<std::mutex> lg(m_mutex);
 37 |             m_work.clear();
 38 |         }
 39 |     }
 40 | 
 41 |     void stop() {
 42 |         m_exit = true;
 43 |         m_finish_work = true;
 44 |         m_signal.notify_all();
 45 |     }
 46 | 
 47 |     void wait_for_completion() {
 48 |         stop();
 49 |         join_all();
 50 |     }
 51 | 
 52 |     template<typename RETVAL>
 53 |     std::future<RETVAL> submit(std::function<RETVAL()>&& function) {
 54 |         if (m_exit) {
 55 |             throw std::runtime_error("Caught work submission to work queue that is desisting.");
 56 |         }
 57 | 
 58 |         // Workaround for lack of lambda move capture
 59 |         typedef std::pair<std::promise<RETVAL>, std::function<RETVAL()>> retpair_t;
 60 |         std::shared_ptr<retpair_t> data = std::make_shared<retpair_t>(std::promise<RETVAL>(), std::move(function));
 61 | 
 62 |         std::future<RETVAL> future = data->first.get_future();
 63 | 
 64 |         {
 65 |             std::lock_guard<std::mutex> lg(m_mutex);
 66 |             m_work.emplace_back([data](){
 67 |                 try {
 68 |                     data->first.set_value(data->second());
 69 |                 }
 70 |                 catch (...) {
 71 |                     data->first.set_exception(std::current_exception());
 72 |                 }
 73 |             });
 74 |         }
 75 |         m_signal.notify_one();
 76 |         return std::move(future);
 77 |     }
 78 | 
 79 |     template <typename F, typename ...Args>
 80 |     auto submit(F&& func, Args&&... args) {
 81 |         // maybe use std::packaged_task
 82 |         using result_t = std::result_of_t<F(Args...)>;
 83 |         std::function<result_t()> xfunc = std::bind(
 84 |                 std::forward<F>(func),
 85 |                 std::forward<Args>(args)...
 86 |         );
 87 |         return this->submit(std::move(xfunc));
 88 |     }
 89 | 
 90 | private:
 91 |     // maybe use boost::lockfree::queue
 92 |     // or some thread-safe queue
 93 |     std::deque<std::function<void()>> m_work;
 94 |     std::mutex m_mutex;
 95 |     // notice the thread to work
 96 |     std::condition_variable m_signal;
 97 |     std::atomic<bool> m_exit{false};
 98 |     std::atomic<bool> m_finish_work{true};
 99 |     // threads
100 |     std::vector<std::thread> m_workers;
101 | 
102 |     void do_work(){
103 |         std::unique_lock<std::mutex> ul(m_mutex);
104 |         while (!m_exit || (m_finish_work && !m_work.empty())) {
105 |             if (!m_work.empty()) {
106 |                 std::function<void()> work(std::move(m_work.front()));
107 |                 m_work.pop_front();
108 |                 ul.unlock();
109 |                 work();
110 |                 ul.lock();
111 |             }
112 |             else {
113 |                 m_signal.wait(ul);
114 |             }
115 |         }
116 |     }
117 | 
118 |     void join_all(){
119 |         for (auto& thread : m_workers) {
120 |             thread.join();
121 |         }
122 |         m_workers.clear();
123 |     }
124 | 
125 |     void operator=(const WorkQueue&) = delete;
126 |     WorkQueue(const WorkQueue&) = delete;
127 | };
128 | 
129 | template<>
130 | std::future<void> WorkQueue::submit(std::function<void()>&& function) {
131 |     if (m_exit) {
132 |         throw std::runtime_error("Caught work submission to work queue that is desisting.");
133 |     }
134 |     // Workaround for lack of lambda move capture
135 |     typedef std::pair<std::promise<void>, std::function<void()>> retpair_t;
136 |     std::shared_ptr<retpair_t> data = std::make_shared<retpair_t>(std::promise<void>(), std::move(function));
137 | 
138 |     std::future<void> future = data->first.get_future();
139 | 
140 |     {
141 |         std::lock_guard<std::mutex> lg(m_mutex);
142 |         m_work.emplace_back([data](){
143 |             try {
144 |                 data->second();
145 |                 data->first.set_value();
146 |             }
147 |             catch (...) {
148 |                 data->first.set_exception(std::current_exception());
149 |             }
150 |         });
151 |     }
152 |     m_signal.notify_one();
153 | 
154 |     return std::move(future);
155 | }
156 | 
157 | #endif //SPARKPP_THREAD_POOL_HPP
158 | 


--------------------------------------------------------------------------------
/include/utils/traits.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/7/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_TRAITS_HPP
 6 | #define SPARKPP_TRAITS_HPP
 7 | 
 8 | #include <functional>
 9 | #include <utility>
10 | 
11 | template <typename K, typename V, typename C, class T>
12 | concept AggregatorTrait = requires(T a, K k, V v, C c) {
13 |     { a.createCombiner(v) } -> C;
14 |     { a.mergeValue(c, v) } -> C;
15 |     { a.mergeCombiners(c, c) } -> C
16 | };
17 | 
18 | 
19 | template <class F, class ...Args>
20 | concept Invocable = requires(F&& f, Args&&... args) {
21 |     std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
22 | };
23 | 
24 | template <class F, class R, class ...Args>
25 | concept Fn = requires(F&& f, Args&&... args) {
26 |     { std::invoke(std::forward<F>(f), std::forward<Args>(args)...) } -> R;
27 | };
28 | 
29 | #endif //SPARKPP_TRAITS_HPP
30 | 


--------------------------------------------------------------------------------
/include/utils/utils.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/7/2019.
 3 | //
 4 | 
 5 | #ifndef SPARKPP_UTILS_HPP
 6 | #define SPARKPP_UTILS_HPP
 7 | 
 8 | #include "utils/traits.hpp"
 9 | // #include "utils/thread_pool.hpp"
10 | #include "utils/tcp.hpp"
11 | #include "utils/function_signature.hpp"
12 | #include "utils/match.hpp"
13 | #include "utils/event_loop.hpp"
14 | #include "utils/pair_hash.hpp"
15 | #include "utils/serde.hpp"
16 | #include "utils/span.hpp"
17 | #include "utils/ptr_cast.hpp"
18 | 
19 | #include "utils/macros.hpp"
20 | 
21 | #endif //SPARKPP_UTILS_HPP
22 | 


--------------------------------------------------------------------------------
/miscs/discussions.md:
--------------------------------------------------------------------------------
  1 | # Discussion
  2 | 
  3 | Some miscellaneous discussions.
  4 | 
  5 | ## Development (Clion)
  6 | 
  7 | * Toolchains -> Add remote host (set IP)
  8 | * Cmake -> Add build option (set environment variable, IP)
  9 | * Deployment -> Set mapping (/home/ubuntu/Sparkpp)
 10 | * Start Sparkpp-slave | D/R-slave1, Sparkpp-slave | D/R-slave2, ...
 11 | * Start Sparkpp | D/R
 12 | 
 13 | ## Design choices
 14 | 
 15 | ### Serialization
 16 | * which serialize protocol to choose? A common interface?
 17 | * how to serialize the closure?
 18 | * - ask user to write a dynamic library
 19 | * - bytecode + interpreter
 20 | * - expression template `Add<T, Lit<1>>`
 21 | * - convert bytes (need to know function bytes length at runtime) -> re-parsing object layout...
 22 | * - vtable (offset if ASLR enabled / address) for same binary
 23 | 
 24 | ### Native memory management
 25 | * replace malloc -> tcmalloc
 26 | * columnar representation?
 27 | 
 28 | 
 29 | ## Benchmarks
 30 | 
 31 | * 3 (1 master, 2 slaves) Amazon EC2 t3a.large instances
 32 | * - 2 cores/4 threads
 33 | * - 8G memory
 34 | * - 8G disk
 35 | 
 36 | ### WordCount (1.45G, 80 files) / (0.95G, 40 files)
 37 | 
 38 | * Sparkpp:    43115ms 43696ms / 27931ms 28008ms
 39 | * - bounded by unordered_map::count/string hash (std::_Hash_bytes by default)
 40 | * Spark2.4.4: 85615ms 82586ms / 53336ms 48894ms
 41 | * - multiple-calling will cause GC overhead limit exceeded
 42 | * Memory: 1.13G vs. 2.87G
 43 | 
 44 | ### Morte Carlo Pi (1e6 chunks * 1e4 chunk size, hand-written LCG)
 45 | 
 46 | * Sparkpp:    29949ms 29584ms 29917ms
 47 | * Spark2.4.4: 51571ms 50725ms 49859ms
 48 | 
 49 | ### Morte Carlo Pi for OpenMP partition test (8 chunks * 1e9 chunk size, slow rand function)
 50 | 
 51 | * Sparkpp-8: 86194ms
 52 | * Sparkpp-4: 98916ms
 53 | * Sparkpp-2: 97822ms
 54 | * Sparkpp-2-OpenMP: 85410ms
 55 | 
 56 | ### Morte Carlo Pi for "Scalability" test (24 chunks * 1e8 chunk size, 8 partition)
 57 | 
 58 | * Sparkpp-1 slave: 52469ms
 59 | * Sparkpp-2 slave: 26355ms
 60 | * Sparkpp-3 slave: 20685ms
 61 | 
 62 | ## Drawbacks
 63 | 
 64 | * Require ABI compatible platform
 65 | * - dynamic linked library
 66 | * Require disabling ASLR
 67 | * - can be solved by serializing vtable offset
 68 | * Compiling time is a problem...
 69 | * 
 70 | 
 71 | ## FAQ
 72 | 
 73 | ### Q: Is comparing Spark-0.5 mechanism & Spark-2.4.4 mechanism a fair comparision?
 74 | 
 75 | A: I think so. I assume that Spark maintains and improves performance as version number increases. E.g.
 76 | 
 77 | Spark2.4: [SPARK-21113] Support for read ahead input stream to amortize disk I/O cost in the spill reader
 78 | 
 79 | Spark2.3: [SPARK-22062][SPARK-17788][SPARK-21907] Fix various causes of OOMs
 80 | 
 81 | Spark2.1: [SPARK-16523]: Speeds up group-by aggregate performance by adding a fast aggregation cache that is backed by a row-based hashmap.
 82 | 
 83 | Spark1.6: SPARK-10000 Unified Memory Management - Shared memory for execution and caching instead of exclusive division of the regions.
 84 | 
 85 | Spark1.2: netty-based Spark Shuffle Manager, default sort-based Shuffle
 86 | 
 87 | 
 88 | ### Q: Is comparing Sparkpp & Spark-2.4.4 a fair comparision?
 89 | 
 90 | A: I have to admit, Sparkpp still lacks of logging, metric, event listening issues, but I suppose they are in the control path. Data path is implemented just same like Spark-0.5. The only difference is the LRU cache. since we don't have cache so far, no unfairness is revealed :).
 91 | 
 92 | 
 93 | ### Q: Why don't compare with native spark?
 94 | 
 95 | A: We tried, but failed to setup config (SSH key problem). And native spark caches shuffle in memory (by now) and I don't think it's a fair comparision. We are working on it. Also, we plan to compare it with Spark-0.8 (oldest spark release found yet)
 96 | 
 97 | 
 98 | ## Introducing C++ ecosystem
 99 | 
100 | * OpenMP, MPI (without MPI, need extra processing managing), CUDA, many ML frameworks...
101 | * Easy to make things columnar
102 | * strong compilers (GCC, LLVM)


--------------------------------------------------------------------------------
/miscs/morte_carlo_pi.scala:
--------------------------------------------------------------------------------
 1 | val chunks = 1e6.toLong
 2 | 
 3 | val chunkSize = 1e4.toLong
 4 | 
 5 | val values = 0L until chunks
 6 | 
 7 | def ran(i: Long): Long = {
 8 |     var count = 0L
 9 |     var prev = i
10 |     for (j <- 0 to 1e4.toInt) {
11 |         prev = (prev * 998244353L + 19260817L) % 134456;
12 |         val x: Double = prev / 67228.0 - 1;
13 |         prev = (prev * 998244353L + 19260817L) % 134456;
14 |         val y: Double = prev / 67228.0 - 1;
15 |         if (x * x + y * y < 1) {
16 |             count += 1;
17 |         }
18 |     }
19 |     count
20 | }
21 | 
22 | spark.time(sc.parallelize(values, 4)
23 |     .map(i => ran(i))
24 |     .reduce((a, b) => a + b))


--------------------------------------------------------------------------------
/miscs/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Airtnp/Sparkpp/b41827838662b82532b1a29d393e8bf246ed418e/miscs/report.pdf


--------------------------------------------------------------------------------
/miscs/split_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | # ref: https://nijianmo.github.io/amazon/index.html
 7 | def parse(path):
 8 |     g = gzip.open(path, 'rb')
 9 |     for l in g:
10 |         yield json.loads(l)
11 | 
12 | def getText(j):
13 |     if 'reviewText' in j.keys():
14 |         return j['reviewText']
15 |     else:
16 |         return ''
17 | 
18 | l = parse(os.getcwd() + '/Movies_and_TV_5.json.gz')
19 | u = list(l)
20 | v = len(u) // 80
21 | 
22 | for i in range(80):
23 |     f = open(f"input/input_{i}", "w+")
24 |     d = u[i * v:(i + 1) * v]
25 |     s = map(getText, d)
26 |     f.writelines(s)


--------------------------------------------------------------------------------
/miscs/word_count.scala:
--------------------------------------------------------------------------------
 1 | // spark-shell --executor-memory=6g
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | val files = 0 until 40
 6 | 
 7 | spark.time(sc
 8 |       .parallelize(files, 40)
 9 |       .flatMap(idx => Source.fromFile(f"/home/ubuntu/Sparkpp/examples/input/input_$idx")
10 |                               .getLines)
11 |                               .flatMap(l => l.split(' ')))
12 |       .map(s => (s, 1))
13 |       .reduceByKey(_ + _, 8)
14 |       .collect())


--------------------------------------------------------------------------------
/src/cache.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/13/2019.
 3 | //
 4 | 
 5 | #include <cache.hpp>
 6 | 
 7 | KeySpace BoundedMemoryCache::newKeySpace() {
 8 |     size_t keySpaceId = nextKeySpaceId.fetch_add(1);
 9 |     return KeySpace{
10 |         .cache = *this,
11 |         .keySpaceId = keySpaceId
12 |     };
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/src/dependency.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/20/2019.
 3 | //
 4 | 
 5 | #include "dependency.hpp"
 6 | 
 7 | ShuffleDependencyBase* dep_from_reader(::capnp::Data::Reader reader) {
 8 |     const char* bytes = reinterpret_cast<const char*>(reader.asBytes().begin());
 9 |     size_t size = reader.size();
10 |     auto base = reinterpret_cast<ShuffleDependencyBase*>(
11 |             const_cast<unsigned char*>(reader.asBytes().begin()));
12 |     base->deserialize_dyn(bytes, size);
13 |     return base;
14 | }


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "spark_env.hpp"
 3 | #include "spark_context.hpp"
 4 | 
 5 | SparkEnv env;
 6 | 
 7 | int main(int argc, char** argv) {
 8 |     addr_t masterAddr = make_pair("18.188.215.139", 25544);
 9 |     vector<addr_t> slaveAddrs = {
10 |             {"18.218.54.64", 24457},
11 |             {"3.17.81.214", 24457}
12 |     };
13 |     env.init(argc, argv, masterAddr);
14 |     auto sc = SparkContext{argc, argv, masterAddr, slaveAddrs};
15 |     vector<int> values = {1, 2, 3, 4, 5, 6, 7};
16 |     auto rdd = sc.parallelize(values, 3);
17 |     auto rdd2 = rdd.map([](int x) {
18 |         return x + 1;
19 |     });
20 |     auto rdd3 = rdd2.map([](int x) {
21 |         return x - 1;
22 |     });
23 |     auto rdd4 = rdd3.mapPair([](int x) {
24 |         return make_pair(x % 2, x);
25 |     });
26 |     // (1, 3, 5, 7) | (2, 4, 6)
27 |     auto rdd5 = rdd4.groupByKey(2);
28 |     // 16 | 12
29 |     auto rdd7 = rdd5.map([](pair<int, vector<int>> x) -> int {
30 |         int acc = 0;
31 |         for (auto i : x.second) {
32 |             acc += i;
33 |         }
34 |         return acc;
35 |     });
36 |     auto v = rdd7.collect();
37 |     // 12, 16 or 16, 12
38 |     std::cout << v[0] << v[1] << '\n';
39 |     return 0;
40 | }


--------------------------------------------------------------------------------
/src/rdd/rdd.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/10/2019.
 3 | //
 4 | 
 5 | #include "rdd/rdd.hpp"
 6 | #include "spark_context.hpp"
 7 | #include "spark_env.hpp"
 8 | 
 9 | RDDBase* rdd_from_reader(::capnp::Data::Reader reader) {
10 |     const char* bytes = reinterpret_cast<const char*>(reader.asBytes().begin());
11 |     size_t size = reader.size();
12 |     auto* rdd = reinterpret_cast<RDDBase*>(const_cast<char*>(bytes));
13 |     rdd->deserialize_dyn(bytes, size);
14 |     return rdd;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/scheduler/dag_scheduler.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by xiaol on 11/12/2019.
  3 | //
  4 | 
  5 | #include "common.hpp"
  6 | #include "serialize_capnp.hpp"
  7 | #include "scheduler/dag_scheduler.hpp"
  8 | #include "spark_env.hpp"
  9 | 
 10 | 
 11 | 
 12 | shared_ptr<Stage> DAGScheduler::newStage(RDDBase *rdd, optional<ShuffleDependencyBase*> shuffleDep) {
 13 |     env.cacheTracker->registerRDD(rdd->id(), rdd->numOfSplits());
 14 |     if (shuffleDep.is_initialized()) {
 15 |         env.mapOutputTracker->registerShuffle(shuffleDep.value()->shuffle_id(), rdd->numOfSplits());
 16 |     }
 17 |     size_t id = nextStageId.fetch_add(1);
 18 |     idToStage[id] = std::make_shared<Stage>(id, rdd, shuffleDep, getParentStages(rdd));
 19 |     return idToStage[id];
 20 | }
 21 | 
 22 | void DAGScheduler::visitMissingParent(
 23 |         unordered_set<Stage*>& missing, unordered_set<RDDBase*>& visited, RDDBase* r) {
 24 |     if (!visited.count(r)) {
 25 |         visited.insert(r);
 26 |         auto locs = getCacheLocs(r);
 27 |         for (size_t p = 0; p < r->numOfSplits(); ++p) {
 28 |             if (locs[p].empty()) {
 29 |                 auto s = r->dependencies();
 30 |                 for (size_t i = 0; i < s.len; ++i) {
 31 |                     auto dep = s.ptr[i];
 32 |                     if (auto shufDep = dynamic_cast<ShuffleDependencyBase*>(dep)) {
 33 |                         auto stage = getShuffleMapStage(shufDep);
 34 |                         if (!stage->isAvailable()) {
 35 |                             missing.insert(stage.get());
 36 |                         }
 37 |                     } else if (auto narrowDep = dynamic_cast<NarrowDependency*>(dep)) {
 38 |                         visitMissingParent(missing, visited, narrowDep->rdd());
 39 |                     }
 40 |                 }
 41 |             }
 42 |         }
 43 |     }
 44 | }
 45 | 
 46 | 
 47 | vector<Stage*> DAGScheduler::getMissingParentStages(const Stage& stage) {
 48 |     unordered_set<Stage*> missing;
 49 |     unordered_set<RDDBase*> visited;
 50 |     visitMissingParent(missing, visited, stage.rdd);
 51 |     return {std::make_move_iterator(missing.begin()), std::make_move_iterator(missing.end())};
 52 | }
 53 | 
 54 | void DAGScheduler::submitMissingTasks(
 55 |         size_t runId,
 56 |         RDDBase* finalRdd,
 57 |         FnBase* func,
 58 |         unordered_map<Stage*, unordered_set<size_t>>& pendingTasks,
 59 |         const vector<size_t>& partitions,
 60 |         vector<bool>& finished,
 61 |         Stage* stage, Stage* finalStage) {
 62 |     auto& pending = pendingTasks[stage];
 63 |     size_t numOutputParts = partitions.size();
 64 |     if (stage == finalStage) {
 65 |         for (size_t id = 0; id < numOutputParts; ++id) {
 66 |             if (!finished[id]) {
 67 |                 size_t partitionId = partitions[id];
 68 |                 auto locs = getPreferredLocs(finalRdd, partitionId);
 69 |                 size_t taskId = nextTaskId.fetch_add(1);
 70 |                 pending.insert(taskId);
 71 | 
 72 |                 auto task = make_unique<ResultTask>(
 73 |                     taskId, runId, finalStage->id, finalRdd,
 74 |                     func, partitionId, move(locs), id);
 75 |                 submitTasks(move(task));
 76 |             }
 77 |         }
 78 |     } else {
 79 |         for (size_t p = 0; p < stage->numPartitions; ++p) {
 80 |             if (stage->outputLocs[p].empty()) {
 81 |                 auto locs = getPreferredLocs(stage->rdd, p);
 82 |                 size_t taskId = nextTaskId.fetch_add(1);
 83 |                 pending.insert(taskId);
 84 | 
 85 |                 auto task = make_unique<ShuffleMapTask>(
 86 |                         taskId, runId, stage->id, stage->rdd,
 87 |                         stage->shuffleDep.value(), p, move(locs));
 88 |                 submitTasks(move(task));
 89 |             }
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | void DAGScheduler::submitStage(
 95 |         size_t runId, RDDBase *finalRdd, FnBase *func,
 96 |         unordered_map<Stage *, unordered_set<size_t>>& pendingTasks,
 97 |         const vector<size_t>& partitions,
 98 |         vector<bool>& finished, Stage *finalStage,
 99 |         unordered_set<Stage*>& waiting,
100 |         unordered_set<Stage*>& running,
101 |         Stage* stage
102 |         ) {
103 |     if (!waiting.count(stage) && !running.count(stage)) {
104 |         auto missing = getMissingParentStages(*stage);
105 |         if (missing.empty()) {
106 |             submitMissingTasks(runId, finalRdd, func,
107 |                     pendingTasks, partitions, finished, stage, finalStage);
108 |             running.insert(stage);
109 |         } else {
110 |             for (auto& s : missing) {
111 |                 submitStage(runId, finalRdd, func,
112 |                         pendingTasks, partitions, finished, finalStage, waiting, running, s);
113 |             }
114 |             waiting.insert(stage);
115 |         }
116 |     }
117 | }
118 | 
119 | void DAGScheduler::submitTasks(unique_ptr<Task> task) {
120 |     // round-robin
121 |     auto [host, port] = address.back();
122 |     address.pop_back();
123 |     address.insert(address.begin(), make_pair(host, port));
124 |     boost::asio::post(pool, [this, host = host, port = port, task = move(task)]() mutable {
125 |         io_service ioc;
126 |         ip::tcp::resolver resolver{ioc};
127 |         ip::tcp::resolver::query query{host, std::to_string(port),
128 |                                        boost::asio::ip::resolver_query_base::numeric_service};
129 |         auto iter = resolver.resolve(query);
130 |         ip::tcp::resolver::iterator end;
131 |         ip::tcp::endpoint endpoint = *iter;
132 |         ip::tcp::socket socket{ioc};
133 |         boost::system::error_code ec;
134 |         do {
135 |             auto start_iter = iter;
136 |             ec.clear();
137 |             socket.close();
138 |             std::this_thread::sleep_for(5ms);
139 |             while (start_iter != end) {
140 |                 socket.connect(endpoint, ec);
141 |                 if (!ec) break;
142 |                 ++start_iter;
143 |             }
144 |         } while (ec);
145 |         int fd = socket.native_handle();
146 |         sendExecution(fd, task.get());
147 |         // TODO: add failure handling
148 |         ::capnp::PackedFdMessageReader message{fd};
149 |         auto reader = recvData<Result>(message);
150 |         Storage s{reader_to_vec(reader)};
151 |         size_t runId = task->run_id();
152 |         // FIXME: dispatch to taskEnded
153 |         CompletionEvent event {
154 |             move(task),
155 |             {TaskEndReason::Success{}},
156 |             move(s)
157 |         };
158 |         eventQueues[runId].enqueue(move(event));
159 |     });
160 | }
161 | 
162 | void DAGScheduler::updateCacheLocs() {
163 |     cacheLocs = env.cacheTracker->getLocationsSnapshot();
164 | }
165 | 
166 | void DAGScheduler::taskEnded(unique_ptr<Task> task, TaskEndReason reason, Storage result) {
167 |     size_t id = task->run_id();
168 |     if (eventQueues[id].size_approx() != 0) {
169 |         eventQueues[id].enqueue(CompletionEvent{
170 |             move(task),
171 |             move(reason),
172 |             move(result)
173 |         });
174 |     }
175 | }
176 | 
177 | vector<Stage*> DAGScheduler::getParentStages(RDDBase *rdd) {
178 |     unordered_set<Stage*> parents;
179 |     unordered_set<RDDBase*> visited;
180 |     visitParent(parents, visited, rdd);
181 |     return {std::make_move_iterator(parents.begin()), std::make_move_iterator(parents.end())};
182 | }
183 | 
184 | shared_ptr<Stage> DAGScheduler::getShuffleMapStage(ShuffleDependencyBase* shuffleDep) {
185 |     size_t id = shuffleDep->shuffle_id();
186 |     if (shuffleToMapStage.count(id)) {
187 |         return shuffleToMapStage[id];
188 |     }
189 |     shuffleToMapStage[id] = newStage(shuffleDep->rdd(), {shuffleDep});
190 |     return shuffleToMapStage[id];
191 | }
192 | 
193 | void DAGScheduler::visitParent(
194 |         unordered_set<Stage*>& parents, unordered_set<RDDBase*>& visited, RDDBase* r) {
195 |     if (!visited.count(r)) {
196 |         visited.insert(r);
197 |         env.cacheTracker->registerRDD(r->id(), r->numOfSplits());
198 |         auto s = r->dependencies();
199 |         for (size_t i = 0; i < s.len; ++i) {
200 |             auto dep = s.ptr[i];
201 |             if (auto shufDep = dynamic_cast<ShuffleDependencyBase*>(dep)) {
202 |                 parents.insert(getShuffleMapStage(shufDep).get());
203 |             } else {
204 |                 visitParent(parents, visited, dep->rdd());
205 |             }
206 |         }
207 |     }
208 | }
209 | 
210 | vector<host_t> DAGScheduler::getPreferredLocs(RDDBase* rdd, size_t partitionId) {
211 |     auto& cached = getCacheLocs(rdd)[partitionId];
212 |     if (!cached.empty()) {
213 |         return cached;
214 |     }
215 |     // TODO: add RDD placement preferences
216 |     // auto rddPrefs = rdd->preferredLocations();
217 |     auto dep = rdd->dependencies();
218 |     for (size_t i = 0; i < dep.len; ++i) {
219 |         if (auto n = dynamic_cast<NarrowDependency*>(dep.ptr[i])) {
220 |             for (auto& inPart : n->getParents(partitionId)) {
221 |                 auto locs = getPreferredLocs(n->rdd(), inPart);
222 |                 if (!locs.empty())
223 |                     return locs;
224 |             }
225 |         }
226 |     }
227 |     return {};
228 | }
229 | 
230 | 


--------------------------------------------------------------------------------
/src/serialize_capnp.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/14/2019.
 3 | //
 4 | 
 5 | #include "serialize_capnp.hpp"
 6 | 
 7 | ::capnp::Data::Reader vec_to_reader(vector<char>& v) {
 8 |     return ::capnp::Data::Reader{
 9 |             reinterpret_cast<unsigned char*>(v.data()),
10 |             v.size()
11 |     };
12 | }
13 | 
14 | vector<char> reader_to_vec(::capnp::Data::Reader reader) {
15 |     char* bytes = reinterpret_cast<char*>(const_cast<unsigned char*>(reader.asBytes().begin()));
16 |     size_t size = reader.size();
17 |     return {
18 |         bytes,
19 |         bytes + size
20 |     };
21 | }
22 | 
23 | 
24 | void sendExecution(int fd, Task* task) {
25 |     ::capnp::MallocMessageBuilder builder;
26 |     Execution::Builder exec = builder.initRoot<Execution>();
27 |     if (auto rt = dynamic_cast<ResultTask*>(task)) {
28 |         exec.setIsShuffle(false);
29 |         exec.setPartitionId(rt->partition);
30 |         exec.setFuncOrDep(rt->func->to_reader());
31 |         vector<char> v;
32 |         rt->rdd->serialize_dyn(v);
33 |         exec.setRdd(vec_to_reader(v));
34 |     } else {
35 |         auto smt = dynamic_cast<ShuffleMapTask*>(task);
36 |         exec.setIsShuffle(true);
37 |         exec.setPartitionId(smt->partition);
38 |         vector<char> depV;
39 |         smt->dep->serialize_dyn(depV);
40 |         exec.setFuncOrDep(vec_to_reader(depV));
41 |         vector<char> rddV;
42 |         smt->rdd->serialize_dyn(rddV);
43 |         exec.setRdd(vec_to_reader(rddV));
44 |     }
45 |     ::capnp::writePackedMessageToFd(fd, builder);
46 | }
47 | 
48 | unique_ptr<Task> recvExecution(::capnp::PackedFdMessageReader& message) {
49 |     Execution::Reader exec = message.getRoot<Execution>();
50 |     if (!exec.getIsShuffle()) {
51 |         RDDBase* rdd = rdd_from_reader(exec.getRdd());
52 |         FnBase* func = fn_from_reader(exec.getFuncOrDep());
53 |         auto task = make_unique<ResultTask>(
54 |             exec.getPartitionId(), rdd, func
55 |         );
56 |         return task;
57 |     } else {
58 |         RDDBase* rdd = rdd_from_reader(exec.getRdd());
59 |         ShuffleDependencyBase* dep = dep_from_reader(exec.getFuncOrDep());
60 |         auto task = make_unique<ShuffleMapTask>(
61 |             exec.getPartitionId(), rdd, dep
62 |         );
63 |         return task;
64 |     }
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/src/serialize_wrapper.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/11/2019.
 3 | //
 4 | 
 5 | #include "serialize_wrapper.hpp"
 6 | 
 7 | FnBase* fn_from_reader(::capnp::Data::Reader reader) {
 8 |     return reinterpret_cast<FnBase*>(
 9 |             const_cast<unsigned char*>(reader.asBytes().begin()));
10 | }
11 | 


--------------------------------------------------------------------------------
/src/spark_env.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by xiaol on 11/14/2019.
 3 | //
 4 | 
 5 | #include "common.hpp"
 6 | #include "spark_env.hpp"
 7 | 
 8 | void SparkEnv::init([[maybe_unused]] int argc, char **argv, const addr_t& masterAddr) {
 9 |     bool isMaster = false;
10 |     if (!strcmp(argv[1], "master")) {
11 |         isMaster = true;
12 |     }
13 |     mapOutputTracker = make_unique<MapOutputTracker>(isMaster, masterAddr);
14 |     cacheTracker = make_unique<CacheTracker>(isMaster, masterAddr, cache);
15 |     shuffleManager = make_unique<ShuffleManager>();
16 |     shuffleFetcher = make_unique<ParallelShuffleFetcher>();
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------