├── .ghci ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── Setup.hs ├── TODO.org ├── changelog.md ├── examples ├── LineCount.hs ├── WordCount.hs ├── WordCountAuto.hs └── WordCountLocal.hs ├── hadron.cabal ├── src ├── Data │ └── List │ │ └── LCS │ │ └── HuntSzymanski.hs └── Hadron │ ├── Basic.hs │ ├── Conduit.hs │ ├── Controller.hs │ ├── Join.hs │ ├── Logger.hs │ ├── OutputFixer.hs │ ├── Protocol.hs │ ├── Run.hs │ ├── Run │ ├── FanOut.hs │ ├── Hadoop.hs │ └── Local.hs │ ├── Streams.hs │ ├── Streams │ └── Bounded.hs │ ├── Types.hs │ └── Utils.hs ├── stack-lts-2.yaml ├── stack-lts-3.yaml ├── stack.yaml └── test └── Tests.hs /.ghci: -------------------------------------------------------------------------------- 1 | :set -isrc 2 | :set -Wall 3 | :set -fno-warn-unused-do-bind 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | TAGS 3 | *.hi 4 | *.o 5 | *DS* 6 | *.csv* 7 | *.swp 8 | .cabal-sandbox 9 | cabal.sandbox.config 10 | tmp 11 | *.log 12 | .stack-work 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use new container infrastructure to enable caching 2 | sudo: false 3 | 4 | # Choose a lightweight base image; we provide our own build tools. 5 | language: c 6 | 7 | # GHC depends on GMP. You can add other dependencies here as well. 8 | addons: 9 | apt: 10 | packages: 11 | - libgmp-dev 12 | 13 | # The different configurations we want to test. You could also do things like 14 | # change flags or use --stack-yaml to point to a different file. 15 | env: 16 | - ARGS="--stack-yaml=stack-lts-2.yaml" 17 | - ARGS="--stack-yaml=stack-lts-3.yaml" 18 | - ARGS="--resolver=lts-4" 19 | - ARGS="--resolver=lts-5" 20 | - ARGS="--resolver=lts-6" 21 | 22 | 23 | before_install: 24 | # Download and unpack the stack executable 25 | - mkdir -p ~/.local/bin 26 | - export PATH=$HOME/.local/bin:$PATH 27 | - travis_retry curl -L https://www.stackage.org/stack/linux-x86_64 | tar xz --wildcards --strip-components=1 -C ~/.local/bin '*/stack' 28 | 29 | # This line does all of the work: installs GHC if necessary, build the library, 30 | # executables, and test suites, and runs the test suites. --no-terminal works 31 | # around some quirks in Travis's terminal implementation. 32 | script: 33 | - stack $ARGS setup 34 | - stack $ARGS test --no-terminal --haddock --no-haddock-deps 35 | - stack $ARGS bench 36 | - stack $ARGS build 37 | - for f in examples/*.hs; do stack $ARGS ghc -- -Wall -Werror $f -o /dev/null; done 38 | # TODO: re-enable sdist when lcs is released and sets an upper bound on base 39 | # - stack $ARGS sdist 40 | 41 | 42 | # Caching so the next build will be fast too. 43 | cache: 44 | directories: 45 | - $HOME/.stack 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Soostone Inc 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Ozgun Ataman nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hadron - Hadoop MapReduce in Haskell [![Build Status](https://travis-ci.org/Soostone/hadron.svg?branch=master)](https://travis-ci.org/Soostone/hadron) 2 | 3 | Hadron aims to bring Haskell's type-safety to the complex and delicate 4 | world of Hadoop Streaming MapReduce. 5 | 6 | ## Features 7 | 8 | * Ties into Hadoop via the Streaming interface 9 | 10 | * Orchestrates multi-step Hadoop jobs so you don't have to manually 11 | call Hadoop at all. 12 | 13 | * Provides typed interactions with your input/output data on hdfs, s3 14 | or any other system that Hadoop supports. 15 | 16 | * Every Map-Reduce step is fully typed both on input and output, 17 | making a long, sophisticated sequence of jobs much easier to design 18 | and maintain. 19 | 20 | * Built-in support for multi-way map-side joins. Disparate data 21 | sources are each mapped to a common, monoidal type which then gets 22 | `mconcat`ed during reduce by join key. We support both required 23 | (a-la inner) and optional (a-la outer) joins. Current shortcoming 24 | here is the loss of input typing; only Tap ByteString can be used on 25 | input in order to support multiple datasets. 26 | 27 | * Various convenience combinators in the Controller module, covering 28 | common tasks. 29 | 30 | 31 | ## Shortcomings and Issues 32 | 33 | Hadoop seems to be terrible at constantly changing little details, 34 | program flags and behavior across major releases. While we try to make 35 | this package as sound as possible, you may be forced to do some 36 | debugging due to a difference in the way Hadoop works on the version 37 | you are running. 38 | 39 | This library has been most commonly tested on Amazon's EMR offering 40 | and Cloudera's local demo VM. 41 | 42 | ## Status 43 | 44 | hadron is used extensively by Soostone to process datasets with rows 45 | in the billions. Improvement opportunities exist, but it is very much 46 | functional. 47 | 48 | ## Modules 49 | 50 | ### Hadron.Basic 51 | 52 | This module exposes low level functionality for constructing a single 53 | MapReduce step. Not recommended for direct use in most cases. 54 | 55 | 56 | ### Hadron.Controller 57 | 58 | High level module for automated orchestration of multi-stage MapReduce 59 | jobs. 60 | 61 | (More docs and examples to be added) 62 | 63 | ### Hadron.Protocol 64 | 65 | Defines data encode/decode strategies via the Protocol type. 66 | 67 | 68 | 69 | ## TODO 70 | 71 | See TODO.org. 72 | 73 | 74 | # Contributors 75 | * [Ozgun Ataman](http://github.com/ozataman) 76 | * [Doug Beardsley](http://github.com/mightybyte) 77 | * [Michael Xavier](http://github.com/MichaelXavier) 78 | * [Thierry Bourrillon](https://github.com/thierry-b) 79 | 80 | 81 | ## Release Notes 82 | 83 | 84 | ## Version 0.5 85 | 86 | - Hadoop operations are now put behind a Hadron.Run interface. 87 | 88 | - Hadron.Run.Local now implements a basic form of Hadoop mimickery, 89 | allowing a wide variety of Controller-based MapReduce applications 90 | to be run locally on the development machine. 91 | 92 | - Several interface improvements to working with local and HDFS based 93 | files during a Controller app. 94 | 95 | - runOnce combinator allows running an "IO a" action in the central 96 | node and having the same value appear on the remote (worker) nodes. 97 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /TODO.org: -------------------------------------------------------------------------------- 1 | * TODOs 2 | ** TODO Re-run strategies should be per stage 3 | ** TODO Make hadoop runner logging real-time instead of the current "print after run completes" due to process buffering 4 | ** TODO Allow specifying per-task re-run strategy (instead of global setting) 5 | ** TODO Re-think the Controller design to enable: 6 | *** Parallel execution of non** dependent computation paths, like the cabal parallel build graph 7 | *** Ability to chain-design MapReduce tasks before it's time to supply them with Taps via connect. In other words, allow MapReduce composition *outside* of the Controller monad. 8 | ** TODO Is there an easier way to express strongly typed multi-way joins instead of the current best of "Tap (Either (Either a b) c)"? 9 | *** Current Best Idea (?): Use Fundeps to convert a :+ sum type to (,,,) product type via merge 10 | ** TODO Escape newlines instead of all-out Base64 encoding in internal binary protocol (i.e. emit 0xff 0x0a for newline, and 0xff 0xff for 0xff). (gregorycollins) 11 | ** TODO Hand-roll a parseLine function break on newlines and tab characters using elemIndex. (gregorycollins) 12 | ** TODO Make a better CLI interface that can specify common hadoop settings (e.g. EMR vs. Cloudera) 13 | ** TODO Use MVector buffering with mutable growth (like in palgos) in joins instead of linked lists 14 | ** TODO (?) Add support for parsing file locations, other settings from a config file 15 | ** DONE Local runner should set filename ENV variable for each file 16 | CLOSED: [2014-11-19 Wed 22:30] 17 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Soostone/hadron/d155ecbcdf0bb0dd442d6fc3e142bea75a312241/changelog.md -------------------------------------------------------------------------------- /examples/LineCount.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE NoMonomorphismRestriction #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-| 5 | 6 | Use the Basic interface to create a simple mapreduce program. 7 | 8 | -} 9 | 10 | module Main where 11 | 12 | ------------------------------------------------------------------------------- 13 | import Control.Monad.Catch 14 | import qualified Data.ByteString.Char8 as B 15 | import Data.Conduit 16 | import qualified Data.Conduit.List as C 17 | import Data.Default 18 | import Data.String 19 | ------------------------------------------------------------------------------- 20 | import Hadron.Basic 21 | ------------------------------------------------------------------------------- 22 | 23 | 24 | 25 | main :: IO () 26 | main = mapReduceMain def pSerialize mapper' reducer' 27 | 28 | 29 | mapper' 30 | :: (IsString t, MonadThrow m) 31 | => ConduitM B.ByteString ([t], Int) m () 32 | mapper' = linesConduit =$= C.map (\_ -> (["cnt"], (1 :: Int))) 33 | 34 | reducer' 35 | :: (Monad m, Num a, Show a) 36 | => ConduitM (t, a) B.ByteString m () 37 | reducer' = do 38 | i <- C.fold (\ acc (_, x) -> x + acc) 0 39 | yield $ B.pack $ show i 40 | -------------------------------------------------------------------------------- /examples/WordCount.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE NoMonomorphismRestriction #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE TupleSections #-} 4 | 5 | module Main where 6 | 7 | ------------------------------------------------------------------------------- 8 | import qualified Data.ByteString.Char8 as B 9 | import Data.Conduit 10 | import qualified Data.Conduit.List as C 11 | import Data.CSV.Conduit 12 | import Data.Default 13 | ------------------------------------------------------------------------------- 14 | import Hadron.Basic 15 | ------------------------------------------------------------------------------- 16 | 17 | 18 | main :: IO () 19 | main = mapReduceMain def pSerialize mapper' reducer' 20 | 21 | mapper':: Mapper B.ByteString CompositeKey Int 22 | mapper' = linesConduit =$= C.concatMap f 23 | where 24 | f ln = map (\w -> ([w], 1 :: Int)) $ B.words ln 25 | 26 | reducer':: Reducer CompositeKey Int B.ByteString 27 | reducer' = do 28 | (w, cnt) <- C.fold (\ (_, cnt) ([k], x) -> (k, cnt + x)) ("", 0) 29 | yield $ B.concat [rowToStr def [w, B.pack . show $ cnt], "\n"] 30 | -------------------------------------------------------------------------------- /examples/WordCountAuto.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE NoMonomorphismRestriction #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE TupleSections #-} 6 | 7 | module Main where 8 | 9 | ------------------------------------------------------------------------------- 10 | import qualified Data.ByteString.Char8 as B 11 | import Data.Conduit ((=$=), yield) 12 | import qualified Data.Conduit.List as C 13 | import Data.CSV.Conduit 14 | import Data.Default 15 | ------------------------------------------------------------------------------- 16 | import Hadron.Controller 17 | ------------------------------------------------------------------------------- 18 | 19 | main :: IO () 20 | main = hadoopMain [("count", app)] (HadoopRun clouderaDemo def) RSReRun 21 | 22 | 23 | source :: Tap B.ByteString 24 | source = tap "hdfs://localhost/user/cloudera/full_meta_4.csv.gz" idProtocol 25 | 26 | target :: CSV B.ByteString a => Tap a 27 | target = tap "hdfs://localhost/user/cloudera/wcOut1" (csvProtocol def) 28 | 29 | 30 | mr1 :: MapReduce B.ByteString (Row B.ByteString) 31 | mr1 = MapReduce def pSerialize mapper' Nothing (Left reducer') 32 | 33 | 34 | mapper' :: Mapper B.ByteString CompositeKey Int 35 | mapper' = intoCSV def =$= C.concatMap f 36 | where 37 | f :: [B.ByteString] -> [([B.ByteString], Int)] 38 | f ln = concatMap (map (\w -> ([w], 1 :: Int)) . B.words) ln 39 | 40 | 41 | reducer' :: Reducer CompositeKey Int (Row B.ByteString) 42 | reducer' = do 43 | (!w, !cnt) <- C.fold (\ (_, !cnt) ([k], !x) -> (k, cnt + x)) ("", 0) 44 | yield $ [w, B.pack . show $ cnt] 45 | 46 | 47 | app :: Controller () 48 | app = connect mr1 [source] target (Just "Counting words") 49 | -------------------------------------------------------------------------------- /examples/WordCountLocal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE NoMonomorphismRestriction #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE TupleSections #-} 6 | 7 | module Main where 8 | 9 | ------------------------------------------------------------------------------- 10 | import Control.Category 11 | import Control.Lens 12 | import qualified Data.ByteString.Char8 as B 13 | import qualified Data.Conduit as C 14 | import qualified Data.Conduit.List as C 15 | import Data.CSV.Conduit 16 | import Data.Default 17 | import Prelude hiding (id, (.)) 18 | ------------------------------------------------------------------------------- 19 | import Hadron.Controller 20 | ------------------------------------------------------------------------------- 21 | 22 | 23 | main :: IO () 24 | main = hadoopMain [("app", app)] (LocalRun def) RSReRun 25 | 26 | 27 | -- notice how path is a file 28 | source :: CSV B.ByteString a => Tap a 29 | source = tap "data/sample.csv" (csvProtocol def) 30 | 31 | 32 | -- notice how path is a folder 33 | target :: CSV B.ByteString a => Tap a 34 | target = tap "data/wordFrequency" (csvProtocol def) 35 | 36 | 37 | truncated :: CSV B.ByteString a => Tap a 38 | truncated = tap "data/truncated.csv" (csvProtocol def) 39 | 40 | 41 | -- notice how output is a file 42 | wordCountTarget :: CSV B.ByteString a => Tap a 43 | wordCountTarget = tap "data/wordCount.csv" (csvProtocol def) 44 | 45 | 46 | mr1 :: MapReduce (Row B.ByteString) (Row B.ByteString) 47 | mr1 = MapReduce def pSerialize mapper' Nothing (Left reducer') 48 | 49 | 50 | ------------------------------------------------------------------------------- 51 | mapper':: Mapper (Row B.ByteString) B.ByteString Int 52 | mapper' = C.concatMap (map (\w -> (w, 1 :: Int)) . concatMap B.words) 53 | 54 | 55 | reducer' :: Reducer B.ByteString Int (Row B.ByteString) 56 | reducer' = do 57 | (!w, !cnt) <- C.fold (\ (_, !cnt) (k, !x) -> (k, cnt + x)) ("", 0) 58 | C.yield [w, B.pack . show $ cnt] 59 | 60 | 61 | ------------------------------------------------------------------------------- 62 | -- | Count the number of words in mr1 output 63 | mr2 :: MapReduce (Row B.ByteString) (Row B.ByteString) 64 | mr2 = MapReduce def pSerialize mapper Nothing (Left r) 65 | where 66 | mapper :: Mapper (Row B.ByteString) String Int 67 | mapper = C.map (const $ ("count", 1)) 68 | 69 | r :: Reducer (String) Int (Row B.ByteString) 70 | r = do 71 | cnt <- C.fold (\ !m (_, !i) -> m + i) 0 72 | C.yield ["Total Count", (B.pack . show) cnt] 73 | 74 | 75 | mr3 :: MapReduce (Row B.ByteString) (Row B.ByteString) 76 | mr3 = MapReduce opts pSerialize mapper Nothing r 77 | where 78 | opts = def & mroNumReduce .~ Just 0 79 | 80 | mapper = C.map (\ v -> ((), map (B.take 5) v) ) 81 | 82 | r = Right (C.map id) 83 | 84 | 85 | 86 | app :: Controller () 87 | app = do 88 | let src = source 89 | connect mr1 [src] target (Just "Counting word frequency") 90 | connect mr2 [target] wordCountTarget (Just "Counting words") 91 | connect mr3 [target] truncated (Just "Truncating all fields") 92 | -------------------------------------------------------------------------------- /hadron.cabal: -------------------------------------------------------------------------------- 1 | name: hadron 2 | version: 0.6 3 | description: Use Haskell to create Hadoop MapReduce programs 4 | synopsis: Use Haskell to create Hadoop MapReduce programs 5 | license: BSD3 6 | license-file: LICENSE 7 | author: Ozgun Ataman, Doug Beardsley 8 | maintainer: ozgun.ataman@soostone.com 9 | category: Data 10 | build-type: Simple 11 | cabal-version: >=1.10 12 | extra-source-files: 13 | README.md 14 | changelog.md 15 | examples/LineCount.hs 16 | examples/WordCountAuto.hs 17 | examples/WordCount.hs 18 | examples/WordCountLocal.hs 19 | test/Tests.hs 20 | 21 | 22 | flag lib-Werror 23 | default: False 24 | manual: True 25 | 26 | library 27 | hs-source-dirs: src 28 | 29 | exposed-modules: 30 | Hadron.Basic 31 | Hadron.Conduit 32 | Hadron.Controller 33 | Hadron.Join 34 | Hadron.Logger 35 | Hadron.Protocol 36 | Hadron.Run 37 | Hadron.Run.FanOut 38 | Hadron.Run.Hadoop 39 | Hadron.Run.Local 40 | Hadron.Types 41 | Hadron.Utils 42 | 43 | other-modules: 44 | Data.List.LCS.HuntSzymanski 45 | 46 | build-depends: 47 | BoundedChan >= 1.0 && < 1.1, 48 | Glob >= 0.7 && < 0.10, 49 | array >= 0.5 && < 0.6, 50 | async >= 2.0 && < 2.2, 51 | attoparsec >= 0.11 && < 0.14, 52 | base >= 4.5 && < 4.10, 53 | base16-bytestring >= 0.1 && < 0.2, 54 | base64-bytestring >= 1.0 && < 1.1, 55 | blaze-builder >= 0.3 && < 0.5, 56 | bytestring >= 0.10 && < 0.11, 57 | cereal >= 0.4 && < 0.6, 58 | conduit >= 1.0 && < 1.3, 59 | conduit-extra >= 1.1 && < 1.2, 60 | containers >= 0.5 && < 0.6, 61 | cryptohash >= 0.11 && < 0.12, 62 | csv-conduit >= 0.5.1 && < 0.7, 63 | data-default >= 0.4 && < 0.8, 64 | directory >= 1.0 && < 1.4, 65 | errors >= 2.2 && < 2.3, 66 | exceptions >= 0.5 && < 0.9, 67 | filepath >= 1.0 && < 1.5, 68 | hashable >= 1.1 && < 1.3, 69 | hostname >= 1.0 && < 1.1, 70 | hslogger >= 1.2 && < 1.3, 71 | katip >= 0.5 && < 0.6, 72 | lens >= 4.0 && < 4.16, 73 | mmorph >= 1.1.0 && < 1.2, 74 | mtl >= 2.2.1 && < 2.3, 75 | operational >= 0.2.3 && < 0.3, 76 | optparse-applicative >= 0.14.0 && < 0.15, 77 | parsec >= 3.1.5 && < 3.2, 78 | primitive >= 0.6.2 && < 0.7, 79 | process >= 1.4.3 && < 1.5, 80 | random >= 1.0 && < 1.2, 81 | resourcet >= 1.1.9 && < 1.2, 82 | retry >= 0.7 && < 0.8, 83 | safe >= 0.3.15 && < 0.4, 84 | safecopy >= 0.8 && < 0.10, 85 | split >= 0.2.3 && < 0.3, 86 | string-conv >= 0.1 && < 0.2, 87 | stringsearch >= 0.3.6 && < 0.4, 88 | template-haskell >= 2.11.1 && < 2.12, 89 | text >= 1.2.2 && < 1.3, 90 | time >= 1.6.0 && < 1.7, 91 | time-locale-compat >= 0.1.1 && < 0.2, 92 | transformers >= 0.5.2 && < 0.6, 93 | transformers-base >= 0.4.4 && < 0.5, 94 | unix >= 2.7.2 && < 2.8, 95 | unix-time >= 0.3.7 && < 0.4, 96 | unordered-containers >= 0.2.8 && < 0.3, 97 | vector >= 0.12.0 && < 0.13 98 | 99 | if flag(lib-Werror) 100 | ghc-options: -Werror 101 | 102 | ghc-options: -Wall -O2 103 | -fwarn-tabs 104 | -fno-warn-unused-do-bind 105 | default-language: Haskell2010 106 | ghc-prof-options: -fprof-auto -fprof-cafs -fprof-auto-calls 107 | 108 | 109 | test-suite test 110 | type: exitcode-stdio-1.0 111 | main-is: Tests.hs 112 | 113 | if flag(lib-Werror) 114 | ghc-options: -Werror 115 | 116 | ghc-options: -Wall -threaded -rtsopts -O0 117 | hs-source-dirs: test 118 | 119 | Build-depends: base 120 | , hadron 121 | , vector 122 | , bytestring 123 | , unordered-containers 124 | , lens 125 | , time 126 | , HUnit 127 | , QuickCheck 128 | , test-framework 129 | , test-framework-hunit 130 | , test-framework-quickcheck2 131 | , derive 132 | 133 | 134 | default-language: Haskell2010 135 | -------------------------------------------------------------------------------- /src/Data/List/LCS/HuntSzymanski.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | 3 | ----------------------------------------------------------------------------- 4 | -- | 5 | -- Module : Data.List.LCS.HuntSzymanski 6 | -- Copyright : (c) Ian Lynagh 2005 7 | -- License : BSD or GPL v2 8 | -- 9 | -- Maintainer : igloo@earth.li 10 | -- Stability : provisional 11 | -- Portability : non-portable (uses STUArray) 12 | -- 13 | -- This is an implementation of the Hunt-Szymanski LCS algorithm. 14 | -- Derived from the description in \"String searching algorithms\" by 15 | -- Graham A Stephen, ISBN 981021829X. 16 | ----------------------------------------------------------------------------- 17 | 18 | module Data.List.LCS.HuntSzymanski ( 19 | -- * Algorithm 20 | -- $algorithm 21 | 22 | -- * LCS 23 | lcs 24 | ) where 25 | 26 | import Data.Array (listArray, (!)) 27 | import Data.Array.MArray (newArray, newArray_) 28 | import Data.Array.Base (unsafeRead, unsafeWrite) 29 | import Data.Array.ST (STArray, STUArray) 30 | import Control.Monad (when) 31 | import Control.Monad.ST (ST, runST) 32 | import Data.List (groupBy, sort) 33 | 34 | {- $algorithm 35 | We take two sequences, @xs@ and @ys@, of length @\#xs@ and @\#ys@. 36 | 37 | First we make an array 38 | 39 | > matchlist[i=0..(#xs-1)] 40 | 41 | such that 42 | 43 | > (matchlist[i] = js) => ((j `elem` js) <=> (xs !! i == ys !! j)) 44 | > && sort js == reverse js 45 | 46 | i.e. @matchlist[i]@ is the indices of elements of @ys@ equal to the 47 | ith element of @xs@, in descending order. 48 | 49 | Let @\#xys@ be the minimum of @\#xs@ and @\#ys@. Trivially this is the maximum 50 | possible length of the LCS of @xs@ and @ys@. Then we can imagine an array 51 | 52 | > k[i=0..#xs][l=0..#xys] 53 | 54 | such that @k[i][l] = j@ where @j@ is the smallest value such that the 55 | LCS of @xs[0..i]@ and @ys[0..j]@ has length @l@. We use @\#ys@ to 56 | mean there is no such @j@. 57 | 58 | We will not need to whole array at once, though. Instead we use an array 59 | 60 | > kk[l=0..#xys] 61 | 62 | representing a row of @kk@ for a particular @i@. Initially it is for 63 | @i = -1@, so @kk[0] = -1@ and @kk[l] = \#ys@ otherwise. As the algorithm 64 | progresses we will increase @i@ by one at the outer level and compute 65 | the replacement values for @k@'s elements. 66 | 67 | But we want more than just the length of the LCS, we also want the LCS 68 | itself. Another array 69 | 70 | > revres[l=0..#xys] 71 | 72 | stores the list of @xs@ indices an LCS of length @l@, if one is known, 73 | at @revres[l]@. 74 | 75 | Now, suppose @kk@ contains @k[i-1]@. We consider each @j@ in @matchlist[i]@ 76 | in turn. We find the @l@ such that @k[l-1] < j <= k[l]@. If @j < k[l]@ then 77 | we updated @k[l]@ to be @j@ and set @revres[l]@ to be @i:revres[l-1]@. 78 | 79 | Finding @l@ is basically binary search, but there are some tricks we can 80 | do. First, as the @j@s are decreasing the last @l@ we had for this @i@ is 81 | an upper bound on this @l@. Second, we use another array 82 | 83 | > lastl[j=0..#ys-1] 84 | 85 | to store the @l@ we got last time for this @j@, initially all @1@. As the 86 | values in @kk[j]@ monotonically decrease this is a lower bound for @l@. 87 | We also test to see whether this old @l@ is still @l@ before we start the 88 | binary search. 89 | -} 90 | 91 | -- |The 'lcs' function takes two lists and returns a list with a longest 92 | -- common subsequence of the two. 93 | lcs :: Ord a => [a] -> [a] -> [a] 94 | -- Start off by returning the common prefix 95 | lcs [] _ = [] 96 | lcs _ [] = [] 97 | lcs (c1:c1s) (c2:c2s) 98 | | c1 == c2 = c1 : lcs c1s c2s 99 | -- Then reverse everything, get the backwards LCS and reverse it 100 | lcs s1 s2 = lcs_tail [] (reverse s1) (reverse s2) 101 | 102 | -- To get the backwards LCS, we again start off by returning the common 103 | -- prefix (or suffix, however you want to think of it :-) ) 104 | lcs_tail :: Ord a => [a] -> [a] -> [a] -> [a] 105 | lcs_tail acc (c1:c1s) (c2:c2s) 106 | | c1 == c2 = lcs_tail (c1:acc) c1s c2s 107 | lcs_tail acc [] _ = acc 108 | lcs_tail acc _ [] = acc 109 | -- Then we begin the real algorithm 110 | lcs_tail acc s1 s2 = runST (lcs' acc s1 s2) 111 | 112 | lcs' :: Ord a => [a] -> [a] -> [a] -> ST s [a] 113 | lcs' acc xs ys = 114 | do let max_xs = length xs 115 | max_ys = length ys 116 | minmax = max_xs `min` max_ys 117 | -- Initialise all the arrays 118 | matchlist <- newArray_ (0, max_xs - 1) 119 | mk_matchlist matchlist xs ys 120 | kk <- newArray (0, minmax) max_ys 121 | unsafeWrite kk 0 (-1) 122 | lastl <- newArray (0, max_ys - 1) 1 123 | revres <- newArray_ (0, minmax) 124 | unsafeWrite revres 0 [] 125 | -- Pass the buck to lcs'' to finish the job off 126 | is <- lcs'' matchlist lastl kk revres max_xs max_ys minmax 127 | -- Convert the list of i indices into the result sequence 128 | let axs = listArray (0, max_xs - 1) xs 129 | return $ map (axs !) is ++ acc 130 | 131 | eqFst :: Eq a => (a, b) -> (a, b) -> Bool 132 | eqFst (x, _) (y, _) = x == y 133 | 134 | -- mk_matchlist fills the matchlist array such that if 135 | -- xs !! i == ys !! j then (j+1) `elem` matchlist ! i 136 | -- and matchlist ! i is decreasing for all i 137 | mk_matchlist :: Ord a => STArray s Int [Int] -> [a] -> [a] -> ST s () 138 | mk_matchlist matchlist xs ys = 139 | do let -- xs' is a list of (string, ids with that string in xs) 140 | xs' = map (\sns -> (fst (head sns), map snd sns)) 141 | $ groupBy eqFst $ sort $ zip xs [0..] 142 | -- ys' is similar, only the ids are reversed 143 | ys' = map (\sns -> (fst (head sns), reverse $ map snd sns)) 144 | $ groupBy eqFst $ sort $ zip ys [0..] 145 | -- add_to_matchlist does all the hardwork 146 | add_to_matchlist all_xs@((sx, idsx):xs'') all_ys@((sy, idsy):ys'') 147 | = case compare sx sy of 148 | -- If we have the same string in xs'' and ys'' then all 149 | -- the indices in xs'' must map to the indices in ys'' 150 | EQ -> do sequence_ [ unsafeWrite matchlist i idsy 151 | | i <- idsx ] 152 | add_to_matchlist xs'' ys'' 153 | -- If the string in xs'' is smaller then there are no 154 | -- corresponding indices in ys so we assign all the xs'' 155 | -- indices the empty list 156 | LT -> do sequence_ [ unsafeWrite matchlist i [] 157 | | i <- idsx ] 158 | add_to_matchlist xs'' all_ys 159 | -- Otherwise the string appears in ys only, so we ignore it 160 | GT -> do add_to_matchlist all_xs ys'' 161 | -- If we run out of ys'' altogether then just go through putting 162 | -- in [] for the list of indices of each index remaining in xs'' 163 | add_to_matchlist ((_, idsx):xs'') [] 164 | = do sequence_ [ unsafeWrite matchlist i [] | i <- idsx ] 165 | add_to_matchlist xs'' [] 166 | -- When we run out of xs'' we are done 167 | add_to_matchlist [] _ = return () 168 | -- Finally, actually call add_to_matchlist to populate matchlist 169 | add_to_matchlist xs' ys' 170 | 171 | lcs'' :: STArray s Int [Int] -- matchlist 172 | -> STUArray s Int Int -- lastl 173 | -> STUArray s Int Int -- kk 174 | -> STArray s Int [Int] -- revres 175 | -> Int -> Int -> Int -> ST s [Int] 176 | lcs'' matchlist lastl kk revres max_xs max_ys minmax = 177 | do let -- Out the outermost level we loop over the indices i of xs 178 | loop_i = sequence_ [ loop_j i | i <- [0..max_xs - 1] ] 179 | -- For each i we loop over the matching indices j of elements of ys 180 | loop_j i = do js <- unsafeRead matchlist i 181 | with_js i js minmax 182 | -- Deal with this i and j 183 | with_js i (j:js) max_bound = 184 | do x0 <- unsafeRead lastl j 185 | l <- find_l j x0 max_bound 186 | unsafeWrite lastl j l 187 | vl <- unsafeRead kk l 188 | when (j < vl) $ do 189 | unsafeWrite kk l j 190 | rs <- unsafeRead revres (l - 1) 191 | unsafeWrite revres l (i:rs) 192 | with_js i js l 193 | with_js _ [] _ = return () 194 | -- find_l returns the l such that kk ! (l-1) < j <= kk ! l 195 | find_l j x0 z0 196 | = let f x z 197 | | x + 1 == z = return z 198 | | otherwise = let y = (x + z) `div` 2 199 | in do vy <- unsafeRead kk y 200 | if vy < j 201 | then f y z 202 | else f x y 203 | in j `seq` do q1 <- unsafeRead kk x0 204 | if j <= q1 205 | then return x0 206 | else f x0 z0 207 | -- Do the hard work 208 | loop_i 209 | -- Find where the result starts 210 | succ_l <- find_l max_ys 1 (minmax + 1) 211 | -- Get the result 212 | unsafeRead revres (succ_l - 1) 213 | 214 | -------------------------------------------------------------------------------- /src/Hadron/Basic.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExistentialQuantification #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE NoMonomorphismRestriction #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | {-# LANGUAGE RankNTypes #-} 8 | {-# LANGUAGE RecordWildCards #-} 9 | 10 | ----------------------------------------------------------------------------- 11 | -- | 12 | -- Module : Hadron 13 | -- Copyright : Soostone Inc 14 | -- License : BSD3 15 | -- 16 | -- Maintainer : Ozgun Ataman 17 | -- Stability : experimental 18 | -- 19 | -- Low level building blocks for working with Hadoop streaming. 20 | -- 21 | -- We define all the base types for MapReduce and export map/reduce 22 | -- maker functions that know how to deal with ByteString input and 23 | -- output. 24 | ---------------------------------------------------------------------------- 25 | 26 | module Hadron.Basic 27 | ( 28 | -- * Types 29 | Key 30 | , CompositeKey 31 | , Mapper 32 | , Reducer 33 | 34 | 35 | -- * Hadoop Utilities 36 | , emitCounter 37 | , hsEmitCounter 38 | , emitStatus 39 | , getFileName 40 | 41 | -- * MapReduce Construction 42 | , mapReduceMain 43 | , mapReduce 44 | , MROptions (..) 45 | , PartitionStrategy (..) 46 | 47 | -- * Low-level Utilities 48 | , mapper 49 | , mapperWith 50 | , combiner 51 | , reducer 52 | , setLineBuffering 53 | 54 | -- * Data Serialization Utilities 55 | , module Hadron.Protocol 56 | 57 | ) where 58 | 59 | 60 | ------------------------------------------------------------------------------- 61 | import Blaze.ByteString.Builder 62 | import Control.Applicative 63 | import Control.Category 64 | import Control.Lens 65 | import Control.Monad 66 | import Control.Monad.Base 67 | import Control.Monad.Primitive 68 | import Control.Monad.Trans 69 | import Control.Monad.Trans.Resource 70 | import qualified Data.ByteString.Char8 as B 71 | import qualified Data.ByteString.Lazy.Char8 as LB 72 | import Data.Conduit 73 | import Data.Conduit.Binary (sinkHandle, sourceHandle) 74 | import Data.Conduit.Blaze 75 | import qualified Data.Conduit.List as C 76 | import Data.List 77 | import Data.Monoid 78 | import Options.Applicative 79 | import Prelude hiding (id, (.)) 80 | import System.Environment 81 | import System.IO 82 | ------------------------------------------------------------------------------- 83 | import Hadron.Protocol 84 | import Hadron.Run.Hadoop 85 | import Hadron.Types 86 | ------------------------------------------------------------------------------- 87 | 88 | 89 | 90 | showBS :: Show a => a -> B.ByteString 91 | showBS = B.pack . show 92 | 93 | 94 | -- | Emit a counter to be captured, added up and reported by Hadoop. 95 | emitCounter 96 | :: B.ByteString 97 | -- ^ Group name 98 | -> B.ByteString 99 | -- ^ Counter name 100 | -> Integer 101 | -- ^ Increment 102 | -> IO () 103 | emitCounter grp counter inc = LB.hPutStrLn stderr $ toLazyByteString txt 104 | where 105 | txt = mconcat $ map fromByteString 106 | ["reporter:counter:", grp, ",", counter, ",", showBS inc] 107 | 108 | 109 | -- | Emit counter from this library's group 110 | hsEmitCounter :: B.ByteString -> Integer -> IO () 111 | hsEmitCounter = emitCounter "Hadron" 112 | 113 | 114 | -- | Emit a status line. 115 | emitStatus :: B.ByteString -> IO () 116 | emitStatus msg = LB.hPutStrLn stderr $ toLazyByteString txt 117 | where 118 | txt = fromByteString "reporter:status:" <> 119 | fromByteString msg 120 | 121 | 122 | -- | Get the current filename from Hadoop ENV. Useful when writing 123 | -- 'Mapper's and you would like to know what file you're currently 124 | -- dealing with. 125 | getFileName :: MonadIO m => m FilePath 126 | getFileName = liftIO $ getEnv "mapreduce_map_input_file" 127 | 128 | 129 | 130 | ------------------------------------------------------------------------------- 131 | mapper 132 | :: Mapper B.ByteString CompositeKey B.ByteString 133 | -- ^ A key/value producer - don't worry about putting any newline 134 | -- endings yourself, we do that for you. 135 | -> IO () 136 | mapper f = mapperWith id f 137 | 138 | 139 | -- | Construct a mapper program using given serialization Prism. 140 | mapperWith 141 | :: Prism' B.ByteString t 142 | -> Mapper B.ByteString CompositeKey t 143 | -> IO () 144 | mapperWith p f = runResourceT $ do 145 | setLineBuffering 146 | sourceHandle stdin $= 147 | f $= 148 | encodeMapOutput p $$ 149 | sinkHandle stdout 150 | 151 | 152 | -- ------------------------------------------------------------------------------- 153 | -- -- | Drop the key and simply output the value stream. 154 | -- mapOnly 155 | -- :: (InputStream B.ByteString -> OutputStream B.ByteString -> IO ()) 156 | -- -> IO () 157 | -- mapOnly f = do 158 | -- setLineBuffering 159 | -- f S.stdin S.stdout 160 | 161 | 162 | ------------------------------------------------------------------------------- 163 | combiner 164 | :: MROptions 165 | -> Prism' B.ByteString b 166 | -> Reducer CompositeKey b (CompositeKey, b) 167 | -> IO () 168 | combiner mro mrInPrism f = runResourceT $ do 169 | setLineBuffering 170 | sourceHandle stdin =$= 171 | decodeReducerInput mro mrInPrism =$= 172 | f =$= 173 | encodeMapOutput mrInPrism $$ 174 | sinkHandle stdout 175 | 176 | 177 | 178 | ------------------------------------------------------------------------------- 179 | setLineBuffering :: MonadIO m => m () 180 | setLineBuffering = do 181 | liftIO $ hSetBuffering stderr LineBuffering 182 | liftIO $ hSetBuffering stdout LineBuffering 183 | liftIO $ hSetBuffering stdin LineBuffering 184 | 185 | 186 | ------------------------------------------------------------------------------- 187 | -- | Appropriately produce lines of mapper output in a way compliant 188 | -- with Hadoop and 'decodeReducerInput'. 189 | encodeMapOutput 190 | :: (PrimMonad base, MonadBase base m) 191 | => Prism' B.ByteString b 192 | -> Conduit (CompositeKey, b) m B.ByteString 193 | encodeMapOutput mrInPrism = C.map conv $= builderToByteString 194 | where 195 | 196 | conv (k,v) = mconcat 197 | [ mconcat (intersperse tab (map fromByteString k)) 198 | , tab 199 | , fromByteString (review mrInPrism v) 200 | , nl ] 201 | 202 | tab = fromByteString "\t" 203 | nl = fromByteString "\n" 204 | 205 | 206 | ------------------------------------------------------------------------------- 207 | -- | Chunk 'stdin' into lines and try to decode the value using given 'Prism'. 208 | decodeReducerInput 209 | :: (MonadIO m, MonadThrow m) 210 | => MROptions 211 | -> Prism' B.ByteString b 212 | -> ConduitM a (CompositeKey, b) m () 213 | decodeReducerInput mro mrInPrism = 214 | sourceHandle stdin =$= 215 | lineC (numSegs (_mroPart mro)) =$= 216 | C.mapMaybe (_2 (firstOf mrInPrism)) 217 | 218 | 219 | ------------------------------------------------------------------------------- 220 | reducerMain 221 | :: MROptions 222 | -> Prism' B.ByteString a 223 | -> Reducer CompositeKey a B.ByteString 224 | -> IO () 225 | reducerMain mro p f = do 226 | setLineBuffering 227 | runResourceT $ reducer mro p f $$ sinkHandle stdout 228 | 229 | 230 | -- | Create a reducer program. 231 | reducer 232 | :: MROptions 233 | -> Prism' B.ByteString a 234 | -- ^ Input conversion function 235 | -> Reducer CompositeKey a b 236 | -- ^ A step function for any given key. Will be rerun from scratch 237 | -- for each unique key based on MROptions. 238 | -> Producer (ResourceT IO) b 239 | reducer mro@MROptions{..} mrInPrism f = do 240 | sourceHandle stdin =$= 241 | decodeReducerInput mro mrInPrism =$= 242 | go2 243 | where 244 | go2 = do 245 | next <- await 246 | case next of 247 | Nothing -> return () 248 | Just x -> do 249 | leftover x 250 | block 251 | go2 252 | 253 | block = sameKey Nothing =$= f 254 | 255 | sameKey cur = do 256 | next <- await 257 | case next of 258 | Nothing -> return () 259 | Just x@(k,_) -> 260 | case cur of 261 | Just curKey -> do 262 | let n = eqSegs _mroPart 263 | case take n curKey == take n k of 264 | True -> yield x >> sameKey cur 265 | False -> leftover x 266 | Nothing -> do 267 | yield x 268 | sameKey (Just k) 269 | 270 | 271 | 272 | 273 | ------------------ 274 | -- Main Program -- 275 | ------------------ 276 | 277 | 278 | 279 | ------------------------------------------------------------------------------- 280 | mapReduce 281 | :: MROptions 282 | -> Prism' B.ByteString a 283 | -- ^ Serialization for data between map and reduce stages 284 | -> Mapper B.ByteString CompositeKey a 285 | -> Reducer CompositeKey a B.ByteString 286 | -> (IO (), IO ()) 287 | mapReduce mro mrInPrism f g = (mp, rd) 288 | where 289 | mp = mapperWith mrInPrism f 290 | rd = reducerMain mro mrInPrism g 291 | 292 | 293 | 294 | -- | A default main that will respond to 'map' and 'reduce' commands 295 | -- to run the right phase appropriately. 296 | -- 297 | -- This is the recommended 'main' entry point to a map-reduce program. 298 | -- The resulting program will respond as: 299 | -- 300 | -- > ./myProgram map 301 | -- > ./myProgram reduce 302 | mapReduceMain 303 | :: MROptions 304 | -> Prism' B.ByteString a 305 | -- ^ Serialization function for the in-between data 'a'. 306 | -> Mapper B.ByteString CompositeKey a 307 | -> Reducer CompositeKey a B.ByteString 308 | -- ^ Reducer for a stream of values belonging to the same key. 309 | -> IO () 310 | mapReduceMain mro mrInPrism f g = liftIO (execParser opts) >>= run 311 | where 312 | (mp,rd) = mapReduce mro mrInPrism f g 313 | 314 | run Map = mp 315 | run Reduce = rd 316 | 317 | 318 | opts = info (helper <*> commandParse) 319 | ( fullDesc 320 | <> progDesc "This is a Hadron Map/Reduce binary. " 321 | <> header "hadron - use Haskell for Hadron." 322 | ) 323 | 324 | 325 | data Command = Map | Reduce 326 | 327 | 328 | ------------------------------------------------------------------------------- 329 | commandParse :: Parser Command 330 | commandParse = subparser 331 | ( command "map" (info (pure Map) 332 | ( progDesc "Run mapper." )) 333 | <> command "reduce" (info (pure Reduce) 334 | ( progDesc "Run reducer" )) 335 | ) 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /src/Hadron/Conduit.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | module Hadron.Conduit where 3 | 4 | ------------------------------------------------------------------------------- 5 | import Control.Concurrent.Chan 6 | import Control.Monad.Trans 7 | import Data.Conduit 8 | ------------------------------------------------------------------------------- 9 | 10 | 11 | ------------------------------------------------------------------------------- 12 | sourceChan :: MonadIO m => Chan (Maybe a) -> Producer m a 13 | sourceChan ch = go 14 | where 15 | go = do 16 | res <- liftIO $ readChan ch 17 | case res of 18 | Nothing -> return () 19 | Just a -> yield a >> go 20 | 21 | 22 | 23 | ------------------------------------------------------------------------------- 24 | peek :: Monad m => Consumer i m (Maybe i) 25 | peek = do 26 | res <- await 27 | case res of 28 | Nothing -> return Nothing 29 | Just a -> leftover a >> return (Just a) 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/Hadron/Controller.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE DeriveDataTypeable #-} 4 | {-# LANGUAGE DeriveFunctor #-} 5 | {-# LANGUAGE EmptyDataDecls #-} 6 | {-# LANGUAGE ExistentialQuantification #-} 7 | {-# LANGUAGE FlexibleContexts #-} 8 | {-# LANGUAGE FlexibleInstances #-} 9 | {-# LANGUAGE GADTs #-} 10 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 11 | {-# LANGUAGE MultiParamTypeClasses #-} 12 | {-# LANGUAGE NoMonomorphismRestriction #-} 13 | {-# LANGUAGE OverloadedStrings #-} 14 | {-# LANGUAGE RankNTypes #-} 15 | {-# LANGUAGE RecordWildCards #-} 16 | {-# LANGUAGE ScopedTypeVariables #-} 17 | {-# LANGUAGE TemplateHaskell #-} 18 | {-# LANGUAGE UndecidableInstances #-} 19 | 20 | ----------------------------------------------------------------------------- 21 | -- | 22 | -- Module : Hadron.Controller 23 | -- Copyright : Soostone Inc 24 | -- License : BSD3 25 | -- 26 | -- Maintainer : Ozgun Ataman 27 | -- Stability : experimental 28 | -- 29 | -- High level flow-control of Hadoop programs with ability to define a 30 | -- sequence of Map-Reduce operations in a Monad, have strongly typed 31 | -- data locations. 32 | ---------------------------------------------------------------------------- 33 | 34 | module Hadron.Controller 35 | ( 36 | 37 | 38 | -- * Hadoop Program Construction 39 | Controller 40 | 41 | , connect 42 | , connect' 43 | , io 44 | , orchIO 45 | , nodeIO 46 | , setVal 47 | , getVal 48 | , runOnce 49 | 50 | , MapReduce (..) 51 | , mrOptions 52 | , Mapper 53 | , Reducer 54 | , (>.>) 55 | , (<.<) 56 | 57 | , MRKey (..) 58 | , CompositeKey 59 | , SingleKey (..) 60 | , WrapSerialize (..) 61 | , WrapSafeCopy (..) 62 | 63 | -- * Data Sources 64 | , Tap (..) 65 | , tapProto, tapLocation 66 | , tap 67 | , taps 68 | , mergeTaps 69 | , concatTaps 70 | , binaryDirTap 71 | , setupBinaryDir 72 | , fileListTap 73 | , fanOutTap 74 | , readTap 75 | , readHdfsFile 76 | 77 | -- * Command Line Entry Point 78 | , hadoopMain 79 | , HadoopEnv (..) 80 | 81 | -- * Settings for MapReduce Jobs 82 | , MROptions 83 | , mroPart 84 | , mroNumMap 85 | , mroNumReduce 86 | , mroCompress 87 | , mroOutSep 88 | , mroTaskTimeout 89 | , PartitionStrategy (..) 90 | , Comparator (..) 91 | , RerunStrategy (..) 92 | 93 | -- * Hadoop Utilities 94 | , emitCounter 95 | , hsEmitCounter 96 | , emitStatus 97 | , getFileName 98 | 99 | 100 | -- * MapReduce Combinators 101 | 102 | , joinMR 103 | 104 | , joinStep 105 | , JoinType (..) 106 | , JoinKey 107 | 108 | -- * Data Serialization Utilities 109 | , module Hadron.Protocol 110 | , module Hadron.Run 111 | 112 | ) where 113 | 114 | ------------------------------------------------------------------------------- 115 | import Control.Applicative as A 116 | import Control.Arrow (first) 117 | import Control.Concurrent.Async 118 | import Control.Concurrent.Chan 119 | import Control.Concurrent.QSem 120 | import Control.Error 121 | import Control.Exception.Lens 122 | import Control.Lens 123 | import Control.Monad.Catch 124 | import Control.Monad.Operational hiding (view) 125 | import qualified Control.Monad.Operational as O 126 | import Control.Monad.State 127 | import Control.Monad.Trans.Resource 128 | import Control.Retry 129 | import qualified Crypto.Hash.MD5 as Crypto 130 | import qualified Data.ByteString.Base16 as Base16 131 | import qualified Data.ByteString.Char8 as B 132 | import Data.ByteString.Search as B 133 | import Data.Char 134 | import Data.Conduit hiding (connect) 135 | import Data.Conduit.Binary (sinkHandle, sourceHandle) 136 | import qualified Data.Conduit.List as C 137 | import Data.Conduit.Zlib 138 | import Data.CSV.Conduit 139 | import Data.Default 140 | import Data.List 141 | import qualified Data.Map.Strict as M 142 | import Data.Monoid 143 | import Data.SafeCopy 144 | import Data.Serialize 145 | import Data.String 146 | import Data.String.Conv 147 | import Data.Text (Text) 148 | import qualified Data.Text as T 149 | import Data.Text.Encoding 150 | import Data.Time 151 | import qualified Data.Time.Locale.Compat as LC 152 | import Data.Typeable 153 | import Network.HostName 154 | import System.Environment 155 | import System.FilePath.Lens 156 | import System.IO 157 | import Text.Parsec 158 | ------------------------------------------------------------------------------- 159 | import Hadron.Basic hiding (mapReduce) 160 | import Hadron.Conduit 161 | import Hadron.Join 162 | import Hadron.Logger 163 | import Hadron.Protocol 164 | import Hadron.Run 165 | import Hadron.Run.Hadoop (mrsInput, mrsJobName, 166 | mrsNumReduce, mrsOutput) 167 | import Hadron.Types 168 | import Hadron.Utils 169 | ------------------------------------------------------------------------------- 170 | 171 | 172 | ------------------------------------------------------------------------------- 173 | echo :: (A.Applicative m, MonadIO m, LogItem a) => Severity -> a -> LogStr -> m () 174 | echo sev cxt msg = runLog $ logF cxt "Run.Hadoop" sev msg 175 | 176 | 177 | ------------------------------------------------------------------------------- 178 | echoInfo :: (Applicative m, MonadIO m, LogItem a) => a -> LogStr -> m () 179 | echoInfo = echo InfoS 180 | 181 | 182 | newtype SingleKey a = SingleKey { unKey :: a } 183 | deriving (Eq,Show,Read,Ord,Serialize) 184 | 185 | newtype WrapSerialize a = WrapSerialize { _getSerialized :: a } 186 | deriving (Eq,Show,Read,Ord,Serialize) 187 | 188 | newtype WrapSafeCopy a = WrapSafeCopy { _getSafeCopy :: a } 189 | deriving (Eq,Show,Read,Ord) 190 | deriveSafeCopy 1 'base ''WrapSafeCopy 191 | 192 | 193 | type Parser = Parsec [B.ByteString] () 194 | 195 | 196 | keyToken :: Parser B.ByteString 197 | keyToken = tokenPrim B.unpack (\pos _ _ -> incSourceColumn pos 1) Just 198 | 199 | 200 | fromCompKey :: MRKey a => [B.ByteString] -> Either ParseError a 201 | fromCompKey s = runParser keyParser () "Key Input" s 202 | 203 | 204 | class MRKey k where 205 | toCompKey :: k -> CompositeKey 206 | keyParser :: Parser k 207 | numKeys :: k -> Int 208 | 209 | instance MRKey () where 210 | toCompKey _ = [""] 211 | keyParser = keyToken >> return () 212 | numKeys _ = 1 213 | 214 | instance MRKey B.ByteString where 215 | toCompKey k = [k] 216 | keyParser = keyToken 217 | numKeys _ = 1 218 | 219 | instance MRKey CompositeKey where 220 | toCompKey ks = ks 221 | keyParser = many1 keyToken 222 | numKeys ks = length ks 223 | 224 | instance MRKey String where 225 | toCompKey = toCompKey . B.pack 226 | keyParser = B.unpack <$> keyToken 227 | numKeys _ = 1 228 | 229 | instance MRKey T.Text where 230 | toCompKey = toCompKey . encodeUtf8 231 | keyParser = decodeUtf8 <$> keyToken 232 | numKeys _ = 1 233 | 234 | instance MRKey Int where 235 | toCompKey = toCompKey . B.pack . show 236 | keyParser = keyParser >>= 237 | maybe (fail "Can't read Int MRKey") return . readMay 238 | numKeys _ = 1 239 | 240 | instance Serialize a => MRKey (WrapSerialize a) where 241 | toCompKey = toCompKey . (^. re pSerialize) . _getSerialized 242 | keyParser = do 243 | a <- (^? pSerialize) <$> keyParser 244 | maybe (fail "Can't decode WrapSerialize") (return . WrapSerialize) a 245 | numKeys _ = 1 246 | 247 | instance SafeCopy a => MRKey (WrapSafeCopy a) where 248 | toCompKey = toCompKey . (^. re pSafeCopy) . _getSafeCopy 249 | keyParser = do 250 | a <- (^? pSafeCopy) <$> keyParser 251 | maybe (fail "Can't decode WrapSerialize") (return . WrapSafeCopy) a 252 | numKeys _ = 1 253 | 254 | utcFormat :: String 255 | utcFormat = "%Y-%m-%d %H:%M:%S.%q" 256 | 257 | instance MRKey UTCTime where 258 | toCompKey = toCompKey . formatTime LC.defaultTimeLocale utcFormat 259 | keyParser = do 260 | # if MIN_VERSION_time (1, 5, 0) 261 | res <- parseTimeM True LC.defaultTimeLocale utcFormat <$> keyParser 262 | # else 263 | res <- parseTime LC.defaultTimeLocale utcFormat <$> keyParser 264 | #endif 265 | maybe (fail "Can't parse value as UTCTime") return res 266 | numKeys _ = 1 267 | 268 | instance (MRKey a, MRKey b) => MRKey (a,b) where 269 | toCompKey (a,b) = toCompKey a ++ toCompKey b 270 | keyParser = (,) <$> keyParser <*> keyParser 271 | numKeys (a,b) = numKeys a + numKeys b 272 | 273 | instance (MRKey a, MRKey b, MRKey c) => MRKey (a,b,c) where 274 | toCompKey (a,b,c) = toCompKey a ++ toCompKey b ++ toCompKey c 275 | keyParser = (,,) <$> keyParser <*> keyParser <*> keyParser 276 | numKeys (a,b,c) = numKeys a + numKeys b + numKeys c 277 | 278 | instance (MRKey a, MRKey b, MRKey c, MRKey d) => MRKey (a,b,c,d) where 279 | toCompKey (a,b,c,d) = toCompKey a ++ toCompKey b ++ toCompKey c ++ toCompKey d 280 | keyParser = (,,,) <$> keyParser <*> keyParser <*> keyParser <*> keyParser 281 | numKeys (a,b,c,d) = numKeys a + numKeys b + numKeys c + numKeys d 282 | 283 | 284 | 285 | ------------------------------------------------------------------------------- 286 | -- | Do something with m-r output before writing it to a tap. 287 | (>.>) :: MapReduce a b -> Conduit b (ResourceT IO) c -> MapReduce a c 288 | (MapReduce o p m c r) >.> f = MapReduce o p m c r' 289 | where 290 | r' = case r of 291 | Left r'' -> Left $ r'' =$= f 292 | Right conv -> Right $ conv =$= f 293 | 294 | 295 | ------------------------------------------------------------------------------- 296 | -- | Do something with the m-r input before starting the map stage. 297 | (<.<) :: Conduit c (ResourceT IO) a -> MapReduce a b -> MapReduce c b 298 | f <.< (MapReduce o p m c r) = MapReduce o p (f =$= m) c r 299 | 300 | 301 | ------------------------------------------------------------------------------- 302 | -- | A packaged MapReduce step. Make one of these for each distinct 303 | -- map-reduce step in your overall 'Controller' flow. 304 | data MapReduce a b = forall k v. MRKey k => MapReduce { 305 | _mrOptions :: MROptions 306 | -- ^ Hadoop and MapReduce options affecting only this specific 307 | -- job. 308 | , _mrInPrism :: Prism' B.ByteString v 309 | -- ^ A serialization scheme for values between the map-reduce 310 | -- steps. 311 | , _mrMapper :: Mapper a k v 312 | , _mrCombiner :: Maybe (Reducer k v (k,v)) 313 | , _mrReducer :: Either (Reducer k v b) (Conduit v (ResourceT IO) b) 314 | -- ^ Either a reducer or a final value converter for a map-only 315 | -- MapReduce job. 316 | } 317 | 318 | -------------------------------------------------------------------------------- 319 | mrOptions :: Lens' (MapReduce a b) MROptions 320 | mrOptions f (MapReduce o p m c r) = (\ o' -> MapReduce o' p m c r) <$> f o 321 | 322 | 323 | -- | Tap is a data source/sink definition that *knows* how to serve 324 | -- records of type 'a'. 325 | -- 326 | -- It comes with knowledge on how to decode ByteString to target type 327 | -- and can be used both as a sink (to save data form MR output) or 328 | -- source (to feed MR programs). 329 | -- 330 | -- Usually, you just define the various data sources and destinations 331 | -- your MapReduce program is going to need: 332 | -- 333 | -- > customers = 'tap' "s3n://my-bucket/customers" (csvProtocol def) 334 | data Tap a = Tap 335 | { _tapLocation :: [FilePath] 336 | , _tapProto :: Protocol' a 337 | } 338 | makeLenses ''Tap 339 | 340 | 341 | -- | If two 'location's are the same, we consider two Taps equal. 342 | instance Eq (Tap a) where 343 | a == b = _tapLocation a == _tapLocation b 344 | 345 | 346 | -- | Construct a 'DataDef' 347 | tap :: FilePath -> Protocol' a -> Tap a 348 | tap fp p = Tap [fp] p 349 | 350 | taps :: [FilePath] -> Protocol' a -> Tap a 351 | taps fp p = Tap fp p 352 | 353 | 354 | ------------------------------------------------------------------------------- 355 | -- | Does given file belong to tap? 356 | belongsToTap :: Tap a -> FilePath -> Bool 357 | belongsToTap t fn = any (`isInfixOf` fn) stem 358 | where 359 | stem = map (takeWhile (/= '*')) (t ^. tapLocation) 360 | 361 | 362 | 363 | ------------------------------------------------------------------------------- 364 | concatTaps :: [Tap a] -> Tap a 365 | concatTaps ts = Tap locs newP 366 | where 367 | locs = concatMap _tapLocation ts 368 | newP = Protocol enc dec 369 | 370 | dec = do 371 | fn <- liftIO $ getFileName 372 | case find (flip belongsToTap fn) ts of 373 | Nothing -> error "Unexpected: Can't determine tap in concatTaps." 374 | Just t -> t ^. (tapProto . protoDec) 375 | 376 | enc = head ts ^. tapProto . protoEnc 377 | 378 | 379 | ------------------------------------------------------------------------------- 380 | -- | Given a tap directory, enumerate and load all files inside. 381 | -- Caution: This is meant only as a way to load small files, or else 382 | -- you'll fill up your memory. 383 | readTap :: RunContext -> Tap a -> IO [a] 384 | readTap rc t = do 385 | fs <- concat <$> forM (_tapLocation t) (hdfsLs rc) 386 | let chk fp = not (elem (fp ^. filePath . filename) [".", ".."]) && 387 | (fp ^. fileSize) > 0 388 | let fs' = filter chk fs 389 | runResourceT $ 390 | inp (map _filePath fs') 391 | =$= (t ^. tapProto . protoDec) 392 | $$ C.consume 393 | where 394 | 395 | policy = capDelay 10000000 $ 396 | exponentialBackoff 50000 <> limitRetries 10 397 | 398 | pullOne sem chan fp = 399 | bracket_ (waitQSem sem) (signalQSem sem) $ 400 | recoverAll policy $ const $ do 401 | a <- runResourceT $ hdfsCat rc fp $$ C.consume 402 | writeChan chan (Just (B.concat a)) 403 | 404 | inp :: [FilePath] -> Producer (ResourceT IO) B.ByteString 405 | inp fs = do 406 | sem <- liftIO $ newQSem 10 407 | chan <- liftIO newChan 408 | a <- liftIO $ async $ do 409 | mapConcurrently (pullOne sem chan) fs 410 | writeChan chan Nothing 411 | liftIO $ link a 412 | sourceChan chan 413 | 414 | 415 | 416 | ------------------------------------------------------------------------------- 417 | -- | Combine two taps intelligently into the Either sum type. 418 | -- 419 | -- Matches on the prefix path given as part of each tap. It would 420 | -- therefore fail to work properly on self joins where the same data 421 | -- location is used in both taps. 422 | mergeTaps :: Tap a -> Tap b -> Tap (Either a b) 423 | mergeTaps ta tb = Tap (_tapLocation ta ++ _tapLocation tb) newP 424 | where 425 | newP = Protocol enc dec 426 | 427 | dec = do 428 | fn <- liftIO getFileName 429 | if belongsToTap ta fn 430 | then (ta ^. tapProto . protoDec) =$= C.map Left 431 | else (tb ^. tapProto . protoDec) =$= C.map Right 432 | 433 | as = ta ^. (tapProto . protoEnc) 434 | bs = tb ^. (tapProto . protoEnc) 435 | 436 | enc = awaitForever $ \ res -> 437 | case res of 438 | Left a -> yield a =$= as 439 | Right b -> yield b =$= bs 440 | 441 | 442 | ------------------------------------------------------------------------------ 443 | -- | Conduit that takes in hdfs filenames and outputs the file 444 | -- contents. Will unpack .gz files automatically. 445 | readHdfsFile 446 | :: RunContext 447 | -> Conduit B.ByteString (ResourceT IO) (FilePath, B.ByteString) 448 | readHdfsFile settings = awaitForever $ \s3Uri -> do 449 | let uriStr = B.unpack s3Uri 450 | getFile = hdfsLocalStream settings uriStr 451 | outStream = if isSuffixOf "gz" uriStr 452 | then getFile =$= ungzip 453 | else getFile 454 | outStream =$= C.map (\ s -> (uriStr, s)) 455 | 456 | 457 | ------------------------------------------------------------------------------ 458 | -- | Tap for handling file lists. Hadoop can't process raw binary data 459 | -- because it splits on newlines. This tap allows you to get around that 460 | -- limitation by instead making your input a list of file paths that contain 461 | -- binary data. Then the file names get split by hadoop and each map job 462 | -- reads from those files as its first step. 463 | fileListTap 464 | :: RunContext 465 | -> FilePath 466 | -- ^ A file containing a list of files to be used as input 467 | -> Tap (FilePath, B.ByteString) 468 | fileListTap settings loc = tap loc (Protocol enc dec) 469 | where 470 | enc = error "You should never use a fileListTap as output!" 471 | dec = linesConduit =$= readHdfsFile settings 472 | 473 | 474 | ------------------------------------------------------------------------------- 475 | -- | Sink objects into multiple output files through concurrent 476 | -- file-write processes behind the scenes. Work-around for Hadoop 477 | -- Streaming limitations in having to sink output into a single 478 | -- provided HDFS path. 479 | fanOutTap 480 | :: RunContext 481 | -> FilePath 482 | -- ^ Static location where fanout statistics will be written via 483 | -- the regular hadoop output. 484 | -> FilePath 485 | -- ^ A temporary location where in-progress files can be kept. 486 | -> (a -> FilePath) 487 | -- ^ Decision dispatch of where each object should go. Make sure 488 | -- to provide fully qualified hdfs directory paths; a unique token 489 | -- will be appended to each file based on the node producing it. 490 | -> Conduit a (ResourceT IO) B.ByteString 491 | -- ^ How to serialize each object. Make sure this conduit provides 492 | -- for all the typical requirements: One record per line, no 493 | -- newlines inside the record, etc. 494 | -> FanOutSink 495 | -- ^ How to sink the fanout, exposed here for flexibility. 496 | -> Tap a 497 | fanOutTap rc loc tmp dispatch encoder sink = tap loc (Protocol enc dec) 498 | where 499 | dec = error "fanOutTap can't be used to read input." 500 | 501 | enc = do 502 | hn <- liftIO mkUniqueHostToken 503 | let dispatch' a = dispatch a & basename %~ (<> "_" <> hn) 504 | fo <- liftIO $ hdfsFanOut rc tmp 505 | register $ liftIO $ fanCloseAll fo 506 | sink dispatch' conv fo 507 | stats <- liftIO $ fanStats fo 508 | (forM_ (M.toList stats) $ \ (fp, cnt) -> yield (map B.pack [fp, (show cnt)])) 509 | =$= fromCSV def 510 | 511 | conv a = liftM mconcat $ 512 | C.sourceList [a] =$= 513 | encoder $$ 514 | C.consume 515 | 516 | 517 | ------------------------------------------------------------------------------- 518 | mkUniqueHostToken :: IO String 519 | mkUniqueHostToken = do 520 | tk <- randomToken 64 521 | (toS . Base16.encode . toS . Crypto.hash . toS . (++ tk)) 522 | <$> getHostName 523 | 524 | 525 | newtype AppLabel = AppLabel T.Text 526 | deriving (Eq,Show,Read,Ord) 527 | 528 | 529 | ------------------------------------------------------------------------------- 530 | mkAppLabel :: T.Text -> AppLabel 531 | mkAppLabel txt 532 | | T.all chk (toS txt) = AppLabel txt 533 | | otherwise = error "Application labels can only be lowercase alphanumeric characters" 534 | where 535 | chk c = all ($ c) [isLower, isAlphaNum, not . isSpace] 536 | 537 | instance IsString AppLabel where fromString = mkAppLabel . toS 538 | 539 | 540 | 541 | data ContState = ContState { 542 | _csApp :: AppLabel 543 | , _csMRCount :: ! Int 544 | -- ^ MR run count; one for each 'connect'. 545 | , _csMRVars :: ! (M.Map String B.ByteString) 546 | -- ^ Arbitrary key-val store that's communicated to nodes. 547 | , _csDynId :: ! Int 548 | -- ^ Keeps increasing count of dynamic taps in the order they are 549 | -- created in the Controller monad. Needed so we can communicate 550 | -- the tap locations to MR nodes. 551 | , _csRunOnceId :: ! Int 552 | -- ^ Increasing count of run-once cache items so we can 553 | -- communicate to remote nodes. 554 | , _csShortCircuit :: Bool 555 | -- ^ Used by the remote nodes. When they hit their primary target 556 | -- (the mapper, combiner or the reducer), they should stop 557 | -- executing. 558 | } 559 | 560 | makeLenses ''ContState 561 | 562 | 563 | instance Default ContState where 564 | def = ContState (AppLabel "_") 0 M.empty 0 0 False 565 | 566 | 567 | ------------------------------------------------------------------------------- 568 | -- | load controller varibles back up in worker nodes 569 | loadState 570 | :: (MonadState ContState m, MonadIO m) 571 | => RunContext 572 | -> FilePath 573 | -> m () 574 | loadState settings runToken = do 575 | fn <- hdfsTempFilePath settings runToken 576 | tmp <- liftIO $ hdfsGet settings fn 577 | (app, st) <- liftIO $ withLocalFile settings tmp $ \ local -> do 578 | !st <- readFile local <&> read 579 | -- removeFile local 580 | return st 581 | csMRVars %= M.union st 582 | csApp .= app 583 | 584 | 585 | ------------------------------------------------------------------------------- 586 | -- | Write state from orchestrator for later load by worker nodes 587 | writeState 588 | :: (MonadIO m, MonadState ContState m) 589 | => RunContext 590 | -> FilePath 591 | -> m () 592 | writeState settings runToken = do 593 | remote <- hdfsTempFilePath settings runToken 594 | let local = LocalFile runToken 595 | 596 | st <- use csMRVars 597 | app <- use csApp 598 | 599 | withLocalFile settings local $ \ lfp -> 600 | liftIO $ writeFile lfp (show (app, st)) 601 | 602 | -- put settings file into a file named after the 603 | -- randomly generated token. 604 | liftIO $ hdfsPut settings local remote 605 | 606 | 607 | 608 | 609 | ------------------------------------------------------------------------------- 610 | data ConI a where 611 | Connect :: forall i o. MapReduce i o 612 | -> [Tap i] -> Tap o 613 | -> Maybe String 614 | -> ConI () 615 | 616 | MakeTap :: Protocol' a -> ConI (Tap a) 617 | 618 | BinaryDirTap 619 | :: FilePath 620 | -> (FilePath -> Bool) 621 | -> ConI (Tap (FilePath, B.ByteString)) 622 | 623 | ConIO :: IO a -> ConI a 624 | -- GADTS don't support haddocks yet https://github.com/haskell/haddock/issues/43 625 | -- General IO action; both orchestrator and nodes perform the action 626 | 627 | OrchIO :: IO a -> ConI () 628 | -- Only the orchestrator performs action 629 | 630 | NodeIO :: IO a -> ConI a 631 | -- Only the nodes perform action 632 | 633 | SetVal :: String -> B.ByteString -> ConI () 634 | 635 | GetVal :: String -> ConI (Maybe B.ByteString) 636 | 637 | RunOnce :: Serialize a => IO a -> ConI a 638 | -- Only run on orchestrator, then make available to all the 639 | -- nodes via HDFS. 640 | 641 | 642 | -- | All MapReduce steps are integrated in the 'Controller' monad. 643 | -- 644 | -- Warning: We do have an 'io' combinator as an escape valve for you 645 | -- to use. However, you need to be careful how you use the result of 646 | -- an IO computation. Remember that the same 'main' function will run 647 | -- on both the main orchestrator process and on each and every 648 | -- map/reduce node. 649 | newtype Controller a = Controller (Program ConI a) 650 | deriving (Functor, Applicative, Monad) 651 | 652 | 653 | 654 | ------------------------------------------------------------------------------- 655 | -- | Connect a MapReduce program to a set of inputs, returning the 656 | -- output tap that was implicity generated (on hdfs) in the process. 657 | connect' 658 | :: MapReduce a b 659 | -- ^ MapReduce step to run 660 | -> [Tap a] 661 | -- ^ Input files 662 | -> Protocol' b 663 | -- ^ Serialization protocol to be used on the output 664 | -> Maybe String 665 | -- ^ A custom name for the job 666 | -> Controller (Tap b) 667 | connect' mr inp p nm = do 668 | out <- makeTap p 669 | connect mr inp out nm 670 | return out 671 | 672 | 673 | ------------------------------------------------------------------------------- 674 | -- | Connect a typed MapReduce program you supply with a list of 675 | -- sources and a destination. 676 | connect :: MapReduce a b -> [Tap a] -> Tap b -> Maybe String -> Controller () 677 | connect mr inp outp nm = Controller $ singleton $ Connect mr inp outp nm 678 | 679 | 680 | ------------------------------------------------------------------------------- 681 | makeTap :: Protocol' a -> Controller (Tap a) 682 | makeTap p = Controller $ singleton $ MakeTap p 683 | 684 | 685 | ------------------------------------------------------------------------------- 686 | -- | Set a persistent variable in Controller state. This variable will 687 | -- be set during main M-R job controller loop and communicated to all 688 | -- the map and reduce nodes and will be available there. 689 | setVal :: String -> B.ByteString -> Controller () 690 | setVal k v = Controller $ singleton $ SetVal k v 691 | 692 | 693 | ------------------------------------------------------------------------------- 694 | -- | Get varible from Controller state 695 | getVal :: String -> Controller (Maybe B.ByteString) 696 | getVal k = Controller $ singleton $ GetVal k 697 | 698 | 699 | ------------------------------------------------------------------------------- 700 | -- | Creates a tap for a directory of binary files. 701 | binaryDirTap 702 | :: FilePath 703 | -- ^ A root location to list files under 704 | -> (FilePath -> Bool) 705 | -- ^ A filter condition to refine the listing 706 | -> Controller (Tap (FilePath, B.ByteString)) 707 | binaryDirTap loc filt = Controller $ singleton $ BinaryDirTap loc filt 708 | 709 | 710 | ------------------------------------------------------------------------------- 711 | -- | Perform an IO operation both on the orchestrator and on the worker nodes. 712 | io :: IO a -> Controller a 713 | io f = Controller $ singleton $ ConIO f 714 | 715 | 716 | ------------------------------------------------------------------------------- 717 | -- | Perform an IO operation only on the orchestrator 718 | orchIO :: IO a -> Controller () 719 | orchIO = Controller . singleton . OrchIO 720 | 721 | 722 | -- | Perform an IO action in orchestrator to obtain value, then cache it on HDFS and 723 | -- magically make it available to nodes during their runtime. 724 | runOnce :: Serialize a => IO a -> Controller a 725 | runOnce = Controller . singleton . RunOnce 726 | 727 | 728 | ------------------------------------------------------------------------------- 729 | -- | Perform an IO operation only on the worker nodes. 730 | nodeIO :: IO a -> Controller a 731 | nodeIO = Controller . singleton . NodeIO 732 | 733 | 734 | ------------------------------------------------------------------------------- 735 | newMRKey :: MonadState ContState m => m String 736 | newMRKey = do 737 | i <- gets _csMRCount 738 | csMRCount %= (+1) 739 | return $! show i 740 | 741 | 742 | ------------------------------------------------------------------------------- 743 | -- | Grab list of files in destination, write into a file, put file on 744 | -- HDFS so it is shared and return the (local, hdfs) paths. 745 | setupBinaryDir 746 | :: RunContext 747 | -> FilePath 748 | -> (FilePath -> Bool) 749 | -> IO (LocalFile, FilePath) 750 | setupBinaryDir settings loc chk = do 751 | localFile <- randomLocalFile 752 | hdfsFile <- randomRemoteFile settings 753 | 754 | files <- hdfsLs settings loc <&> map _filePath 755 | let files' = filter chk files 756 | withLocalFile settings localFile $ \ f -> writeFile f (unlines files') 757 | 758 | hdfsPut settings localFile hdfsFile 759 | 760 | return (localFile, hdfsFile) 761 | 762 | 763 | tapLens :: Int -> Lens' ContState (Maybe B.ByteString) 764 | tapLens curId = csMRVars.at ("tap_" <> show curId) 765 | 766 | runCacheLens :: Int -> Lens' ContState (Maybe B.ByteString) 767 | runCacheLens curId = csMRVars.at ("runOnce_" <> show curId) 768 | 769 | 770 | pickTapId :: MonadState ContState m => m Int 771 | pickTapId = pickIdWith csDynId 772 | 773 | 774 | pickRunCacheId :: MonadState ContState m => m Int 775 | pickRunCacheId = pickIdWith csRunOnceId 776 | 777 | 778 | ------------------------------------------------------------------------------- 779 | -- | Monotinically increasing counter. 780 | pickIdWith :: MonadState ContState m => Lens' ContState Int -> m Int 781 | pickIdWith l = do 782 | curId <- use l 783 | l %= (+1) 784 | return curId 785 | 786 | 787 | ------------------------------------------------------------------------------- 788 | -- | Interpreter for the central job control process 789 | orchestrate 790 | :: (MonadMask m, MonadIO m, Applicative m) 791 | => Controller a 792 | -> RunContext 793 | -> RerunStrategy 794 | -> ContState 795 | -> m (Either Text a) 796 | orchestrate (Controller p) settings rr s = do 797 | bracket 798 | (liftIO $ openFile "hadron.log" AppendMode) 799 | (liftIO . hClose) 800 | (\_h -> do echoInfo () "Initiating orchestration..." 801 | evalStateT (runExceptT (go p)) s) 802 | where 803 | go = eval . O.view 804 | 805 | eval (Return a) = return a 806 | eval (i :>>= f) = eval' i >>= go . f 807 | 808 | eval' :: (Functor m, MonadIO m) => ConI a -> ExceptT Text (StateT ContState m) a 809 | 810 | eval' (ConIO f) = liftIO f 811 | 812 | eval' (OrchIO f) = void $ liftIO f 813 | 814 | eval' (NodeIO _) = return (error "NodeIO can't be used in the orchestrator decision path") 815 | 816 | -- evaluate the function, write its result into HDFS for later retrieval 817 | eval' (RunOnce f) = do 818 | a <- liftIO f 819 | 820 | curId <- pickRunCacheId 821 | 822 | runCacheLens curId .= Just (encode a) 823 | 824 | -- loc <- liftIO $ randomRemoteFile settings 825 | -- curId <- pickRunCacheId 826 | -- runCacheLens curId .= Just (B.pack loc) 827 | 828 | -- tmp <- randomFileName 829 | -- liftIO $ withLocalFile settings tmp $ \ fn -> 830 | -- B.writeFile fn (encode a) 831 | -- liftIO $ hdfsPut settings tmp loc 832 | 833 | return a 834 | 835 | 836 | eval' (MakeTap tp) = do 837 | loc <- liftIO $ randomRemoteFile settings 838 | 839 | curId <- pickTapId 840 | tapLens curId .= Just (B.pack loc) 841 | 842 | return $ Tap [loc] tp 843 | 844 | eval' (BinaryDirTap loc filt) = do 845 | (_, hdfsFile) <- liftIO $ setupBinaryDir settings loc filt 846 | 847 | -- remember location of the file from the original loc 848 | -- string 849 | curId <- pickTapId 850 | tapLens curId .= Just (B.pack hdfsFile) 851 | 852 | return $ fileListTap settings hdfsFile 853 | 854 | 855 | eval' (SetVal k v) = csMRVars . at k .= Just v 856 | eval' (GetVal k) = use (csMRVars . at k) 857 | 858 | eval' (Connect (MapReduce mro _ _ _ rd) inp outp nm) = go' 859 | where 860 | go' = do 861 | mrKey <- newMRKey 862 | 863 | let info = sl "Key" mrKey <> sl "Name" nm 864 | 865 | echoInfo info "Launching MR job" 866 | 867 | chk <- liftIO $ mapM (hdfsFileExists settings) (_tapLocation outp) 868 | case any id chk of 869 | False -> do 870 | echoInfo info "Destination file does not exist. Proceeding." 871 | go'' mrKey 872 | True -> 873 | case rr of 874 | RSFail -> echo ErrorS info $ ls $ 875 | "Destination exists: " <> head (_tapLocation outp) 876 | RSSkip -> echoInfo info $ 877 | "Destination exists. Skipping " <> 878 | ls (intercalate ", " (_tapLocation outp)) 879 | RSReRun -> do 880 | echoInfo info $ ls $ 881 | "Destination file exists, will delete and rerun: " <> 882 | head (_tapLocation outp) 883 | _ <- liftIO $ mapM_ (hdfsDeletePath settings) (_tapLocation outp) 884 | go'' mrKey 885 | 886 | echoInfo info "MR job complete" 887 | 888 | 889 | go'' mrKey = do 890 | 891 | -- serialize current state to HDFS, to be read by 892 | -- individual mappers reducers of this step. 893 | runToken <- liftIO $ randomToken 64 894 | 895 | writeState settings runToken 896 | 897 | let mrs = mrOptsToRunOpts mro 898 | launchMapReduce settings mrKey runToken $ mrs 899 | & mrsInput .~ concatMap _tapLocation inp 900 | & mrsOutput .~ head (_tapLocation outp) 901 | & mrsJobName .~ nm 902 | & (if onlyMap then mrsNumReduce .~ Just 0 else id) 903 | 904 | onlyMap = case rd of 905 | Left{} -> False 906 | Right{} -> True 907 | 908 | 909 | data Phase = Map | Combine | Reduce 910 | 911 | 912 | ------------------------------------------------------------------------------- 913 | -- | What to do when we notice that a destination file already exists. 914 | data RerunStrategy 915 | = RSFail 916 | -- ^ Fail and log the problem. 917 | | RSReRun 918 | -- ^ Delete the file and rerun the analysis 919 | | RSSkip 920 | -- ^ Consider the analaysis already done and skip. 921 | deriving (Eq,Show,Read,Ord) 922 | 923 | instance Default RerunStrategy where 924 | def = RSFail 925 | 926 | 927 | ------------------------------------------------------------------------------- 928 | -- | Decode key produced by the Map stage. Errors are simply raised as 929 | -- key marshalling errors are unacceptable. 930 | decodeKey :: MRKey k => (CompositeKey, v) -> (k, v) 931 | decodeKey (k,v) = (k', v) 932 | where 933 | k' = either mkErr id $ fromCompKey k 934 | mkErr e = error ("Stage could not decode Map's output: " ++ show e) 935 | 936 | 937 | encodeKey :: MRKey k => (k, v) -> (CompositeKey, v) 938 | encodeKey = first toCompKey 939 | 940 | 941 | 942 | data NodeError 943 | = NodeRunComplete 944 | -- ^ Single short circuiting in node workers; map/reduce/combine 945 | -- has been completed. 946 | deriving (Eq,Show,Read,Ord,Typeable) 947 | makePrisms ''NodeError 948 | 949 | instance Exception NodeError 950 | 951 | class AsNodeError t where 952 | _NodeError :: Prism' t NodeError 953 | 954 | instance AsNodeError NodeError where _NodeError = id 955 | instance AsNodeError SomeException where _NodeError = exception 956 | 957 | 958 | ------------------------------------------------------------------------------- 959 | -- | The main entry point. Use this function to produce a command line 960 | -- program that encapsulates everything. 961 | -- 962 | -- When run without arguments, the program will orchestrate the entire 963 | -- MapReduce job flow. The same program also doubles as the actual 964 | -- mapper/reducer executable when called with right arguments, though 965 | -- you don't have to worry about that. 966 | hadoopMain 967 | :: ( MonadThrow m, MonadMask m 968 | , MonadIO m, Functor m, Applicative m ) 969 | => [(AppLabel, Controller ())] 970 | -- ^ Hadoop streaming applications that can be run. First element 971 | -- of tuple is used to lookup the right application to run from 972 | -- the command line. 973 | -> RunContext 974 | -- ^ Hadoop environment info. 975 | -> RerunStrategy 976 | -- ^ What to do if destination files already exist. 977 | -> m () 978 | hadoopMain conts settings rr = do 979 | args <- liftIO getArgs 980 | case args of 981 | [nm] -> do 982 | let nm' = mkAppLabel (toS nm) 983 | case lookup nm' conts of 984 | Nothing -> error (show nm <> " is not a known MapReduce application") 985 | Just cont -> do 986 | _ <- orchestrate cont settings rr (def { _csApp = nm' }) 987 | echoInfo () ("Completed MR application " <> ls nm) 988 | [runToken, arg] -> workNode settings conts runToken arg 989 | _ -> error "You must provide the name of the MR application to initiate orchestration." 990 | 991 | 992 | 993 | 994 | ------------------------------------------------------------------------------- 995 | mkArgs :: IsString [a] => [a] -> [(Phase, [a])] 996 | mkArgs mrKey = [ (Map, "mapper_" ++ mrKey) 997 | , (Reduce, "reducer_" ++ mrKey) 998 | , (Combine, "combiner_" ++ mrKey) ] 999 | 1000 | 1001 | 1002 | ------------------------------------------------------------------------------- 1003 | -- | Interpret the Controller in the context of a Hadoop worker node. 1004 | -- In this mode, the objective is to find the mapper, combiner or the 1005 | -- reducer that we are supposed to be executing as. 1006 | workNode 1007 | :: forall m. (MonadIO m, MonadThrow m, MonadMask m, Functor m) 1008 | => RunContext 1009 | -> [(AppLabel, Controller ())] 1010 | -> String 1011 | -> String 1012 | -> m () 1013 | workNode settings conts runToken arg = do 1014 | handling (exception._NodeRunComplete) (const $ return ()) $ do 1015 | void $ flip evalStateT def $ do 1016 | loadState settings runToken 1017 | l <- use csApp 1018 | case lookup l conts of 1019 | Nothing -> error ("App not found in worker node: " <> show l) 1020 | Just (Controller p) -> interpretWithMonad go' p 1021 | where 1022 | 1023 | -- A short-circuiting wrapper for go. We hijack the exception 1024 | -- system to implement shortcircuting here. It may be a better 1025 | -- idea to use ContT. 1026 | go' :: ConI b -> StateT ContState m b 1027 | go' c = do 1028 | chk <- use csShortCircuit 1029 | case chk of 1030 | True -> throwM NodeRunComplete 1031 | False -> go c 1032 | 1033 | go :: ConI b -> StateT ContState m b 1034 | 1035 | go (ConIO f) = liftIO f 1036 | 1037 | go (OrchIO _) = return () 1038 | 1039 | go (NodeIO f) = liftIO f 1040 | 1041 | go (MakeTap lp) = do 1042 | curId <- pickTapId 1043 | dynLoc <- use $ tapLens curId 1044 | case dynLoc of 1045 | Nothing -> error $ 1046 | "Dynamic location can't be determined for MakTap at index " <> 1047 | show curId 1048 | Just loc' -> return $ Tap ([B.unpack loc']) lp 1049 | 1050 | go (BinaryDirTap loc _) = do 1051 | 1052 | -- remember location of the file from the original loc 1053 | -- string 1054 | curId <- pickTapId 1055 | dynLoc <- use $ tapLens curId 1056 | case dynLoc of 1057 | Nothing -> error $ 1058 | "Dynamic location can't be determined for BinaryDirTap at: " <> loc 1059 | Just loc' -> return $ fileListTap settings $ B.unpack loc' 1060 | 1061 | -- setting in map-reduce phase is a no-op... There's nobody to 1062 | -- communicate it to. 1063 | go (SetVal _ _) = return () 1064 | go (GetVal k) = use (csMRVars . at k) 1065 | 1066 | go (RunOnce _) = do 1067 | curId <- pickRunCacheId 1068 | bs <- use (runCacheLens curId) 1069 | 1070 | either error return $ 1071 | note "RunOnce cache missing on remote node" bs >>= decode 1072 | 1073 | go (Connect (MapReduce mro mrInPrism mp comb rd) inp outp nm) = do 1074 | mrKey <- newMRKey 1075 | 1076 | let dec = do 1077 | fn <- getFileName 1078 | let t = find (flip belongsToTap fn) inp 1079 | return $ case t of 1080 | Nothing -> head inp ^. tapProto . protoDec 1081 | Just t' -> t' ^. tapProto . protoDec 1082 | 1083 | let enc = outp ^. tapProto . protoEnc 1084 | 1085 | mp' = case rd of 1086 | Left _ -> mapRegular 1087 | Right conv -> do 1088 | setLineBuffering 1089 | dec' <- liftIO $ dec 1090 | runResourceT $ sourceHandle stdin 1091 | =$= dec' 1092 | =$= mp 1093 | =$= C.map snd 1094 | =$= conv 1095 | =$= enc 1096 | $$ sinkHandle stdout 1097 | 1098 | mapRegular = do 1099 | dec' <- liftIO dec 1100 | mapperWith mrInPrism (dec' =$= mp =$= C.map encodeKey) 1101 | 1102 | red = case rd of 1103 | Right _ -> error "Unexpected: Reducer called for a map-only job." 1104 | Left f -> do 1105 | setLineBuffering 1106 | runResourceT $ 1107 | reducer mro mrInPrism (C.map decodeKey =$= f) 1108 | =$= enc 1109 | $$ sinkHandle stdout 1110 | 1111 | comb' = case comb of 1112 | Nothing -> error "Unexpected: No combiner supplied." 1113 | Just c -> combiner mro mrInPrism (C.map decodeKey =$= c =$= C.map encodeKey) 1114 | 1115 | -- error message maker for caught exceptions 1116 | mkErr :: Maybe FilePath -> String -> SomeException -> b 1117 | mkErr file stage e = error $ 1118 | "Exception raised during " <> stage <> 1119 | " in MR Job #" <> mrKey <> 1120 | maybe "" (\nm' -> " (" <> nm' <> ") ") nm <> 1121 | maybe "" (" while processing file " <>) file <> 1122 | ": " <> show e 1123 | 1124 | 1125 | case find ((== arg) . snd) $ mkArgs mrKey of 1126 | 1127 | Just (Map, _) -> do 1128 | liftIO $ do 1129 | curFile <- getFileName 1130 | catching exception mp' (mkErr (Just curFile) "mapper") 1131 | csShortCircuit .= True 1132 | 1133 | 1134 | Just (Reduce, _) -> do 1135 | liftIO $ catching exception red (mkErr Nothing "reducer") 1136 | csShortCircuit .= True 1137 | 1138 | 1139 | Just (Combine, _) -> do 1140 | liftIO $ catching exception comb' (mkErr Nothing "combiner") 1141 | csShortCircuit .= True 1142 | 1143 | Nothing -> return () 1144 | 1145 | 1146 | -- -- | TODO: See if this works. Objective is to increase type safety of 1147 | -- -- join inputs. Notice how we have an existential on a. 1148 | -- -- 1149 | -- -- A join definition that ultimately produces objects of type b. 1150 | -- data JoinDef b = forall a. JoinDef { 1151 | -- joinTap :: Tap a 1152 | -- , joinType :: JoinType 1153 | -- , joinMap :: Conduit a IO (JoinKey, b) 1154 | -- } 1155 | 1156 | 1157 | ------------------------------------------------------------------------------- 1158 | -- | A convenient way to express map-sde multi-way join operations 1159 | -- into a single data type. All you need to supply is the map 1160 | -- operation for each tap, the reduce step is assumed to be the 1161 | -- Monoidal 'mconcat'. 1162 | -- 1163 | -- 'joinMR' is probably easier to use if you can get by with an inner 1164 | -- join. 1165 | joinStep 1166 | :: forall k b a. 1167 | (Show b, Monoid b, Serialize b, 1168 | MRKey k) 1169 | => [(Tap a, JoinType, Mapper a k b)] 1170 | -- ^ Dataset definitions and how to map each dataset. 1171 | -> MapReduce a b 1172 | joinStep fs = MapReduce mro pSerialize mp Nothing (Left rd) 1173 | where 1174 | showBS = B.pack . show 1175 | n = numKeys (undefined :: k) 1176 | 1177 | mro = joinOpts { _mroPart = Partition (n+1) n } 1178 | 1179 | locations :: [FilePath] 1180 | locations = concatMap (view (_1 . tapLocation)) fs 1181 | 1182 | taps' :: [Tap a] 1183 | taps' = concatMap ((\t -> replicate (length (_tapLocation t)) t) . view _1) fs 1184 | 1185 | locations' = map B.pack locations 1186 | 1187 | dataSets :: [(FilePath, DataSet)] 1188 | dataSets = map (\ (loc, i) -> (loc, DataSet (showBS i))) $ 1189 | zip locations ([0..] :: [Int]) 1190 | 1191 | dsIx :: M.Map FilePath DataSet 1192 | dsIx = M.fromList dataSets 1193 | 1194 | tapIx :: M.Map DataSet (Tap a) 1195 | tapIx = M.fromList $ zip (map snd dataSets) taps' 1196 | 1197 | getTapDS :: Tap a -> [DataSet] 1198 | getTapDS t = mapMaybe (flip M.lookup dsIx) (_tapLocation t) 1199 | 1200 | 1201 | fs' :: [(DataSet, JoinType)] 1202 | fs' = concatMap (\ (t, jt, _) -> for (getTapDS t) $ \ ds -> (ds, jt) ) fs 1203 | for = flip map 1204 | 1205 | 1206 | -- | get dataset name from a given input filename 1207 | getDS nm = fromMaybe (error "Can't identify current tap from filename.") $ do 1208 | let nm' = B.pack nm 1209 | curLoc <- find (\l -> length (B.indices l nm') > 0) locations' 1210 | M.lookup (B.unpack curLoc) dsIx 1211 | 1212 | 1213 | -- | get the conduit for given dataset name 1214 | mkMap' ds = fromMaybe (error "Can't identify current tap in IX.") $ do 1215 | t <- M.lookup ds tapIx 1216 | cond <- find ((== t) . view _1) fs 1217 | return $ (cond ^. _3) =$= C.map (_1 %~ toCompKey) 1218 | 1219 | mp = joinMapper getDS mkMap' 1220 | 1221 | rd = joinReducer fs' 1222 | 1223 | 1224 | -- ------------------------------------------------------------------------------- 1225 | -- -- | A generic map-reduce function that should be good enough for most 1226 | -- -- cases. 1227 | -- mapReduce 1228 | -- :: forall a k v b. (MRKey k, Serialize v) 1229 | -- => (a -> MaybeT IO [(k, v)]) 1230 | -- -- ^ Common map key 1231 | -- -> (k -> b -> v -> IO b) 1232 | -- -- ^ Left fold in reduce stage 1233 | -- -> b 1234 | -- -- ^ A starting point for fold 1235 | -- -> MapReduce a (k,b) 1236 | -- mapReduce mp rd a0 = MapReduce mro pSerialize m Nothing r 1237 | -- where 1238 | -- n = numKeys (undefined :: k) 1239 | -- mro = def { _mroPart = Partition n n } 1240 | 1241 | -- m :: Mapper a k v 1242 | -- m = awaitForever $ \ a -> runMaybeT $ hoist (lift . lift) (mp a) >>= lift . C.sourceList 1243 | 1244 | -- r :: Reducer k v (k,b) 1245 | -- r = do 1246 | -- (k, b) <- C.foldM step (Nothing, a0) 1247 | -- case k of 1248 | -- Nothing -> return () 1249 | -- Just k' -> yield (k', b) 1250 | 1251 | 1252 | -- step (_, acc) (k, v) = do 1253 | -- !b <- liftIO $ rd k acc v 1254 | -- return (Just k, b) 1255 | 1256 | 1257 | -- ------------------------------------------------------------------------------- 1258 | -- -- | Deduplicate input objects that have the same key value; the first 1259 | -- -- object seen for each key will be kept. 1260 | -- firstBy 1261 | -- :: forall a k. (Serialize a, MRKey k) 1262 | -- => (a -> MaybeT IO [k]) 1263 | -- -- ^ Key making function 1264 | -- -> MapReduce a a 1265 | -- firstBy f = mapReduce mp rd Nothing >.> (C.map snd =$= C.catMaybes) 1266 | -- where 1267 | -- mp :: a -> MaybeT IO [(k, a)] 1268 | -- mp a = do 1269 | -- k <- f a 1270 | -- return $ zip k (repeat a) 1271 | 1272 | -- rd :: k -> Maybe a -> a -> IO (Maybe a) 1273 | -- rd _ Nothing a = return $! Just a 1274 | -- rd _ acc _ = return $! acc 1275 | 1276 | 1277 | -- ------------------------------------------------------------------------------- 1278 | -- -- | A generic map-only MR step. 1279 | -- mapMR :: (Serialize b) => (v -> IO [b]) -> MapReduce v b 1280 | -- mapMR f = MapReduce def pSerialize mp Nothing rd 1281 | -- where 1282 | -- mp = do 1283 | -- rng <- liftIO mkRNG 1284 | -- awaitForever $ \ a -> do 1285 | -- t <- liftIO $ randomToken 2 rng 1286 | -- res <- liftIO $ f a 1287 | -- mapM_ (\x -> yield (t, x)) res 1288 | -- rd = C.map snd 1289 | 1290 | 1291 | -- ------------------------------------------------------------------------------- 1292 | -- -- | Do somthing with only the first row we see, putting the result in 1293 | -- -- the given HDFS destination. 1294 | -- oneSnap 1295 | -- :: RunContext 1296 | -- -> FilePath 1297 | -- -> (a -> B.ByteString) 1298 | -- -> Conduit a IO a 1299 | -- oneSnap settings s3fp f = do 1300 | -- h <- await 1301 | -- case h of 1302 | -- Nothing -> return () 1303 | -- Just h' -> do 1304 | -- liftIO $ putHeaders (f h') 1305 | -- yield h' 1306 | -- awaitForever yield 1307 | -- where 1308 | -- putHeaders x = do 1309 | -- tmp <- randomFileName 1310 | -- withLocalFile settings tmp $ \ fn -> B.writeFile fn x 1311 | -- chk <- hdfsFileExists settings s3fp 1312 | -- when (not chk) $ void $ hdfsPut settings tmp s3fp 1313 | -- withLocalFile settings tmp removeFile 1314 | 1315 | 1316 | ------------------------------------------------------------------------------- 1317 | -- | Monoidal inner (map-side) join for two types. Each type is mapped 1318 | -- into the common monoid, which is then collapsed during reduce. 1319 | -- 1320 | -- Make sure an incoming 'Left' stays 'Left' and a 'Right' stays a 1321 | -- 'Right'. 1322 | -- 1323 | -- TODO: Wrap around this with a better API so the user doesn't have 1324 | -- to care. 1325 | joinMR 1326 | :: forall a b k v. (MRKey k, Monoid v, Serialize v) 1327 | => Mapper (Either a b) k (Either v v) 1328 | -- ^ Mapper for the input 1329 | -> MapReduce (Either a b) v 1330 | joinMR mp = MapReduce mro pSerialize mp' Nothing (Left red) 1331 | where 1332 | mro = def { _mroPart = Partition (n+1) n } 1333 | n = numKeys (undefined :: k) 1334 | 1335 | -- add to key so we know for sure all Lefts arrive before 1336 | -- Rights. 1337 | 1338 | mp' :: Mapper (Either a b) CompositeKey (Either v v) 1339 | mp' = mp =$= C.map modMap 1340 | 1341 | modMap (k, Left v) = (toCompKey k ++ ["1"], Left v) 1342 | modMap (k, Right v) = (toCompKey k ++ ["2"], Right v) 1343 | 1344 | -- cache lefts, start emitting upon seeing the first right. 1345 | red = go [] 1346 | where 1347 | go ls' = do 1348 | inc <- await 1349 | case inc of 1350 | Nothing -> return () 1351 | Just (_, Left r) -> go $! (r:ls') 1352 | Just (_, Right b) -> do 1353 | mapM_ yield [mappend a b | a <- ls'] 1354 | go ls' 1355 | -------------------------------------------------------------------------------- /src/Hadron/Join.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE RecordWildCards #-} 6 | 7 | module Hadron.Join 8 | ( 9 | 10 | DataDefs 11 | , DataSet (..) 12 | , JoinType (..) 13 | , JoinKey 14 | 15 | , joinMain 16 | , joinMapper 17 | , joinReducer 18 | , joinOpts 19 | 20 | -- * TODO: To be put into an Internal module 21 | , JoinAcc (..) 22 | , bufToStr 23 | 24 | ) where 25 | 26 | ------------------------------------------------------------------------------- 27 | import Control.Lens 28 | import qualified Data.ByteString.Char8 as B 29 | import Data.Conduit 30 | import qualified Data.Conduit.List as C 31 | import Data.Default 32 | import Data.Hashable 33 | import qualified Data.HashMap.Strict as HM 34 | import Data.List 35 | import Data.Monoid as M 36 | import Data.Ord 37 | import Data.Serialize 38 | import Data.String 39 | import qualified Data.Vector as V 40 | import GHC.Generics 41 | ------------------------------------------------------------------------------- 42 | import Hadron.Basic 43 | ------------------------------------------------------------------------------- 44 | 45 | 46 | type DataDefs = [(DataSet, JoinType)] 47 | 48 | data JoinType = JRequired | JOptional 49 | deriving (Eq,Show,Read,Ord) 50 | 51 | 52 | newtype DataSet = DataSet { getDataSet :: B.ByteString } 53 | deriving (Eq,Show,Read,Ord,Serialize,Generic,Hashable,IsString) 54 | 55 | type JoinKey = B.ByteString 56 | 57 | 58 | -- | We are either buffering input rows or have started streaming, as 59 | -- we think we're now receiving the last table we were expecting. 60 | data JoinAcc a = 61 | Buffering { 62 | bufData :: ! (HM.HashMap DataSet [a]) 63 | -- ^ Buffer of in-memory retained data. We have to retain (n-1) of 64 | -- the input datasets and we can then start emitting rows in 65 | -- constant-space for the nth dataset. 66 | } 67 | | Streaming { strStems :: V.Vector a } 68 | 69 | deriving (Eq,Show) 70 | 71 | 72 | instance Default (JoinAcc a) where 73 | def = Buffering mempty 74 | 75 | 76 | 77 | ------------------------------------------------------------------------------- 78 | -- | Convert a buffering state to a ready-to-stream state. Once this 79 | -- conversion is done, we'll start emitting output rows immediately 80 | -- and in constant space. 81 | bufToStr 82 | :: Monoid a 83 | => DataDefs 84 | -- ^ Table definitions for the current join 85 | -> JoinAcc a 86 | -- ^ Buffering 87 | -> JoinAcc a 88 | -- ^ Streaming 89 | bufToStr defs Buffering{..} = Streaming rs 90 | where 91 | rs = V.fromList $ maybe [] (map M.mconcat . sequence) groups 92 | 93 | -- | Maybe will reduce to Nothing if any of the Inner joins is 94 | -- missing. 95 | groups = mapM (flip HM.lookup data' . fst) defs 96 | 97 | data' = foldl' step bufData defs 98 | 99 | step m (_, JRequired) = m 100 | step m (ds, JOptional) = HM.insertWith insMissing ds [mempty] m 101 | 102 | insMissing new [] = new 103 | insMissing _ old = old 104 | bufToStr _ _ = error "bufToStr can only convert a Buffering to a Streaming" 105 | 106 | 107 | -- | Given a new row in the final dataset of the joinset, emit all the 108 | -- joined rows immediately. 109 | emitStream :: (Monad m, Monoid b) => JoinAcc b -> b -> ConduitM i b m () 110 | emitStream Streaming{..} a = V.mapM_ (yield . mappend a) strStems 111 | emitStream _ _ = error "emitStream can't be called unless it's in Streaming mode." 112 | 113 | 114 | ------------------------------------------------------------------------------- 115 | joinOpts :: MROptions 116 | joinOpts = def { _mroPart = (Partition 2 1) } 117 | 118 | 119 | ------------------------------------------------------------------------------- 120 | -- | Make join reducer from given table definitions 121 | joinReducer 122 | :: (Show r, Monoid r) 123 | => [(DataSet, JoinType)] 124 | -- ^ Table definitions 125 | -> Reducer CompositeKey r r 126 | joinReducer fs = red def 127 | where 128 | red ja = do 129 | next <- await 130 | case next of 131 | Nothing -> joinFinalize fs ja 132 | Just x -> do 133 | ja' <- joinReduceStep fs ja x 134 | red $! ja' 135 | 136 | 137 | ------------------------------------------------------------------------------- 138 | joinFinalize 139 | :: (Monad m, Monoid b) 140 | => [(DataSet, JoinType)] 141 | -> JoinAcc b 142 | -> ConduitM i b m () 143 | 144 | -- we're still in buffering, so nothing has been emitted yet. one of 145 | -- the tables (and definitely the last table) did not have any input 146 | -- at all. we'll try to emit in case the last table is not a required 147 | -- table. 148 | -- 149 | -- notice that unlike other calls to bufToStr, we include ALL the 150 | -- tables here so that if the last table was required, it'll all 151 | -- collapse to an empty list and nothing will be emitted. 152 | joinFinalize fs buf@Buffering{} = 153 | let str = bufToStr fs buf 154 | in emitStream str mempty 155 | 156 | -- we're already in streaming, so we've been emitting output in 157 | -- real-time. nothing left to do at this point. 158 | joinFinalize _ Streaming{} = return () 159 | 160 | 161 | 162 | ------------------------------------------------------------------------------- 163 | -- | Make a step function for a join operation 164 | joinReduceStep 165 | :: (Monad m, Monoid b) 166 | => DataDefs 167 | -> JoinAcc b 168 | -> (CompositeKey, b) 169 | -> ConduitM i b m (JoinAcc b) 170 | joinReduceStep fs buf@Buffering{..} (k, x) = 171 | 172 | -- Accumulate until you start seeing the last table. We'll start 173 | -- emitting immediately after that. 174 | case ds' == lastDataSet of 175 | False -> -- traceShow accumulate $ 176 | return $! accumulate 177 | True -> 178 | let xs = filter ((/= ds') . fst) fs 179 | in joinReduceStep fs (bufToStr xs buf) (k,x) 180 | 181 | where 182 | 183 | fs' = sortBy (comparing fst) fs 184 | 185 | lastDataSet = fst $ last fs' 186 | 187 | accumulate = 188 | Buffering { bufData = HM.insertWith add ds' [x] bufData 189 | } 190 | 191 | add new old = new ++ old 192 | ds = last k 193 | ds' = DataSet ds 194 | 195 | joinReduceStep _ str@Streaming{} (_,x) = emitStream str x >> return str 196 | 197 | 198 | 199 | -- | Helper for easy construction of specialized join mapper. 200 | -- 201 | -- This mapper identifies the active dataset from the currently 202 | -- streaming filename and uses filename to determine how the mapping 203 | -- shoudl be done. 204 | joinMapper 205 | :: (String -> DataSet) 206 | -- ^ Infer dataset from current filename 207 | -> (DataSet -> Mapper a CompositeKey r) 208 | -- ^ Given a dataset, map it to a common data type 209 | -> Mapper a CompositeKey r 210 | joinMapper getDS mkMap = do 211 | fi <- getFileName 212 | let ds = getDS fi 213 | mkMap ds =$= C.map (go ds) 214 | where 215 | go ds (jk, a) = (jk ++ [getDataSet ds], a) 216 | 217 | 218 | 219 | ------------------------ 220 | -- A Main Application -- 221 | ------------------------ 222 | 223 | 224 | ------------------------------------------------------------------------------- 225 | -- | Make a stand-alone program that can act as a mapper and reducer, 226 | -- performing the join defined here. 227 | -- 228 | -- For proper higher level operation, see the 'Controller' module. 229 | joinMain 230 | :: (Serialize r, Monoid r, Show r) 231 | => DataDefs 232 | -- ^ Define your tables 233 | -> (String -> DataSet) 234 | -- ^ Infer dataset from input filename 235 | -> (DataSet -> Mapper B.ByteString CompositeKey r) 236 | -- ^ Map input stream to a join key and the common-denominator 237 | -- uniform data type we know how to 'mconcat'. 238 | -> Prism' B.ByteString r 239 | -- ^ Choose serialization method for final output. 240 | -> IO () 241 | joinMain fs getDS mkMap out = mapReduceMain joinOpts pSerialize mp rd 242 | where 243 | mp = joinMapper getDS mkMap 244 | 245 | rd = joinReducer fs =$= C.mapMaybe (firstOf (re out)) 246 | -------------------------------------------------------------------------------- /src/Hadron/Logger.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE RecordWildCards #-} 3 | 4 | module Hadron.Logger 5 | ( module Hadron.Logger 6 | , module Katip 7 | ) where 8 | 9 | ------------------------------------------------------------------------------- 10 | import Control.Monad.Trans 11 | import Data.IORef 12 | import Katip 13 | import System.IO 14 | import System.IO.Unsafe 15 | ------------------------------------------------------------------------------- 16 | 17 | 18 | runLog :: MonadIO m => KatipT m b -> m b 19 | runLog m = liftIO (readIORef _ioLogger) >>= flip runKatipT m 20 | 21 | 22 | 23 | ------------------------------------------------------------------------------- 24 | _ioLogger :: IORef LogEnv 25 | _ioLogger = unsafePerformIO $ do 26 | le <- initLogEnv "hadron" "-" 27 | hSetBuffering stderr LineBuffering 28 | s <- mkHandleScribe ColorIfTerminal stderr InfoS V3 29 | newIORef =<< registerScribe "stderr" s defaultScribeSettings le 30 | {-# NOINLINE _ioLogger #-} 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/Hadron/OutputFixer.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | ------------------------------------------------------------------------------- 4 | import Data.Conduit 5 | import Data.Conduit.Binary 6 | import Data.Conduit.List 7 | import System.IO 8 | ------------------------------------------------------------------------------- 9 | import Hadron 10 | ------------------------------------------------------------------------------- 11 | 12 | 13 | main :: IO () 14 | main = runResourceT $ 15 | sourceHandle stdin $= 16 | (protoDec linesProtocol) $= 17 | (protoEnc linesProtocol) $$ 18 | sinkHandle stdout 19 | -------------------------------------------------------------------------------- /src/Hadron/Protocol.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE RankNTypes #-} 4 | {-# LANGUAGE TemplateHaskell #-} 5 | 6 | module Hadron.Protocol 7 | ( 8 | Protocol (..) 9 | , Protocol' 10 | , protoEnc, protoDec 11 | , prismToProtocol 12 | , filterP 13 | 14 | , base64SerProtocol 15 | , base64SafeCopyProtocol 16 | , idProtocol 17 | , linesProtocol 18 | , gzipProtocol 19 | , showProtocol 20 | , csvProtocol 21 | 22 | -- * Serialization Prisms 23 | , pSerialize 24 | , pSafeCopy 25 | , pShow 26 | 27 | -- * Serialization Utils 28 | , serialize 29 | , deserialize 30 | 31 | -- * Utils 32 | , lineC 33 | , linesConduit 34 | , mkKey 35 | 36 | , eitherPrism 37 | , eitherProtocol 38 | 39 | 40 | ) where 41 | 42 | ------------------------------------------------------------------------------- 43 | import Blaze.ByteString.Builder 44 | import Control.Applicative 45 | import Control.Category 46 | 47 | import Control.Error 48 | import Control.Lens 49 | import Control.Monad 50 | import Control.Monad.Trans.Resource 51 | import Data.Attoparsec.ByteString.Char8 (Parser, endOfLine, takeTill) 52 | import qualified Data.ByteString.Base64 as Base64 53 | import qualified Data.ByteString.Char8 as B 54 | import Data.Conduit 55 | import Data.Conduit.Attoparsec 56 | import Data.Conduit.Blaze 57 | import qualified Data.Conduit.List as C 58 | import Data.Conduit.Zlib (gzip, ungzip) 59 | import Data.CSV.Conduit 60 | import Data.Monoid 61 | import qualified Data.SafeCopy as SC 62 | import qualified Data.Serialize as Ser 63 | import Data.String 64 | import Prelude hiding (id, (.)) 65 | ------------------------------------------------------------------------------- 66 | import Hadron.Types 67 | ------------------------------------------------------------------------------- 68 | 69 | 70 | ------------------- 71 | -- Serialization -- 72 | ------------------- 73 | 74 | 75 | -- | Like 'Protocol' but fixes the source format as 'ByteString'. 76 | type Protocol' a = Protocol B.ByteString a 77 | 78 | 79 | -- | A 'Protocol's is a serialization strategy when we're dealing with 80 | -- streams of records, allowing for arbitrary data formats, delimiters 81 | -- and other potential obstructions. 82 | -- 83 | -- Most of the time we'll be using 'Protocol\''s. 84 | data Protocol b a = Protocol { 85 | _protoEnc :: Conduit a (ResourceT IO) b 86 | , _protoDec :: Conduit b (ResourceT IO) a 87 | } 88 | makeLenses ''Protocol 89 | 90 | 91 | instance Category Protocol where 92 | id = Protocol (C.map id) (C.map id) 93 | p1 . p2 = Protocol { _protoEnc = _protoEnc p1 =$= _protoEnc p2 94 | , _protoDec = _protoDec p2 =$= _protoDec p1} 95 | 96 | 97 | -- ------------------------------------------------------------------------------- 98 | -- -- | Get an InputStream based encode action out of the protocol. 99 | -- protoEncIS :: Protocol b a -> InputStream a -> IO (InputStream b) 100 | -- protoEncIS p is = do 101 | -- (is', os') <- S.makeChanPipe 64 102 | -- os'' <- (p ^. protoEnc) os' 103 | -- async $ S.connect is os'' 104 | -- return is' 105 | 106 | 107 | ------------------------------------------------------------------------------- 108 | -- | Filter elements in both directions of the protocol. 109 | filterP :: (a -> Bool) -> Protocol b a -> Protocol b a 110 | filterP f (Protocol enc dec) = Protocol enc' dec' 111 | where 112 | enc' = C.filter f =$= enc 113 | dec' = dec =$= C.filter f 114 | 115 | 116 | 117 | ------------------------------------------------------------------------------- 118 | -- | Lift 'Prism' to work with a newline-separated stream of objects. 119 | -- 120 | -- It is assumed that the prism you supply to this function does not 121 | -- add newlines itself. You need to make them newline-free for this to 122 | -- work properly. 123 | prismToProtocol :: Prism' B.ByteString a -> Protocol' a 124 | prismToProtocol p = 125 | Protocol { _protoEnc = C.map (review p) =$= write 126 | , _protoDec = linesConduit =$= C.map (mkErr . preview p) } 127 | where 128 | write = C.map (\x -> fromByteString x `mappend` nl) =$= 129 | builderToByteString 130 | nl = fromByteString "\n" 131 | mkErr = fromMaybe $ 132 | error "Unexpected: Prism could not decode incoming value." 133 | 134 | 135 | ------------------------------------------------------------------------------- 136 | -- | Basically 'id' from Control.Category. Just pass the incoming 137 | -- ByteString through. 138 | idProtocol :: Protocol' B.ByteString 139 | idProtocol = id 140 | 141 | 142 | -- | A simple serialization strategy that works on lines of strings. 143 | linesProtocol :: Protocol' B.ByteString 144 | linesProtocol = Protocol { _protoEnc = C.map (\x -> B.concat [x, "\n"]) 145 | , _protoDec = linesConduit } 146 | 147 | 148 | ------------------------------------------------------------------------------- 149 | -- | Channel the 'Serialize' instance through 'Base64' encoding to 150 | -- make it newline-safe, then turn into newline-separated stream. 151 | base64SerProtocol :: Ser.Serialize a => Protocol' a 152 | base64SerProtocol = prismToProtocol pSerialize 153 | 154 | 155 | ------------------------------------------------------------------------------- 156 | -- | Channel the 'Serialize' instance through 'Base64' encoding to 157 | -- make it newline-safe, then turn into newline-separated stream. 158 | base64SafeCopyProtocol :: SC.SafeCopy a => Protocol' a 159 | base64SafeCopyProtocol = prismToProtocol pSafeCopy 160 | 161 | 162 | ------------------------------------------------------------------------------- 163 | -- | Encode and decode a gzip stream 164 | gzipProtocol :: Protocol B.ByteString B.ByteString 165 | gzipProtocol = Protocol gzip ungzip 166 | 167 | 168 | ------------------------------------------------------------------------------- 169 | -- | Protocol for converting to/from any stream type 'b' and CSV type 170 | -- 'a'. 171 | csvProtocol :: (CSV b a) => CSVSettings -> Protocol b a 172 | csvProtocol cset = Protocol (fromCSV cset) (intoCSV cset) 173 | 174 | 175 | ------------------------------------------------------------------------------- 176 | -- | Use 'Show'/'Read' instances to stream-serialize. You must be 177 | -- careful not to have any newline characters inside, or the stream 178 | -- will get confused. 179 | -- 180 | -- This is meant for debugging more than anything. Do not use it in 181 | -- serious matters. Use 'serProtocol' instead. 182 | showProtocol :: (Read a, Show a) => Protocol' a 183 | showProtocol = prismToProtocol pShow 184 | 185 | 186 | -- | Helper for reliable serialization through Base64 encoding so it 187 | -- is newline-free. 188 | serialize :: Ser.Serialize a => a -> B.ByteString 189 | serialize = Base64.encode . Ser.encode 190 | 191 | 192 | -- | Helper for reliable deserialization 193 | deserialize :: Ser.Serialize c => B.ByteString -> Either String c 194 | deserialize = Ser.decode <=< Base64.decode 195 | 196 | 197 | -- | Serialize with the 'Serialize' instance coupled with Base64 198 | -- encoding, so it's free of restrictied characters. 199 | -- 200 | -- Any 'Prism' can be used as follows: 201 | -- 202 | -- >>> import Control.Lens 203 | -- 204 | -- To decode ByteString into target object: 205 | -- 206 | -- >>> firstOf myPrism byteStr 207 | -- Just a 208 | -- 209 | -- To encode an object into ByteString: 210 | -- 211 | -- >>> firstOf (re myPrism) myObject 212 | -- Just byteStr 213 | pSerialize :: Ser.Serialize a => Prism' B.ByteString a 214 | pSerialize = prism' serialize (hush . deserialize) 215 | 216 | 217 | ------------------------------------------------------------------------------- 218 | pSafeCopy :: SC.SafeCopy a => Prism' B.ByteString a 219 | pSafeCopy = prism' 220 | (Base64.encode . Ser.runPut . SC.safePut) 221 | (hush . (Ser.runGet SC.safeGet <=< Base64.decode)) 222 | 223 | 224 | -- | Serialize with the Show/Read instances 225 | pShow :: (Show a, Read a) => Prism' B.ByteString a 226 | pShow = prism 227 | (B.pack . show) 228 | (\x -> maybe (Left x) Right . readMay . B.unpack $ x) 229 | 230 | 231 | -- | Parse a line of input and eat a tab character that may be at the 232 | -- very end of the line. This tab is put by hadoop if this file is the 233 | -- result of a previous M/R that didn't have any value in the reduce 234 | -- step. 235 | parseLine :: Parser B.ByteString 236 | parseLine = ln <* endOfLine 237 | where 238 | ln = do 239 | x <- takeTill (== '\n') 240 | return $ if B.length x > 0 && B.last x == '\t' 241 | then B.init x 242 | else x 243 | 244 | 245 | -- | Turn incoming stream into a stream of lines. This will 246 | -- automatically eat tab characters at the end of the line. 247 | linesConduit :: MonadThrow m => Conduit B.ByteString m B.ByteString 248 | linesConduit = conduitParser parseLine =$= C.map snd 249 | 250 | 251 | -- | Parse lines of (key,value) for hadoop reduce stage 252 | lineC :: MonadThrow m 253 | => Int 254 | -- ^ Number of key segments (usually just 1), but may be higher 255 | -- if you're using multiple parts in your key. 256 | -> Conduit B.ByteString m (CompositeKey, B.ByteString) 257 | lineC n = linesConduit =$= C.map ppair 258 | where 259 | ppair line = (k, v) 260 | where 261 | k = take n spl 262 | v = B.intercalate "\t" $ drop n spl 263 | -- ^ Re-assemble remaining segments to restore 264 | -- correctness. 265 | spl = B.split '\t' line 266 | 267 | 268 | 269 | ------------------------------------------------------------------------------- 270 | -- | Only works when a and b are disjoint types. 271 | eitherPrism :: Prism' B.ByteString a -> Prism' B.ByteString b -> Prism' B.ByteString (Either a b) 272 | eitherPrism f g = prism' enc dec 273 | where 274 | enc (Left a) = review f a 275 | enc (Right b) = review g b 276 | 277 | dec bs = (preview f bs <&> Left) 278 | `mplus` 279 | (preview g bs <&> Right) 280 | 281 | 282 | ------------------------------------------------------------------------------- 283 | eitherProtocol 284 | :: Prism' B.ByteString a 285 | -> Prism' B.ByteString b 286 | -> Protocol' (Either a b) 287 | eitherProtocol f g = prismToProtocol (eitherPrism f g) 288 | 289 | 290 | 291 | -------------------------------------------------------------------------------- /src/Hadron/Run.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE NoMonomorphismRestriction #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE RecordWildCards #-} 6 | {-# LANGUAGE TemplateHaskell #-} 7 | 8 | ----------------------------------------------------------------------------- 9 | -- | 10 | -- Module : 11 | -- Copyright : 12 | -- License : 13 | -- 14 | -- Maintainer : 15 | -- Stability : experimental 16 | -- 17 | -- An operational run layer that either passes commands on to hadoop 18 | -- or runs things locally. 19 | ---------------------------------------------------------------------------- 20 | 21 | 22 | module Hadron.Run 23 | ( RunContext (..) 24 | , L.LocalRunSettings (..) 25 | , H.HadoopEnv (..) 26 | , H.clouderaDemo 27 | , H.amazonEMR 28 | 29 | , H.PartitionStrategy (..) 30 | , H.numSegs 31 | , H.eqSegs 32 | , H.Comparator (..) 33 | 34 | , H.HadoopRunOpts (..) 35 | , H.mrSettings 36 | 37 | , H.Codec 38 | , H.gzipCodec 39 | , H.snappyCodec 40 | 41 | , launchMapReduce 42 | 43 | , hdfsTempFilePath 44 | , hdfsFileExists 45 | , hdfsDeletePath 46 | , hdfsLs 47 | , hdfsPut 48 | , hdfsMkdir 49 | , hdfsCat 50 | , hdfsGet 51 | , hdfsLocalStream 52 | 53 | , randomRemoteFile 54 | , randomLocalFile 55 | 56 | , L.LocalFile (..) 57 | , L.randomFileName 58 | , withLocalFile 59 | , withRandomLocalFile 60 | 61 | , module Hadron.Run.FanOut 62 | , hdfsFanOut 63 | 64 | ) where 65 | 66 | 67 | ------------------------------------------------------------------------------- 68 | import Control.Error 69 | import Control.Lens 70 | import Control.Monad 71 | import Control.Monad.Morph 72 | import Control.Monad.Trans 73 | import Control.Monad.Trans.Resource 74 | import qualified Data.ByteString.Char8 as B 75 | import Data.Conduit 76 | import Data.Conduit.Binary (sourceFile) 77 | import Data.Text (Text) 78 | import System.Directory 79 | import System.FilePath.Posix 80 | ------------------------------------------------------------------------------- 81 | import Hadron.Run.FanOut 82 | import qualified Hadron.Run.Hadoop as H 83 | import Hadron.Run.Local (LocalFile (..)) 84 | import qualified Hadron.Run.Local as L 85 | import Hadron.Utils 86 | ------------------------------------------------------------------------------- 87 | 88 | 89 | ------------------------------------------------------------------------------- 90 | -- | Dispatch on the type of run 91 | data RunContext 92 | = LocalRun L.LocalRunSettings 93 | -- ^ Development mode: Emulate a hadoop run locally on this 94 | -- machine 95 | | HadoopRun H.HadoopEnv L.LocalRunSettings 96 | -- ^ Production mode: Actually run on hadoop. However, some 97 | -- utilites use local facilities so we still force you to have a 98 | -- local policy. 99 | makePrisms ''RunContext 100 | 101 | 102 | lset :: RunContext -> L.LocalRunSettings 103 | lset (LocalRun s) = s 104 | lset (HadoopRun _ s) = s 105 | 106 | 107 | ------------------------------------------------------------------------------- 108 | launchMapReduce 109 | :: (Functor m, MonadIO m) 110 | => RunContext 111 | -> String 112 | -> String 113 | -> H.HadoopRunOpts 114 | -> ExceptT Text m () 115 | launchMapReduce (LocalRun env) mrKey token opts = 116 | ExceptT . L.runLocal env . runExceptT $ (L.localMapReduce env mrKey token opts) 117 | launchMapReduce (HadoopRun env _) mrKey token opts = 118 | H.hadoopMapReduce env mrKey token opts 119 | 120 | 121 | ------------------------------------------------------------------------------- 122 | hdfsFileExists :: RunContext -> FilePath -> IO Bool 123 | hdfsFileExists (LocalRun env) fp = L.runLocal env (L.hdfsFileExists (LocalFile fp)) 124 | hdfsFileExists (HadoopRun env _) fp = H.hdfsFileExists env fp 125 | 126 | 127 | ------------------------------------------------------------------------------- 128 | hdfsDeletePath :: RunContext -> FilePath -> IO () 129 | hdfsDeletePath rc fp = case rc of 130 | LocalRun lrs -> L.runLocal lrs (L.hdfsDeletePath (LocalFile fp)) 131 | HadoopRun he _ -> H.hdfsDeletePath he fp 132 | 133 | 134 | ------------------------------------------------------------------------------- 135 | hdfsLs :: RunContext -> FilePath -> IO [File] 136 | hdfsLs rc fp = case rc of 137 | LocalRun lrs -> L.runLocal lrs (L.hdfsLs (LocalFile fp)) 138 | HadoopRun he _ -> H.hdfsLs he fp 139 | 140 | 141 | ------------------------------------------------------------------------------- 142 | hdfsPut :: RunContext -> L.LocalFile -> FilePath -> IO () 143 | hdfsPut rc f1 f2 = case rc of 144 | LocalRun lrs -> L.runLocal lrs (L.hdfsPut f1 (LocalFile f2)) 145 | HadoopRun e _ -> withLocalFile rc f1 $ \ lf -> H.hdfsPut e lf f2 146 | 147 | 148 | ------------------------------------------------------------------------------- 149 | hdfsFanOut 150 | :: RunContext 151 | -> FilePath 152 | -- ^ A temporary folder where in-progress files can be placed 153 | -> IO FanOut 154 | hdfsFanOut rc tmp = case rc of 155 | LocalRun lcs -> L.runLocal lcs $ L.hdfsFanOut tmp 156 | HadoopRun e _ -> H.hdfsFanOut e tmp 157 | 158 | 159 | ------------------------------------------------------------------------------- 160 | hdfsMkdir :: RunContext -> FilePath -> IO () 161 | hdfsMkdir rc fp = case rc of 162 | LocalRun lcs -> L.runLocal lcs (L.hdfsMkdir (LocalFile fp)) 163 | HadoopRun he _ -> H.hdfsMkdir he fp 164 | 165 | 166 | ------------------------------------------------------------------------------- 167 | hdfsCat 168 | :: RunContext 169 | -> FilePath 170 | -> Producer (ResourceT IO) B.ByteString 171 | hdfsCat rc fp = case rc of 172 | LocalRun lcs -> hoist (hoist (L.runLocal lcs)) $ L.hdfsCat (LocalFile fp) 173 | HadoopRun{} -> hdfsLocalStream rc fp 174 | 175 | 176 | ------------------------------------------------------------------------------- 177 | -- | Copy a file from HDFS into local. 178 | hdfsGet :: RunContext -> FilePath -> IO LocalFile 179 | hdfsGet rc fp = do 180 | local <- L.randomFileName 181 | case rc of 182 | LocalRun _ -> return (LocalFile fp) 183 | HadoopRun h _ -> do 184 | withLocalFile rc local $ \ lf -> H.hdfsGet h fp lf 185 | return local 186 | 187 | 188 | ------------------------------------------------------------------------------- 189 | -- | Copy a file down to local FS, then stream its content. 190 | hdfsLocalStream 191 | :: RunContext 192 | -> FilePath 193 | -> Producer (ResourceT IO) B.ByteString 194 | hdfsLocalStream env fp = case env of 195 | LocalRun{} -> hdfsCat env fp 196 | HadoopRun _ _ -> do 197 | random <- liftIO $ hdfsGet env fp 198 | withLocalFile env random $ \ local -> do 199 | register $ removeFile local 200 | sourceFile local 201 | 202 | 203 | 204 | -- ------------------------------------------------------------------------------- 205 | -- -- | Stream contents of a folder one by one from HDFS. 206 | -- hdfsLocalStreamMulti 207 | -- :: (MonadIO m, MonadThrow m, MonadBase base m, PrimMonad base) 208 | -- => HadoopEnv 209 | -- -> FilePath 210 | -- -- ^ Location / glob pattern 211 | -- -> (FilePath -> Bool) 212 | -- -- ^ File filter based on name 213 | -- -> Source m ByteString 214 | -- hdfsLocalStreamMulti hs loc chk = do 215 | -- fs <- liftIO $ hdfsLs hs loc <&> filter chk 216 | -- lfs <- liftIO $ mapConcurrently (hdfsGet hs) fs 217 | -- forM_ (zip lfs fs) $ \ (local, fp) -> do 218 | -- h <- liftIO $ catching _IOException 219 | -- (openFile local ReadMode) 220 | -- (\e -> error $ "hdfsLocalStream failed with open file: " <> show e) 221 | -- let getFile = sourceHandle h 222 | -- if isSuffixOf "gz" fp 223 | -- then getFile =$= ungzip 224 | -- else getFile 225 | -- liftIO $ do 226 | -- hClose h 227 | -- removeFile local 228 | 229 | 230 | randomLocalFile :: MonadIO m => m LocalFile 231 | randomLocalFile = L.randomFileName 232 | 233 | 234 | randomRemoteFile :: RunContext -> IO FilePath 235 | randomRemoteFile env = case env of 236 | LocalRun{} -> _unLocalFile `liftM` L.randomFileName 237 | HadoopRun e _ -> H.randomFilename e 238 | 239 | 240 | ------------------------------------------------------------------------------- 241 | -- | Given a filename, produce an HDFS path for me in our temporary folder. 242 | hdfsTempFilePath :: MonadIO m => RunContext -> FilePath -> m FilePath 243 | hdfsTempFilePath env fp = case env of 244 | LocalRun{} -> return fp 245 | HadoopRun{} -> return $ H.tmpRoot fp 246 | 247 | 248 | ------------------------------------------------------------------------------- 249 | -- | Helper to work with relative paths using Haskell functions like 250 | -- 'readFile' and 'writeFile'. 251 | withLocalFile 252 | :: MonadIO m => RunContext -> LocalFile -> (FilePath -> m b) -> m b 253 | withLocalFile rs fp f = L.withLocalFile (lset rs) fp f 254 | 255 | 256 | ------------------------------------------------------------------------------- 257 | withRandomLocalFile :: MonadIO m => RunContext -> (FilePath -> m b) -> m LocalFile 258 | withRandomLocalFile rc f = do 259 | fp <- randomLocalFile 260 | withLocalFile rc fp f 261 | return fp 262 | -------------------------------------------------------------------------------- /src/Hadron/Run/FanOut.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE ExistentialQuantification #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE RecordWildCards #-} 6 | {-# LANGUAGE TemplateHaskell #-} 7 | 8 | ----------------------------------------------------------------------------- 9 | -- | 10 | -- Module : Hadron.Run.FanOut 11 | -- Copyright : Soostone Inc, 2015 12 | -- License : BSD3 13 | -- 14 | -- Maintainer : Ozgun Ataman 15 | -- Stability : experimental 16 | -- 17 | -- Haskell-native ability to stream output to multiple files in Hadoop 18 | -- Streaming. 19 | ---------------------------------------------------------------------------- 20 | 21 | 22 | module Hadron.Run.FanOut 23 | ( FanOut 24 | , mkFanOut 25 | , fanWrite 26 | , fanClose 27 | , fanCloseAll 28 | 29 | , FanOutSink 30 | , sinkFanOut 31 | , sequentialSinkFanout 32 | , fanStats 33 | ) where 34 | 35 | ------------------------------------------------------------------------------- 36 | import Control.Applicative as A 37 | import Control.Concurrent.MVar 38 | import Control.Lens 39 | import Control.Monad 40 | import Control.Monad.Trans 41 | import qualified Data.ByteString.Char8 as B 42 | import Data.Conduit 43 | import qualified Data.Conduit.List as C 44 | import qualified Data.Map.Strict as M 45 | import Data.Monoid 46 | import System.FilePath.Lens 47 | import System.IO 48 | ------------------------------------------------------------------------------- 49 | import Hadron.Utils 50 | ------------------------------------------------------------------------------- 51 | 52 | 53 | -- | An open file handle 54 | data FileHandle = FileHandle { 55 | _fhHandle :: !Handle 56 | , _fhFin :: IO () 57 | , _fhCount :: !Int 58 | , _fhPendingCount :: !Int 59 | } 60 | makeLenses ''FileHandle 61 | 62 | 63 | -- | Concurrent multi-output manager. 64 | data FanOut = FanOut { 65 | _fanFiles :: MVar (M.Map FilePath FileHandle) 66 | , _fanCreate :: FilePath -> IO (Handle, IO ()) 67 | } 68 | makeLenses ''FanOut 69 | 70 | 71 | data FileChunk = FileChunk { 72 | _chunkOrig :: !FilePath 73 | , _chunkTarget :: !FilePath 74 | , _chunkCnt :: !Int 75 | } 76 | 77 | makeLenses ''FileChunk 78 | 79 | 80 | ------------------------------------------------------------------------------- 81 | -- | Make a new fanout manager that will use given process creator. 82 | -- Process is expected to pipe its stdin into the desired location. 83 | mkFanOut 84 | :: (FilePath -> IO (Handle, IO ())) 85 | -- ^ Open a handle for a given target path 86 | -> IO FanOut 87 | mkFanOut f = FanOut A.<$> newMVar M.empty <*> pure f 88 | 89 | 90 | ------------------------------------------------------------------------------- 91 | -- | Write into a file. A new process will be spawned if this is the 92 | -- first time writing into this file. 93 | fanWrite :: FanOut -> FilePath -> B.ByteString -> IO () 94 | fanWrite fo fp bs = modifyMVar_ (fo ^. fanFiles) go 95 | where 96 | 97 | go !m | Just fh <- M.lookup fp m = do 98 | B.hPut (fh ^. fhHandle) bs 99 | let newCount = fh ^. fhPendingCount + B.length bs 100 | upFun <- case (newCount >= chunk) of 101 | True -> do 102 | hFlush (fh ^. fhHandle) 103 | return $ fhPendingCount .~ 0 104 | False -> return $ fhPendingCount .~ newCount 105 | return $! M.insert fp (fh & upFun . (fhCount %~ (+1))) m 106 | 107 | go !m = do 108 | (r, p) <- (fo ^. fanCreate) fp 109 | go $! M.insert fp (FileHandle r p 0 0) m 110 | 111 | 112 | chunk = 1024 * 4 113 | 114 | ------------------------------------------------------------------------------- 115 | closeHandle :: FileHandle -> IO () 116 | closeHandle fh = do 117 | hFlush $ fh ^. fhHandle 118 | hClose $ fh ^. fhHandle 119 | fh ^. fhFin 120 | 121 | 122 | ------------------------------------------------------------------------------- 123 | -- | Close a specific file. 124 | fanClose :: FanOut -> FilePath -> IO () 125 | fanClose fo fp = modifyMVar_ (fo ^. fanFiles) $ \ m -> case m ^. at fp of 126 | Nothing -> return m 127 | Just fh -> do 128 | closeHandle fh 129 | return $! m & at fp .~ Nothing 130 | 131 | 132 | ------------------------------------------------------------------------------- 133 | -- | Close all files. The same FanOut can be used after this, which 134 | -- would spawn new processes to write into files. 135 | fanCloseAll :: FanOut -> IO () 136 | fanCloseAll fo = modifyMVar_ (fo ^. fanFiles) $ \m -> do 137 | forM_ (M.toList m) $ \ (_fp, fh) -> closeHandle fh 138 | return M.empty 139 | 140 | 141 | ------------------------------------------------------------------------------- 142 | -- | Grab # of writes into each file so far. 143 | fanStats :: FanOut -> IO (M.Map FilePath Int) 144 | fanStats fo = do 145 | m <- modifyMVar (fo ^. fanFiles) $ \ m -> return (m,m) 146 | return $ M.map (^. fhCount) m 147 | 148 | 149 | ------------------------------------------------------------------------------- 150 | -- | Sink a stream into 'FanOut'. 151 | sinkFanOut :: FanOutSink 152 | sinkFanOut dispatch conv fo = C.foldM go 0 153 | where 154 | go !i a = do 155 | bs <- conv a 156 | liftIO (fanWrite fo (dispatch a) bs) 157 | return $! i + 1 158 | 159 | 160 | ------------------------------------------------------------------------------- 161 | -- | A fanout that keeps only a single file open at a time. Each time 162 | -- the target filename changes, this will close/finalize the file and 163 | -- start the new file. 164 | sequentialSinkFanout :: FanOutSink 165 | sequentialSinkFanout dispatch conv fo = 166 | liftM fst $ C.foldM go (0, Nothing) 167 | where 168 | go (!i, !chunk0) a = do 169 | bs <- conv a 170 | let fp = dispatch a 171 | 172 | let goNew = do 173 | tk <- liftIO (randomToken 16) 174 | let fp' = fp & basename %~ (<> "_" <> tk) 175 | liftIO $ fanWrite fo fp' bs 176 | return $! (i+1, Just (FileChunk fp fp' (B.length bs))) 177 | 178 | case chunk0 of 179 | Nothing -> goNew 180 | Just c@FileChunk{..} -> case fp == view chunkOrig c of 181 | False -> do 182 | liftIO $ fanClose fo (view chunkTarget c) 183 | goNew 184 | True -> do 185 | liftIO $ fanWrite fo (view chunkTarget c) bs 186 | return $! (i+1, Just $! c & chunkCnt %~ (+ (B.length bs))) 187 | 188 | 189 | 190 | type FanOutSink = forall a m. MonadIO m => (a -> FilePath) -> (a -> m B.ByteString) -> FanOut -> Consumer a m Int 191 | 192 | 193 | ------------------------------------------------------------------------------- 194 | -- test :: IO () 195 | -- test = do 196 | -- fo <- mkFanOut 197 | -- (\ fp -> (,) <$> openFile fp AppendMode <*> pure (return ())) 198 | -- fanWrite fo "test1" "foo" 199 | -- fanWrite fo "test1" "bar" 200 | -- fanWrite fo "test1" "tak" 201 | -- print =<< fanStats fo 202 | -- fanCloseAll fo 203 | -- fanWrite fo "test1" "tak" 204 | -------------------------------------------------------------------------------- /src/Hadron/Run/Hadoop.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE MultiParamTypeClasses #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE RecordWildCards #-} 6 | {-# LANGUAGE TemplateHaskell #-} 7 | 8 | ----------------------------------------------------------------------------- 9 | -- | 10 | -- Module : Hadron.Run.Hadoop 11 | -- Copyright : Soostone Inc 12 | -- License : BSD3 13 | -- 14 | -- Maintainer : Ozgun Ataman 15 | -- Stability : experimental 16 | -- 17 | -- Deal with the hadoop command line program. 18 | ---------------------------------------------------------------------------- 19 | 20 | 21 | module Hadron.Run.Hadoop 22 | ( HadoopEnv (..) 23 | , hsBin, hsJar 24 | , clouderaDemo 25 | , amazonEMR 26 | 27 | , PartitionStrategy (..) 28 | , numSegs 29 | , eqSegs 30 | , Comparator (..) 31 | 32 | , HadoopRunOpts (..) 33 | , mrSettings 34 | , mrsInput, mrsOutput, mrsPart, mrsNumMap, mrsNumReduce 35 | , mrsCombine, mrsCompress, mrsOutSep, mrsJobName, mrsComparator 36 | , mrsTaskTimeout 37 | 38 | , Codec 39 | , gzipCodec 40 | , snappyCodec 41 | 42 | -- * Hadoop Command Line Wrappers 43 | , hadoopMapReduce 44 | , hdfsFileExists 45 | , hdfsDeletePath 46 | , hdfsLs, parseLS 47 | , hdfsPut 48 | , hdfsFanOut 49 | , hdfsFanOutStream 50 | , hdfsMkdir 51 | , tmpRoot 52 | , hdfsCat 53 | , hdfsGet 54 | , hdfsChmod 55 | , randomFilename 56 | 57 | ) where 58 | 59 | ------------------------------------------------------------------------------- 60 | import Control.Applicative as A 61 | import Control.Error 62 | import Control.Lens 63 | import Control.Monad 64 | import Control.Monad.Trans 65 | import Control.Monad.Trans.Resource 66 | import Crypto.Hash.MD5 67 | import qualified Data.ByteString.Base16 as Base16 68 | import Data.ByteString.Char8 (ByteString) 69 | import qualified Data.ByteString.Char8 as B 70 | import Data.Conduit 71 | import Data.Conduit.Binary (sourceHandle) 72 | import Data.Default 73 | import Data.List 74 | import Data.List.LCS.HuntSzymanski 75 | import Data.Monoid 76 | import Data.String.Conv 77 | import qualified Data.Text as T 78 | import System.Directory 79 | import System.Environment 80 | import System.Exit 81 | import System.FilePath 82 | import System.FilePath.Lens 83 | import System.IO 84 | import System.Process 85 | ------------------------------------------------------------------------------- 86 | import Hadron.Logger 87 | import Hadron.Run.FanOut 88 | import Hadron.Utils 89 | ------------------------------------------------------------------------------- 90 | 91 | 92 | ------------------------------------------------------------------------------- 93 | echo :: (A.Applicative m, MonadIO m) => Severity -> LogStr -> m () 94 | echo sev msg = runLog $ logMsg "Run.Hadoop" sev msg 95 | 96 | 97 | ------------------------------------------------------------------------------- 98 | echoInfo :: (Applicative m, MonadIO m) => LogStr -> m () 99 | echoInfo = echo InfoS 100 | 101 | 102 | 103 | data HadoopEnv = HadoopEnv { 104 | _hsBin :: String 105 | , _hsJar :: String 106 | } 107 | 108 | 109 | -- | Settings for the cloudera demo VM. 110 | clouderaDemo :: HadoopEnv 111 | clouderaDemo = HadoopEnv { 112 | _hsBin = "hadoop" 113 | , _hsJar = "/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.2.0.jar" 114 | } 115 | 116 | 117 | -- | Settings for Amazon's EMR instances. 118 | amazonEMR :: HadoopEnv 119 | amazonEMR = HadoopEnv { 120 | _hsBin = "/home/hadoop/bin/hadoop" 121 | , _hsJar = "/home/hadoop/contrib/streaming/hadoop-streaming.jar" 122 | } 123 | 124 | 125 | instance Default HadoopEnv where 126 | def = clouderaDemo 127 | 128 | 129 | data PartitionStrategy 130 | = NoPartition 131 | -- ^ Expect a single key segment emitted from the 'Mapper'. 132 | | Partition { 133 | keySegs :: Int 134 | -- ^ Total segments comprising a key. 135 | , partSegs :: Int 136 | -- ^ First N key segments used for partitioning. All keys that 137 | -- share these segments will be routed to the same reducer. 138 | } 139 | -- ^ Expect a composite key emitted from the 'Mapper'. 140 | 141 | 142 | instance Default PartitionStrategy where def = NoPartition 143 | 144 | 145 | data Comparator 146 | = RegularComp 147 | -- ^ Regular sorting 148 | | NumericComp Int Int Bool 149 | -- ^ Numeric sorting spanning fields i to j, True=reversed 150 | 151 | instance Default Comparator where def = RegularComp 152 | 153 | 154 | ------------------------------------------------------------------------------- 155 | -- | Number of total key segments. 156 | numSegs :: PartitionStrategy -> Int 157 | numSegs NoPartition = 1 158 | numSegs Partition{..} = keySegs 159 | 160 | 161 | ------------------------------------------------------------------------------- 162 | -- | Number of key segments that constitute input object equality, for 163 | -- hadoop partitions. 164 | eqSegs :: PartitionStrategy -> Int 165 | eqSegs NoPartition = 1 166 | eqSegs Partition{..} = partSegs 167 | 168 | 169 | type Codec = String 170 | 171 | ------------------------------------------------------------------------------- 172 | gzipCodec :: Codec 173 | gzipCodec = "org.apache.hadoop.io.compress.GzipCodec" 174 | 175 | snappyCodec :: Codec 176 | snappyCodec = "org.apache.hadoop.io.compress.SnappyCodec" 177 | 178 | 179 | type MapReduceKey = String 180 | type RunToken = String 181 | 182 | 183 | ------------------------------------------------------------------------------- 184 | -- | Useful reference for hadoop flags: 185 | -- 186 | -- @http://hadoop.apache.org/docs/r2.4.1/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml@ 187 | data HadoopRunOpts = HadoopRunOpts { 188 | _mrsInput :: [String] 189 | , _mrsOutput :: String 190 | , _mrsPart :: PartitionStrategy 191 | , _mrsNumMap :: Maybe Int 192 | , _mrsNumReduce :: Maybe Int 193 | , _mrsTaskTimeout :: Maybe Int 194 | , _mrsCombine :: Bool 195 | , _mrsCompress :: Maybe Codec 196 | , _mrsOutSep :: Maybe Char 197 | -- ^ A separator to be used in reduce output. It is sometimes 198 | -- useful to specify one to trick Hadoop. 199 | , _mrsJobName :: Maybe String 200 | , _mrsComparator :: Comparator 201 | } 202 | makeLenses ''HadoopRunOpts 203 | 204 | instance Default HadoopRunOpts where 205 | def = HadoopRunOpts [] "" def Nothing Nothing Nothing False Nothing Nothing Nothing def 206 | 207 | -- | A simple starting point to defining 'HadoopRunOpts' 208 | mrSettings 209 | :: [String] 210 | -- ^ Input files 211 | -> String 212 | -- ^ Output files 213 | -> HadoopRunOpts 214 | mrSettings ins out = def { _mrsInput = ins, _mrsOutput = out } 215 | 216 | 217 | 218 | ------------------------------------------------------------------------------- 219 | hadoopMapReduce 220 | :: (Functor m, MonadIO m) 221 | => HadoopEnv 222 | -> MapReduceKey 223 | -> RunToken 224 | -> HadoopRunOpts 225 | -> ExceptT T.Text m () 226 | hadoopMapReduce HadoopEnv{..} mrKey runToken HadoopRunOpts{..} = do 227 | exec <- scriptIO getExecutablePath 228 | prog <- scriptIO getProgName 229 | echoInfo $ "Launching Hadoop job for MR key: " <> ls mrKey 230 | 231 | let args = mkArgs exec prog 232 | 233 | echoInfo $ "Hadoop arguments: " <> ls (intercalate " " args) 234 | 235 | (code, out, eout) <- scriptIO $ readProcessWithExitCode _hsBin args "" 236 | case code of 237 | ExitSuccess -> return () 238 | e -> do 239 | echo ErrorS $ ls $ intercalate "\n" 240 | [ "Hadoop job failed.", "StdOut:" 241 | , out, "", "StdErr:", eout] 242 | 243 | hoistEither $ Left $ T.pack $ "MR job failed with: " ++ show e 244 | where 245 | mkArgs exec prog = 246 | [ "jar", _hsJar] ++ 247 | comp ++ numMap ++ numRed ++ timeout ++ outSep ++ jobName ++ 248 | comparator ++ part ++ 249 | inputs ++ 250 | [ "-output", _mrsOutput] ++ 251 | mkStage prog "mapper" ++ 252 | mkStage prog "reducer" ++ 253 | if _mrsCombine then mkStage prog "combiner" else [] ++ 254 | [ "-file", exec ] 255 | 256 | 257 | mkStage prog stage = 258 | [ "-" ++ stage 259 | , "\"" ++ prog ++ " " ++ runToken ++ " " ++ stage ++ "_" ++ mrKey ++ "\"" 260 | ] 261 | 262 | jobName = maybe [] (\nm -> ["-D", "mapreduce.job.name=\"" <> nm <>"\""]) 263 | _mrsJobName 264 | 265 | 266 | inputs = concatMap mkInput _mrsInput 267 | mkInput i = ["-input", i] 268 | 269 | numMap = maybe [] (\x -> ["-D", "mapreduce.job.maps=" ++ show x]) _mrsNumMap 270 | numRed = maybe [] (\x -> ["-D", "mapreduce.job.reduces=" ++ show x]) _mrsNumReduce 271 | 272 | timeout = maybe [] (\x -> ["-D", "mapreduce.task.timeout=" ++ show x]) _mrsTaskTimeout 273 | 274 | comp = 275 | case _mrsCompress of 276 | Just codec -> [ "-D", "mapreduce.output.fileoutputformat.compress=true" 277 | , "-D", "mapreduce.output.fileoutputformat.compress.codec=" ++ codec 278 | -- , "-D", "mapred.compress.map.output=true" 279 | -- , "-D", "mapred.map.output.compression.codec=" ++ mrsCodec 280 | ] 281 | Nothing -> [] 282 | 283 | part = case _mrsPart of 284 | NoPartition -> [] 285 | Partition{..} -> 286 | [ "-D", "stream.num.map.output.key.fields=" ++ show keySegs 287 | , "-D", "mapreduce.partition.keypartitioner.options=-k1," ++ show partSegs 288 | , "-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner" 289 | ] 290 | 291 | 292 | comparator = case _mrsComparator of 293 | RegularComp -> [] 294 | NumericComp st end rev -> 295 | [ "-D", "mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator" 296 | , "-D", "mapred.text.key.comparator.options=-k" <> 297 | show st <> "," <> show end <> "n" <> 298 | if rev then "r" else "" 299 | ] 300 | 301 | outSep = case _mrsOutSep of 302 | Nothing -> [] 303 | Just sep -> 304 | [ "-D", "stream.reduce.output.field.separator=" ++ [sep] 305 | , "-D", "mapred.textoutputformat.separator=" ++ [sep] ] 306 | ++ (if _mrsNumReduce == Just 0 307 | then [ "-D", "stream.map.output.field.separator=" ++ [sep]] 308 | else []) 309 | 310 | 311 | ------------------------------------------------------------------------------- 312 | -- | Check if the target file is present. 313 | hdfsFileExists :: HadoopEnv -> FilePath -> IO Bool 314 | hdfsFileExists HadoopEnv{..} p = do 315 | res <- rawSystem _hsBin ["fs", "-stat", p] 316 | return $ case res of 317 | ExitSuccess -> True 318 | ExitFailure{} -> False 319 | 320 | 321 | 322 | ------------------------------------------------------------------------------- 323 | -- | Check if the target file is present. 324 | hdfsDeletePath :: HadoopEnv -> FilePath -> IO () 325 | hdfsDeletePath HadoopEnv{..} p = void $ 326 | rawSystem _hsBin ["fs", "-rmr", "-skipTrash", p] 327 | 328 | 329 | 330 | 331 | ------------------------------------------------------------------------------- 332 | -- | List a directory's contents 333 | hdfsLs :: HadoopEnv -> FilePath -> IO [File] 334 | hdfsLs HadoopEnv{..} p = do 335 | (res,out,_) <- readProcessWithExitCode _hsBin ["fs", "-lsr", p] "" 336 | return $ case res of 337 | ExitSuccess -> parseLS p out 338 | ExitFailure{} -> [] 339 | 340 | 341 | ------------------------------------------------------------------------------- 342 | -- | TODO: The lcs function does not guarantee contiguous-common 343 | -- regions, so this function may behave strangely. We should figure 344 | -- out a way to use longest-common-prefix like semantics. 345 | parseLS :: String -> String -> [File] 346 | parseLS pat out = filter isOK $ map clean $ mapMaybe parseLs $ lines out 347 | where 348 | pat' = T.pack pat 349 | prefix = takeWhile (/= '*') pat 350 | isOK x = x ^. filePath . to (isPrefixOf prefix) 351 | clean f = f & filePath %~ (T.unpack begin <>) 352 | where 353 | shared = T.pack $ lcs pat (f ^. filePath) 354 | (begin, _) = T.breakOn shared pat' 355 | 356 | 357 | ------------------------------------------------------------------------------- 358 | -- | Copy file from a location to a location 359 | hdfsPut :: HadoopEnv -> FilePath -> FilePath -> IO () 360 | hdfsPut HadoopEnv{..} localPath hdfsPath = void $ 361 | rawSystem _hsBin ["fs", "-put", localPath, hdfsPath] 362 | 363 | 364 | ------------------------------------------------------------------------------- 365 | -- | Create a new multiple output file manager. 366 | hdfsFanOutStream :: HadoopEnv -> FilePath -> IO FanOut 367 | hdfsFanOutStream env@HadoopEnv{..} tmp = mkFanOut mkP 368 | where 369 | 370 | mkTmp fp = tmp fp ^. filename 371 | 372 | mkP fp = do 373 | (Just h, _, _, ph) <- createProcess $ (proc _hsBin ["fs", "-put", "-", mkTmp fp]) 374 | { std_in = CreatePipe } 375 | hSetBuffering h LineBuffering 376 | let fin = do void $ waitForProcess ph 377 | hdfsMkdir env (fp ^. directory) 378 | void $ rawSystem _hsBin ["fs", "-mv", mkTmp fp, fp] 379 | return (h, fin) 380 | 381 | 382 | ------------------------------------------------------------------------------- 383 | hdfsFanOut :: HadoopEnv -> FilePath -> IO FanOut 384 | hdfsFanOut env@HadoopEnv{..} tmp = mkFanOut mkHandle 385 | where 386 | 387 | mkTmp fp = tmp (toS . Base16.encode . toS . hash . toS $ fp) 388 | 389 | -- write into a temp file loc until we know the stage is 390 | -- complete without failure 391 | mkHandle fp = do 392 | let fp' = mkTmp fp 393 | createDirectoryIfMissing True (fp' ^. directory) 394 | h <- openFile fp' AppendMode 395 | let fin = runResourceT $ do 396 | a <- register $ removeFile fp' 397 | liftIO $ hdfsMkdir env (fp ^. directory) 398 | liftIO $ hdfsPut env fp' fp 399 | release a 400 | return (h, fin) 401 | 402 | 403 | 404 | ------------------------------------------------------------------------------- 405 | -- | Create HDFS directory if missing 406 | hdfsMkdir :: HadoopEnv -> String -> IO () 407 | hdfsMkdir HadoopEnv{..} fp = void $ rawSystem _hsBin ["fs", "-mkdir", "-p", fp] 408 | 409 | 410 | ------------------------------------------------------------------------------- 411 | -- | Apply recursive permissions to given 412 | hdfsChmod 413 | :: HadoopEnv 414 | -> String 415 | -- ^ Target path 416 | -> String 417 | -- ^ Permissions string 418 | -> IO ExitCode 419 | hdfsChmod HadoopEnv{..} fp mode = rawSystem _hsBin ["fs", "-chmod", "-R", mode, fp] 420 | 421 | 422 | ------------------------------------------------------------------------------- 423 | -- | Stream data directly from HDFS using @hdfs cat@. 424 | -- 425 | -- NOTE: It appears that this function may output a header before the file 426 | -- contents. Be careful! 427 | hdfsCat :: MonadIO m => HadoopEnv -> FilePath -> Producer m ByteString 428 | hdfsCat HadoopEnv{..} p = do 429 | (inH, outH, _, _) <- liftIO $ do 430 | let cp = (proc _hsBin ["fs", "-cat", p]) { std_in = CreatePipe 431 | , std_out = CreatePipe 432 | , std_err = Inherit } 433 | createProcess cp 434 | maybe (return ()) (liftIO . hClose) inH 435 | maybe exit sourceHandle outH 436 | where 437 | exit = error $ concat ["Could not open file ", p, ". Skipping...."] 438 | 439 | 440 | 441 | tmpRoot :: FilePath 442 | tmpRoot = "/tmp/hadron/" 443 | 444 | 445 | ------------------------------------------------------------------------------ 446 | -- | Generates a random filename in the /tmp/hadron directory. 447 | randomFilename :: HadoopEnv -> IO FilePath 448 | randomFilename settings = do 449 | tk <- randomToken 64 450 | hdfsMkdir settings tmpRoot 451 | return $ B.unpack $ B.concat [B.pack tmpRoot, B.pack tk] 452 | 453 | 454 | ------------------------------------------------------------------------------- 455 | -- | Copy file from HDFS to a temporary local file whose name is returned. 456 | hdfsGet :: HadoopEnv -> FilePath -> FilePath -> IO () 457 | hdfsGet HadoopEnv{..} p local = do 458 | (res,out,e) <- readProcessWithExitCode _hsBin ["fs", "-get", p, local] "" 459 | case res of 460 | ExitFailure i -> error $ "hdfsGet failed: " <> show i <> ".\n" <> out <> "\n" <> e 461 | ExitSuccess -> return () 462 | 463 | 464 | 465 | 466 | ------------------------------------------------------------------------------- 467 | makeLenses ''HadoopEnv 468 | ------------------------------------------------------------------------------- 469 | -------------------------------------------------------------------------------- /src/Hadron/Run/Local.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | {-# LANGUAGE NoMonomorphismRestriction #-} 5 | {-# LANGUAGE OverloadedStrings #-} 6 | {-# LANGUAGE RankNTypes #-} 7 | {-# LANGUAGE RecordWildCards #-} 8 | {-# LANGUAGE TemplateHaskell #-} 9 | 10 | ----------------------------------------------------------------------------- 11 | -- | 12 | -- Module : 13 | -- Copyright : 14 | -- License : 15 | -- 16 | -- Maintainer : 17 | -- Stability : experimental 18 | -- 19 | -- Emulate all hadoop operations locally 20 | ---------------------------------------------------------------------------- 21 | 22 | module Hadron.Run.Local where 23 | 24 | ------------------------------------------------------------------------------- 25 | import Control.Applicative as A 26 | import Control.Error 27 | import Control.Lens 28 | import Control.Monad.Reader 29 | import Control.Monad.Trans.Resource 30 | import qualified Data.ByteString.Char8 as B 31 | import Data.Conduit 32 | import Data.Conduit.Binary (sourceFile) 33 | import Data.Default 34 | import Data.Hashable 35 | import Data.List 36 | import Data.Monoid 37 | import Data.Text (Text) 38 | import qualified Data.Text as T 39 | import System.Directory 40 | import System.Environment 41 | import System.Exit 42 | import System.FilePath.Glob 43 | import System.FilePath.Lens 44 | import System.FilePath.Posix 45 | import System.IO 46 | import System.Process 47 | ------------------------------------------------------------------------------- 48 | import Hadron.Logger 49 | import Hadron.Run.FanOut 50 | import qualified Hadron.Run.Hadoop as H 51 | import Hadron.Utils 52 | ------------------------------------------------------------------------------- 53 | #if MIN_VERSION_base(4, 7, 0) 54 | #else 55 | import System.Posix.Env 56 | #endif 57 | 58 | 59 | 60 | newtype LocalFile = LocalFile { _unLocalFile :: FilePath } 61 | deriving (Eq,Show,Read,Ord) 62 | makeLenses ''LocalFile 63 | 64 | 65 | data LocalRunSettings = LocalRunSettings { 66 | _lrsTempPath :: FilePath 67 | -- ^ Root of the "file system" during a localrun 68 | } 69 | makeLenses ''LocalRunSettings 70 | 71 | 72 | instance Default LocalRunSettings where 73 | def = LocalRunSettings "tmp" 74 | 75 | 76 | type Local = ReaderT LocalRunSettings IO 77 | 78 | runLocal :: r -> ReaderT r m a -> m a 79 | runLocal env f = runReaderT f env 80 | 81 | 82 | ------------------------------------------------------------------------------- 83 | path :: (MonadIO m, MonadReader LocalRunSettings m) => LocalFile -> m FilePath 84 | path (LocalFile fp) = do 85 | root <- view lrsTempPath 86 | let p = root fp 87 | dir = p ^. directory 88 | liftIO $ createDirectoryIfMissing True dir 89 | return p 90 | 91 | 92 | ------------------------------------------------------------------------------- 93 | getRecursiveDirectoryContents :: FilePath -> IO [FilePath] 94 | getRecursiveDirectoryContents dir0 = go dir0 95 | where 96 | go dir = do 97 | fs <- liftM (map (dir ) . filter (not . flip elem [".", ".."])) $ 98 | getDirectoryContents' dir 99 | fs' <- filterM (fmap not . doesDirectoryExist) fs 100 | fss <- mapM go fs 101 | return $ fs' ++ concat fss 102 | 103 | 104 | ------------------------------------------------------------------------------- 105 | -- | A version that return [] instead of an error when directory does not exit. 106 | getDirectoryContents' :: FilePath -> IO [FilePath] 107 | getDirectoryContents' fp = do 108 | chk <- doesDirectoryExist fp 109 | case chk of 110 | False -> return [] 111 | True -> getDirectoryContents fp 112 | 113 | 114 | ------------------------------------------------------------------------------- 115 | -- | Recursive contents if given a directory, assumed glob pattern if not. 116 | getInputFiles :: FilePath -> IO [FilePath] 117 | getInputFiles fp = do 118 | chk <- doesDirectoryExist fp 119 | case chk of 120 | True -> glob (fp "**") 121 | False -> glob fp 122 | 123 | 124 | ------------------------------------------------------------------------------- 125 | localMapReduce 126 | :: (Functor m, MonadIO m) 127 | => LocalRunSettings 128 | -> String -- ^ MapReduceKey 129 | -> String -- ^ RunToken 130 | -> H.HadoopRunOpts 131 | -> ExceptT Text m () 132 | localMapReduce lrs mrKey token H.HadoopRunOpts{..} = do 133 | exPath <- scriptIO getExecutablePath 134 | echoInfo $ "Launching Hadoop job for MR key: " <> ls mrKey 135 | 136 | 137 | expandedInput <- liftIO $ liftM concat $ forM _mrsInput $ \ inp -> 138 | withLocalFile lrs (LocalFile inp) getInputFiles 139 | 140 | 141 | let enableCompress = case _mrsCompress of 142 | Nothing -> False 143 | Just x -> isInfixOf "Gzip" x 144 | 145 | -- Are the input files already compressed? 146 | inputCompressed file = isInfixOf ".gz" file 147 | 148 | 149 | outFile <- liftIO $ withLocalFile lrs (LocalFile _mrsOutput) $ \ fp -> 150 | case fp ^. extension . to null of 151 | False -> return fp 152 | True -> do 153 | createDirectoryIfMissing True fp 154 | return $ fp ("0000.out" ++ if enableCompress then ".gz" else "") 155 | 156 | 157 | let pipe = " | " 158 | 159 | maybeCompress = if enableCompress 160 | then pipe <> "gzip" 161 | else "" 162 | 163 | maybeGunzip fp = (if inputCompressed fp then ("gunzip" <> pipe) else "") 164 | 165 | maybeReducer = case _mrsNumReduce of 166 | Just 0 -> "" 167 | _ -> pipe <> exPath <> " " <> token <> " " <> "reducer_" <> mrKey 168 | 169 | 170 | 171 | -- map over each file individually and write results into a temp file 172 | mapFile infile = clearExit . scriptIO . withTmpMapFile infile $ \ fp -> do 173 | echoInfo ("Running command: " <> ls (command fp)) 174 | #if MIN_VERSION_base(4, 7, 0) 175 | setEnv "mapreduce_map_input_file" infile 176 | #else 177 | setEnv "mapreduce_map_input_file" infile True 178 | #endif 179 | system (command fp) 180 | where 181 | command fp = 182 | "cat " <> infile <> pipe <> 183 | maybeGunzip infile <> 184 | exPath <> " " <> token <> " " <> "mapper_" <> mrKey <> 185 | " > " <> fp 186 | 187 | 188 | -- a unique temp file for each input file 189 | withTmpMapFile infile f = liftIO $ 190 | withLocalFile lrs (LocalFile ((show (hash infile)) <> "_mapout")) f 191 | 192 | 193 | getTempMapFiles = mapM (flip withTmpMapFile return) expandedInput 194 | 195 | -- concat all processed map output, sort and run through the reducer 196 | reduceFiles = do 197 | fs <- getTempMapFiles 198 | echoInfo ("Running command: " <> ls (command fs)) 199 | scriptIO $ createDirectoryIfMissing True (outFile ^. directory) 200 | clearExit $ scriptIO $ system (command fs) 201 | where 202 | command fs = 203 | "cat " <> intercalate " " fs <> pipe <> 204 | ("sort -t$'\t' -k1," <> show (H.numSegs _mrsPart)) <> 205 | maybeReducer <> 206 | maybeCompress <> 207 | " > " <> outFile 208 | 209 | removeTempFiles = scriptIO $ do 210 | fs <- getTempMapFiles 211 | mapM_ removeFile fs 212 | 213 | 214 | echoInfo "Mapping over individual local input files." 215 | mapM_ mapFile expandedInput 216 | 217 | echoInfo "Executing reduce stage." 218 | reduceFiles 219 | 220 | removeTempFiles 221 | 222 | 223 | ------------------------------------------------------------------------------- 224 | echo :: (A.Applicative m, MonadIO m) => Severity -> LogStr -> m () 225 | echo sev msg = runLog $ logMsg "Local" sev msg 226 | 227 | 228 | ------------------------------------------------------------------------------- 229 | echoInfo :: (Applicative m, MonadIO m) => LogStr -> m () 230 | echoInfo = echo InfoS 231 | 232 | 233 | ------------------------------------------------------------------------------- 234 | -- | Fail if command not successful. 235 | clearExit 236 | :: (Functor m, MonadIO m) 237 | => ExceptT Text m ExitCode 238 | -> ExceptT Text m () 239 | clearExit f = do 240 | res <- f 241 | case res of 242 | ExitSuccess -> echoInfo "Command successful." 243 | e -> do 244 | echo ErrorS $ ls $ "Command failed: " ++ show e 245 | hoistEither $ Left $ T.pack $ "Command failed with: " ++ show e 246 | 247 | 248 | ------------------------------------------------------------------------------- 249 | -- | Check if the target file is present. 250 | hdfsFileExists 251 | :: (MonadIO m, MonadReader LocalRunSettings m) 252 | => LocalFile 253 | -> m Bool 254 | hdfsFileExists p = liftIO . chk =<< path p 255 | where 256 | chk fp = (||) <$> doesFileExist fp <*> doesDirectoryExist fp 257 | 258 | 259 | ------------------------------------------------------------------------------- 260 | hdfsDeletePath 261 | :: (MonadIO m, MonadReader LocalRunSettings m) 262 | => LocalFile 263 | -> m () 264 | hdfsDeletePath p = do 265 | fp <- path p 266 | liftIO $ do 267 | chk <- doesDirectoryExist fp 268 | when chk (removeDirectoryRecursive fp) 269 | chk2 <- doesFileExist fp 270 | when chk2 (removeFile fp) 271 | 272 | 273 | ------------------------------------------------------------------------------- 274 | hdfsLs 275 | :: (MonadIO m, MonadReader LocalRunSettings m) 276 | => LocalFile -> m [File] 277 | hdfsLs p = do 278 | fs <- liftIO . getDirectoryContents' =<< path p 279 | return $ map (File "" 1 "" "") $ map (_unLocalFile p ) fs 280 | 281 | 282 | 283 | ------------------------------------------------------------------------------- 284 | hdfsPut 285 | :: (MonadIO m, MonadReader LocalRunSettings m) 286 | => LocalFile 287 | -> LocalFile 288 | -> m () 289 | hdfsPut src dest = do 290 | src' <- path src 291 | dest' <- path dest 292 | liftIO $ copyFile src' dest' 293 | 294 | 295 | ------------------------------------------------------------------------------- 296 | -- | Create a new multiple output file manager. 297 | hdfsFanOut 298 | :: (MonadIO m, MonadReader LocalRunSettings m) 299 | => FilePath 300 | -- ^ Temporary file location. 301 | -> m FanOut 302 | hdfsFanOut tmp = do 303 | env <- ask 304 | liftIO $ mkFanOut (mkHandle env) 305 | where 306 | 307 | mkTmp fp = tmp fp 308 | 309 | -- write into a temp file loc until we know the stage is 310 | -- complete without failure 311 | mkHandle env fp = do 312 | fp' <- runLocal env $ path (LocalFile (mkTmp fp)) 313 | createDirectoryIfMissing True (fp' ^. directory) 314 | h <- openFile fp' AppendMode 315 | return (h, fin env fp) 316 | 317 | -- move temp file to its final destination 318 | fin env fp0 = do 319 | temp <- runLocal env $ path (LocalFile (mkTmp fp0)) 320 | dest <- runLocal env $ path (LocalFile fp0) 321 | createDirectoryIfMissing True (dest ^. directory) 322 | renameFile temp dest 323 | 324 | 325 | ------------------------------------------------------------------------------- 326 | hdfsMkdir 327 | :: (MonadIO m, MonadReader LocalRunSettings m) 328 | => LocalFile 329 | -> m () 330 | hdfsMkdir p = liftIO . createDirectoryIfMissing True =<< path p 331 | 332 | 333 | ------------------------------------------------------------------------------- 334 | hdfsCat :: LocalFile -> Producer (ResourceT Local) B.ByteString 335 | hdfsCat p = sourceFile =<< (lift . lift) (path p) 336 | 337 | 338 | ------------------------------------------------------------------------------- 339 | hdfsGet 340 | :: (MonadIO m, MonadReader LocalRunSettings m) 341 | => LocalFile 342 | -> m LocalFile 343 | hdfsGet fp = do 344 | target <- randomFileName 345 | hdfsPut fp target 346 | return target 347 | 348 | 349 | hdfsLocalStream :: LocalFile -> Producer (ResourceT Local) B.ByteString 350 | hdfsLocalStream = hdfsCat 351 | 352 | 353 | ------------------------------------------------------------------------------- 354 | randomFileName :: MonadIO m => m LocalFile 355 | randomFileName = LocalFile `liftM` liftIO (randomToken 64) 356 | 357 | 358 | 359 | ------------------------------------------------------------------------------- 360 | -- | Helper to work with relative paths using Haskell functions like 361 | -- 'readFile' and 'writeFile'. 362 | withLocalFile 363 | :: MonadIO m 364 | => LocalRunSettings 365 | -> LocalFile 366 | -- ^ A relative path in our working folder 367 | -> (FilePath -> m b) 368 | -- ^ What to do with the absolute path. 369 | -> m b 370 | withLocalFile settings fp f = f =<< runLocal settings (path fp) 371 | -------------------------------------------------------------------------------- /src/Hadron/Streams.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE RecordWildCards #-} 4 | {-# LANGUAGE TemplateHaskell #-} 5 | 6 | module Hadron.Streams where 7 | 8 | ------------------------------------------------------------------------------- 9 | import Control.Applicative 10 | import Control.Concurrent.Async 11 | import Control.Concurrent.BoundedChan 12 | import Control.Lens 13 | import Control.Monad 14 | import Control.Monad.Trans 15 | import Data.Attoparsec.ByteString.Char8 (Parser, endOfInput, 16 | endOfLine, takeTill) 17 | import qualified Data.ByteString.Char8 as B 18 | import qualified Data.Conduit as C 19 | import qualified Data.Conduit.List as C 20 | import Data.IORef 21 | import Data.Maybe 22 | import System.IO.Streams (InputStream, OutputStream) 23 | import qualified System.IO.Streams as S 24 | import qualified System.IO.Streams.Attoparsec as S 25 | ------------------------------------------------------------------------------- 26 | 27 | {-| 28 | 29 | Thinking about io-streams, as things are inverted in some sense: 30 | 31 | - An InputStream knows how to produce input when user asks for it. 32 | User can read from it. 33 | 34 | - An OutputStream knows how to consume the input it will be fed by the 35 | user. User can write to it. 36 | 37 | -} 38 | 39 | 40 | 41 | ------------------------------------------------------------------------------- 42 | -- | Create a new stream for each item in the first stream and drain 43 | -- results into a single stream. 44 | bindStream :: InputStream a -> (a -> IO (InputStream b)) -> IO (InputStream b) 45 | bindStream i f = do 46 | ref <- newIORef Nothing 47 | S.makeInputStream (loop ref) 48 | where 49 | loop ref = do 50 | !acc <- readIORef ref 51 | case acc of 52 | Just is -> do 53 | n <- S.read is 54 | case n of 55 | Nothing -> writeIORef ref Nothing >> loop ref 56 | Just _ -> return n 57 | Nothing -> do 58 | next <- S.read i 59 | case next of 60 | Nothing -> return Nothing 61 | Just x -> do 62 | !is <- f x 63 | writeIORef ref (Just is) 64 | loop ref 65 | 66 | 67 | 68 | ------------------------------------------------------------------------------- 69 | -- | Very inefficient. Use a conduit to contramap an OutputStream. 70 | contraMapConduit :: C.Conduit b IO a -> S.OutputStream a -> IO (S.OutputStream b) 71 | contraMapConduit c s = S.makeOutputStream $ \ i -> case i of 72 | Nothing -> S.write Nothing s 73 | Just r -> do 74 | xs <- C.sourceList [r] C.$= c C.$$ C.consume 75 | mapM_ (flip S.write s . Just) xs 76 | 77 | 78 | ------------------------------------------------------------------------------- 79 | mapMaybeS :: (a -> Maybe b) -> InputStream a -> IO (InputStream b) 80 | mapMaybeS f s = S.makeInputStream g 81 | where 82 | g = do 83 | next <- S.read s 84 | case next of 85 | Nothing -> return Nothing 86 | Just a -> case f a of 87 | Nothing -> g 88 | a' -> return a' 89 | 90 | 91 | ------------------------------------------------------------------------------- 92 | contramapMaybe :: (a -> Maybe b) -> OutputStream b -> IO (OutputStream a) 93 | contramapMaybe f out = S.makeOutputStream go 94 | where 95 | go nxt = case nxt of 96 | Nothing -> S.write Nothing out 97 | Just x -> case f x of 98 | Nothing -> return () 99 | x' -> S.write x' out 100 | 101 | 102 | -- | Parse a line of input and eat a tab character that may be at the 103 | -- very end of the line. This tab is put by hadoop if this file is the 104 | -- result of a previous M/R that didn't have any value in the reduce 105 | -- step. 106 | parseLine :: Parser (Maybe B.ByteString) 107 | parseLine = (endOfInput >> pure Nothing) <|> (Just <$> (ln <* endOfLine)) 108 | where 109 | ln = do 110 | x <- takeTill (== '\n') 111 | return $ if B.length x > 0 && B.last x == '\t' 112 | then B.init x 113 | else x 114 | 115 | 116 | -- | Turn incoming stream into a stream of lines. This will 117 | -- automatically eat tab characters at the end of the line. 118 | streamLines :: InputStream B.ByteString -> IO (InputStream B.ByteString) 119 | streamLines = S.parserToInputStream parseLine 120 | 121 | 122 | ------------------------------------------------------------------------------- 123 | -- | Apply a conduit to input stream. 124 | conduitStream 125 | :: MonadIO m 126 | => (forall b. m b -> IO b) 127 | -- ^ Need a monad morphism to IO for the context. 128 | -> C.Conduit i m o 129 | -> S.InputStream i 130 | -> m (S.InputStream o) 131 | conduitStream run c i = consumeSource run (inputStreamToProducer i C.$= c) 132 | 133 | 134 | ------------------------------------------------------------------------------- 135 | consumeSource :: MonadIO m => (forall b. m b -> IO b) -> C.Source m a -> m (S.InputStream a) 136 | consumeSource run s = do 137 | ref <- liftIO $ newBoundedChan 512 138 | liftIO . async . run $ (s C.$$ go ref) 139 | liftIO $ S.makeInputStream (readChan ref) 140 | where 141 | go ref = do 142 | r <- C.await 143 | liftIO $ writeChan ref r 144 | case r of 145 | Nothing -> return () 146 | Just _ -> go ref 147 | 148 | 149 | 150 | ------------------------------------------------------------------------------- 151 | inputStreamToProducer :: MonadIO m => S.InputStream a -> C.Producer m a 152 | inputStreamToProducer s = go 153 | where 154 | go = do 155 | x <- liftIO $ S.read s 156 | case x of 157 | Nothing -> return () 158 | Just x' -> C.yield x' >> go 159 | 160 | 161 | ------------------------------------------------------------------------------- 162 | outputStreamToConsumer :: MonadIO m => S.OutputStream a -> C.Consumer a m () 163 | outputStreamToConsumer s = go 164 | where 165 | go = do 166 | r <- C.await 167 | liftIO $ S.write r s 168 | case r of 169 | Nothing -> return () 170 | Just _ -> go 171 | 172 | 173 | data EmitAcc x a = EmitAcc { 174 | _acBuffer :: [x] 175 | , _acDone :: ! Bool 176 | , _acAcc :: ! a 177 | } 178 | makeLenses ''EmitAcc 179 | 180 | 181 | ------------------------------------------------------------------------------- 182 | -- | Fold while also possibly returning elements to emit each step. 183 | -- The IO action can be used at anytime to obtain the state of the 184 | -- accumulator. 185 | emitFoldM 186 | :: (b -> Maybe i -> IO (b, [a])) 187 | -> b 188 | -> InputStream i 189 | -> IO (InputStream a, IO b) 190 | emitFoldM f a0 is = do 191 | ref <- newIORef $ EmitAcc [] False a0 192 | is' <- S.makeInputStream (loop ref) 193 | return (is', liftM _acAcc (readIORef ref)) 194 | 195 | where 196 | 197 | loop ref = do 198 | !EmitAcc{..} <- readIORef ref 199 | 200 | case _acBuffer of 201 | 202 | -- previous results in buffer; stream them out 203 | (x:rest) -> do 204 | modifyIORef' ref $ acBuffer .~ rest 205 | return (Just x) 206 | 207 | -- buffer empty; step the input stream 208 | [] -> do 209 | case _acDone of 210 | True -> return Nothing 211 | False -> do 212 | inc <- S.read is 213 | (!acc', !xs) <- f _acAcc inc 214 | modifyIORef' ref $ 215 | (acAcc .~ acc') . 216 | (acBuffer .~ xs) . 217 | if isNothing inc then acDone .~ True else id 218 | loop ref 219 | 220 | 221 | -------------------------------------------------------------------------------- /src/Hadron/Streams/Bounded.hs: -------------------------------------------------------------------------------- 1 | -- | Stream utilities for working with concurrent channels. 2 | 3 | {-# LANGUAGE BangPatterns #-} 4 | 5 | module Hadron.Streams.Bounded 6 | ( -- * Channel conversions 7 | inputToChan 8 | , chanToInput 9 | , chanToOutput 10 | , makeChanPipe 11 | ) where 12 | 13 | ------------------------------------------------------------------------------ 14 | import Control.Applicative ((<$>), (<*>)) 15 | import Control.Concurrent.BoundedChan 16 | import Prelude hiding (read) 17 | ------------------------------------------------------------------------------ 18 | import System.IO.Streams.Internal (InputStream, OutputStream, 19 | makeInputStream, 20 | makeOutputStream, read) 21 | 22 | 23 | ------------------------------------------------------------------------------ 24 | -- | Writes the contents of an input stream to a channel until the input stream 25 | -- yields end-of-stream. 26 | inputToChan :: InputStream a -> BoundedChan (Maybe a) -> IO () 27 | inputToChan is ch = go 28 | where 29 | go = do 30 | mb <- read is 31 | writeChan ch mb 32 | maybe (return $! ()) (const go) mb 33 | 34 | 35 | ------------------------------------------------------------------------------ 36 | -- | Turns a 'Chan' into an input stream. 37 | -- 38 | chanToInput :: BoundedChan (Maybe a) -> IO (InputStream a) 39 | chanToInput ch = makeInputStream $! readChan ch 40 | 41 | 42 | ------------------------------------------------------------------------------ 43 | -- | Turns a 'Chan' into an output stream. 44 | -- 45 | chanToOutput :: BoundedChan (Maybe a) -> IO (OutputStream a) 46 | chanToOutput = makeOutputStream . writeChan 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- 51 | -- | Create a new pair of streams using an underlying 'Chan'. Everything written 52 | -- to the 'OutputStream' will appear as-is on the 'InputStream'. 53 | -- 54 | -- Since reading from the 'InputStream' and writing to the 'OutputStream' are 55 | -- blocking calls, be sure to do so in different threads. 56 | makeChanPipe :: Int -> IO (InputStream a, OutputStream a) 57 | makeChanPipe n = do 58 | chan <- newBoundedChan n 59 | (,) <$> chanToInput chan <*> chanToOutput chan 60 | -------------------------------------------------------------------------------- /src/Hadron/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE RecordWildCards #-} 4 | {-# LANGUAGE TemplateHaskell #-} 5 | 6 | module Hadron.Types where 7 | 8 | ------------------------------------------------------------------------------- 9 | import Control.Lens 10 | import Control.Monad.Trans.Resource 11 | import qualified Data.ByteString.Char8 as B 12 | import Data.Conduit 13 | import Data.Default 14 | ------------------------------------------------------------------------------- 15 | import Hadron.Run.Hadoop 16 | ------------------------------------------------------------------------------- 17 | 18 | type Key = B.ByteString 19 | 20 | type CompositeKey = [Key] 21 | 22 | 23 | 24 | -- | Useful when making a key from multiple pieces of data 25 | mkKey :: [B.ByteString] -> B.ByteString 26 | mkKey = B.intercalate "|" 27 | 28 | 29 | ------------------------------------------------------------------------------- 30 | -- | A 'Mapper' parses and converts the unbounded incoming stream of 31 | -- input data into a stream of (key, value) pairs. 32 | -- 33 | -- A mapper is responsible for signaling its own "Nothing" to signify 34 | -- it is done writing. Hadron will not automatically mark EOF on the 35 | -- OutputStream, which is basically 'stdout'. 36 | type Mapper a k b = Conduit a (ResourceT IO) (k, b) 37 | 38 | 39 | ------------------------------------------------------------------------------- 40 | -- | A reducer takes an incoming stream of (key, value) pairs and 41 | -- emits zero or more output objects of type 'r'. 42 | -- 43 | -- Note that this framework guarantees your reducer function (i.e. the 44 | -- conduit you supply here) will see ONLY keys that are deemed 45 | -- 'equivalent' based on the 'MROptions' you supply. Different keys 46 | -- will be given to individual and isolated invocations of your 47 | -- reducer function. This is a very central abstraction (and one of 48 | -- the few major ones) provided by this framework. 49 | -- 50 | -- It does not matter if you supply your own EOF signals via Nothing 51 | -- as Hadron will simply discard them before relaying over to 'stdout' 52 | -- and supply its own EOF based on when its input is finished. 53 | type Reducer k a r = Conduit (k, a) (ResourceT IO) r 54 | 55 | 56 | data ReduceErrorStrategy 57 | = ReduceErrorReThrow 58 | | ReduceErrorSkipKey 59 | | ReduceErrorRetry 60 | deriving (Eq,Show,Read,Ord) 61 | 62 | 63 | ------------------------------------------------------------------------------- 64 | -- | Options for a single-step MR job. 65 | data MROptions = MROptions { 66 | _mroPart :: PartitionStrategy 67 | -- ^ Number of segments to expect in incoming keys. Affects both 68 | -- hadron program's understanding of key AND Hadoop's distribution 69 | -- of map output to reducers. 70 | , _mroComparator :: Comparator 71 | , _mroNumMap :: Maybe Int 72 | -- ^ Number of map tasks; 'Nothing' leaves it to Hadoop to decide. 73 | , _mroNumReduce :: Maybe Int 74 | -- ^ Number of reduce tasks; 'Nothing' leaves it to Hadoop to 75 | -- decide. 76 | , _mroTaskTimeout :: Maybe Int 77 | -- ^ Timeout in miliseconds for Hadoop tasks. Set to 0 to disable 78 | -- the timeout. 79 | , _mroCompress :: Maybe String 80 | -- ^ Whether to use compression on reduce output. 81 | , _mroOutSep :: Maybe Char 82 | -- ^ Separator to be communicated to Hadoop for the reduce output. 83 | -- Sets both the 'stream.reduce.output.field.separator' and 84 | -- 'mapred.textoutputformat.separator' parameters. Sometimes 85 | -- useful to trick Hadoop into agreeing that the reduce output has 86 | -- both a key and a value, therefore avoiding the trailing 87 | -- separator forcefully inserted by Hadoop. 88 | -- 89 | -- If you're outputting CSV for example, you may want to specify 90 | -- 'Just ,' here so that with 2 fields Hadoop will think you 91 | -- already have the key-value pair. 92 | , _mroReduceError :: ReduceErrorStrategy 93 | -- ^ What to do on reducer error. 94 | } 95 | 96 | makeLenses ''MROptions 97 | 98 | 99 | instance Default MROptions where 100 | def = MROptions NoPartition RegularComp Nothing Nothing Nothing Nothing Nothing 101 | ReduceErrorReThrow 102 | 103 | 104 | ------------------------------------------------------------------------------- 105 | -- | Obtain baseline Hadoop run-time options from provided step options 106 | mrOptsToRunOpts :: MROptions -> HadoopRunOpts 107 | mrOptsToRunOpts MROptions{..} = def { _mrsPart = _mroPart 108 | , _mrsNumMap = _mroNumMap 109 | , _mrsNumReduce = _mroNumReduce 110 | , _mrsCompress = _mroCompress 111 | , _mrsOutSep = _mroOutSep 112 | , _mrsComparator = _mroComparator 113 | , _mrsTaskTimeout = _mroTaskTimeout } 114 | 115 | -------------------------------------------------------------------------------- /src/Hadron/Utils.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE TemplateHaskell #-} 3 | 4 | module Hadron.Utils where 5 | 6 | 7 | ------------------------------------------------------------------------------- 8 | import Control.Applicative as A 9 | import Control.Lens 10 | import Control.Monad 11 | import Control.Monad.Trans 12 | import Data.Conduit 13 | import Data.List.Split 14 | import Numeric 15 | import Safe 16 | import System.Random (randomRIO) 17 | ------------------------------------------------------------------------------- 18 | 19 | 20 | 21 | -- | Perform a given monadic action once every X elements flowing 22 | -- through this conduit. 23 | performEvery 24 | :: (Monad m) 25 | => Integer 26 | -- ^ Once every N items that flow throught he pipe. 27 | -> (Integer -> m ()) 28 | -> ConduitM a a m () 29 | performEvery n f = go 1 30 | where 31 | go !i = do 32 | x <- await 33 | case x of 34 | Nothing -> return () 35 | Just x' -> do 36 | when (i `mod` n == 0) $ lift (f i) 37 | yield $! x' 38 | go $! i + 1 39 | 40 | 41 | 42 | ------------------------------------------------------------------------------- 43 | data File = File { 44 | _filePerms :: String 45 | , _fileSize :: Int 46 | , _fileDate :: String 47 | , _fileTime :: String 48 | , _filePath :: String 49 | } deriving (Eq,Show,Read,Ord) 50 | makeLenses ''File 51 | 52 | 53 | parseLs :: String -> Maybe File 54 | parseLs str = 55 | let xs = split (dropDelims . condense $ oneOf "\t ") str 56 | in File A.<$> xs !? 0 57 | <*> (xs !? 2 >>= readMay) 58 | <*> xs !? 3 59 | <*> xs !? 4 60 | <*> xs !? 5 61 | 62 | 63 | ------------------------------------------------------------------------------- 64 | -- | Generate a random token 65 | randomToken :: Int -> IO String 66 | randomToken n = do 67 | is <- sequence . take n $ repeat mk 68 | return $ concat $ map (flip showHex "") is 69 | where 70 | mk :: IO Int 71 | mk = randomRIO (0,15) 72 | 73 | 74 | ------------------------------------------------------------------------------- 75 | (!?) :: [a] -> Int -> Maybe a 76 | (!?) = atMay 77 | -------------------------------------------------------------------------------- /stack-lts-2.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-2.22 2 | packages: 3 | - '.' 4 | # TODO: have to drop this and depend on an officially released version of lcs 5 | - location: https://github.com/juanpaucar/lcs/archive/40cc3bc32b5cd96d78f224850ec2d4428051b8b1.zip 6 | extra-dep: true 7 | extra-deps: 8 | - katip-0.2.0.0 9 | - string-conv-0.1.1 10 | - retry-0.7.3 11 | - microlens-0.4.4.0 12 | - microlens-th-0.4.0.0 13 | - errors-2.1.2 14 | - unexceptionalio-0.3.0 15 | 16 | # Override default flag values for local packages and extra-deps 17 | flags: 18 | hadron: 19 | lib-Werror: True 20 | 21 | # Extra package databases containing global packages 22 | extra-package-dbs: [] 23 | -------------------------------------------------------------------------------- /stack-lts-3.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-3.22 2 | packages: 3 | - '.' 4 | # TODO: have to drop this and depend on an officially released version of lcs 5 | - location: https://github.com/juanpaucar/lcs/archive/40cc3bc32b5cd96d78f224850ec2d4428051b8b1.zip 6 | extra-dep: true 7 | extra-deps: 8 | - katip-0.2.0.0 9 | - string-conv-0.1.1 10 | - retry-0.7.3 11 | 12 | # Override default flag values for local packages and extra-deps 13 | flags: 14 | hadron: 15 | lib-Werror: True 16 | 17 | # Extra package databases containing global packages 18 | extra-package-dbs: [] 19 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-5.12 2 | packages: 3 | - '.' 4 | # TODO: have to drop this and depend on an officially released version of lcs 5 | - location: https://github.com/juanpaucar/lcs/archive/40cc3bc32b5cd96d78f224850ec2d4428051b8b1.zip 6 | extra-dep: true 7 | extra-deps: 8 | - katip-0.2.0.0 9 | - string-conv-0.1.1 10 | # Override default flag values for local packages and extra-deps 11 | flags: 12 | hadron: 13 | lib-Werror: True 14 | 15 | # Extra package databases containing global packages 16 | extra-package-dbs: [] 17 | -------------------------------------------------------------------------------- /test/Tests.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleInstances #-} 2 | {-# LANGUAGE MultiParamTypeClasses #-} 3 | {-# LANGUAGE TemplateHaskell #-} 4 | {-# OPTIONS_GHC -fno-warn-orphans #-} 5 | module Main where 6 | 7 | 8 | ------------------------------------------------------------------------------- 9 | import Control.Arrow 10 | import Data.DeriveTH 11 | import Data.List 12 | import Data.Ord 13 | import Data.Time 14 | import Test.Framework 15 | import Test.Framework.Providers.QuickCheck2 16 | import Test.QuickCheck 17 | ------------------------------------------------------------------------------- 18 | import Hadron.Controller 19 | ------------------------------------------------------------------------------- 20 | 21 | 22 | main :: IO () 23 | main = defaultMain 24 | [ testProperty "MRKey UTCTime instance obeys ordering" prop_utcMrKeySort 25 | ] 26 | 27 | 28 | ------------------------------------------------------------------------------- 29 | prop_utcMrKeySort :: [UTCTime] -> Bool 30 | prop_utcMrKeySort ds = sortBy (comparing fst) ds' == sortBy (comparing snd) ds' 31 | where 32 | ds' = map (id &&& toCompKey) ds 33 | 34 | 35 | ------------------------------------------------------------------------------- 36 | instance Arbitrary DiffTime where 37 | arbitrary = secondsToDiffTime `fmap` arbitrary 38 | 39 | 40 | $(derives [makeArbitrary] [''UTCTime, ''Day]) 41 | 42 | --------------------------------------------------------------------------------