├── .gitignore ├── DataflowAgent ├── Async.fs ├── DataflowAgent.fs └── DataflowAgent.fsproj ├── FSharpDataflow.sln ├── README.md └── WebCrawler ├── Prog.fs ├── Program.fs ├── WebCrawler.fsproj └── WebCrawler └── WebCrawler.fsproj /.gitignore: -------------------------------------------------------------------------------- 1 | #ignore thumbnails created by windows 2 | Thumbs.db 3 | #Ignore files build by Visual Studio 4 | *.obj 5 | *.exe 6 | *.pdb 7 | *.user 8 | *.aps 9 | *.pch 10 | *.vspscc 11 | *_i.c 12 | *_p.c 13 | *.ncb 14 | *.suo 15 | *.tlb 16 | *.tlh 17 | *.bak 18 | *.cache 19 | *.ilk 20 | *.log 21 | [Bb]in 22 | [Dd]ebug*/ 23 | *.lib 24 | *.sbr 25 | obj/ 26 | [Rr]elease*/ 27 | _ReSharper*/ 28 | [Tt]est[Rr]esult* 29 | build/ 30 | deploy/ 31 | docs/ 32 | nuget/ 33 | test/ 34 | *_Spliced.* 35 | *.vsp 36 | -------------------------------------------------------------------------------- /DataflowAgent/Async.fs: -------------------------------------------------------------------------------- 1 | // ---------------------------------------------------------------------------- 2 | // F# TPL Dataflow MailboxProcessor implementation 3 | // (c) David Thomas 2012, Available under Apache 2.0 license. 4 | // ---------------------------------------------------------------------------- 5 | namespace FSharp.Dataflow 6 | open System 7 | open System.Threading 8 | open System.Threading.Tasks 9 | 10 | [] 11 | type AsyncReplyChannel<'Reply> internal (replyf : 'Reply -> unit) = 12 | member x.Reply(reply) = replyf(reply) 13 | 14 | [] 15 | type internal AsyncResultCell<'a>() = 16 | let source = new TaskCompletionSource<'a>() 17 | 18 | member x.RegisterResult result = source.SetResult(result) 19 | 20 | member x.AsyncWaitResult = 21 | Async.FromContinuations(fun (cont,_,_) -> 22 | let apply = fun (task:Task<_>) -> cont (task.Result) 23 | source.Task.ContinueWith(apply) |> ignore) 24 | 25 | member x.GetWaitHandle(timeout:int) = 26 | async { let waithandle = source.Task.Wait(timeout) 27 | return waithandle } 28 | 29 | member x.GrabResult() = source.Task.Result 30 | 31 | member x.TryWaitResultSynchronously(timeout:int) = 32 | //early completion check 33 | if source.Task.IsCompleted then 34 | Some source.Task.Result 35 | //now force a wait for the task to complete 36 | else 37 | if source.Task.Wait(timeout) then 38 | Some source.Task.Result 39 | else None -------------------------------------------------------------------------------- /DataflowAgent/DataflowAgent.fs: -------------------------------------------------------------------------------- 1 | // ---------------------------------------------------------------------------- 2 | // F# TPL Dataflow MailboxProcessor implementation 3 | // (c) David Thomas 2012, Available under Apache 2.0 license. 4 | // ---------------------------------------------------------------------------- 5 | namespace FSharp.Dataflow 6 | open System 7 | open System.Threading 8 | open System.Threading.Tasks 9 | open System.Threading.Tasks.Dataflow 10 | #nowarn "40" 11 | 12 | [] 13 | type DataflowAgent<'Msg>(initial, ?cancelToken, ?dataflowOptions) = 14 | let cancellationToken = defaultArg cancelToken Async.DefaultCancellationToken 15 | let mutable started = false 16 | let errorEvent = new Event() 17 | let options = defaultArg dataflowOptions <| DataflowBlockOptions() 18 | let incomingMessages = new BufferBlock<'Msg>(options) 19 | let mutable defaultTimeout = Timeout.Infinite 20 | 21 | member x.CurrentQueueLength() = incomingMessages.Count 22 | 23 | member x.DefaultTimeout 24 | with get() = defaultTimeout 25 | and set(value) = defaultTimeout <- value 26 | 27 | [] 28 | member this.Error = errorEvent.Publish 29 | 30 | member x.Start() = 31 | if started 32 | then raise (new InvalidOperationException("Already Started.")) 33 | else 34 | started <- true 35 | let comp = async { try do! initial x 36 | with error -> errorEvent.Trigger error } 37 | Async.Start(computation = comp, cancellationToken = cancellationToken) 38 | 39 | member x.Receive(?timeout) = 40 | Async.AwaitTask <| incomingMessages.ReceiveAsync() 41 | 42 | member x.TryReceive(?timeout) = 43 | let ts = TimeSpan.FromMilliseconds(float <| defaultArg timeout defaultTimeout) 44 | Async.AwaitTask <| incomingMessages.ReceiveAsync(ts) 45 | .ContinueWith(fun (tt:Task<_>) -> 46 | if tt.IsCanceled || tt.IsFaulted then None 47 | else Some tt.Result) 48 | 49 | member x.Post(item) = 50 | let posted = incomingMessages.Post(item) 51 | if not posted then 52 | raise (InvalidOperationException("Incoming message buffer full.")) 53 | 54 | member x.TryPostAndReply(replyChannelMsg, ?timeout) :'Reply option = 55 | let timeout = defaultArg timeout defaultTimeout 56 | let resultCell = AsyncResultCell<_>() 57 | let msg = replyChannelMsg(new AsyncReplyChannel<_>(fun reply -> resultCell.RegisterResult(reply))) 58 | if incomingMessages.Post(msg) then 59 | resultCell.TryWaitResultSynchronously(timeout) 60 | else None 61 | 62 | member x.PostAndReply(replyChannelMsg, ?timeout) : 'Reply = 63 | match x.TryPostAndReply(replyChannelMsg, ?timeout = timeout) with 64 | | None -> raise (TimeoutException("PostAndReply timed out")) 65 | | Some result -> result 66 | 67 | member x.PostAndTryAsyncReply(replyChannelMsg, ?timeout): Async<'Reply option> = 68 | let timeout = defaultArg timeout defaultTimeout 69 | let resultCell = AsyncResultCell<_>() 70 | let msg = replyChannelMsg(new AsyncReplyChannel<_>(fun reply -> resultCell.RegisterResult(reply))) 71 | let posted = incomingMessages.Post(msg) 72 | if posted then 73 | match timeout with 74 | | Threading.Timeout.Infinite -> 75 | async { let! result = resultCell.AsyncWaitResult 76 | return Some(result) } 77 | | _ -> 78 | async { let! ok = resultCell.GetWaitHandle(timeout) 79 | let res = (if ok then Some(resultCell.GrabResult()) else None) 80 | return res } 81 | else async{return None} 82 | 83 | member x.PostAndAsyncReply( replyChannelMsg, ?timeout) = 84 | let timeout = defaultArg timeout defaultTimeout 85 | match timeout with 86 | | Threading.Timeout.Infinite -> 87 | let resCell = AsyncResultCell<_>() 88 | let msg = replyChannelMsg (AsyncReplyChannel<_>(fun reply -> resCell.RegisterResult(reply) )) 89 | let posted = incomingMessages.Post(msg) 90 | if posted then 91 | resCell.AsyncWaitResult 92 | else 93 | raise (InvalidOperationException("Incoming message buffer full.")) 94 | | _ -> 95 | let asyncReply = x.PostAndTryAsyncReply(replyChannelMsg, timeout=timeout) 96 | async { let! res = asyncReply 97 | match res with 98 | | None -> return! raise (TimeoutException("PostAndAsyncReply TimedOut")) 99 | | Some res -> return res } 100 | 101 | member x.TryScan((scanner: 'Msg -> Async<_> option), timeout): Async<_ option> = 102 | let ts = TimeSpan.FromMilliseconds( float timeout) 103 | let rec loopformsg = async { 104 | let! msg = Async.AwaitTask <| incomingMessages.ReceiveAsync(ts) 105 | .ContinueWith(fun (tt:Task<_>) -> 106 | if tt.IsCanceled || tt.IsFaulted then None 107 | else Some tt.Result) 108 | match msg with 109 | | Some m-> let res = scanner m 110 | match res with 111 | | None -> return! loopformsg 112 | | Some res -> return! res 113 | | None -> return None} 114 | loopformsg 115 | 116 | member x.Scan(scanner, timeout) = 117 | async { let! res = x.TryScan(scanner, timeout) 118 | match res with 119 | | None -> return raise(TimeoutException("Scan TimedOut")) 120 | | Some res -> return res } 121 | 122 | static member Start(initial, ?cancellationToken, ?dataflowOptions) = 123 | let dfa = DataflowAgent<'Msg>(initial, ?cancelToken = cancellationToken, ?dataflowOptions = dataflowOptions) 124 | dfa.Start();dfa -------------------------------------------------------------------------------- /DataflowAgent/DataflowAgent.fsproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | x86 6 | 8.0.30703 7 | 2.0 8 | {5667053f-c57d-49b5-ac6d-5db216363519} 9 | Library 10 | TPLDF_Agent 11 | TPLDF_Agent 12 | v4.0 13 | 14 | 15 | DataflowAgent 16 | 17 | 18 | true 19 | full 20 | false 21 | false 22 | bin\Debug\ 23 | DEBUG;TRACE 24 | 3 25 | AnyCPU 26 | bin\Debug\TPLDF_Agent.XML 27 | 28 | 29 | 30 | 31 | pdbonly 32 | true 33 | true 34 | bin\Release\ 35 | TRACE 36 | 3 37 | x86 38 | bin\Release\TPLDF_Agent.XML 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | C:\Program Files (x86)\Microsoft Corporation\TPL Dataflow\System.Threading.Tasks.Dataflow.dll 54 | 55 | 56 | 63 | -------------------------------------------------------------------------------- /FSharpDataflow.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "DataflowAgent", "DataflowAgent\DataflowAgent.fsproj", "{5667053F-C57D-49B5-AC6D-5DB216363519}" 5 | EndProject 6 | Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "WebCrawler", "WebCrawler\WebCrawler.fsproj", "{C63D7083-9367-4927-9A61-EDAE18E38999}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x86 = Debug|x86 11 | Release|x86 = Release|x86 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {5667053F-C57D-49B5-AC6D-5DB216363519}.Debug|x86.ActiveCfg = Debug|x86 15 | {5667053F-C57D-49B5-AC6D-5DB216363519}.Debug|x86.Build.0 = Debug|x86 16 | {5667053F-C57D-49B5-AC6D-5DB216363519}.Release|x86.ActiveCfg = Release|x86 17 | {5667053F-C57D-49B5-AC6D-5DB216363519}.Release|x86.Build.0 = Release|x86 18 | {C63D7083-9367-4927-9A61-EDAE18E38999}.Debug|x86.ActiveCfg = Debug|x86 19 | {C63D7083-9367-4927-9A61-EDAE18E38999}.Debug|x86.Build.0 = Debug|x86 20 | {C63D7083-9367-4927-9A61-EDAE18E38999}.Release|x86.ActiveCfg = Release|x86 21 | {C63D7083-9367-4927-9A61-EDAE18E38999}.Release|x86.Build.0 = Release|x86 22 | EndGlobalSection 23 | GlobalSection(SolutionProperties) = preSolution 24 | HideSolutionNode = FALSE 25 | EndGlobalSection 26 | EndGlobal 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | An implementation of MailboxProcessor using TPL Dataflow. 2 | 3 | Have a look at the following posts for details: 4 | 5 | [TDF Agents I] (http://7sharpnine.com/2012/01/22/2012-01-22-fsharp-dataflow-agents-i//) 6 | [TDF Agents II] (http://7sharpnine.com/2012/01/30/2012-01-24-fsharp-dataflow-agents-ii/) 7 | [TDF Agents III] (http://7sharpnine.com/2012/02/20/2012-02-19-fsharp-dataflow-agents-iii/) 8 | 9 | TODO: Add XmlDoc comments for the member functions. Most of the functions are the equivelent of the MailboxProcessor's apart from `Scan` and `TryScan` 10 | -------------------------------------------------------------------------------- /WebCrawler/Prog.fs: -------------------------------------------------------------------------------- 1 | module Prog 2 | 3 | /// Message type used by the agent - contains queueing 4 | /// of work items and notification of completion 5 | type internal ThrottlingAgentMessage = 6 | | Completed 7 | | Work of Async 8 | 9 | /// Represents an agent that runs operations in concurrently. When the number 10 | /// of concurrent operations exceeds 'limit', they are queued and processed later 11 | type ThrottlingAgent(limit) = 12 | let agent =MailboxProcessor.Start(fun agent -> 13 | 14 | /// Represents a state when the agent is blocked 15 | let rec waiting () = 16 | // Use 'Scan' to wait for completion of some work 17 | agent.Scan(function 18 | | Completed -> Some(working (limit - 1)) 19 | | _ -> None) 20 | 21 | /// Represents a state when the agent is working 22 | and working count = async { 23 | // Receive any message 24 | let! msg = agent.Receive() 25 | match msg with 26 | | Completed -> 27 | // Decrement the counter of work items 28 | return! working (count - 1) 29 | | Work work -> 30 | // Start the work item & continue in blocked/working state 31 | async { try do! work 32 | finally agent.Post(Completed) } 33 | |> Async.Start 34 | if count < limit then return! working (count + 1) 35 | else return! waiting () } 36 | 37 | // Start in working state with zero running work items 38 | working 0) 39 | 40 | /// Queue the specified asynchronous workflow for processing 41 | member x.DoWork(work) = agent.Post(Work work) -------------------------------------------------------------------------------- /WebCrawler/Program.fs: -------------------------------------------------------------------------------- 1 | open System 2 | open System.Collections.Concurrent 3 | open System.Collections.Generic 4 | open System.IO 5 | open System.Net 6 | open System.Text.RegularExpressions 7 | open FSharp.Dataflow 8 | open System.Threading.Tasks.Dataflow 9 | module Helpers = 10 | 11 | type Agent<'T> = DataflowAgent<'T> 12 | //type Agent<'T> = MailboxProcessor<'T> 13 | type Message = 14 | | Done 15 | | Mailbox of Agent 16 | | Stop 17 | | Url of string option 18 | | Start of AsyncReplyChannel 19 | 20 | // Gates the number of crawling agents. 21 | [] 22 | let Gate = 10 23 | 24 | // Extracts links from HTML. 25 | let extractLinks html = 26 | let pattern1 = "(?i)href\\s*=\\s*(\"|\')/?((?!#.*|/\B|mailto:|location\.|javascript:)[^\"\']+)(\"|\')" 27 | let pattern2 = "(?i)^https?" 28 | 29 | let links = 30 | [ 31 | for x in Regex(pattern1).Matches(html) do 32 | yield x.Groups.[2].Value 33 | ] |> List.filter (fun x -> Regex(pattern2).IsMatch(x)) 34 | links 35 | 36 | // Fetches a Web page. 37 | let fetch (url : string) = 38 | try 39 | let req = WebRequest.Create(url) :?> HttpWebRequest 40 | req.UserAgent <- "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)" 41 | req.Timeout <- 5000 42 | use resp = req.GetResponse() 43 | let content = resp.ContentType 44 | let isHtml = Regex("html").IsMatch(content) 45 | match isHtml with 46 | | true -> use stream = resp.GetResponseStream() 47 | use reader = new StreamReader(stream) 48 | let html = reader.ReadToEnd() 49 | Some html 50 | | false -> None 51 | with 52 | | _ -> None 53 | 54 | let collectLinks url = 55 | let html = fetch url 56 | match html with 57 | | Some x -> extractLinks x 58 | | None -> [] 59 | 60 | open Helpers 61 | 62 | let sw = System.Diagnostics.Stopwatch() 63 | let crawl url limit = 64 | // Concurrent queue for saving collected urls. 65 | let q = ConcurrentQueue() 66 | 67 | // Holds crawled URLs. 68 | let set = HashSet() 69 | 70 | // Creates a mailbox that synchronizes printing to the console (so 71 | // that two calls to 'printfn' do not interleave when printing) 72 | let printer = 73 | Agent.Start(fun x -> async { 74 | while true do 75 | let! str = x.Receive() 76 | printfn "%s" str }) 77 | // Hides standard 'printfn' function (formats the string using 78 | // 'kprintf' and then posts the result to the printer agent. 79 | let printfn fmt = 80 | Printf.kprintf printer.Post fmt 81 | 82 | let supervisor = 83 | Agent.Start(fun x -> async { 84 | // The agent expects to receive 'Start' message first - the message 85 | // carries a reply channel that is used to notify the caller 86 | // when the agent completes crawling. 87 | let! start = x.Receive() 88 | let repl = 89 | match start with 90 | | Start repl -> repl 91 | | _ -> failwith "Expected Start message!" 92 | 93 | let rec loop run = 94 | async { 95 | let! msg = x.Receive() 96 | match msg with 97 | | Mailbox(mailbox) -> 98 | let count = set.Count 99 | if count < limit - 1 && run then 100 | let url = q.TryDequeue() 101 | match url with 102 | | true, str -> if not (set.Contains str) then 103 | let set'= set.Add str 104 | mailbox.Post <| Url(Some str) 105 | return! loop run 106 | else 107 | mailbox.Post <| Url None 108 | return! loop run 109 | 110 | | _ -> mailbox.Post <| Url None 111 | return! loop run 112 | else 113 | mailbox.Post Stop 114 | return! loop run 115 | | Stop -> return! loop false 116 | | Start _ -> failwith "Unexpected start message!" 117 | | Url _ -> failwith "Unexpected URL message!" 118 | | Done -> printfn "Supervisor is done." 119 | //(x :> IDisposable).Dispose() 120 | // Notify the caller that the agent has completed 121 | repl.Reply(()) 122 | } 123 | do! loop true }) 124 | 125 | 126 | let urlCollector = 127 | Agent.Start(fun y -> 128 | let rec loop count = 129 | async { 130 | let! msg = y.TryReceive(6000) 131 | match msg with 132 | | Some message -> 133 | match message with 134 | | Url u -> 135 | match u with 136 | | Some url -> q.Enqueue url 137 | return! loop count 138 | | None -> return! loop count 139 | | _ -> 140 | match count with 141 | | Gate -> supervisor.Post Done 142 | //(y :> IDisposable).Dispose() 143 | printfn "URL collector is done." 144 | sw.Stop() 145 | printfn "****** %i ms ******" sw.ElapsedMilliseconds 146 | | _ -> return! loop (count + 1) 147 | | None -> supervisor.Post Stop 148 | return! loop count 149 | } 150 | loop 1) 151 | 152 | /// Initializes a crawling agent. 153 | let crawler id = 154 | Agent.Start(initial = (fun inbox -> 155 | let rec loop() = 156 | async { 157 | let! msg = inbox.Receive() 158 | match msg with 159 | | Url x -> 160 | match x with 161 | | Some url -> 162 | let links = collectLinks url 163 | printfn "%s crawled by agent %d." url id 164 | for link in links do 165 | urlCollector.Post <| Url (Some link) 166 | supervisor.Post(Mailbox(inbox)) 167 | return! loop() 168 | | None -> supervisor.Post(Mailbox(inbox)) 169 | return! loop() 170 | | _ -> urlCollector.Post Done 171 | printfn "Agent %d is done." id 172 | //(inbox :> IDisposable).Dispose() 173 | } 174 | loop()), dataflowOptions = DataflowBlockOptions(BoundedCapacity=10)) 175 | 176 | // Send 'Start' message to the main agent. The result 177 | // is asynchronous workflow that will complete when the 178 | // agent crawling completes 179 | let result = supervisor.PostAndAsyncReply(Start) 180 | // Spawn the crawlers. 181 | let crawlers = 182 | [ 183 | for i in 1 .. Gate do 184 | yield crawler i 185 | ] 186 | 187 | // Post the first messages. 188 | crawlers.Head.Post <| Url (Some url) 189 | crawlers.Tail |> List.iter (fun ag -> ag.Post <| Url None) 190 | result 191 | 192 | sw.Start() 193 | crawl "http://news.google.com" 50 194 | |> Async.RunSynchronously 195 | 196 | Console.ReadKey() |> ignore -------------------------------------------------------------------------------- /WebCrawler/WebCrawler.fsproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | x86 6 | 8.0.30703 7 | 2.0 8 | c63d7083-9367-4927-9a61-edae18e38999 9 | Exe 10 | WebCrawler 11 | WebCrawler 12 | v4.0 13 | Client 14 | WebCrawler 15 | 16 | 17 | true 18 | full 19 | false 20 | false 21 | bin\Debug\ 22 | DEBUG;TRACE 23 | 3 24 | x86 25 | bin\Debug\WebCrawler.XML 26 | 27 | 28 | pdbonly 29 | true 30 | true 31 | bin\Release\ 32 | TRACE 33 | 3 34 | x86 35 | bin\Release\WebCrawler.XML 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | ..\..\..\Program Files (x86)\Microsoft Corporation\TPL Dataflow\System.Threading.Tasks.Dataflow.dll 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | DataflowAgent 53 | {5667053f-c57d-49b5-ac6d-5db216363519} 54 | True 55 | 56 | 57 | 58 | 59 | 66 | -------------------------------------------------------------------------------- /WebCrawler/WebCrawler/WebCrawler.fsproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | x86 6 | 8.0.30703 7 | 2.0 8 | {ac623873-957b-427f-9248-823b2869da43} 9 | Exe 10 | WebCrawler 11 | WebCrawler 12 | v4.0 13 | Client 14 | WebCrawler 15 | 16 | 17 | true 18 | full 19 | false 20 | false 21 | bin\Debug\ 22 | DEBUG;TRACE 23 | 3 24 | x86 25 | bin\Debug\WebCrawler.XML 26 | 27 | 28 | pdbonly 29 | true 30 | true 31 | bin\Release\ 32 | TRACE 33 | 3 34 | x86 35 | bin\Release\WebCrawler.XML 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 56 | --------------------------------------------------------------------------------