├── test ├── test_helper.exs └── crawler_test.exs ├── .gitignore ├── README.md ├── mix.lock ├── lib ├── parser.ex ├── sitemap.ex └── crawler.ex ├── mix.exs └── config └── config.exs /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /deps 3 | erl_crash.dump 4 | *.ez 5 | crawler 6 | -------------------------------------------------------------------------------- /test/crawler_test.exs: -------------------------------------------------------------------------------- 1 | defmodule CrawlerTest do 2 | use ExUnit.Case 3 | 4 | test "the truth" do 5 | assert 1 + 1 == 2 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Crawler 2 | ======= 3 | 4 | A web crawler in Elixir 5 | 6 | ```sh 7 | mix escript.build 8 | ./crawler --url=https://github.com 9 | ``` 10 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{"floki": {:hex, :floki, "0.3.2"}, 2 | "httpotion": {:hex, :httpotion, "2.1.0"}, 3 | "ibrowse": {:git, "git://github.com/cmullaparthi/ibrowse.git", "d2e369ff42666c3574b8b7ec26f69027895c4d94", [tag: "v4.1.1"]}, 4 | "json": {:hex, :json, "0.3.2"}, 5 | "mochiweb": {:hex, :mochiweb, "2.12.2"}, 6 | "poison": {:hex, :poison, "1.4.0"}} 7 | -------------------------------------------------------------------------------- /lib/parser.ex: -------------------------------------------------------------------------------- 1 | defmodule Parser do 2 | def parse(base, page_body) do 3 | page_body |> Floki.find("a") |> Floki.attribute("href") |> Enum.map(fn link -> 4 | parse_link(link, base) 5 | end) |> Enum.filter(fn link -> !is_nil(link) end) 6 | end 7 | 8 | def parse_link(link, base) do 9 | %{authority: base_authority} = URI.parse(base) 10 | 11 | case URI.parse(link) do 12 | %{authority: nil} -> 13 | base <> link 14 | %{authority: ^base_authority} -> 15 | link 16 | _ -> 17 | nil 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/sitemap.ex: -------------------------------------------------------------------------------- 1 | defmodule SiteMap do 2 | def start_link do 3 | Agent.start_link(fn -> %{} end, name: __MODULE__) 4 | end 5 | 6 | def has_page?(name) do 7 | Agent.get __MODULE__, fn map -> 8 | Dict.has_key?(map, name) 9 | end 10 | end 11 | 12 | def put_page(name, links) do 13 | IO.puts "Saving links for " <> name 14 | 15 | Agent.update __MODULE__, fn map -> 16 | Map.put(map, name, links) 17 | end 18 | end 19 | 20 | def to_json do 21 | Agent.get __MODULE__, fn map -> 22 | {:ok, json} = JSON.encode(map) 23 | json 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Crawler.Mixfile do 2 | use Mix.Project 3 | 4 | def project do 5 | [app: :crawler, 6 | version: "0.0.1", 7 | elixir: "~> 1.0", 8 | build_embedded: Mix.env == :prod, 9 | start_permanent: Mix.env == :prod, 10 | escript: [main_module: Crawler], 11 | deps: deps] 12 | end 13 | 14 | # Configuration for the OTP application 15 | # 16 | # Type `mix help compile.app` for more information 17 | def application do 18 | [applications: [:logger, :httpotion]] 19 | end 20 | 21 | # Dependencies can be Hex packages: 22 | # 23 | # {:mydep, "~> 0.3.0"} 24 | # 25 | # Or git/path repositories: 26 | # 27 | # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"} 28 | # 29 | # Type `mix help deps` for more examples and options 30 | defp deps do 31 | [ 32 | {:ibrowse, github: "cmullaparthi/ibrowse", tag: "v4.1.1"}, 33 | {:httpotion, "~> 2.1.0"}, 34 | {:floki, "~> 0.3"}, 35 | {:json, "~> 0.3.0"} 36 | ] 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /lib/crawler.ex: -------------------------------------------------------------------------------- 1 | defmodule Crawler do 2 | def main(args) do 3 | args |> parse_args |> crawl 4 | end 5 | 6 | def crawl(url) do 7 | SiteMap.start_link 8 | 9 | get_all(url, [url]) 10 | 11 | # TODO: does this need to wait? 12 | IO.puts SiteMap.to_json 13 | end 14 | 15 | def get_all(_, []) do 16 | {:ok} 17 | end 18 | 19 | def get_all(base, stack) do 20 | Enum.each stack, fn url -> 21 | task = Task.async(fn -> get(base, url) end) 22 | Task.await(task, 10_000) 23 | end 24 | end 25 | 26 | def get(base, url) do 27 | # TODO: each fetch should be in a new process 28 | IO.puts "getting " <> url 29 | 30 | SiteMap.put_page(url, []) 31 | %{body: body} = HTTPotion.get(url) 32 | links = Parser.parse(base, body) 33 | SiteMap.put_page(url, links) 34 | 35 | to_fetch = Enum.filter links, fn link -> 36 | !SiteMap.has_page?(link) 37 | end 38 | 39 | get_all(base, to_fetch) 40 | end 41 | 42 | defp parse_args(args) do 43 | {options, _, _} = OptionParser.parse(args, switches: [url: :string]) 44 | options[:url] 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for third- 9 | # party users, it should be done in your mix.exs file. 10 | 11 | # Sample configuration: 12 | # 13 | # config :logger, :console, 14 | # level: :info, 15 | # format: "$date $time [$level] $metadata$message\n", 16 | # metadata: [:user_id] 17 | 18 | # It is also possible to import configuration files, relative to this 19 | # directory. For example, you can emulate configuration per environment 20 | # by uncommenting the line below and defining dev.exs, test.exs and such. 21 | # Configuration from the imported file will override the ones defined 22 | # here (which is why it is important to import them last). 23 | # 24 | # import_config "#{Mix.env}.exs" 25 | --------------------------------------------------------------------------------