├── test
├── test_helper.exs
├── html
│ ├── shift_jis.html
│ └── utf8.html
├── metainvestigator
│ └── meta_test.exs
└── metainvestigator_test.exs
├── .gitignore
├── LICENSE
├── config
└── config.exs
├── mix.exs
├── mix.lock
├── lib
├── metainvestigator
│ └── meta.ex
└── metainvestigator.ex
└── README.md
/test/test_helper.exs:
--------------------------------------------------------------------------------
1 | ExUnit.start()
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /deps
2 | /_build
3 | erl_crash.dump
4 | *.ez
5 |
--------------------------------------------------------------------------------
/test/html/shift_jis.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nekova/metainvestigator/HEAD/test/html/shift_jis.html
--------------------------------------------------------------------------------
/test/html/utf8.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | MetaInvestigator
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |

15 |

16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/test/metainvestigator/meta_test.exs:
--------------------------------------------------------------------------------
1 | defmodule MetaInvestigator.MetaTest do
2 | use ExUnit.Case
3 | import MetaInvestigator.Meta
4 |
5 | @html File.read! "test/html/utf8.html"
6 |
7 | @shift_jis File.read! "test/html/shift_jis.html"
8 |
9 | test "charset" do
10 | assert charset(@html) == "utf-8"
11 | end
12 |
13 | test "keywords" do
14 | assert keywords(@html) == "This is keywords"
15 | end
16 |
17 | test "og_title" do
18 | assert og_title(@html) == "MetaInvestigator in Test"
19 | end
20 |
21 | test "og_image" do
22 | assert og_image(@html) == "http://img.example.gif"
23 | end
24 |
25 | test "og_type" do
26 | assert og_type(@html) == "article"
27 | end
28 |
29 | test "og_url" do
30 | assert og_url(@html) == "http://example.com"
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015 nekova
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining
4 | a copy of this software and associated documentation files (the
5 | "Software"), to deal in the Software without restriction, including
6 | without limitation the rights to use, copy, modify, merge, publish,
7 | distribute, sublicense, and/or sell copies of the Software, and to
8 | permit persons to whom the Software is furnished to do so, subject to
9 | the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
1 | # This file is responsible for configuring your application
2 | # and its dependencies with the aid of the Mix.Config module.
3 | use Mix.Config
4 | # This configuration is loaded before any dependency and is restricted
5 | # to this project. If another project depends on this project, this
6 | # file won't be loaded nor affect the parent project. For this reason,
7 | # if you want to provide default values for your application for third-
8 | # party users, it should be done in your mix.exs file.
9 |
10 | # Sample configuration:
11 | #
12 | # config :logger, :console,
13 | # level: :info,
14 | # format: "$date $time [$level] $metadata$message\n",
15 | # metadata: [:user_id]
16 |
17 | # It is also possible to import configuration files, relative to this
18 | # directory. For example, you can emulate configuration per environment
19 | # by uncommenting the line below and defining dev.exs, test.exs and such.
20 | # Configuration from the imported file will override the ones defined
21 | # here (which is why it is important to import them last).
22 | #
23 | # import_config "#{Mix.env}.exs"
24 |
--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
1 | defmodule MetaInvestigator.Mixfile do
2 | use Mix.Project
3 |
4 | def project do
5 | [app: :metainvestigator,
6 | version: "0.0.3",
7 | elixir: "~> 1.0",
8 | description: description(),
9 | package: package(),
10 | deps: deps()]
11 | end
12 |
13 | # Configuration for the OTP application
14 | #
15 | # Type `mix help compile.app` for more information
16 | def application do
17 | [applications: [:logger]]
18 | end
19 |
20 | # Dependencies can be Hex packages:
21 | #
22 | # {:mydep, "~> 0.3.0"}
23 | #
24 | # Or git/path repositories:
25 | #
26 | # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"}
27 | #
28 | # Type `mix help deps` for more examples and options
29 | defp deps do
30 | [
31 | {:elixir_mbcs, git: "https://github.com/woxtu/elixir-mbcs.git"},
32 | {:floki, "~> 0.6"},
33 | {:httpoison, "~> 0.7.2", only: :dev}
34 | ]
35 | end
36 |
37 | defp description do
38 | """
39 | A library for web scraping, inspired by MetaInspector
40 | """
41 | end
42 |
43 | defp package do
44 | [maintainers: ["nekova"],
45 | licenses: ["MIT"],
46 | links: %{github: "https://github.com/nekova/metainvestigator"}
47 | ]
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
1 | %{"elixir_mbcs": {:git, "https://github.com/woxtu/elixir-mbcs.git", "884c97098cc08988dd0555ffd3ac564079d74457", []},
2 | "floki": {:hex, :floki, "0.7.1", "64e6d266b4817711b24b9d2e62b55f4af673d791fdd30462f961b94f45c8b1e9", [:mix], [{:mochiweb, "~> 2.12.2", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"},
3 | "hackney": {:hex, :hackney, "1.3.2", "43bd07ab88753f5e136e38fddd2a09124bee25733b03361eeb459d0173fc17ab", [:make, :rebar], [{:idna, "~> 1.0.2", [hex: :idna, repo: "hexpm", optional: false]}, {:ssl_verify_hostname, "~> 1.0.5", [hex: :ssl_verify_hostname, repo: "hexpm", optional: false]}], "hexpm"},
4 | "httpoison": {:hex, :httpoison, "0.7.5", "7f4a1dc1245f6a4a7d944786b75a44c94056bd54830299229e4b57fd75cb9daa", [:mix], [{:hackney, "~> 1.3.1", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
5 | "idna": {:hex, :idna, "1.0.2", "397e3d001c002319da75759b0a81156bf11849c71d565162436d50020cb7265e", [:make], [], "hexpm"},
6 | "mbcs": {:git, "https://github.com/nekova/erlang-mbcs.git", "7bc88e28624abd6780073241e3bbb01b874b6a16", []},
7 | "mochiweb": {:hex, :mochiweb, "2.12.2", "80804ad342afa3d7f3524040d4eed66ce74b17a555de454ac85b07c479928e46", [:make, :rebar], [], "hexpm"},
8 | "ssl_verify_hostname": {:hex, :ssl_verify_hostname, "1.0.5", "2e73e068cd6393526f9fa6d399353d7c9477d6886ba005f323b592d389fb47be", [:make], [], "hexpm"}}
9 |
--------------------------------------------------------------------------------
/test/metainvestigator_test.exs:
--------------------------------------------------------------------------------
1 | defmodule MetaInvestigatorTest do
2 | use ExUnit.Case
3 | import MetaInvestigator
4 |
5 | @html File.read! "test/html/utf8.html"
6 |
7 | @shift_jis File.read! "test/html/shift_jis.html"
8 |
9 | @valid_response %{
10 | best_image: "http://img.example.gif", best_title: "MetaInvestigator in Test",
11 | images: ["http://i.example.jpg", "http://duplicate.example.png"],
12 | meta: %MetaInvestigator.Meta{charset: "utf-8", keywords: "This is keywords",
13 | og_image: "http://img.example.gif", og_title: "MetaInvestigator in Test",
14 | og_type: "article", og_url: "http://example.com"}, title: "MetaInvestigator"
15 | }
16 |
17 | test "fetch" do
18 | assert fetch(@html) == @valid_response
19 | end
20 |
21 | test "title" do
22 | assert title(@html) == "MetaInvestigator"
23 | end
24 |
25 | test "images" do
26 | assert length(images(@html)) == 2
27 | end
28 |
29 | test "best_title" do
30 | assert best_title(@html) == "MetaInvestigator in Test"
31 | end
32 |
33 | test "best_image" do
34 | assert best_image(@html) == "http://img.example.gif"
35 | end
36 |
37 | test "to_utf8 with utf8" do
38 | assert to_utf8(@html) == @html
39 | end
40 |
41 | test "@html is utf8" do
42 | assert String.valid?(@html)
43 | end
44 |
45 | test "@shift_jis is not utf8" do
46 | refute String.valid?(@shift_jis)
47 | end
48 |
49 | test "@shift_jis decode to utf8" do
50 | refute to_utf8(@shift_jis) == @shift_jis
51 | end
52 | end
53 |
--------------------------------------------------------------------------------
/lib/metainvestigator/meta.ex:
--------------------------------------------------------------------------------
1 | defmodule MetaInvestigator.Meta do
2 | @type t :: %__MODULE__{
3 | charset: String.t,
4 | keywords: String.t,
5 | og_title: String.t,
6 | og_type: String.t,
7 | og_url: String.t,
8 | og_image: String.t
9 | }
10 | defstruct charset: nil, # The encoding of the website.
11 | keywords: nil,
12 | og_title: nil,
13 | og_type: nil, # The type of the website, usually "article" or "website".
14 | og_url: nil,
15 | og_image: nil
16 |
17 |
18 | @metadata ["title", "type", "image", "url"]
19 |
20 | @spec charset(String.t) :: String.t
21 | def charset(html) do
22 | html |> Floki.find("meta") |> Floki.attribute("charset") |> List.first
23 | end
24 |
25 | @spec keywords(String.t) :: String.t
26 | def keywords(html) do
27 | html |> Floki.find("[name=\"keywords\"]") |> Floki.attribute("content") |> List.first
28 | end
29 |
30 | for meta <- @metadata do
31 | def unquote(:"og_#{meta}")(html), do: meta_tag_by(html, unquote(meta))
32 | end
33 |
34 | @spec meta_tag_by(String.t, String.t) :: String.t
35 | defp meta_tag_by(html, attribute) when attribute in @metadata do
36 | Floki.find(html, "[property=\"og:#{attribute}\"]")
37 | |> Floki.attribute("content") |> List.first
38 | end
39 |
40 | def meta(html) do
41 | %__MODULE__{
42 | charset: charset(html),
43 | keywords: keywords(html),
44 | og_title: og_title(html),
45 | og_type: og_type(html),
46 | og_url: og_url(html),
47 | og_image: og_image(html)
48 | }
49 | end
50 | end
51 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | MetaInvestigator
2 | =============
3 |
4 | MetaInvestigator is a library for web scraping, inspired by [MetaInspector](https://github.com/jaimeiniesta/metainspector).
5 |
6 | You can get its title, images, charset, description, keywords, meta tags...etc
7 |
8 | ## Usage
9 | You can use your favorite HTTP Client. HTTPoison, HTTPotion, tesla...etc
10 |
11 | ```elixir
12 | iex(1)> html = HTTPClient.get!("https://github.com/nekova").body
13 | iex(2)> page = MetaInvestigator.fetch(html)
14 | #%{best_image: "https://avatars1.githubusercontent.com/u/3464295?v=3&s=400",
15 | # best_title: "nekova (ಠ_ಠ) · GitHub",
16 | # images: ["https://avatars3.githubusercontent.com/u/3464295?v=3&s=460",
17 | # "https://assets-cdn.github.com/images/spinners/octocat-spinner-128.gif"],
18 | # meta: %MetaInvestigator.Meta{charset: "utf-8", keywords: nil,
19 | # og_image: "https://avatars1.githubusercontent.com/u/3464295?v=3&s=400",
20 | # og_title: "nekova (ಠ_ಠ)", og_type: "profile",
21 | # og_url: "https://github.com/nekova"}, title: "nekova (ಠ_ಠ) · GitHub"}
22 | iex(3)> page.og_image
23 | "https://avatars3.githubusercontent.com/u/3464295?v=3&s=460"
24 | iex(4)> page.best_title
25 | "nekova (ಠ_ಠ) · GitHub"
26 | ```
27 |
28 | You can access each element directly.
29 |
30 | ```elixir
31 | iex(2)> page = MetaInvestigator.title(html)
32 | "nekova (ಠ_ಠ) · GitHub"
33 | iex(3)> page = MetaInvestigator.best_image(html)
34 | "https://avatars3.githubusercontent.com/u/3464295?v=3&s=460"
35 | ```
36 |
37 | ## Installation
38 | First, add MetaInvestigator to your mix.exs dependencies:
39 |
40 | ```elixir
41 | def dpes do
42 | [{:metainvestigator, "~> 0.0.3"}]
43 | end
44 | ```
45 |
46 | and run ```$ mix deps.get```.
47 |
48 | ```elixir
49 | def application do
50 | [applications: [:metainvestigator]]
51 | end
52 | ```
53 |
54 | ## How to confirm the operation with iex
55 | ```elixir
56 | $> cd metainvestigator
57 | $> iex -S mix
58 | iex(1)> HTTPoison.start
59 | {:ok, [:idna, :hackney, :httpoison]}
60 | iex(2)> html = HTTPoison.get!("URL").body
61 | iex(3)> page = MetaInvestigator.fetch(html)
62 | ```
63 |
64 | ## LICENSE
65 | ```
66 | Copyright © 2015 nekova
67 |
68 | This work is free. You can redistribute it and/or modify it under the
69 | terms of the MIT License. See the LICENSE file for more details.
70 | ```
71 |
--------------------------------------------------------------------------------
/lib/metainvestigator.ex:
--------------------------------------------------------------------------------
1 | defmodule MetaInvestigator do
2 | alias MetaInvestigator.Meta
3 |
4 | @moduledoc """
5 | This is a simple library for web scraping.
6 |
7 | You can get its title, images, charset, description, keywords, meta tags...etc
8 |
9 | ## Example
10 | You can get a struct that included all elements
11 | * MetaInvestigator.fetch(html) : returns a struct
12 |
13 | You can access the following elements directly
14 | * MetaInvestigator.title(html) : returns a title
15 | * MetaInvestigator.images(html) : returns all images
16 | * MetaInvestigator.best_title(html) : returns a best title of the html
17 | * MetaInvestigator.best_image(html) : returns a best image of the html
18 | """
19 |
20 | @type t :: %__MODULE__{
21 | title: String.t,
22 | images: list,
23 | best_title: String.t,
24 | best_image: String.t,
25 | meta: MetaInvestigator.Meta.t
26 | }
27 | defstruct title: nil,
28 | images: [], # All images of the website.
29 | best_title: nil, # The best title of the website, usually or .
30 | best_image: nil, # The best image of the website, usually
or .
31 | meta: %{} # A map containing key value pairs of metatags
32 |
33 | @type html :: String.t
34 |
35 | @doc """
36 | Fetch elements inside a HTML
37 | """
38 | def fetch(html) when is_binary(html) do
39 | html = case String.valid?(html) do
40 | true -> html
41 | false -> to_utf8(html)
42 | end
43 | %{title: title(html), images: images(html), best_title: best_title(html),
44 | best_image: best_image(html), meta: Meta.meta(html) }
45 | end
46 |
47 | @spec title(html) :: String.t
48 | def title(html) do
49 | html |> Floki.find("head title") |> Floki.text
50 | end
51 |
52 | @spec images(html) :: list
53 | def images(html) do
54 | html |> Floki.find("img") |> Floki.attribute("src") |> Enum.uniq
55 | end
56 |
57 | @spec best_title(html) :: String.t
58 | def best_title(html) do
59 | compare(Meta.og_title(html), title(html))
60 | end
61 |
62 | @spec best_image(html) :: String.t
63 | def best_image(html) do
64 | [Meta.og_image(html)] ++ images(html) |> List.first
65 | end
66 |
67 | def to_utf8(string, encoding \\ :shift_jis) do
68 | Mbcs.start
69 | string |> :erlang.binary_to_list |> decode(encoding) |> to_string
70 | end
71 |
72 | def decode(string, :shift_jis), do: Mbcs.decode!(string, :cp932, return: :list)
73 |
74 | defp compare(nil, nil), do: nil
75 | defp compare(one, nil), do: one
76 | defp compare(nil, two), do: two
77 | defp compare(one, two) do
78 | case String.length(one) >= String.length(two) do
79 | true -> one
80 | false -> two
81 | end
82 | end
83 | end
84 |
--------------------------------------------------------------------------------