├── test ├── test_helper.exs ├── html │ ├── shift_jis.html │ └── utf8.html ├── metainvestigator │ └── meta_test.exs └── metainvestigator_test.exs ├── .gitignore ├── LICENSE ├── config └── config.exs ├── mix.exs ├── mix.lock ├── lib ├── metainvestigator │ └── meta.ex └── metainvestigator.ex └── README.md /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /deps 2 | /_build 3 | erl_crash.dump 4 | *.ez 5 | -------------------------------------------------------------------------------- /test/html/shift_jis.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nekova/metainvestigator/HEAD/test/html/shift_jis.html -------------------------------------------------------------------------------- /test/html/utf8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | MetaInvestigator 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 16 |
17 | 18 | 19 | -------------------------------------------------------------------------------- /test/metainvestigator/meta_test.exs: -------------------------------------------------------------------------------- 1 | defmodule MetaInvestigator.MetaTest do 2 | use ExUnit.Case 3 | import MetaInvestigator.Meta 4 | 5 | @html File.read! "test/html/utf8.html" 6 | 7 | @shift_jis File.read! "test/html/shift_jis.html" 8 | 9 | test "charset" do 10 | assert charset(@html) == "utf-8" 11 | end 12 | 13 | test "keywords" do 14 | assert keywords(@html) == "This is keywords" 15 | end 16 | 17 | test "og_title" do 18 | assert og_title(@html) == "MetaInvestigator in Test" 19 | end 20 | 21 | test "og_image" do 22 | assert og_image(@html) == "http://img.example.gif" 23 | end 24 | 25 | test "og_type" do 26 | assert og_type(@html) == "article" 27 | end 28 | 29 | test "og_url" do 30 | assert og_url(@html) == "http://example.com" 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 nekova 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | # This configuration is loaded before any dependency and is restricted 5 | # to this project. If another project depends on this project, this 6 | # file won't be loaded nor affect the parent project. For this reason, 7 | # if you want to provide default values for your application for third- 8 | # party users, it should be done in your mix.exs file. 9 | 10 | # Sample configuration: 11 | # 12 | # config :logger, :console, 13 | # level: :info, 14 | # format: "$date $time [$level] $metadata$message\n", 15 | # metadata: [:user_id] 16 | 17 | # It is also possible to import configuration files, relative to this 18 | # directory. For example, you can emulate configuration per environment 19 | # by uncommenting the line below and defining dev.exs, test.exs and such. 20 | # Configuration from the imported file will override the ones defined 21 | # here (which is why it is important to import them last). 22 | # 23 | # import_config "#{Mix.env}.exs" 24 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule MetaInvestigator.Mixfile do 2 | use Mix.Project 3 | 4 | def project do 5 | [app: :metainvestigator, 6 | version: "0.0.3", 7 | elixir: "~> 1.0", 8 | description: description(), 9 | package: package(), 10 | deps: deps()] 11 | end 12 | 13 | # Configuration for the OTP application 14 | # 15 | # Type `mix help compile.app` for more information 16 | def application do 17 | [applications: [:logger]] 18 | end 19 | 20 | # Dependencies can be Hex packages: 21 | # 22 | # {:mydep, "~> 0.3.0"} 23 | # 24 | # Or git/path repositories: 25 | # 26 | # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"} 27 | # 28 | # Type `mix help deps` for more examples and options 29 | defp deps do 30 | [ 31 | {:elixir_mbcs, git: "https://github.com/woxtu/elixir-mbcs.git"}, 32 | {:floki, "~> 0.6"}, 33 | {:httpoison, "~> 0.7.2", only: :dev} 34 | ] 35 | end 36 | 37 | defp description do 38 | """ 39 | A library for web scraping, inspired by MetaInspector 40 | """ 41 | end 42 | 43 | defp package do 44 | [maintainers: ["nekova"], 45 | licenses: ["MIT"], 46 | links: %{github: "https://github.com/nekova/metainvestigator"} 47 | ] 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{"elixir_mbcs": {:git, "https://github.com/woxtu/elixir-mbcs.git", "884c97098cc08988dd0555ffd3ac564079d74457", []}, 2 | "floki": {:hex, :floki, "0.7.1", "64e6d266b4817711b24b9d2e62b55f4af673d791fdd30462f961b94f45c8b1e9", [:mix], [{:mochiweb, "~> 2.12.2", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"}, 3 | "hackney": {:hex, :hackney, "1.3.2", "43bd07ab88753f5e136e38fddd2a09124bee25733b03361eeb459d0173fc17ab", [:make, :rebar], [{:idna, "~> 1.0.2", [hex: :idna, repo: "hexpm", optional: false]}, {:ssl_verify_hostname, "~> 1.0.5", [hex: :ssl_verify_hostname, repo: "hexpm", optional: false]}], "hexpm"}, 4 | "httpoison": {:hex, :httpoison, "0.7.5", "7f4a1dc1245f6a4a7d944786b75a44c94056bd54830299229e4b57fd75cb9daa", [:mix], [{:hackney, "~> 1.3.1", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, 5 | "idna": {:hex, :idna, "1.0.2", "397e3d001c002319da75759b0a81156bf11849c71d565162436d50020cb7265e", [:make], [], "hexpm"}, 6 | "mbcs": {:git, "https://github.com/nekova/erlang-mbcs.git", "7bc88e28624abd6780073241e3bbb01b874b6a16", []}, 7 | "mochiweb": {:hex, :mochiweb, "2.12.2", "80804ad342afa3d7f3524040d4eed66ce74b17a555de454ac85b07c479928e46", [:make, :rebar], [], "hexpm"}, 8 | "ssl_verify_hostname": {:hex, :ssl_verify_hostname, "1.0.5", "2e73e068cd6393526f9fa6d399353d7c9477d6886ba005f323b592d389fb47be", [:make], [], "hexpm"}} 9 | -------------------------------------------------------------------------------- /test/metainvestigator_test.exs: -------------------------------------------------------------------------------- 1 | defmodule MetaInvestigatorTest do 2 | use ExUnit.Case 3 | import MetaInvestigator 4 | 5 | @html File.read! "test/html/utf8.html" 6 | 7 | @shift_jis File.read! "test/html/shift_jis.html" 8 | 9 | @valid_response %{ 10 | best_image: "http://img.example.gif", best_title: "MetaInvestigator in Test", 11 | images: ["http://i.example.jpg", "http://duplicate.example.png"], 12 | meta: %MetaInvestigator.Meta{charset: "utf-8", keywords: "This is keywords", 13 | og_image: "http://img.example.gif", og_title: "MetaInvestigator in Test", 14 | og_type: "article", og_url: "http://example.com"}, title: "MetaInvestigator" 15 | } 16 | 17 | test "fetch" do 18 | assert fetch(@html) == @valid_response 19 | end 20 | 21 | test "title" do 22 | assert title(@html) == "MetaInvestigator" 23 | end 24 | 25 | test "images" do 26 | assert length(images(@html)) == 2 27 | end 28 | 29 | test "best_title" do 30 | assert best_title(@html) == "MetaInvestigator in Test" 31 | end 32 | 33 | test "best_image" do 34 | assert best_image(@html) == "http://img.example.gif" 35 | end 36 | 37 | test "to_utf8 with utf8" do 38 | assert to_utf8(@html) == @html 39 | end 40 | 41 | test "@html is utf8" do 42 | assert String.valid?(@html) 43 | end 44 | 45 | test "@shift_jis is not utf8" do 46 | refute String.valid?(@shift_jis) 47 | end 48 | 49 | test "@shift_jis decode to utf8" do 50 | refute to_utf8(@shift_jis) == @shift_jis 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/metainvestigator/meta.ex: -------------------------------------------------------------------------------- 1 | defmodule MetaInvestigator.Meta do 2 | @type t :: %__MODULE__{ 3 | charset: String.t, 4 | keywords: String.t, 5 | og_title: String.t, 6 | og_type: String.t, 7 | og_url: String.t, 8 | og_image: String.t 9 | } 10 | defstruct charset: nil, # The encoding of the website. 11 | keywords: nil, 12 | og_title: nil, 13 | og_type: nil, # The type of the website, usually "article" or "website". 14 | og_url: nil, 15 | og_image: nil 16 | 17 | 18 | @metadata ["title", "type", "image", "url"] 19 | 20 | @spec charset(String.t) :: String.t 21 | def charset(html) do 22 | html |> Floki.find("meta") |> Floki.attribute("charset") |> List.first 23 | end 24 | 25 | @spec keywords(String.t) :: String.t 26 | def keywords(html) do 27 | html |> Floki.find("[name=\"keywords\"]") |> Floki.attribute("content") |> List.first 28 | end 29 | 30 | for meta <- @metadata do 31 | def unquote(:"og_#{meta}")(html), do: meta_tag_by(html, unquote(meta)) 32 | end 33 | 34 | @spec meta_tag_by(String.t, String.t) :: String.t 35 | defp meta_tag_by(html, attribute) when attribute in @metadata do 36 | Floki.find(html, "[property=\"og:#{attribute}\"]") 37 | |> Floki.attribute("content") |> List.first 38 | end 39 | 40 | def meta(html) do 41 | %__MODULE__{ 42 | charset: charset(html), 43 | keywords: keywords(html), 44 | og_title: og_title(html), 45 | og_type: og_type(html), 46 | og_url: og_url(html), 47 | og_image: og_image(html) 48 | } 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MetaInvestigator 2 | ============= 3 | 4 | MetaInvestigator is a library for web scraping, inspired by [MetaInspector](https://github.com/jaimeiniesta/metainspector). 5 | 6 | You can get its title, images, charset, description, keywords, meta tags...etc 7 | 8 | ## Usage 9 | You can use your favorite HTTP Client. HTTPoison, HTTPotion, tesla...etc 10 | 11 | ```elixir 12 | iex(1)> html = HTTPClient.get!("https://github.com/nekova").body 13 | iex(2)> page = MetaInvestigator.fetch(html) 14 | #%{best_image: "https://avatars1.githubusercontent.com/u/3464295?v=3&s=400", 15 | # best_title: "nekova (ಠ_ಠ) · GitHub", 16 | # images: ["https://avatars3.githubusercontent.com/u/3464295?v=3&s=460", 17 | # "https://assets-cdn.github.com/images/spinners/octocat-spinner-128.gif"], 18 | # meta: %MetaInvestigator.Meta{charset: "utf-8", keywords: nil, 19 | # og_image: "https://avatars1.githubusercontent.com/u/3464295?v=3&s=400", 20 | # og_title: "nekova (ಠ_ಠ)", og_type: "profile", 21 | # og_url: "https://github.com/nekova"}, title: "nekova (ಠ_ಠ) · GitHub"} 22 | iex(3)> page.og_image 23 | "https://avatars3.githubusercontent.com/u/3464295?v=3&s=460" 24 | iex(4)> page.best_title 25 | "nekova (ಠ_ಠ) · GitHub" 26 | ``` 27 | 28 | You can access each element directly. 29 | 30 | ```elixir 31 | iex(2)> page = MetaInvestigator.title(html) 32 | "nekova (ಠ_ಠ) · GitHub" 33 | iex(3)> page = MetaInvestigator.best_image(html) 34 | "https://avatars3.githubusercontent.com/u/3464295?v=3&s=460" 35 | ``` 36 | 37 | ## Installation 38 | First, add MetaInvestigator to your mix.exs dependencies: 39 | 40 | ```elixir 41 | def dpes do 42 | [{:metainvestigator, "~> 0.0.3"}] 43 | end 44 | ``` 45 | 46 | and run ```$ mix deps.get```. 47 | 48 | ```elixir 49 | def application do 50 | [applications: [:metainvestigator]] 51 | end 52 | ``` 53 | 54 | ## How to confirm the operation with iex 55 | ```elixir 56 | $> cd metainvestigator 57 | $> iex -S mix 58 | iex(1)> HTTPoison.start 59 | {:ok, [:idna, :hackney, :httpoison]} 60 | iex(2)> html = HTTPoison.get!("URL").body 61 | iex(3)> page = MetaInvestigator.fetch(html) 62 | ``` 63 | 64 | ## LICENSE 65 | ``` 66 | Copyright © 2015 nekova 67 | 68 | This work is free. You can redistribute it and/or modify it under the 69 | terms of the MIT License. See the LICENSE file for more details. 70 | ``` 71 | -------------------------------------------------------------------------------- /lib/metainvestigator.ex: -------------------------------------------------------------------------------- 1 | defmodule MetaInvestigator do 2 | alias MetaInvestigator.Meta 3 | 4 | @moduledoc """ 5 | This is a simple library for web scraping. 6 | 7 | You can get its title, images, charset, description, keywords, meta tags...etc 8 | 9 | ## Example 10 | You can get a struct that included all elements 11 | * MetaInvestigator.fetch(html) : returns a struct 12 | 13 | You can access the following elements directly 14 | * MetaInvestigator.title(html) : returns a title 15 | * MetaInvestigator.images(html) : returns all images 16 | * MetaInvestigator.best_title(html) : returns a best title of the html 17 | * MetaInvestigator.best_image(html) : returns a best image of the html 18 | """ 19 | 20 | @type t :: %__MODULE__{ 21 | title: String.t, 22 | images: list, 23 | best_title: String.t, 24 | best_image: String.t, 25 | meta: MetaInvestigator.Meta.t 26 | } 27 | defstruct title: nil, 28 | images: [], # All images of the website. 29 | best_title: nil, # The best title of the website, usually or <og:title>. 30 | best_image: nil, # The best image of the website, usually <img> or <og:image>. 31 | meta: %{} # A map containing key value pairs of metatags 32 | 33 | @type html :: String.t 34 | 35 | @doc """ 36 | Fetch elements inside a HTML 37 | """ 38 | def fetch(html) when is_binary(html) do 39 | html = case String.valid?(html) do 40 | true -> html 41 | false -> to_utf8(html) 42 | end 43 | %{title: title(html), images: images(html), best_title: best_title(html), 44 | best_image: best_image(html), meta: Meta.meta(html) } 45 | end 46 | 47 | @spec title(html) :: String.t 48 | def title(html) do 49 | html |> Floki.find("head title") |> Floki.text 50 | end 51 | 52 | @spec images(html) :: list 53 | def images(html) do 54 | html |> Floki.find("img") |> Floki.attribute("src") |> Enum.uniq 55 | end 56 | 57 | @spec best_title(html) :: String.t 58 | def best_title(html) do 59 | compare(Meta.og_title(html), title(html)) 60 | end 61 | 62 | @spec best_image(html) :: String.t 63 | def best_image(html) do 64 | [Meta.og_image(html)] ++ images(html) |> List.first 65 | end 66 | 67 | def to_utf8(string, encoding \\ :shift_jis) do 68 | Mbcs.start 69 | string |> :erlang.binary_to_list |> decode(encoding) |> to_string 70 | end 71 | 72 | def decode(string, :shift_jis), do: Mbcs.decode!(string, :cp932, return: :list) 73 | 74 | defp compare(nil, nil), do: nil 75 | defp compare(one, nil), do: one 76 | defp compare(nil, two), do: two 77 | defp compare(one, two) do 78 | case String.length(one) >= String.length(two) do 79 | true -> one 80 | false -> two 81 | end 82 | end 83 | end 84 | --------------------------------------------------------------------------------