├── .gitignore ├── LICENSE ├── README.md ├── config └── config.exs ├── lib ├── html_entities.ex ├── html_entities │ └── util.ex └── html_entities_list.txt ├── mix.exs ├── mix.lock └── test ├── html_entities └── util_test.exs ├── html_entities_test.exs └── test_helper.exs /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where third-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | html_entities-*.tar 24 | 25 | # Temporary files for e.g. tests 26 | /tmp 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Martin Svalin 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HtmlEntities 2 | 3 | [![Module Version](https://img.shields.io/hexpm/v/html_entities.svg)](https://hex.pm/packages/html_entities) 4 | [![Hex Docs](https://img.shields.io/badge/hex-docs-lightgreen.svg)](https://hexdocs.pm/html_entities/) 5 | [![Total Download](https://img.shields.io/hexpm/dt/html_entities.svg)](https://hex.pm/packages/html_entities) 6 | [![License](https://img.shields.io/hexpm/l/html_entities.svg)](https://github.com/martinsvalin/html_entities/blob/master/LICENSE) 7 | [![Last Updated](https://img.shields.io/github/last-commit/martinsvalin/html_entities.svg)](https://github.com/martinsvalin/html_entities/commits/master) 8 | 9 | Elixir module for decoding and encoding HTML entities in a string. 10 | 11 | Entity names, codepoints and their corresponding characters are copied from 12 | [Wikipedia](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references). 13 | 14 | ## Installation 15 | 16 | Add the dependency to your `mix.exs` file, then run `mix deps.get`. 17 | 18 | ```elixir 19 | defp deps do 20 | [ 21 | {:html_entities, "~> 0.5"} 22 | ] 23 | end 24 | ``` 25 | 26 | ## Usage 27 | 28 | Inside IEx: 29 | 30 | ```elixir 31 | iex> HtmlEntities.decode("Tom & Jerry") 32 | "Tom & Jerry" 33 | iex> HtmlEntities.decode("¡Ay, caramba!") 34 | "¡Ay, caramba!" 35 | iex> HtmlEntities.encode("<< KAPOW!! >>") 36 | "<< KAPOW!! >>" 37 | ``` 38 | 39 | Inside a module: 40 | 41 | ```elixir 42 | defmodule EntityTest do 43 | def non_breaking_space do 44 | HtmlEntities.decode("¡") 45 | end 46 | end 47 | ``` 48 | 49 | ## License 50 | 51 | Copyright (c) 2015 Martin Svalin 52 | 53 | This library is MIT licensed. See the [LICENSE](https://github.com/martinsvalin/html_entities/blob/master/LICENSE) for details. 54 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | -------------------------------------------------------------------------------- /lib/html_entities.ex: -------------------------------------------------------------------------------- 1 | defmodule HtmlEntities do 2 | @moduledoc """ 3 | Decode and encode HTML entities in a string. 4 | 5 | ## Examples 6 | 7 | Decoding: 8 | 9 | iex> "Tom & Jerry" |> HtmlEntities.decode 10 | "Tom & Jerry" 11 | iex> "¡Ay, caramba!" |> HtmlEntities.decode 12 | "¡Ay, caramba!" 13 | iex> "ő ő" |> HtmlEntities.decode 14 | "ő ő" 15 | 16 | Encoding: 17 | 18 | iex> "Tom & Jerry" |> HtmlEntities.encode 19 | "Tom & Jerry" 20 | iex> "<< KAPOW!! >>" |> HtmlEntities.encode 21 | "<< KAPOW!! >>" 22 | """ 23 | 24 | @external_resource "lib/html_entities_list.txt" 25 | 26 | @doc "Decode HTML entities in a string." 27 | @spec decode(String.t()) :: String.t() 28 | def decode(string) when is_binary(string) do 29 | decode(string, "") 30 | end 31 | 32 | defp decode(<<"&", rest::binary>>, acc) do 33 | case decode_entity(rest) do 34 | {character, rest} -> decode(rest, <>) 35 | :error -> decode(rest, <>) 36 | end 37 | end 38 | 39 | defp decode(<>, acc) do 40 | decode(rest, <>) 41 | end 42 | 43 | defp decode(<<>>, acc) do 44 | acc 45 | end 46 | 47 | defp decode_entity(<<"#x", c, rest::binary>>) when c in ?0..?9 or c in ?a..?f or c in ?A..?F do 48 | case Integer.parse(<>, 16) do 49 | {number, ";" <> rest} -> {<>, rest} 50 | _ -> :error 51 | end 52 | rescue 53 | ArgumentError -> :error 54 | end 55 | 56 | defp decode_entity(<<"#", rest::binary>>) do 57 | case Integer.parse(rest, 10) do 58 | {number, ";" <> rest} -> {<>, rest} 59 | _ -> :error 60 | end 61 | rescue 62 | ArgumentError -> :error 63 | end 64 | 65 | codes = HtmlEntities.Util.load_entities(@external_resource) 66 | 67 | for {name, _character, codepoint} <- codes do 68 | defp decode_entity(<>) do 69 | {<>, rest} 70 | end 71 | end 72 | 73 | defp decode_entity(_), do: :error 74 | 75 | @doc "Encode HTML entities in a string." 76 | @spec encode(String.t()) :: String.t() 77 | def encode(string) when is_binary(string) do 78 | for <>, into: "" do 79 | case x do 80 | ?' -> "'" 81 | ?" -> """ 82 | ?& -> "&" 83 | ?< -> "<" 84 | ?> -> ">" 85 | _ -> <> 86 | end 87 | end 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /lib/html_entities/util.ex: -------------------------------------------------------------------------------- 1 | defmodule HtmlEntities.Util do 2 | @moduledoc """ 3 | Utility functions for managing metadata. 4 | 5 | Putting this code here makes it testable, and allows the code 6 | generation part of HtmlEntities to be as small as possible. 7 | """ 8 | 9 | @type entity :: {String.t(), String.t(), integer()} 10 | 11 | @doc "Load HTML entities from an external file." 12 | @spec load_entities(String.t()) :: [entity()] 13 | def load_entities(filename) do 14 | File.stream!(filename) |> Enum.map(&convert_line_to_entity/1) 15 | end 16 | 17 | @doc "Converts a line of comma-separated lines to entity definitions." 18 | @spec convert_line_to_entity([String.t()] | File.Stream.t()) :: entity() 19 | def convert_line_to_entity(line) do 20 | [name, character, codepoint] = line |> String.trim_trailing() |> String.split(",") 21 | {name, character, String.to_integer(codepoint)} 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/html_entities_list.txt: -------------------------------------------------------------------------------- 1 | quot,",34 2 | amp,&,38 3 | apos,',39 4 | lt,<,60 5 | gt,>,62 6 | nbsp, ,160 7 | iexcl,¡,161 8 | cent,¢,162 9 | pound,£,163 10 | curren,¤,164 11 | yen,¥,165 12 | brvbar,¦,166 13 | sect,§,167 14 | uml,¨,168 15 | copy,©,169 16 | ordf,ª,170 17 | laquo,«,171 18 | not,¬,172 19 | shy, ,173 20 | reg,®,174 21 | macr,¯,175 22 | deg,°,176 23 | plusmn,±,177 24 | sup2,²,178 25 | sup3,³,179 26 | acute,´,180 27 | micro,µ,181 28 | para,¶,182 29 | middot,·,183 30 | cedil,¸,184 31 | sup1,¹,185 32 | ordm,º,186 33 | raquo,»,187 34 | frac14,¼,188 35 | frac12,½,189 36 | frac34,¾,190 37 | iquest,¿,191 38 | Agrave,À,192 39 | Aacute,Á,193 40 | Acirc,Â,194 41 | Atilde,Ã,195 42 | Auml,Ä,196 43 | Aring,Å,197 44 | AElig,Æ,198 45 | Ccedil,Ç,199 46 | Egrave,È,200 47 | Eacute,É,201 48 | Ecirc,Ê,202 49 | Euml,Ë,203 50 | Igrave,Ì,204 51 | Iacute,Í,205 52 | Icirc,Î,206 53 | Iuml,Ï,207 54 | ETH,Ð,208 55 | Ntilde,Ñ,209 56 | Ograve,Ò,210 57 | Oacute,Ó,211 58 | Ocirc,Ô,212 59 | Otilde,Õ,213 60 | Ouml,Ö,214 61 | times,×,215 62 | Oslash,Ø,216 63 | Ugrave,Ù,217 64 | Uacute,Ú,218 65 | Ucirc,Û,219 66 | Uuml,Ü,220 67 | Yacute,Ý,221 68 | THORN,Þ,222 69 | szlig,ß,223 70 | agrave,à,224 71 | aacute,á,225 72 | acirc,â,226 73 | atilde,ã,227 74 | auml,ä,228 75 | aring,å,229 76 | aelig,æ,230 77 | ccedil,ç,231 78 | egrave,è,232 79 | eacute,é,233 80 | ecirc,ê,234 81 | euml,ë,235 82 | igrave,ì,236 83 | iacute,í,237 84 | icirc,î,238 85 | iuml,ï,239 86 | eth,ð,240 87 | ntilde,ñ,241 88 | ograve,ò,242 89 | oacute,ó,243 90 | ocirc,ô,244 91 | otilde,õ,245 92 | ouml,ö,246 93 | divide,÷,247 94 | oslash,ø,248 95 | ugrave,ù,249 96 | uacute,ú,250 97 | ucirc,û,251 98 | uuml,ü,252 99 | yacute,ý,253 100 | thorn,þ,254 101 | yuml,ÿ,255 102 | OElig,Œ,338 103 | oelig,œ,339 104 | Scaron,Š,352 105 | scaron,š,353 106 | Yuml,Ÿ,376 107 | fnof,ƒ,402 108 | circ,ˆ,710 109 | tilde,˜,732 110 | Alpha,Α,913 111 | Beta,Β,914 112 | Gamma,Γ,915 113 | Delta,Δ,916 114 | Epsilon,Ε,917 115 | Zeta,Ζ,918 116 | Eta,Η,919 117 | Theta,Θ,920 118 | Iota,Ι,921 119 | Kappa,Κ,922 120 | Lambda,Λ,923 121 | Mu,Μ,924 122 | Nu,Ν,925 123 | Xi,Ξ,926 124 | Omicron,Ο,927 125 | Pi,Π,928 126 | Rho,Ρ,929 127 | Sigma,Σ,931 128 | Tau,Τ,932 129 | Upsilon,Υ,933 130 | Phi,Φ,934 131 | Chi,Χ,935 132 | Psi,Ψ,936 133 | Omega,Ω,937 134 | alpha,α,945 135 | beta,β,946 136 | gamma,γ,947 137 | delta,δ,948 138 | epsilon,ε,949 139 | zeta,ζ,950 140 | eta,η,951 141 | theta,θ,952 142 | iota,ι,953 143 | kappa,κ,954 144 | lambda,λ,955 145 | mu,μ,956 146 | nu,ν,957 147 | xi,ξ,958 148 | omicron,ο,959 149 | pi,π,960 150 | rho,ρ,961 151 | sigmaf,ς,962 152 | sigma,σ,963 153 | tau,τ,964 154 | upsilon,υ,965 155 | phi,φ,966 156 | chi,χ,967 157 | psi,ψ,968 158 | omega,ω,969 159 | thetasym,ϑ,977 160 | upsih,ϒ,978 161 | piv,ϖ,982 162 | ensp, ,8194 163 | emsp, ,8195 164 | thinsp, ,8201 165 | zwnj, ,8204 166 | zwj, ,8205 167 | lrm, ,8206 168 | rlm, ,8207 169 | ndash,–,8211 170 | mdash,—,8212 171 | lsquo,‘,8216 172 | rsquo,’,8217 173 | sbquo,‚,8218 174 | ldquo,“,8220 175 | rdquo,”,8221 176 | bdquo,„,8222 177 | dagger,†,8224 178 | Dagger,‡,8225 179 | bull,•,8226 180 | hellip,…,8230 181 | permil,‰,8240 182 | prime,′,8242 183 | Prime,″,8243 184 | lsaquo,‹,8249 185 | rsaquo,›,8250 186 | oline,‾,8254 187 | frasl,⁄,8260 188 | euro,€,8364 189 | image,ℑ,8465 190 | weierp,℘,8472 191 | real,ℜ,8476 192 | trade,™,8482 193 | alefsym,ℵ,8501 194 | larr,←,8592 195 | uarr,↑,8593 196 | rarr,→,8594 197 | darr,↓,8595 198 | harr,↔,8596 199 | crarr,↵,8629 200 | lArr,⇐,8656 201 | uArr,⇑,8657 202 | rArr,⇒,8658 203 | dArr,⇓,8659 204 | hArr,⇔,8660 205 | forall,∀,8704 206 | part,∂,8706 207 | exist,∃,8707 208 | empty,∅,8709 209 | nabla,∇,8711 210 | isin,∈,8712 211 | notin,∉,8713 212 | ni,∋,8715 213 | prod,∏,8719 214 | sum,∑,8721 215 | minus,−,8722 216 | lowast,∗,8727 217 | radic,√,8730 218 | prop,∝,8733 219 | infin,∞,8734 220 | ang,∠,8736 221 | and,∧,8743 222 | or,∨,8744 223 | cap,∩,8745 224 | cup,∪,8746 225 | int,∫,8747 226 | there4,∴,8756 227 | sim,∼,8764 228 | cong,≅,8773 229 | asymp,≈,8776 230 | ne,≠,8800 231 | equiv,≡,8801 232 | le,≤,8804 233 | ge,≥,8805 234 | sub,⊂,8834 235 | sup,⊃,8835 236 | nsub,⊄,8836 237 | sube,⊆,8838 238 | supe,⊇,8839 239 | oplus,⊕,8853 240 | otimes,⊗,8855 241 | perp,⊥,8869 242 | sdot,⋅,8901 243 | lceil,⌈,8968 244 | rceil,⌉,8969 245 | lfloor,⌊,8970 246 | rfloor,⌋,8971 247 | lang,〈,9001 248 | rang,〉,9002 249 | loz,◊,9674 250 | spades,♠,9824 251 | clubs,♣,9827 252 | hearts,♥,9829 253 | diams,♦,9830 254 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlEntities.Mixfile do 2 | use Mix.Project 3 | 4 | @source_url "https://github.com/martinsvalin/html_entities" 5 | @version "0.5.2" 6 | 7 | def project do 8 | [ 9 | app: :html_entities, 10 | version: @version, 11 | name: "HtmlEntities", 12 | elixir: "~> 1.3", 13 | description: description(), 14 | package: package(), 15 | deps: deps(), 16 | docs: docs() 17 | ] 18 | end 19 | 20 | defp description do 21 | """ 22 | Decode and encode HTML entities in a string. 23 | """ 24 | end 25 | 26 | defp package do 27 | [ 28 | maintainers: ["Martin Svalin", "Dávid Kovács", "Johan Wärlander"], 29 | files: ["lib", "mix.exs", "README*", "LICENSE*"], 30 | licenses: ["MIT"], 31 | links: %{"GitHub" => @source_url} 32 | ] 33 | end 34 | 35 | defp deps do 36 | [ 37 | {:ex_doc, ">= 0.0.0", only: :dev, runtime: false} 38 | ] 39 | end 40 | 41 | defp docs do 42 | [ 43 | extras: ["README.md"], 44 | main: "readme", 45 | source_url: @source_url, 46 | source_ref: "v#{@version}" 47 | ] 48 | end 49 | 50 | def application do 51 | [applications: []] 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "earmark": {:hex, :earmark, "1.4.1", "07bb382826ee8d08d575a1981f971ed41bd5d7e86b917fd012a93c51b5d28727", [:mix], [], "hexpm", "cdfa03374331187c7b9e86d971423a19138dc1cf9902b26923a657c789673876"}, 3 | "earmark_parser": {:hex, :earmark_parser, "1.4.12", "b245e875ec0a311a342320da0551da407d9d2b65d98f7a9597ae078615af3449", [:mix], [], "hexpm", "711e2cc4d64abb7d566d43f54b78f7dc129308a63bc103fbd88550d2174b3160"}, 4 | "ex_doc": {:hex, :ex_doc, "0.23.0", "a069bc9b0bf8efe323ecde8c0d62afc13d308b1fa3d228b65bca5cf8703a529d", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "f5e2c4702468b2fd11b10d39416ddadd2fcdd173ba2a0285ebd92c39827a5a16"}, 5 | "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, 6 | "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, 7 | "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, 8 | } 9 | -------------------------------------------------------------------------------- /test/html_entities/util_test.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlEntitiesUtilTest do 2 | use ExUnit.Case 3 | doctest HtmlEntities.Util 4 | import HtmlEntities.Util 5 | 6 | test "Comma-separated entity descriptions are converted to tuples" do 7 | assert convert_line_to_entity("auml,ä,228") == {"auml", "ä", 228} 8 | assert convert_line_to_entity("aring,å,229") == {"aring", "å", 229} 9 | assert convert_line_to_entity("ouml,ö,246") == {"ouml", "ö", 246} 10 | end 11 | 12 | test "Structurally invalid entity descriptions trigger an error" do 13 | assert_raise MatchError, fn -> 14 | convert_line_to_entity("auml,ä,228,foo") 15 | end 16 | 17 | assert_raise MatchError, fn -> 18 | convert_line_to_entity("auml,ä") 19 | end 20 | 21 | assert_raise MatchError, fn -> 22 | convert_line_to_entity("") 23 | end 24 | end 25 | 26 | test "Trailing whitespace is removed from entity descriptions" do 27 | assert convert_line_to_entity("euro,€,8364 \t") == {"euro", "€", 8364} 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /test/html_entities_test.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlEntitiesTest do 2 | use ExUnit.Case 3 | doctest HtmlEntities 4 | import HtmlEntities 5 | 6 | describe "decode/1" do 7 | test "named" do 8 | assert decode("& ®") == "& ®" 9 | end 10 | 11 | test "numbers" do 12 | assert decode("perhaps an &?") == "perhaps an &?" 13 | assert decode("perhaps an &?") == "perhaps an &?" 14 | assert decode("non-breaking space") == "non-breaking space" 15 | end 16 | 17 | test "handle consecutive entities (non-greedy)" do 18 | assert decode("åäö") == "åäö" 19 | end 20 | 21 | test "ignore unrecognized entities" do 22 | assert decode("&nosuchentity;") == "&nosuchentity;" 23 | assert decode("&#nosuchentity;") == "&#nosuchentity;" 24 | assert decode("&#xxxx;") == "&#xxxx;" 25 | end 26 | 27 | test "ignore invalid unicode codepoints" do 28 | assert decode("�") == "�" 29 | assert decode("�") == "�" 30 | end 31 | end 32 | 33 | describe "encode/1" do 34 | test "don't replace safe UTF-8 characters" do 35 | assert encode("AbcÅäö€") == "AbcÅäö€" 36 | end 37 | 38 | test "replace unsafe characters" do 39 | assert encode("'\"&<>") == "'"&<>" 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | --------------------------------------------------------------------------------