├── .ocamlformat-ignore ├── dune ├── test ├── dune └── test.ml ├── src └── dune ├── example ├── dune └── example.ml ├── ocp-browser-emoji.png ├── gencode ├── dune └── gencode.ml ├── .gitignore ├── dune-project ├── emoji.opam ├── .ocamlformat ├── README.md └── LICENSE.md /.ocamlformat-ignore: -------------------------------------------------------------------------------- 1 | src/* 2 | -------------------------------------------------------------------------------- /dune: -------------------------------------------------------------------------------- 1 | (mdx 2 | (libraries emoji) 3 | (files README.md)) 4 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (tests 2 | (names test) 3 | (libraries emoji)) 4 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (modules emoji) 3 | (public_name emoji)) 4 | -------------------------------------------------------------------------------- /example/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name example) 3 | (libraries emoji)) 4 | -------------------------------------------------------------------------------- /ocp-browser-emoji.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fxfactorial/ocaml-emoji/HEAD/ocp-browser-emoji.png -------------------------------------------------------------------------------- /gencode/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name gencode) 3 | (modules gencode) 4 | (libraries lambdasoup uutf)) 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.cmi 3 | *.cmt 4 | *.cmx 5 | *.cmxa 6 | *.a 7 | _build 8 | *.txt 9 | *.data 10 | *.log 11 | *.native 12 | api.docdir 13 | *.markdown 14 | _tests/* 15 | testapi.c 16 | *.merlin 17 | full-emoji-list.html 18 | full-emoji-modifiers.html 19 | -------------------------------------------------------------------------------- /test/test.ml: -------------------------------------------------------------------------------- 1 | (* from https://www.unicode.org/emoji/charts/emoji-counts.html *) 2 | let () = 3 | assert (Array.length Emoji.category_smileys_and_emotion = 171); 4 | assert (Array.length Emoji.category_people_and_body = 2418); 5 | assert (Array.length Emoji.category_animals_and_nature = 160); 6 | assert (Array.length Emoji.category_symbols = 224); 7 | assert (Array.length Emoji.all_emojis = 3953) 8 | -------------------------------------------------------------------------------- /example/example.ml: -------------------------------------------------------------------------------- 1 | let print_emojis emojis = 2 | Array.iter print_string emojis; 3 | print_newline () 4 | 5 | let () = 6 | print_string "All emojis:\n"; 7 | print_emojis Emoji.all_emojis; 8 | print_string "Animal & Nature:\n"; 9 | print_emojis Emoji.category_animals_and_nature; 10 | print_string "Animal-reptile:\n"; 11 | print_emojis Emoji.sub_category_animal_reptile; 12 | Printf.printf "face with bags under eyes: %s\n" 13 | Emoji.face_with_bags_under_eyes; 14 | Printf.printf "leafless tree: %s\n" Emoji.leafless_tree; 15 | Printf.printf "root vegetable: %s\n" Emoji.root_vegetable; 16 | Printf.printf "splatter: %s\n" Emoji.splatter; 17 | () 18 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 3.2) 2 | (using mdx 0.3) 3 | (name emoji) 4 | (explicit_js_mode) 5 | (generate_opam_files true) 6 | (license BSD-3-Clause) 7 | (authors "Edgar Aroutiounian ") 8 | (maintainers "Edgar Aroutiounian " "Swrup ") 9 | (source (github fxfactorial/ocaml-emoji)) 10 | (package 11 | (name emoji) 12 | (synopsis "Use emojis by name") 13 | (description "OCaml library providing byte sequences of all the Unicode emoji characters and sequences sourced from unicode.org" ) 14 | (tags (emoji unicode)) 15 | (depends 16 | (ocaml (>= "4.04")) 17 | (mdx :with-test) 18 | (lambdasoup :with-dev-setup) 19 | (uutf :with-dev-setup) 20 | )) 21 | -------------------------------------------------------------------------------- /emoji.opam: -------------------------------------------------------------------------------- 1 | # This file is generated by dune, edit dune-project instead 2 | opam-version: "2.0" 3 | synopsis: "Use emojis by name" 4 | description: 5 | "OCaml library providing byte sequences of all the Unicode emoji characters and sequences sourced from unicode.org" 6 | maintainer: [ 7 | "Edgar Aroutiounian " 8 | "Swrup " 9 | ] 10 | authors: ["Edgar Aroutiounian "] 11 | license: "BSD-3-Clause" 12 | tags: ["emoji" "unicode"] 13 | homepage: "https://github.com/fxfactorial/ocaml-emoji" 14 | bug-reports: "https://github.com/fxfactorial/ocaml-emoji/issues" 15 | depends: [ 16 | "dune" {>= "3.2"} 17 | "ocaml" {>= "4.04"} 18 | "mdx" {with-test} 19 | "lambdasoup" {with-dev-setup} 20 | "uutf" {with-dev-setup} 21 | "odoc" {with-doc} 22 | ] 23 | build: [ 24 | ["dune" "subst"] {dev} 25 | [ 26 | "dune" 27 | "build" 28 | "-p" 29 | name 30 | "-j" 31 | jobs 32 | "@install" 33 | "@runtest" {with-test} 34 | "@doc" {with-doc} 35 | ] 36 | ] 37 | dev-repo: "git+https://github.com/fxfactorial/ocaml-emoji.git" 38 | -------------------------------------------------------------------------------- /.ocamlformat: -------------------------------------------------------------------------------- 1 | version=0.27.0 2 | assignment-operator=end-line 3 | break-cases=fit 4 | break-fun-decl=wrap 5 | break-fun-sig=wrap 6 | break-infix=wrap 7 | break-infix-before-func=false 8 | break-separators=before 9 | break-sequences=true 10 | cases-exp-indent=2 11 | cases-matching-exp-indent=normal 12 | doc-comments=before 13 | doc-comments-padding=2 14 | doc-comments-tag-only=default 15 | dock-collection-brackets=false 16 | exp-grouping=preserve 17 | field-space=loose 18 | if-then-else=compact 19 | indicate-multiline-delimiters=space 20 | indicate-nested-or-patterns=unsafe-no 21 | infix-precedence=indent 22 | leading-nested-match-parens=false 23 | let-and=sparse 24 | let-binding-spacing=compact 25 | let-module=compact 26 | margin=80 27 | max-indent=68 28 | module-item-spacing=sparse 29 | ocp-indent-compat=false 30 | parens-ite=false 31 | parens-tuple=always 32 | parse-docstrings=true 33 | sequence-blank-line=preserve-one 34 | sequence-style=terminator 35 | single-case=compact 36 | space-around-arrays=true 37 | space-around-lists=true 38 | space-around-records=true 39 | space-around-variants=true 40 | type-decl=sparse 41 | wrap-comments=false 42 | wrap-fun-args=true 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Emoji 2 | 3 | OCaml library providing byte sequences of all the Unicode (v17.0) emoji 4 | characters and sequences 5 | 6 | ``` 7 | $ opam install emoji 8 | ``` 9 | 10 | You can see them by printing to the screen: 11 | ```ocaml 12 | # Emoji.distorted_face;; 13 | - : string = "🫪" 14 | # Emoji.hairy_creature;; 15 | - : string = "🫈" 16 | # Emoji.orca;; 17 | - : string = "🫍" 18 | ``` 19 | 20 | You can also get all emojis from the same category or subcategory: 21 | ```ocaml 22 | # let best_animals = Emoji.sub_category_animal_reptile;; 23 | val best_animals : string array = 24 | [|"🐊"; "🐉"; "🐲"; "🦎"; "🦕"; "🐍"; "🦖"; "🐢"|] 25 | ``` 26 | 27 | Using `ocp-browser` shows the emoji 28 | 29 | ![ocp-browser screenshot](./ocp-browser-emoji.png) 30 | 31 | # Development 32 | 33 | To generate `emoji.ml`, first update the source html files: 34 | ``` 35 | $ wget "https://www.unicode.org/emoji/charts/full-emoji-list.html" 36 | $ wget "https://www.unicode.org/emoji/charts/full-emoji-modifiers.html" 37 | ``` 38 | 39 | then run: 40 | ``` 41 | $ dune exec gencode/gencode.exe > src/emoji.ml 42 | ``` 43 | test with `dune runtest`, you can then `dune promote` to apply [mdx](https://github.com/realworldocaml/mdx) changes to keep this file up to date. 44 | 45 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Modified BSD License 2 | ==================== 3 | 4 | _Copyright © 2022, the ocaml-emoji contributors_ 5 | _All rights reserved._ 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 3. Neither the name of the ocaml-emoji nor the 16 | names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALLTHE OCAML-EMOJI CONTRIBUTOR BE LIABLE FOR ANY 23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | -------------------------------------------------------------------------------- /gencode/gencode.ml: -------------------------------------------------------------------------------- 1 | let emoji_file = "full-emoji-list.html" 2 | 3 | let emoji_modifiers_file = "full-emoji-modifiers.html" 4 | 5 | type emoji = 6 | { code_point : string 7 | ; emoji : string 8 | ; description : string 9 | ; name : string 10 | ; category : string 11 | ; sub_category : string 12 | } 13 | 14 | let string_escape_hex = 15 | let fst_code = Char.code '0' in 16 | let snd_code = Char.code 'a' - 10 in 17 | let nibble_to_hex_char n = 18 | if n < 0 || n >= 16 then 19 | invalid_arg "nibbles must be within the range 0x0 and 0xf" 20 | else if n < 10 then Char.chr (fst_code + n) 21 | else Char.chr (snd_code + n) 22 | in 23 | fun s -> 24 | let buf = Buffer.create (String.length s) in 25 | String.iter 26 | (fun c -> 27 | let c = Char.code c in 28 | Buffer.add_string buf "\\x"; 29 | let n = (c land 0xf0) lsr 4 in 30 | Buffer.add_char buf (nibble_to_hex_char n); 31 | let n = c land 0x0f in 32 | Buffer.add_char buf (nibble_to_hex_char n) ) 33 | s; 34 | Buffer.contents buf 35 | 36 | (* leading ints are illegal in 37 | * OCaml identifiers so we prepend 38 | * them with a '_' *) 39 | let wrap_leading_ints s = match s.[0] with '0' .. '9' -> "_" ^ s | _ -> s 40 | 41 | let to_legal_ident_char c = 42 | if not (Uchar.is_char c) then 43 | (* not a latin1 character, ex: quotation mark (U+2019) in names *) 44 | "_" 45 | else 46 | match Uchar.to_char c with 47 | | '&' -> "and" 48 | | '#' -> "hash" 49 | | '*' -> "star" 50 | | '-' | ' ' | ':' | '.' | ',' | '(' | ')' -> "_" 51 | | '!' -> 52 | (*TODO: use "_" and merge with previous case ? *) 53 | (* only used for ON! arrow and UP! button *) 54 | "" 55 | | '\197' -> "a" (* Å *) 56 | | '\227' -> "a" (* ã *) 57 | | '\231' -> "c" (* ç *) 58 | | '\233' -> "e" (* é *) 59 | | '\237' -> "i" (* í *) 60 | | '\241' -> "n" (* piñata !*) 61 | | '\244' -> "o" (* ô *) 62 | | '\252' -> "u" (* ü *) 63 | | ('_' | '\'' | '0' .. '9' | 'a' .. 'z') as c -> String.make 1 c 64 | | 'A' .. 'Z' as c -> String.make 1 (Char.lowercase_ascii c) 65 | | c -> failwith (Format.sprintf "unhandled character: '%c'" c) 66 | 67 | let deduplicate_underscores s = 68 | let buf = Buffer.create (String.length s) in 69 | let _was_underscore : bool = 70 | String.fold_left 71 | (fun was_underscore c -> 72 | let is_underscore = c = '_' in 73 | if not (was_underscore && is_underscore) then Buffer.add_char buf c; 74 | is_underscore ) 75 | false s 76 | in 77 | Buffer.contents buf 78 | 79 | let identifier_of_description s = 80 | let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in 81 | let buf = Buffer.create (String.length s) in 82 | begin 83 | try 84 | while true do 85 | match Uutf.decode decoder with 86 | | `Uchar u -> Buffer.add_string buf (to_legal_ident_char u) 87 | | `End -> raise Exit 88 | | `Await -> () 89 | | `Malformed e -> failwith e 90 | done 91 | with Exit -> () 92 | end; 93 | Buffer.contents buf |> wrap_leading_ints |> deduplicate_underscores 94 | 95 | let just_innard s = s |> Soup.trimmed_texts |> String.concat "" 96 | 97 | type skin_tone = 98 | | Light 99 | | Medium_light 100 | | Medium 101 | | Medium_dark 102 | | Dark 103 | 104 | let skin_tone_of_order_count = function 105 | | 1 -> Light 106 | | 2 -> Medium_light 107 | | 3 -> Medium 108 | | 4 -> Medium_dark 109 | | 5 -> Dark 110 | | _ -> failwith "skin_tone_of_order_count failure" 111 | 112 | let skin_tone_of_code_point code_point = 113 | let skin_tone_to_code_point o = 114 | match o with 115 | | Light -> "U+1F3FB" 116 | | Medium_light -> "U+1F3FC" 117 | | Medium -> "U+1F3FD" 118 | | Medium_dark -> "U+1F3FE" 119 | | Dark -> "U+1F3FF" 120 | in 121 | let skin_tones = [| Light; Medium_light; Medium; Medium_dark; Dark |] in 122 | let chars = String.split_on_char ' ' code_point in 123 | match List.nth_opt chars 1 with 124 | | None -> failwith "skin_tone_of_code_point failure: invalid code_point size" 125 | | Some code -> ( 126 | let opt = 127 | Array.find_opt 128 | (fun o -> String.equal code (skin_tone_to_code_point o)) 129 | skin_tones 130 | in 131 | match opt with 132 | | None -> failwith "skin_tone_of_code_point failure" 133 | | Some o -> o ) 134 | 135 | (* HACK; missing skin tones in description: 136 | some emoji have the same description because of skin tones missing from description 137 | we change the description to contains the skin tones 138 | to guess the skin tone we relies on the fact that skin tones are given in a 139 | specific order in the html 140 | and we double check by looking for specific code that define skin tones *) 141 | let fix_incomplete_skin_tones_description = 142 | let skin_tones = 143 | [| "" 144 | ; " light skin tone" 145 | ; " medium-light skin tone" 146 | ; " medium skin tone" 147 | ; " medium-dark skin tone" 148 | ; " dark skin tone" 149 | |] 150 | in 151 | let ht = Hashtbl.create 0x100000 in 152 | fun code_point s -> 153 | let duplicate_count = 154 | match Hashtbl.find_opt ht s with 155 | | None -> 0 156 | | Some count -> 157 | let order_count_guess = skin_tone_of_order_count count in 158 | let code_point_guess = skin_tone_of_code_point code_point in 159 | begin 160 | if order_count_guess = code_point_guess then () 161 | else 162 | failwith 163 | (Format.sprintf "failed to guess skin tones: %s" code_point) 164 | end; 165 | count 166 | in 167 | Hashtbl.replace ht s (duplicate_count + 1); 168 | let missing_skin_tone = skin_tones.(duplicate_count) in 169 | s ^ missing_skin_tone 170 | 171 | let parse_row (l, category, sub_category) el = 172 | match Soup.select_one "th" el with 173 | | Some el -> ( 174 | if List.mem "rchars" (Soup.classes el) then 175 | (* not an emoji row *) 176 | (l, category, sub_category) 177 | else 178 | (* title row *) 179 | let title = 180 | match Soup.select_one "a" el with 181 | | None -> failwith "no link in category row" 182 | | Some a -> ( 183 | match Soup.attribute "name" a with 184 | | None -> failwith "no name in category link" 185 | | Some name -> identifier_of_description @@ String.trim name ) 186 | in 187 | match Soup.classes el with 188 | | [] -> failwith "no class name" 189 | | name :: _l -> ( 190 | match name with 191 | | "bighead" -> (l, title, "") 192 | | "mediumhead" -> (l, category, title) 193 | | _ -> failwith "invalid class name" ) ) 194 | | None -> ( 195 | match Soup.select_one "img" el with 196 | | None -> (* not an emoji row *) (l, category, sub_category) 197 | | Some img -> 198 | let code_point = 199 | match Soup.select_one "td.code > a" el with 200 | | None -> failwith "no code_point found" 201 | | Some el -> just_innard el 202 | in 203 | let emoji = 204 | match Soup.attribute "alt" img with 205 | | None -> failwith "no alt on emoji img" 206 | | Some emoji -> emoji 207 | in 208 | let description = 209 | match Soup.select_one "td.name" el with 210 | | None -> failwith "no description found" 211 | | Some el -> just_innard el 212 | in 213 | (* Recently-added emoji are marked by a ⊛ in the name ⊛_⊛^^ *) 214 | let prefix = "⊛" in 215 | let description = 216 | if String.starts_with ~prefix description then 217 | (* its not 1 *) 218 | let prefix_len = String.length prefix in 219 | String.trim 220 | @@ String.sub description prefix_len 221 | (String.length description - prefix_len) 222 | else description 223 | in 224 | (* fix missing skin tones in description *) 225 | let description = 226 | fix_incomplete_skin_tones_description code_point description 227 | in 228 | let name = identifier_of_description description in 229 | 230 | ( { code_point 231 | ; emoji 232 | ; description 233 | ; name 234 | ; category = "category_" ^ category 235 | ; sub_category = "sub_category_" ^ sub_category 236 | } 237 | :: l 238 | , category 239 | , sub_category ) ) 240 | 241 | let parse file = 242 | let chan = open_in file in 243 | Fun.protect 244 | ~finally:(fun () -> close_in chan) 245 | (fun () -> Soup.read_channel chan |> Soup.parse) 246 | 247 | let parsed = parse emoji_file 248 | 249 | let parsed_skin_tones = parse emoji_modifiers_file 250 | 251 | let table = Soup.to_list @@ Soup.select "table > tbody > tr" parsed 252 | 253 | let table_skin_tones = 254 | Soup.to_list @@ Soup.select "table > tbody > tr" parsed_skin_tones 255 | 256 | let table = table @ table_skin_tones 257 | 258 | let init = ([], "", "") 259 | 260 | let emojis, _last_category, _last_sub_category = 261 | List.fold_left parse_row init table 262 | 263 | let emojis = List.sort (fun e1 e2 -> compare e1.name e2.name) emojis 264 | 265 | (* category_name -> (emoji_name -> unit) *) 266 | let cats_table = Hashtbl.create 512 267 | 268 | (* sub_category_name -> (emoji_name -> unit) *) 269 | let subcats_table = Hashtbl.create 512 270 | 271 | let () = 272 | List.iter 273 | (fun { category; sub_category; name; _ } -> 274 | let cat_table = 275 | match Hashtbl.find_opt cats_table category with 276 | | None -> 277 | let cat_table = Hashtbl.create 512 in 278 | Hashtbl.add cats_table category cat_table; 279 | cat_table 280 | | Some cat_table -> cat_table 281 | in 282 | Hashtbl.add cat_table name (); 283 | let subcat_table = 284 | match Hashtbl.find_opt subcats_table sub_category with 285 | | None -> 286 | let subcat_table = Hashtbl.create 512 in 287 | Hashtbl.add subcats_table sub_category subcat_table; 288 | subcat_table 289 | | Some subcat_table -> subcat_table 290 | in 291 | Hashtbl.add subcat_table name () ) 292 | emojis; 293 | 294 | Format.printf 295 | "(** All Emojis defined by the Unicode standard, encoded using UTF-8 *)@\n"; 296 | List.iter 297 | (fun e -> 298 | Format.printf "@\n(** %s (%s): %s *)@\nlet %s = \"%s\"@\n" e.emoji 299 | e.code_point e.description 300 | (identifier_of_description e.description) 301 | (string_escape_hex e.emoji) ) 302 | emojis 303 | 304 | let pp_print_list_to_ocaml_array fmt a = 305 | Format.fprintf fmt "[|%a|]" 306 | (Format.pp_print_list 307 | ~pp_sep:(fun fmt () -> Format.fprintf fmt " ; ") 308 | Format.pp_print_string ) 309 | a 310 | 311 | let subcats = 312 | Hashtbl.fold 313 | (fun name emojis acc -> 314 | (name, List.sort compare @@ List.of_seq @@ Hashtbl.to_seq_keys emojis) 315 | :: acc ) 316 | subcats_table [] 317 | 318 | let subcats = List.sort (fun (n1, _) (n2, _) -> compare n1 n2) subcats 319 | 320 | let () = 321 | Format.printf "@\n(** All sub categories *)@\n"; 322 | List.iter 323 | (fun (name, emojis) -> 324 | Format.printf "@\nlet %s = %a@\n" name pp_print_list_to_ocaml_array emojis ) 325 | subcats 326 | 327 | let cats = 328 | Hashtbl.fold 329 | (fun name emojis acc -> 330 | (name, List.sort compare @@ List.of_seq @@ Hashtbl.to_seq_keys emojis) 331 | :: acc ) 332 | cats_table [] 333 | 334 | let cats = List.sort (fun (n1, _) (n2, _) -> compare n1 n2) cats 335 | 336 | let () = 337 | Format.printf "@\n(** All categories *)@\n"; 338 | List.iter 339 | (fun (cat, emojis) -> 340 | Format.printf "@\nlet %s = %a@\n" cat pp_print_list_to_ocaml_array emojis ) 341 | cats 342 | 343 | let all_names = List.map (fun emoji -> emoji.name) emojis 344 | 345 | let () = 346 | Format.printf "@\n(** All included emojis in an array *)@\n"; 347 | Format.printf "let all_emojis = %a@\n" pp_print_list_to_ocaml_array all_names 348 | --------------------------------------------------------------------------------