├── INSTALL ├── META ├── README.md ├── Makefile ├── LICENSE ├── utf8conv.mli ├── cp1252.txt └── utf8conv.ml /INSTALL: -------------------------------------------------------------------------------- 1 | Standard installation using make and ocamlfind: 2 | 3 | $ make 4 | $ make install 5 | 6 | Uninstall if/when needed: 7 | 8 | $ make uninstall 9 | -------------------------------------------------------------------------------- /META: -------------------------------------------------------------------------------- 1 | name = "utf8conv" 2 | version = "1.0.0" 3 | description = "Conversion from ISO-9959-1/Windows-1252 to UTF-8" 4 | archive(byte) = "utf8conv.cma" 5 | archive(native) = "utf8conv.cmxa" 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Conversion of ISO-8859-1/Windows-1252 text into UTF-8. 2 | 3 | This is a minimalistic module that does not require any configuration file 4 | and only depends on OCaml's standard library. 5 | 6 | Utf8conv does not provide operations on existing UTF-8 data. 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default all opt doc install uninstall clean 2 | default: all opt 3 | all: 4 | ocamlc -c -g utf8conv.mli 5 | ocamlc -c -g utf8conv.ml 6 | ocamlc -a -g -o utf8conv.cma utf8conv.cmo 7 | opt: 8 | ocamlc -c -g utf8conv.mli 9 | ocamlopt -c -g utf8conv.ml 10 | ocamlopt -a -g -o utf8conv.cmxa utf8conv.cmx 11 | doc: 12 | mkdir -p html 13 | ocamldoc -html -d html utf8conv.mli 14 | install: 15 | ocamlfind install utf8conv META \ 16 | $$(ls *.mli *.cm[ioxa] *.cmxa *.o *.a 2>/dev/null) 17 | uninstall: 18 | ocamlfind remove utf8conv 19 | clean: 20 | rm -f *.cm[ioxa] *.o *.cmxa *.a *~ 21 | rm -rf html 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Martin Jambon 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 3. The name of the author may not be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /utf8conv.mli: -------------------------------------------------------------------------------- 1 | (** 2 | Conversion of ISO-8859-1/Windows-1252 text into UTF-8. 3 | 4 | This is a minimalistic module that does not require any configuration file 5 | and only depends on OCaml's standard library. 6 | *) 7 | 8 | val utf8_of_windows1252 : 9 | ?pos: int -> 10 | ?len: int -> 11 | ?undefined: (char -> string) -> 12 | string -> string 13 | (** Converts an ASCII, ISO-8859-1 or Windows-1252 string or substring 14 | into a UTF-8 string. 15 | 16 | @param pos position of the beginning of the input substring. 17 | The default is 0, the beginning of the string. 18 | @param len length of the input substring. 19 | The default length extends until the end of the string. 20 | @param undefined function for converting invalid bytes. The default 21 | is to raise a [Failure] exception with a useful 22 | error message. 23 | *) 24 | 25 | val is_ascii : ?pos: int -> ?len: int -> string -> bool 26 | (** Returns true if and only if the given string or substring 27 | contains only 7-bit characters. *) 28 | 29 | val is_iso88591 : ?pos: int -> ?len: int -> string -> bool 30 | (** This function returns true if and only if the given string or substring 31 | contains only legal ISO-8859-1-encoded characters 32 | (all printable characters plus all ASCII control characters). 33 | 34 | ISO-8859-1 is the default encoding assumed by HTTP 1.1. 35 | However, it is often confused with Windows-1252 which is a superset 36 | of ISO-8859-1. Most HTTP user agents assume Windows-1252 as the 37 | default encoding because the encoding of any ISO-8859-1 character is 38 | also valid under Windows-1252. 39 | 40 | See also [is_windows1252]. 41 | *) 42 | 43 | val is_windows1252 : ?pos: int -> ?len: int -> string -> bool 44 | (** This function returns true if and only if the given string or substring 45 | contains only legal Windows-1252-encoded characters 46 | (all printable characters plus all ASCII control characters). 47 | 48 | Windows-1252 is also known as CP-1252. It adds a few printable 49 | characters to ISO-8859-1 in the 128-159 range. Please note that 50 | the Unicode code points of these extra characters is not equal 51 | to their byte value. 52 | 53 | See also [is_iso88591]. 54 | *) 55 | 56 | val escape : 57 | ?pos: int -> 58 | ?len: int -> 59 | ?noquotes: bool -> 60 | string -> string 61 | (** Produces a valid OCaml string literal where non-printable and non-ASCII 62 | bytes are escaped using the hexadecimal notation 63 | except for the usual ['\n'], ['\t'] and ['\r']. 64 | [escape s] can be used as a substitute for [Printf.sprintf "%S" s] 65 | which uses the decimal notation in its escape sequences. 66 | 67 | @param noquotes omit leading and trailing double-quotes. Default: false. 68 | *) 69 | -------------------------------------------------------------------------------- /cp1252.txt: -------------------------------------------------------------------------------- 1 | # Retrieved on 2011-09-17 from 2 | # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT 3 | # 4 | # 5 | # Name: cp1252 to Unicode table 6 | # Unicode version: 2.0 7 | # Table version: 2.01 8 | # Table format: Format A 9 | # Date: 04/15/98 10 | # 11 | # Contact: Shawn.Steele@microsoft.com 12 | # 13 | # General notes: none 14 | # 15 | # Format: Three tab-separated columns 16 | # Column #1 is the cp1252 code (in hex) 17 | # Column #2 is the Unicode (in hex as 0xXXXX) 18 | # Column #3 is the Unicode name (follows a comment sign, '#') 19 | # 20 | # The entries are in cp1252 order 21 | # 22 | 0x00 0x0000 #NULL 23 | 0x01 0x0001 #START OF HEADING 24 | 0x02 0x0002 #START OF TEXT 25 | 0x03 0x0003 #END OF TEXT 26 | 0x04 0x0004 #END OF TRANSMISSION 27 | 0x05 0x0005 #ENQUIRY 28 | 0x06 0x0006 #ACKNOWLEDGE 29 | 0x07 0x0007 #BELL 30 | 0x08 0x0008 #BACKSPACE 31 | 0x09 0x0009 #HORIZONTAL TABULATION 32 | 0x0A 0x000A #LINE FEED 33 | 0x0B 0x000B #VERTICAL TABULATION 34 | 0x0C 0x000C #FORM FEED 35 | 0x0D 0x000D #CARRIAGE RETURN 36 | 0x0E 0x000E #SHIFT OUT 37 | 0x0F 0x000F #SHIFT IN 38 | 0x10 0x0010 #DATA LINK ESCAPE 39 | 0x11 0x0011 #DEVICE CONTROL ONE 40 | 0x12 0x0012 #DEVICE CONTROL TWO 41 | 0x13 0x0013 #DEVICE CONTROL THREE 42 | 0x14 0x0014 #DEVICE CONTROL FOUR 43 | 0x15 0x0015 #NEGATIVE ACKNOWLEDGE 44 | 0x16 0x0016 #SYNCHRONOUS IDLE 45 | 0x17 0x0017 #END OF TRANSMISSION BLOCK 46 | 0x18 0x0018 #CANCEL 47 | 0x19 0x0019 #END OF MEDIUM 48 | 0x1A 0x001A #SUBSTITUTE 49 | 0x1B 0x001B #ESCAPE 50 | 0x1C 0x001C #FILE SEPARATOR 51 | 0x1D 0x001D #GROUP SEPARATOR 52 | 0x1E 0x001E #RECORD SEPARATOR 53 | 0x1F 0x001F #UNIT SEPARATOR 54 | 0x20 0x0020 #SPACE 55 | 0x21 0x0021 #EXCLAMATION MARK 56 | 0x22 0x0022 #QUOTATION MARK 57 | 0x23 0x0023 #NUMBER SIGN 58 | 0x24 0x0024 #DOLLAR SIGN 59 | 0x25 0x0025 #PERCENT SIGN 60 | 0x26 0x0026 #AMPERSAND 61 | 0x27 0x0027 #APOSTROPHE 62 | 0x28 0x0028 #LEFT PARENTHESIS 63 | 0x29 0x0029 #RIGHT PARENTHESIS 64 | 0x2A 0x002A #ASTERISK 65 | 0x2B 0x002B #PLUS SIGN 66 | 0x2C 0x002C #COMMA 67 | 0x2D 0x002D #HYPHEN-MINUS 68 | 0x2E 0x002E #FULL STOP 69 | 0x2F 0x002F #SOLIDUS 70 | 0x30 0x0030 #DIGIT ZERO 71 | 0x31 0x0031 #DIGIT ONE 72 | 0x32 0x0032 #DIGIT TWO 73 | 0x33 0x0033 #DIGIT THREE 74 | 0x34 0x0034 #DIGIT FOUR 75 | 0x35 0x0035 #DIGIT FIVE 76 | 0x36 0x0036 #DIGIT SIX 77 | 0x37 0x0037 #DIGIT SEVEN 78 | 0x38 0x0038 #DIGIT EIGHT 79 | 0x39 0x0039 #DIGIT NINE 80 | 0x3A 0x003A #COLON 81 | 0x3B 0x003B #SEMICOLON 82 | 0x3C 0x003C #LESS-THAN SIGN 83 | 0x3D 0x003D #EQUALS SIGN 84 | 0x3E 0x003E #GREATER-THAN SIGN 85 | 0x3F 0x003F #QUESTION MARK 86 | 0x40 0x0040 #COMMERCIAL AT 87 | 0x41 0x0041 #LATIN CAPITAL LETTER A 88 | 0x42 0x0042 #LATIN CAPITAL LETTER B 89 | 0x43 0x0043 #LATIN CAPITAL LETTER C 90 | 0x44 0x0044 #LATIN CAPITAL LETTER D 91 | 0x45 0x0045 #LATIN CAPITAL LETTER E 92 | 0x46 0x0046 #LATIN CAPITAL LETTER F 93 | 0x47 0x0047 #LATIN CAPITAL LETTER G 94 | 0x48 0x0048 #LATIN CAPITAL LETTER H 95 | 0x49 0x0049 #LATIN CAPITAL LETTER I 96 | 0x4A 0x004A #LATIN CAPITAL LETTER J 97 | 0x4B 0x004B #LATIN CAPITAL LETTER K 98 | 0x4C 0x004C #LATIN CAPITAL LETTER L 99 | 0x4D 0x004D #LATIN CAPITAL LETTER M 100 | 0x4E 0x004E #LATIN CAPITAL LETTER N 101 | 0x4F 0x004F #LATIN CAPITAL LETTER O 102 | 0x50 0x0050 #LATIN CAPITAL LETTER P 103 | 0x51 0x0051 #LATIN CAPITAL LETTER Q 104 | 0x52 0x0052 #LATIN CAPITAL LETTER R 105 | 0x53 0x0053 #LATIN CAPITAL LETTER S 106 | 0x54 0x0054 #LATIN CAPITAL LETTER T 107 | 0x55 0x0055 #LATIN CAPITAL LETTER U 108 | 0x56 0x0056 #LATIN CAPITAL LETTER V 109 | 0x57 0x0057 #LATIN CAPITAL LETTER W 110 | 0x58 0x0058 #LATIN CAPITAL LETTER X 111 | 0x59 0x0059 #LATIN CAPITAL LETTER Y 112 | 0x5A 0x005A #LATIN CAPITAL LETTER Z 113 | 0x5B 0x005B #LEFT SQUARE BRACKET 114 | 0x5C 0x005C #REVERSE SOLIDUS 115 | 0x5D 0x005D #RIGHT SQUARE BRACKET 116 | 0x5E 0x005E #CIRCUMFLEX ACCENT 117 | 0x5F 0x005F #LOW LINE 118 | 0x60 0x0060 #GRAVE ACCENT 119 | 0x61 0x0061 #LATIN SMALL LETTER A 120 | 0x62 0x0062 #LATIN SMALL LETTER B 121 | 0x63 0x0063 #LATIN SMALL LETTER C 122 | 0x64 0x0064 #LATIN SMALL LETTER D 123 | 0x65 0x0065 #LATIN SMALL LETTER E 124 | 0x66 0x0066 #LATIN SMALL LETTER F 125 | 0x67 0x0067 #LATIN SMALL LETTER G 126 | 0x68 0x0068 #LATIN SMALL LETTER H 127 | 0x69 0x0069 #LATIN SMALL LETTER I 128 | 0x6A 0x006A #LATIN SMALL LETTER J 129 | 0x6B 0x006B #LATIN SMALL LETTER K 130 | 0x6C 0x006C #LATIN SMALL LETTER L 131 | 0x6D 0x006D #LATIN SMALL LETTER M 132 | 0x6E 0x006E #LATIN SMALL LETTER N 133 | 0x6F 0x006F #LATIN SMALL LETTER O 134 | 0x70 0x0070 #LATIN SMALL LETTER P 135 | 0x71 0x0071 #LATIN SMALL LETTER Q 136 | 0x72 0x0072 #LATIN SMALL LETTER R 137 | 0x73 0x0073 #LATIN SMALL LETTER S 138 | 0x74 0x0074 #LATIN SMALL LETTER T 139 | 0x75 0x0075 #LATIN SMALL LETTER U 140 | 0x76 0x0076 #LATIN SMALL LETTER V 141 | 0x77 0x0077 #LATIN SMALL LETTER W 142 | 0x78 0x0078 #LATIN SMALL LETTER X 143 | 0x79 0x0079 #LATIN SMALL LETTER Y 144 | 0x7A 0x007A #LATIN SMALL LETTER Z 145 | 0x7B 0x007B #LEFT CURLY BRACKET 146 | 0x7C 0x007C #VERTICAL LINE 147 | 0x7D 0x007D #RIGHT CURLY BRACKET 148 | 0x7E 0x007E #TILDE 149 | 0x7F 0x007F #DELETE 150 | 0x80 0x20AC #EURO SIGN 151 | 0x81 #UNDEFINED 152 | 0x82 0x201A #SINGLE LOW-9 QUOTATION MARK 153 | 0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK 154 | 0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK 155 | 0x85 0x2026 #HORIZONTAL ELLIPSIS 156 | 0x86 0x2020 #DAGGER 157 | 0x87 0x2021 #DOUBLE DAGGER 158 | 0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT 159 | 0x89 0x2030 #PER MILLE SIGN 160 | 0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON 161 | 0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK 162 | 0x8C 0x0152 #LATIN CAPITAL LIGATURE OE 163 | 0x8D #UNDEFINED 164 | 0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON 165 | 0x8F #UNDEFINED 166 | 0x90 #UNDEFINED 167 | 0x91 0x2018 #LEFT SINGLE QUOTATION MARK 168 | 0x92 0x2019 #RIGHT SINGLE QUOTATION MARK 169 | 0x93 0x201C #LEFT DOUBLE QUOTATION MARK 170 | 0x94 0x201D #RIGHT DOUBLE QUOTATION MARK 171 | 0x95 0x2022 #BULLET 172 | 0x96 0x2013 #EN DASH 173 | 0x97 0x2014 #EM DASH 174 | 0x98 0x02DC #SMALL TILDE 175 | 0x99 0x2122 #TRADE MARK SIGN 176 | 0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON 177 | 0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 178 | 0x9C 0x0153 #LATIN SMALL LIGATURE OE 179 | 0x9D #UNDEFINED 180 | 0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON 181 | 0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS 182 | 0xA0 0x00A0 #NO-BREAK SPACE 183 | 0xA1 0x00A1 #INVERTED EXCLAMATION MARK 184 | 0xA2 0x00A2 #CENT SIGN 185 | 0xA3 0x00A3 #POUND SIGN 186 | 0xA4 0x00A4 #CURRENCY SIGN 187 | 0xA5 0x00A5 #YEN SIGN 188 | 0xA6 0x00A6 #BROKEN BAR 189 | 0xA7 0x00A7 #SECTION SIGN 190 | 0xA8 0x00A8 #DIAERESIS 191 | 0xA9 0x00A9 #COPYRIGHT SIGN 192 | 0xAA 0x00AA #FEMININE ORDINAL INDICATOR 193 | 0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 194 | 0xAC 0x00AC #NOT SIGN 195 | 0xAD 0x00AD #SOFT HYPHEN 196 | 0xAE 0x00AE #REGISTERED SIGN 197 | 0xAF 0x00AF #MACRON 198 | 0xB0 0x00B0 #DEGREE SIGN 199 | 0xB1 0x00B1 #PLUS-MINUS SIGN 200 | 0xB2 0x00B2 #SUPERSCRIPT TWO 201 | 0xB3 0x00B3 #SUPERSCRIPT THREE 202 | 0xB4 0x00B4 #ACUTE ACCENT 203 | 0xB5 0x00B5 #MICRO SIGN 204 | 0xB6 0x00B6 #PILCROW SIGN 205 | 0xB7 0x00B7 #MIDDLE DOT 206 | 0xB8 0x00B8 #CEDILLA 207 | 0xB9 0x00B9 #SUPERSCRIPT ONE 208 | 0xBA 0x00BA #MASCULINE ORDINAL INDICATOR 209 | 0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 210 | 0xBC 0x00BC #VULGAR FRACTION ONE QUARTER 211 | 0xBD 0x00BD #VULGAR FRACTION ONE HALF 212 | 0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS 213 | 0xBF 0x00BF #INVERTED QUESTION MARK 214 | 0xC0 0x00C0 #LATIN CAPITAL LETTER A WITH GRAVE 215 | 0xC1 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE 216 | 0xC2 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX 217 | 0xC3 0x00C3 #LATIN CAPITAL LETTER A WITH TILDE 218 | 0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS 219 | 0xC5 0x00C5 #LATIN CAPITAL LETTER A WITH RING ABOVE 220 | 0xC6 0x00C6 #LATIN CAPITAL LETTER AE 221 | 0xC7 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA 222 | 0xC8 0x00C8 #LATIN CAPITAL LETTER E WITH GRAVE 223 | 0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE 224 | 0xCA 0x00CA #LATIN CAPITAL LETTER E WITH CIRCUMFLEX 225 | 0xCB 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS 226 | 0xCC 0x00CC #LATIN CAPITAL LETTER I WITH GRAVE 227 | 0xCD 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE 228 | 0xCE 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX 229 | 0xCF 0x00CF #LATIN CAPITAL LETTER I WITH DIAERESIS 230 | 0xD0 0x00D0 #LATIN CAPITAL LETTER ETH 231 | 0xD1 0x00D1 #LATIN CAPITAL LETTER N WITH TILDE 232 | 0xD2 0x00D2 #LATIN CAPITAL LETTER O WITH GRAVE 233 | 0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE 234 | 0xD4 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX 235 | 0xD5 0x00D5 #LATIN CAPITAL LETTER O WITH TILDE 236 | 0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS 237 | 0xD7 0x00D7 #MULTIPLICATION SIGN 238 | 0xD8 0x00D8 #LATIN CAPITAL LETTER O WITH STROKE 239 | 0xD9 0x00D9 #LATIN CAPITAL LETTER U WITH GRAVE 240 | 0xDA 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE 241 | 0xDB 0x00DB #LATIN CAPITAL LETTER U WITH CIRCUMFLEX 242 | 0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS 243 | 0xDD 0x00DD #LATIN CAPITAL LETTER Y WITH ACUTE 244 | 0xDE 0x00DE #LATIN CAPITAL LETTER THORN 245 | 0xDF 0x00DF #LATIN SMALL LETTER SHARP S 246 | 0xE0 0x00E0 #LATIN SMALL LETTER A WITH GRAVE 247 | 0xE1 0x00E1 #LATIN SMALL LETTER A WITH ACUTE 248 | 0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX 249 | 0xE3 0x00E3 #LATIN SMALL LETTER A WITH TILDE 250 | 0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS 251 | 0xE5 0x00E5 #LATIN SMALL LETTER A WITH RING ABOVE 252 | 0xE6 0x00E6 #LATIN SMALL LETTER AE 253 | 0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA 254 | 0xE8 0x00E8 #LATIN SMALL LETTER E WITH GRAVE 255 | 0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE 256 | 0xEA 0x00EA #LATIN SMALL LETTER E WITH CIRCUMFLEX 257 | 0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS 258 | 0xEC 0x00EC #LATIN SMALL LETTER I WITH GRAVE 259 | 0xED 0x00ED #LATIN SMALL LETTER I WITH ACUTE 260 | 0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX 261 | 0xEF 0x00EF #LATIN SMALL LETTER I WITH DIAERESIS 262 | 0xF0 0x00F0 #LATIN SMALL LETTER ETH 263 | 0xF1 0x00F1 #LATIN SMALL LETTER N WITH TILDE 264 | 0xF2 0x00F2 #LATIN SMALL LETTER O WITH GRAVE 265 | 0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE 266 | 0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX 267 | 0xF5 0x00F5 #LATIN SMALL LETTER O WITH TILDE 268 | 0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS 269 | 0xF7 0x00F7 #DIVISION SIGN 270 | 0xF8 0x00F8 #LATIN SMALL LETTER O WITH STROKE 271 | 0xF9 0x00F9 #LATIN SMALL LETTER U WITH GRAVE 272 | 0xFA 0x00FA #LATIN SMALL LETTER U WITH ACUTE 273 | 0xFB 0x00FB #LATIN SMALL LETTER U WITH CIRCUMFLEX 274 | 0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS 275 | 0xFD 0x00FD #LATIN SMALL LETTER Y WITH ACUTE 276 | 0xFE 0x00FE #LATIN SMALL LETTER THORN 277 | 0xFF 0x00FF #LATIN SMALL LETTER Y WITH DIAERESIS 278 | -------------------------------------------------------------------------------- /utf8conv.ml: -------------------------------------------------------------------------------- 1 | open Printf 2 | 3 | let get_len pos opt_len s = 4 | let slen = String.length s in 5 | if pos < 0 || (pos >= slen && not (pos = 0)) then 6 | failwith (sprintf 7 | "Utf8conv: out-of-bounds pos argument \ 8 | (string length: %i, pos: %i)" 9 | slen pos); 10 | let len = 11 | match opt_len with 12 | None -> slen - pos 13 | | Some n -> 14 | if n < 0 || pos + n > slen then 15 | failwith (sprintf 16 | "Utf8conv: out-of-bounds len argument \ 17 | (string length: %i, pos: %i, len: %i)" 18 | slen pos n) 19 | else 20 | n 21 | in 22 | len 23 | 24 | let add_hex_char buf c = 25 | bprintf buf "\\x%02X" (Char.code c) 26 | 27 | let escape ?(pos = 0) ?len ?(noquotes = false) s = 28 | let len = get_len pos len s in 29 | let buf = Buffer.create (2 * len) in 30 | if not noquotes then 31 | Buffer.add_char buf '\"'; 32 | for i = pos to pos + len - 1 do 33 | let c = s.[i] in 34 | match c with 35 | '\\' -> Buffer.add_string buf "\\\\" 36 | | '\"' -> Buffer.add_string buf "\\\"" 37 | 38 | | '\x00'..'\x07' -> add_hex_char buf c 39 | | '\x08' -> Buffer.add_string buf "\\b" 40 | | '\x09' -> Buffer.add_string buf "\\t" 41 | | '\x0a' -> Buffer.add_string buf "\\n" 42 | | '\x0b'..'\x0c' -> add_hex_char buf c 43 | | '\x0d' -> Buffer.add_string buf "\\r" 44 | | '\x0e'..'\x1f' -> add_hex_char buf c 45 | | '\x20'..'\x7e' -> Buffer.add_char buf c 46 | | '\x7f'..'\xff' -> add_hex_char buf c 47 | done; 48 | if not noquotes then 49 | Buffer.add_char buf '\"'; 50 | Buffer.contents buf 51 | 52 | (* Windows-1252 -> Unicode, UTF-8, name *) 53 | let windows1252 = [| 54 | (*0x00*) Some 0x0000, Some "\x00", 55 | "NULL"; 56 | (*0x01*) Some 0x0001, Some "\x01", 57 | "START OF HEADING"; 58 | (*0x02*) Some 0x0002, Some "\x02", 59 | "START OF TEXT"; 60 | (*0x03*) Some 0x0003, Some "\x03", 61 | "END OF TEXT"; 62 | (*0x04*) Some 0x0004, Some "\x04", 63 | "END OF TRANSMISSION"; 64 | (*0x05*) Some 0x0005, Some "\x05", 65 | "ENQUIRY"; 66 | (*0x06*) Some 0x0006, Some "\x06", 67 | "ACKNOWLEDGE"; 68 | (*0x07*) Some 0x0007, Some "\x07", 69 | "BELL"; 70 | (*0x08*) Some 0x0008, Some "\b", 71 | "BACKSPACE"; 72 | (*0x09*) Some 0x0009, Some "\t", 73 | "HORIZONTAL TABULATION"; 74 | (*0x0a*) Some 0x000a, Some "\n", 75 | "LINE FEED"; 76 | (*0x0b*) Some 0x000b, Some "\x0B", 77 | "VERTICAL TABULATION"; 78 | (*0x0c*) Some 0x000c, Some "\x0C", 79 | "FORM FEED"; 80 | (*0x0d*) Some 0x000d, Some "\r", 81 | "CARRIAGE RETURN"; 82 | (*0x0e*) Some 0x000e, Some "\x0E", 83 | "SHIFT OUT"; 84 | (*0x0f*) Some 0x000f, Some "\x0F", 85 | "SHIFT IN"; 86 | (*0x10*) Some 0x0010, Some "\x10", 87 | "DATA LINK ESCAPE"; 88 | (*0x11*) Some 0x0011, Some "\x11", 89 | "DEVICE CONTROL ONE"; 90 | (*0x12*) Some 0x0012, Some "\x12", 91 | "DEVICE CONTROL TWO"; 92 | (*0x13*) Some 0x0013, Some "\x13", 93 | "DEVICE CONTROL THREE"; 94 | (*0x14*) Some 0x0014, Some "\x14", 95 | "DEVICE CONTROL FOUR"; 96 | (*0x15*) Some 0x0015, Some "\x15", 97 | "NEGATIVE ACKNOWLEDGE"; 98 | (*0x16*) Some 0x0016, Some "\x16", 99 | "SYNCHRONOUS IDLE"; 100 | (*0x17*) Some 0x0017, Some "\x17", 101 | "END OF TRANSMISSION BLOCK"; 102 | (*0x18*) Some 0x0018, Some "\x18", 103 | "CANCEL"; 104 | (*0x19*) Some 0x0019, Some "\x19", 105 | "END OF MEDIUM"; 106 | (*0x1a*) Some 0x001a, Some "\x1A", 107 | "SUBSTITUTE"; 108 | (*0x1b*) Some 0x001b, Some "\x1B", 109 | "ESCAPE"; 110 | (*0x1c*) Some 0x001c, Some "\x1C", 111 | "FILE SEPARATOR"; 112 | (*0x1d*) Some 0x001d, Some "\x1D", 113 | "GROUP SEPARATOR"; 114 | (*0x1e*) Some 0x001e, Some "\x1E", 115 | "RECORD SEPARATOR"; 116 | (*0x1f*) Some 0x001f, Some "\x1F", 117 | "UNIT SEPARATOR"; 118 | (*0x20*) Some 0x0020, Some " ", 119 | "SPACE"; 120 | (*0x21*) Some 0x0021, Some "!", 121 | "EXCLAMATION MARK"; 122 | (*0x22*) Some 0x0022, Some "\"", 123 | "QUOTATION MARK"; 124 | (*0x23*) Some 0x0023, Some "#", 125 | "NUMBER SIGN"; 126 | (*0x24*) Some 0x0024, Some "$", 127 | "DOLLAR SIGN"; 128 | (*0x25*) Some 0x0025, Some "%", 129 | "PERCENT SIGN"; 130 | (*0x26*) Some 0x0026, Some "&", 131 | "AMPERSAND"; 132 | (*0x27*) Some 0x0027, Some "'", 133 | "APOSTROPHE"; 134 | (*0x28*) Some 0x0028, Some "(", 135 | "LEFT PARENTHESIS"; 136 | (*0x29*) Some 0x0029, Some ")", 137 | "RIGHT PARENTHESIS"; 138 | (*0x2a*) Some 0x002a, Some "*", 139 | "ASTERISK"; 140 | (*0x2b*) Some 0x002b, Some "+", 141 | "PLUS SIGN"; 142 | (*0x2c*) Some 0x002c, Some ",", 143 | "COMMA"; 144 | (*0x2d*) Some 0x002d, Some "-", 145 | "HYPHEN-MINUS"; 146 | (*0x2e*) Some 0x002e, Some ".", 147 | "FULL STOP"; 148 | (*0x2f*) Some 0x002f, Some "/", 149 | "SOLIDUS"; 150 | (*0x30*) Some 0x0030, Some "0", 151 | "DIGIT ZERO"; 152 | (*0x31*) Some 0x0031, Some "1", 153 | "DIGIT ONE"; 154 | (*0x32*) Some 0x0032, Some "2", 155 | "DIGIT TWO"; 156 | (*0x33*) Some 0x0033, Some "3", 157 | "DIGIT THREE"; 158 | (*0x34*) Some 0x0034, Some "4", 159 | "DIGIT FOUR"; 160 | (*0x35*) Some 0x0035, Some "5", 161 | "DIGIT FIVE"; 162 | (*0x36*) Some 0x0036, Some "6", 163 | "DIGIT SIX"; 164 | (*0x37*) Some 0x0037, Some "7", 165 | "DIGIT SEVEN"; 166 | (*0x38*) Some 0x0038, Some "8", 167 | "DIGIT EIGHT"; 168 | (*0x39*) Some 0x0039, Some "9", 169 | "DIGIT NINE"; 170 | (*0x3a*) Some 0x003a, Some ":", 171 | "COLON"; 172 | (*0x3b*) Some 0x003b, Some ";", 173 | "SEMICOLON"; 174 | (*0x3c*) Some 0x003c, Some "<", 175 | "LESS-THAN SIGN"; 176 | (*0x3d*) Some 0x003d, Some "=", 177 | "EQUALS SIGN"; 178 | (*0x3e*) Some 0x003e, Some ">", 179 | "GREATER-THAN SIGN"; 180 | (*0x3f*) Some 0x003f, Some "?", 181 | "QUESTION MARK"; 182 | (*0x40*) Some 0x0040, Some "@", 183 | "COMMERCIAL AT"; 184 | (*0x41*) Some 0x0041, Some "A", 185 | "LATIN CAPITAL LETTER A"; 186 | (*0x42*) Some 0x0042, Some "B", 187 | "LATIN CAPITAL LETTER B"; 188 | (*0x43*) Some 0x0043, Some "C", 189 | "LATIN CAPITAL LETTER C"; 190 | (*0x44*) Some 0x0044, Some "D", 191 | "LATIN CAPITAL LETTER D"; 192 | (*0x45*) Some 0x0045, Some "E", 193 | "LATIN CAPITAL LETTER E"; 194 | (*0x46*) Some 0x0046, Some "F", 195 | "LATIN CAPITAL LETTER F"; 196 | (*0x47*) Some 0x0047, Some "G", 197 | "LATIN CAPITAL LETTER G"; 198 | (*0x48*) Some 0x0048, Some "H", 199 | "LATIN CAPITAL LETTER H"; 200 | (*0x49*) Some 0x0049, Some "I", 201 | "LATIN CAPITAL LETTER I"; 202 | (*0x4a*) Some 0x004a, Some "J", 203 | "LATIN CAPITAL LETTER J"; 204 | (*0x4b*) Some 0x004b, Some "K", 205 | "LATIN CAPITAL LETTER K"; 206 | (*0x4c*) Some 0x004c, Some "L", 207 | "LATIN CAPITAL LETTER L"; 208 | (*0x4d*) Some 0x004d, Some "M", 209 | "LATIN CAPITAL LETTER M"; 210 | (*0x4e*) Some 0x004e, Some "N", 211 | "LATIN CAPITAL LETTER N"; 212 | (*0x4f*) Some 0x004f, Some "O", 213 | "LATIN CAPITAL LETTER O"; 214 | (*0x50*) Some 0x0050, Some "P", 215 | "LATIN CAPITAL LETTER P"; 216 | (*0x51*) Some 0x0051, Some "Q", 217 | "LATIN CAPITAL LETTER Q"; 218 | (*0x52*) Some 0x0052, Some "R", 219 | "LATIN CAPITAL LETTER R"; 220 | (*0x53*) Some 0x0053, Some "S", 221 | "LATIN CAPITAL LETTER S"; 222 | (*0x54*) Some 0x0054, Some "T", 223 | "LATIN CAPITAL LETTER T"; 224 | (*0x55*) Some 0x0055, Some "U", 225 | "LATIN CAPITAL LETTER U"; 226 | (*0x56*) Some 0x0056, Some "V", 227 | "LATIN CAPITAL LETTER V"; 228 | (*0x57*) Some 0x0057, Some "W", 229 | "LATIN CAPITAL LETTER W"; 230 | (*0x58*) Some 0x0058, Some "X", 231 | "LATIN CAPITAL LETTER X"; 232 | (*0x59*) Some 0x0059, Some "Y", 233 | "LATIN CAPITAL LETTER Y"; 234 | (*0x5a*) Some 0x005a, Some "Z", 235 | "LATIN CAPITAL LETTER Z"; 236 | (*0x5b*) Some 0x005b, Some "[", 237 | "LEFT SQUARE BRACKET"; 238 | (*0x5c*) Some 0x005c, Some "\\", 239 | "REVERSE SOLIDUS"; 240 | (*0x5d*) Some 0x005d, Some "]", 241 | "RIGHT SQUARE BRACKET"; 242 | (*0x5e*) Some 0x005e, Some "^", 243 | "CIRCUMFLEX ACCENT"; 244 | (*0x5f*) Some 0x005f, Some "_", 245 | "LOW LINE"; 246 | (*0x60*) Some 0x0060, Some "`", 247 | "GRAVE ACCENT"; 248 | (*0x61*) Some 0x0061, Some "a", 249 | "LATIN SMALL LETTER A"; 250 | (*0x62*) Some 0x0062, Some "b", 251 | "LATIN SMALL LETTER B"; 252 | (*0x63*) Some 0x0063, Some "c", 253 | "LATIN SMALL LETTER C"; 254 | (*0x64*) Some 0x0064, Some "d", 255 | "LATIN SMALL LETTER D"; 256 | (*0x65*) Some 0x0065, Some "e", 257 | "LATIN SMALL LETTER E"; 258 | (*0x66*) Some 0x0066, Some "f", 259 | "LATIN SMALL LETTER F"; 260 | (*0x67*) Some 0x0067, Some "g", 261 | "LATIN SMALL LETTER G"; 262 | (*0x68*) Some 0x0068, Some "h", 263 | "LATIN SMALL LETTER H"; 264 | (*0x69*) Some 0x0069, Some "i", 265 | "LATIN SMALL LETTER I"; 266 | (*0x6a*) Some 0x006a, Some "j", 267 | "LATIN SMALL LETTER J"; 268 | (*0x6b*) Some 0x006b, Some "k", 269 | "LATIN SMALL LETTER K"; 270 | (*0x6c*) Some 0x006c, Some "l", 271 | "LATIN SMALL LETTER L"; 272 | (*0x6d*) Some 0x006d, Some "m", 273 | "LATIN SMALL LETTER M"; 274 | (*0x6e*) Some 0x006e, Some "n", 275 | "LATIN SMALL LETTER N"; 276 | (*0x6f*) Some 0x006f, Some "o", 277 | "LATIN SMALL LETTER O"; 278 | (*0x70*) Some 0x0070, Some "p", 279 | "LATIN SMALL LETTER P"; 280 | (*0x71*) Some 0x0071, Some "q", 281 | "LATIN SMALL LETTER Q"; 282 | (*0x72*) Some 0x0072, Some "r", 283 | "LATIN SMALL LETTER R"; 284 | (*0x73*) Some 0x0073, Some "s", 285 | "LATIN SMALL LETTER S"; 286 | (*0x74*) Some 0x0074, Some "t", 287 | "LATIN SMALL LETTER T"; 288 | (*0x75*) Some 0x0075, Some "u", 289 | "LATIN SMALL LETTER U"; 290 | (*0x76*) Some 0x0076, Some "v", 291 | "LATIN SMALL LETTER V"; 292 | (*0x77*) Some 0x0077, Some "w", 293 | "LATIN SMALL LETTER W"; 294 | (*0x78*) Some 0x0078, Some "x", 295 | "LATIN SMALL LETTER X"; 296 | (*0x79*) Some 0x0079, Some "y", 297 | "LATIN SMALL LETTER Y"; 298 | (*0x7a*) Some 0x007a, Some "z", 299 | "LATIN SMALL LETTER Z"; 300 | (*0x7b*) Some 0x007b, Some "{", 301 | "LEFT CURLY BRACKET"; 302 | (*0x7c*) Some 0x007c, Some "|", 303 | "VERTICAL LINE"; 304 | (*0x7d*) Some 0x007d, Some "}", 305 | "RIGHT CURLY BRACKET"; 306 | (*0x7e*) Some 0x007e, Some "~", 307 | "TILDE"; 308 | (*0x7f*) Some 0x007f, Some "\x7F", 309 | "DELETE"; 310 | (*0x80*) Some 0x20ac, Some "\xE2\x82\xAC", 311 | "EURO SIGN"; 312 | (*0x81*) None, None, 313 | "UNDEFINED"; 314 | (*0x82*) Some 0x201a, Some "\xE2\x80\x9A", 315 | "SINGLE LOW-9 QUOTATION MARK"; 316 | (*0x83*) Some 0x0192, Some "\xC6\x92", 317 | "LATIN SMALL LETTER F WITH HOOK"; 318 | (*0x84*) Some 0x201e, Some "\xE2\x80\x9E", 319 | "DOUBLE LOW-9 QUOTATION MARK"; 320 | (*0x85*) Some 0x2026, Some "\xE2\x80\xA6", 321 | "HORIZONTAL ELLIPSIS"; 322 | (*0x86*) Some 0x2020, Some "\xE2\x80\xA0", 323 | "DAGGER"; 324 | (*0x87*) Some 0x2021, Some "\xE2\x80\xA1", 325 | "DOUBLE DAGGER"; 326 | (*0x88*) Some 0x02c6, Some "\xCB\x86", 327 | "MODIFIER LETTER CIRCUMFLEX ACCENT"; 328 | (*0x89*) Some 0x2030, Some "\xE2\x80\xB0", 329 | "PER MILLE SIGN"; 330 | (*0x8a*) Some 0x0160, Some "\xC5\xA0", 331 | "LATIN CAPITAL LETTER S WITH CARON"; 332 | (*0x8b*) Some 0x2039, Some "\xE2\x80\xB9", 333 | "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"; 334 | (*0x8c*) Some 0x0152, Some "\xC5\x92", 335 | "LATIN CAPITAL LIGATURE OE"; 336 | (*0x8d*) None, None, 337 | "UNDEFINED"; 338 | (*0x8e*) Some 0x017d, Some "\xC5\xBD", 339 | "LATIN CAPITAL LETTER Z WITH CARON"; 340 | (*0x8f*) None, None, 341 | "UNDEFINED"; 342 | (*0x90*) None, None, 343 | "UNDEFINED"; 344 | (*0x91*) Some 0x2018, Some "\xE2\x80\x98", 345 | "LEFT SINGLE QUOTATION MARK"; 346 | (*0x92*) Some 0x2019, Some "\xE2\x80\x99", 347 | "RIGHT SINGLE QUOTATION MARK"; 348 | (*0x93*) Some 0x201c, Some "\xE2\x80\x9C", 349 | "LEFT DOUBLE QUOTATION MARK"; 350 | (*0x94*) Some 0x201d, Some "\xE2\x80\x9D", 351 | "RIGHT DOUBLE QUOTATION MARK"; 352 | (*0x95*) Some 0x2022, Some "\xE2\x80\xA2", 353 | "BULLET"; 354 | (*0x96*) Some 0x2013, Some "\xE2\x80\x93", 355 | "EN DASH"; 356 | (*0x97*) Some 0x2014, Some "\xE2\x80\x94", 357 | "EM DASH"; 358 | (*0x98*) Some 0x02dc, Some "\xCB\x9C", 359 | "SMALL TILDE"; 360 | (*0x99*) Some 0x2122, Some "\xE2\x84\xA2", 361 | "TRADE MARK SIGN"; 362 | (*0x9a*) Some 0x0161, Some "\xC5\xA1", 363 | "LATIN SMALL LETTER S WITH CARON"; 364 | (*0x9b*) Some 0x203a, Some "\xE2\x80\xBA", 365 | "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"; 366 | (*0x9c*) Some 0x0153, Some "\xC5\x93", 367 | "LATIN SMALL LIGATURE OE"; 368 | (*0x9d*) None, None, 369 | "UNDEFINED"; 370 | (*0x9e*) Some 0x017e, Some "\xC5\xBE", 371 | "LATIN SMALL LETTER Z WITH CARON"; 372 | (*0x9f*) Some 0x0178, Some "\xC5\xB8", 373 | "LATIN CAPITAL LETTER Y WITH DIAERESIS"; 374 | (*0xa0*) Some 0x00a0, Some "\xC2\xA0", 375 | "NO-BREAK SPACE"; 376 | (*0xa1*) Some 0x00a1, Some "\xC2\xA1", 377 | "INVERTED EXCLAMATION MARK"; 378 | (*0xa2*) Some 0x00a2, Some "\xC2\xA2", 379 | "CENT SIGN"; 380 | (*0xa3*) Some 0x00a3, Some "\xC2\xA3", 381 | "POUND SIGN"; 382 | (*0xa4*) Some 0x00a4, Some "\xC2\xA4", 383 | "CURRENCY SIGN"; 384 | (*0xa5*) Some 0x00a5, Some "\xC2\xA5", 385 | "YEN SIGN"; 386 | (*0xa6*) Some 0x00a6, Some "\xC2\xA6", 387 | "BROKEN BAR"; 388 | (*0xa7*) Some 0x00a7, Some "\xC2\xA7", 389 | "SECTION SIGN"; 390 | (*0xa8*) Some 0x00a8, Some "\xC2\xA8", 391 | "DIAERESIS"; 392 | (*0xa9*) Some 0x00a9, Some "\xC2\xA9", 393 | "COPYRIGHT SIGN"; 394 | (*0xaa*) Some 0x00aa, Some "\xC2\xAA", 395 | "FEMININE ORDINAL INDICATOR"; 396 | (*0xab*) Some 0x00ab, Some "\xC2\xAB", 397 | "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"; 398 | (*0xac*) Some 0x00ac, Some "\xC2\xAC", 399 | "NOT SIGN"; 400 | (*0xad*) Some 0x00ad, Some "\xC2\xAD", 401 | "SOFT HYPHEN"; 402 | (*0xae*) Some 0x00ae, Some "\xC2\xAE", 403 | "REGISTERED SIGN"; 404 | (*0xaf*) Some 0x00af, Some "\xC2\xAF", 405 | "MACRON"; 406 | (*0xb0*) Some 0x00b0, Some "\xC2\xB0", 407 | "DEGREE SIGN"; 408 | (*0xb1*) Some 0x00b1, Some "\xC2\xB1", 409 | "PLUS-MINUS SIGN"; 410 | (*0xb2*) Some 0x00b2, Some "\xC2\xB2", 411 | "SUPERSCRIPT TWO"; 412 | (*0xb3*) Some 0x00b3, Some "\xC2\xB3", 413 | "SUPERSCRIPT THREE"; 414 | (*0xb4*) Some 0x00b4, Some "\xC2\xB4", 415 | "ACUTE ACCENT"; 416 | (*0xb5*) Some 0x00b5, Some "\xC2\xB5", 417 | "MICRO SIGN"; 418 | (*0xb6*) Some 0x00b6, Some "\xC2\xB6", 419 | "PILCROW SIGN"; 420 | (*0xb7*) Some 0x00b7, Some "\xC2\xB7", 421 | "MIDDLE DOT"; 422 | (*0xb8*) Some 0x00b8, Some "\xC2\xB8", 423 | "CEDILLA"; 424 | (*0xb9*) Some 0x00b9, Some "\xC2\xB9", 425 | "SUPERSCRIPT ONE"; 426 | (*0xba*) Some 0x00ba, Some "\xC2\xBA", 427 | "MASCULINE ORDINAL INDICATOR"; 428 | (*0xbb*) Some 0x00bb, Some "\xC2\xBB", 429 | "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"; 430 | (*0xbc*) Some 0x00bc, Some "\xC2\xBC", 431 | "VULGAR FRACTION ONE QUARTER"; 432 | (*0xbd*) Some 0x00bd, Some "\xC2\xBD", 433 | "VULGAR FRACTION ONE HALF"; 434 | (*0xbe*) Some 0x00be, Some "\xC2\xBE", 435 | "VULGAR FRACTION THREE QUARTERS"; 436 | (*0xbf*) Some 0x00bf, Some "\xC2\xBF", 437 | "INVERTED QUESTION MARK"; 438 | (*0xc0*) Some 0x00c0, Some "\xC3\x80", 439 | "LATIN CAPITAL LETTER A WITH GRAVE"; 440 | (*0xc1*) Some 0x00c1, Some "\xC3\x81", 441 | "LATIN CAPITAL LETTER A WITH ACUTE"; 442 | (*0xc2*) Some 0x00c2, Some "\xC3\x82", 443 | "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"; 444 | (*0xc3*) Some 0x00c3, Some "\xC3\x83", 445 | "LATIN CAPITAL LETTER A WITH TILDE"; 446 | (*0xc4*) Some 0x00c4, Some "\xC3\x84", 447 | "LATIN CAPITAL LETTER A WITH DIAERESIS"; 448 | (*0xc5*) Some 0x00c5, Some "\xC3\x85", 449 | "LATIN CAPITAL LETTER A WITH RING ABOVE"; 450 | (*0xc6*) Some 0x00c6, Some "\xC3\x86", 451 | "LATIN CAPITAL LETTER AE"; 452 | (*0xc7*) Some 0x00c7, Some "\xC3\x87", 453 | "LATIN CAPITAL LETTER C WITH CEDILLA"; 454 | (*0xc8*) Some 0x00c8, Some "\xC3\x88", 455 | "LATIN CAPITAL LETTER E WITH GRAVE"; 456 | (*0xc9*) Some 0x00c9, Some "\xC3\x89", 457 | "LATIN CAPITAL LETTER E WITH ACUTE"; 458 | (*0xca*) Some 0x00ca, Some "\xC3\x8A", 459 | "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"; 460 | (*0xcb*) Some 0x00cb, Some "\xC3\x8B", 461 | "LATIN CAPITAL LETTER E WITH DIAERESIS"; 462 | (*0xcc*) Some 0x00cc, Some "\xC3\x8C", 463 | "LATIN CAPITAL LETTER I WITH GRAVE"; 464 | (*0xcd*) Some 0x00cd, Some "\xC3\x8D", 465 | "LATIN CAPITAL LETTER I WITH ACUTE"; 466 | (*0xce*) Some 0x00ce, Some "\xC3\x8E", 467 | "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"; 468 | (*0xcf*) Some 0x00cf, Some "\xC3\x8F", 469 | "LATIN CAPITAL LETTER I WITH DIAERESIS"; 470 | (*0xd0*) Some 0x00d0, Some "\xC3\x90", 471 | "LATIN CAPITAL LETTER ETH"; 472 | (*0xd1*) Some 0x00d1, Some "\xC3\x91", 473 | "LATIN CAPITAL LETTER N WITH TILDE"; 474 | (*0xd2*) Some 0x00d2, Some "\xC3\x92", 475 | "LATIN CAPITAL LETTER O WITH GRAVE"; 476 | (*0xd3*) Some 0x00d3, Some "\xC3\x93", 477 | "LATIN CAPITAL LETTER O WITH ACUTE"; 478 | (*0xd4*) Some 0x00d4, Some "\xC3\x94", 479 | "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"; 480 | (*0xd5*) Some 0x00d5, Some "\xC3\x95", 481 | "LATIN CAPITAL LETTER O WITH TILDE"; 482 | (*0xd6*) Some 0x00d6, Some "\xC3\x96", 483 | "LATIN CAPITAL LETTER O WITH DIAERESIS"; 484 | (*0xd7*) Some 0x00d7, Some "\xC3\x97", 485 | "MULTIPLICATION SIGN"; 486 | (*0xd8*) Some 0x00d8, Some "\xC3\x98", 487 | "LATIN CAPITAL LETTER O WITH STROKE"; 488 | (*0xd9*) Some 0x00d9, Some "\xC3\x99", 489 | "LATIN CAPITAL LETTER U WITH GRAVE"; 490 | (*0xda*) Some 0x00da, Some "\xC3\x9A", 491 | "LATIN CAPITAL LETTER U WITH ACUTE"; 492 | (*0xdb*) Some 0x00db, Some "\xC3\x9B", 493 | "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"; 494 | (*0xdc*) Some 0x00dc, Some "\xC3\x9C", 495 | "LATIN CAPITAL LETTER U WITH DIAERESIS"; 496 | (*0xdd*) Some 0x00dd, Some "\xC3\x9D", 497 | "LATIN CAPITAL LETTER Y WITH ACUTE"; 498 | (*0xde*) Some 0x00de, Some "\xC3\x9E", 499 | "LATIN CAPITAL LETTER THORN"; 500 | (*0xdf*) Some 0x00df, Some "\xC3\x9F", 501 | "LATIN SMALL LETTER SHARP S"; 502 | (*0xe0*) Some 0x00e0, Some "\xC3\xA0", 503 | "LATIN SMALL LETTER A WITH GRAVE"; 504 | (*0xe1*) Some 0x00e1, Some "\xC3\xA1", 505 | "LATIN SMALL LETTER A WITH ACUTE"; 506 | (*0xe2*) Some 0x00e2, Some "\xC3\xA2", 507 | "LATIN SMALL LETTER A WITH CIRCUMFLEX"; 508 | (*0xe3*) Some 0x00e3, Some "\xC3\xA3", 509 | "LATIN SMALL LETTER A WITH TILDE"; 510 | (*0xe4*) Some 0x00e4, Some "\xC3\xA4", 511 | "LATIN SMALL LETTER A WITH DIAERESIS"; 512 | (*0xe5*) Some 0x00e5, Some "\xC3\xA5", 513 | "LATIN SMALL LETTER A WITH RING ABOVE"; 514 | (*0xe6*) Some 0x00e6, Some "\xC3\xA6", 515 | "LATIN SMALL LETTER AE"; 516 | (*0xe7*) Some 0x00e7, Some "\xC3\xA7", 517 | "LATIN SMALL LETTER C WITH CEDILLA"; 518 | (*0xe8*) Some 0x00e8, Some "\xC3\xA8", 519 | "LATIN SMALL LETTER E WITH GRAVE"; 520 | (*0xe9*) Some 0x00e9, Some "\xC3\xA9", 521 | "LATIN SMALL LETTER E WITH ACUTE"; 522 | (*0xea*) Some 0x00ea, Some "\xC3\xAA", 523 | "LATIN SMALL LETTER E WITH CIRCUMFLEX"; 524 | (*0xeb*) Some 0x00eb, Some "\xC3\xAB", 525 | "LATIN SMALL LETTER E WITH DIAERESIS"; 526 | (*0xec*) Some 0x00ec, Some "\xC3\xAC", 527 | "LATIN SMALL LETTER I WITH GRAVE"; 528 | (*0xed*) Some 0x00ed, Some "\xC3\xAD", 529 | "LATIN SMALL LETTER I WITH ACUTE"; 530 | (*0xee*) Some 0x00ee, Some "\xC3\xAE", 531 | "LATIN SMALL LETTER I WITH CIRCUMFLEX"; 532 | (*0xef*) Some 0x00ef, Some "\xC3\xAF", 533 | "LATIN SMALL LETTER I WITH DIAERESIS"; 534 | (*0xf0*) Some 0x00f0, Some "\xC3\xB0", 535 | "LATIN SMALL LETTER ETH"; 536 | (*0xf1*) Some 0x00f1, Some "\xC3\xB1", 537 | "LATIN SMALL LETTER N WITH TILDE"; 538 | (*0xf2*) Some 0x00f2, Some "\xC3\xB2", 539 | "LATIN SMALL LETTER O WITH GRAVE"; 540 | (*0xf3*) Some 0x00f3, Some "\xC3\xB3", 541 | "LATIN SMALL LETTER O WITH ACUTE"; 542 | (*0xf4*) Some 0x00f4, Some "\xC3\xB4", 543 | "LATIN SMALL LETTER O WITH CIRCUMFLEX"; 544 | (*0xf5*) Some 0x00f5, Some "\xC3\xB5", 545 | "LATIN SMALL LETTER O WITH TILDE"; 546 | (*0xf6*) Some 0x00f6, Some "\xC3\xB6", 547 | "LATIN SMALL LETTER O WITH DIAERESIS"; 548 | (*0xf7*) Some 0x00f7, Some "\xC3\xB7", 549 | "DIVISION SIGN"; 550 | (*0xf8*) Some 0x00f8, Some "\xC3\xB8", 551 | "LATIN SMALL LETTER O WITH STROKE"; 552 | (*0xf9*) Some 0x00f9, Some "\xC3\xB9", 553 | "LATIN SMALL LETTER U WITH GRAVE"; 554 | (*0xfa*) Some 0x00fa, Some "\xC3\xBA", 555 | "LATIN SMALL LETTER U WITH ACUTE"; 556 | (*0xfb*) Some 0x00fb, Some "\xC3\xBB", 557 | "LATIN SMALL LETTER U WITH CIRCUMFLEX"; 558 | (*0xfc*) Some 0x00fc, Some "\xC3\xBC", 559 | "LATIN SMALL LETTER U WITH DIAERESIS"; 560 | (*0xfd*) Some 0x00fd, Some "\xC3\xBD", 561 | "LATIN SMALL LETTER Y WITH ACUTE"; 562 | (*0xfe*) Some 0x00fe, Some "\xC3\xBE", 563 | "LATIN SMALL LETTER THORN"; 564 | (*0xff*) Some 0x00ff, Some "\xC3\xBF", 565 | "LATIN SMALL LETTER Y WITH DIAERESIS"; 566 | |] 567 | 568 | let is_ascii ?(pos = 0) ?len s = 569 | let len = get_len pos len s in 570 | try 571 | for i = pos to pos + len - 1 do 572 | if Char.code s.[i] >= 0x80 then 573 | raise Exit 574 | done; 575 | true 576 | with Exit -> 577 | false 578 | 579 | let is_iso88591 ?(pos = 0) ?len s = 580 | let len = get_len pos len s in 581 | try 582 | for i = pos to pos + len - 1 do 583 | match s.[i] with 584 | '\x80'..'\x9f' -> raise Exit 585 | | _ -> () 586 | done; 587 | true 588 | with Exit -> 589 | false 590 | 591 | let is_windows1252 ?(pos = 0) ?len s = 592 | let len = get_len pos len s in 593 | try 594 | for i = pos to pos + len - 1 do 595 | match s.[i] with 596 | '\x81' | '\x8d' | '\x8f' | '\x90' | '\x9d' -> raise Exit 597 | | _ -> () 598 | done; 599 | true 600 | with Exit -> 601 | false 602 | 603 | let utf8_of_windows1252 604 | ?(pos = 0) 605 | ?len 606 | ?(undefined = 607 | fun c -> 608 | failwith 609 | (sprintf "Utf8conv.utf8_of_windows1252: \ 610 | undefined character code 0x%02x" (Char.code c)) 611 | ) 612 | s = 613 | 614 | let len = get_len pos len s in 615 | let buf = Buffer.create (2 * len) in 616 | let last_flush = ref (pos - 1) in 617 | let flush_ascii i = 618 | let start = !last_flush + 1 in 619 | let flush_len = i - start in 620 | if flush_len > 0 then 621 | Buffer.add_substring buf s start flush_len; 622 | last_flush := i 623 | in 624 | for i = pos to pos + len - 1 do 625 | let c = s.[i] in 626 | if Char.code c >= 0x80 then ( 627 | flush_ascii i; 628 | let x = 629 | match windows1252.(Char.code c) with 630 | _, Some x, _ -> x 631 | | _, None, _ -> undefined c 632 | in 633 | Buffer.add_string buf x 634 | ) 635 | done; 636 | flush_ascii (pos + len); 637 | Buffer.contents buf 638 | --------------------------------------------------------------------------------