├── .eslintrc ├── .npmignore ├── AUTHORS ├── .gitignore ├── .travis.yml ├── appveyor.yml ├── package.json ├── LICENSE.txt ├── test ├── performance.test.js └── charset.test.js ├── History.md ├── index.js └── README.md /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "eslint-config-egg" 3 | } 4 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | test/ 2 | coverage.html 3 | lib-cov/ 4 | Makefile 5 | .travis.yml 6 | logo.png 7 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # Ordered by date of first contribution. 2 | 3 | fengmk2 (https://github.com/fengmk2) 4 | Oleg Slobodskoi (https://github.com/kof) 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | coverage 2 | *.seed 3 | *.log 4 | *.csv 5 | *.dat 6 | *.out 7 | *.pid 8 | *.gz 9 | 10 | pids 11 | logs 12 | results 13 | 14 | node_modules 15 | npm-debug.log 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: node_js 3 | node_js: 4 | - '4' 5 | - '6' 6 | - '8' 7 | install: 8 | - npm i npminstall && npminstall 9 | script: 10 | - npm run ci 11 | after_script: 12 | - npminstall codecov && codecov 13 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - nodejs_version: '4' 4 | - nodejs_version: '6' 5 | - nodejs_version: '8' 6 | 7 | install: 8 | - ps: Install-Product node $env:nodejs_version 9 | - npm i npminstall && node_modules\.bin\npminstall 10 | 11 | test_script: 12 | - node --version 13 | - npm --version 14 | - npm run test 15 | 16 | build: off 17 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "charset", 3 | "version": "1.0.1", 4 | "description": "Get the content charset from header and html content-type.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "npm run lint && egg-bin test", 8 | "ci": "npm run lint && egg-bin cov", 9 | "lint": "eslint test *.js" 10 | }, 11 | "dependencies": {}, 12 | "devDependencies": { 13 | "egg-bin": "1", 14 | "egg-ci": "^1.1.0", 15 | "eslint": "4", 16 | "eslint-config-egg": "5" 17 | }, 18 | "homepage": "https://github.com/node-modules/charset", 19 | "repository": { 20 | "type": "git", 21 | "url": "git://github.com/node-modules/charset.git" 22 | }, 23 | "keywords": [ 24 | "charset", 25 | "content-type", 26 | "ContentType", 27 | "Content-Type", 28 | "xml", 29 | "encoding" 30 | ], 31 | "engines": { 32 | "node": ">=4.0.0" 33 | }, 34 | "ci": { 35 | "version": "4, 6, 8" 36 | }, 37 | "author": "fengmk2 (https://fengmk2.com)", 38 | "license": "MIT" 39 | } -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This software is licensed under the MIT License. 2 | 3 | Copyright (C) 2012 - 2015 fengmk2 4 | Copyright (c) node-modules and other contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /test/performance.test.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const assert = require('assert'); 4 | const charset = require('..'); 5 | 6 | describe('performance.test.js', () => { 7 | function genstr(len, chr) { 8 | let result = ''; 9 | for (let i = 0; i < len; i++) { 10 | result += chr; 11 | } 12 | return result; 13 | } 14 | const longspace = genstr(800000, ' '); 15 | const longa = genstr(800000, 'a'); 16 | 17 | it('should run fast', () => { 18 | assert(charset('encoding=' + longspace + '"utf8') === null); 19 | }); 20 | 21 | it('should ignore space > 10', () => { 22 | assert(charset('encoding=' + genstr(0, ' ') + '"utf8') === 'utf8'); 23 | assert(charset('encoding=' + genstr(9, ' ') + '"utf8') === 'utf8'); 24 | assert(charset('encoding=' + genstr(10, ' ') + '"utf8') === 'utf8'); 25 | assert(charset('encoding=' + genstr(11, ' ') + '"utf8') === null); 26 | }); 27 | 28 | it('should ignore charset length > 100', () => { 29 | assert(charset('encoding=' + genstr(11, 'a')) === genstr(11, 'a')); 30 | assert(charset('encoding=' + genstr(99, 'a')) === genstr(99, 'a')); 31 | assert(charset('encoding=' + genstr(100, 'a')) === genstr(100, 'a')); 32 | assert(charset('encoding=' + genstr(101, 'a')) === genstr(100, 'a')); 33 | assert(charset('encoding=' + longa) === genstr(100, 'a')); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /History.md: -------------------------------------------------------------------------------- 1 | 2 | 1.0.1 / 2017-09-07 3 | ================== 4 | 5 | **fixes** 6 | * [[`effda0c`](http://github.com/node-modules/charset/commit/effda0c48c51b47a47f4cad7db0c51ee7407cc1b)] - fix: limit match string (#11) (fengmk2 <>) 7 | 8 | **others** 9 | * [[`5ba8a49`](http://github.com/node-modules/charset/commit/5ba8a4942f069d40584cf5a1a7938ff8dc92bcc9)] - test: use npm scripts instead of Makefile (#9) (fengmk2 <>) 10 | * [[`4787184`](http://github.com/node-modules/charset/commit/47871846e3e738c8cca18c5021e6784c0777ef8f)] - fix example with jschardet (Xu Jingxin <>) 11 | * [[`c2f94ef`](http://github.com/node-modules/charset/commit/c2f94ef9cfbaef2e0ff546b4bffcccc1e0beac52)] - add combine example with jschardet. (fengmk2 <>) 12 | 13 | 1.0.0 / 2014-09-17 14 | ================== 15 | 16 | * add peek size, default is 512. fixed #4 17 | 18 | 0.1.0 / 2014-07-05 19 | ================== 20 | 21 | * support `charset(content-type-string)` 22 | * update AUTHORS with new version of contributors 23 | 24 | 0.0.2 / 2014-01-19 25 | ================== 26 | 27 | * add contributors 28 | * #1 #2 read charset from encoding="utf8" for xml, handle spaces between = and inside of utf8 (@kof) 29 | 30 | 0.0.1 / 2012-10-08 31 | ================== 32 | 33 | * first commit for charset. 34 | * Initial commit 35 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const CHARTSET_RE = /(?:charset|encoding)\s{0,10}=\s{0,10}['"]? {0,10}([\w\-]{1,100})/i; 4 | 5 | module.exports = charset; 6 | 7 | /** 8 | * guest data charset from req.headers, xml, html content-type meta tag 9 | * headers: 10 | * 'content-type': 'text/html;charset=gbk' 11 | * meta tag: 12 | * 13 | * xml file: 14 | * 15 | * 16 | * @param {Object} obj `Content-Type` String, or `res.headers`, or `res` Object 17 | * @param {Buffer} [data] content buffer 18 | * @param {Number} [peekSize] max content peek size, default is 512 19 | * @return {String} charset, lower case, e.g.: utf8, gbk, gb2312, .... 20 | * If can\'t guest, return null 21 | * @api public 22 | */ 23 | function charset(obj, data, peekSize) { 24 | let matchs = null; 25 | let end = 0; 26 | if (data) { 27 | peekSize = peekSize || 512; 28 | // https://github.com/node-modules/charset/issues/4 29 | end = data.length > peekSize ? peekSize : data.length; 30 | } 31 | // charset('text/html;charset=gbk') 32 | let contentType = obj; 33 | if (contentType && typeof contentType !== 'string') { 34 | // charset(res.headers) 35 | let headers = obj; 36 | if (obj.headers) { 37 | // charset(res) 38 | headers = obj.headers; 39 | } 40 | contentType = headers['content-type'] || headers['Content-Type']; 41 | } 42 | if (contentType) { 43 | // guest from obj first 44 | matchs = CHARTSET_RE.exec(contentType); 45 | } 46 | if (!matchs && end > 0) { 47 | // guest from content body (html/xml) header 48 | contentType = data.slice(0, end).toString(); 49 | matchs = CHARTSET_RE.exec(contentType); 50 | } 51 | let cs = null; 52 | if (matchs) { 53 | cs = matchs[1].toLowerCase(); 54 | if (cs === 'utf-8') { 55 | cs = 'utf8'; 56 | } 57 | } 58 | return cs; 59 | } 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | charset 2 | ======= 3 | 4 | [![NPM version][npm-image]][npm-url] 5 | [![build status][travis-image]][travis-url] 6 | [![Test coverage][codecov-image]][codecov-url] 7 | [![David deps][david-image]][david-url] 8 | [![npm download][download-image]][download-url] 9 | 10 | [npm-image]: https://img.shields.io/npm/v/charset.svg?style=flat-square 11 | [npm-url]: https://npmjs.org/package/charset 12 | [travis-image]: https://img.shields.io/travis/node-modules/charset.svg?style=flat-square 13 | [travis-url]: https://travis-ci.org/node-modules/charset 14 | [codecov-image]: https://codecov.io/gh/node-modules/charset/branch/master/graph/badge.svg 15 | [codecov-url]: https://codecov.io/gh/node-modules/charset 16 | [david-image]: https://img.shields.io/david/node-modules/charset.svg?style=flat-square 17 | [david-url]: https://david-dm.org/node-modules/charset 18 | [download-image]: https://img.shields.io/npm/dm/charset.svg?style=flat-square 19 | [download-url]: https://npmjs.org/package/charset 20 | 21 | ![logo](https://raw.github.com/node-modules/charset/master/logo.png) 22 | 23 | Get the content charset from header and html content-type. 24 | 25 | ## Install 26 | 27 | ```bash 28 | $ npm install charset --save 29 | ``` 30 | 31 | ## Usage 32 | 33 | ### Detect charset from http client response and content 34 | 35 | ```js 36 | var charset = require('charset'); 37 | var http = require('http'); 38 | 39 | http.get('http://nodejs.org', function (res) { 40 | res.on('data', function (chunk) { 41 | console.log(charset(res.headers, chunk)); 42 | // or `console.log(charset(res, chunk));` 43 | res.destroy(); 44 | }); 45 | }); 46 | ``` 47 | 48 | Stdout will should log: `utf8` . 49 | 50 | ### Detect from String 51 | 52 | ```js 53 | charset(res.headers['content-type']); 54 | ``` 55 | 56 | ### Detect combine with [jschardet] 57 | 58 | As you know, `charset` only detect from http response headers and html content-type meta tag. 59 | You can combine with [jschardet] to help you detect the finally charset. 60 | 61 | This example codes come from [stackoverflow#12326688](http://stackoverflow.com/a/18712021/2496088): 62 | 63 | ```js 64 | var request = require('request'); 65 | var charset = require('charset'); 66 | var jschardet = require('jschardet'); 67 | 68 | request({ 69 | url: 'http://www.example.com', 70 | encoding: null 71 | }, function (err, res, body) { 72 | if (err) { 73 | throw err; 74 | } 75 | enc = charset(res.headers, body); 76 | enc = enc || jschardet.detect(body).encoding.toLowerCase(); 77 | console.log(enc); 78 | }); 79 | ``` 80 | 81 | ## License 82 | 83 | [MIT](LICENSE.txt) 84 | 85 | [jschardet]: https://github.com/aadsm/jschardet 86 | -------------------------------------------------------------------------------- /test/charset.test.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const assert = require('assert'); 4 | const fs = require('fs'); 5 | const charset = require('../'); 6 | 7 | const testContent = fs.readFileSync(__dirname + '/test.txt'); 8 | const testContent2 = fs.readFileSync(__dirname + '/test2.txt'); 9 | 10 | describe('charset.test.js', function() { 11 | it('should get charset from headers', function() { 12 | assert(charset({ 13 | 'content-type': 'text/html;charset=gBk', 14 | }, new Buffer('')) === 'gbk'); 15 | assert(charset({ 16 | 'content-type': 'text/html;charset=UTF8', 17 | }, new Buffer('')) === 'utf8'); 18 | assert(charset({ 19 | 'content-type': 'text/html;charset=UTF-8', 20 | }, new Buffer('')) === 'utf8'); 21 | assert(charset({ 22 | 'content-type': 'text/html;charset=gb2312', 23 | }, new Buffer('')) === 'gb2312'); 24 | assert(charset({ 25 | 'Content-Type': 'text/html;Charset=UTF-8', 26 | }) === 'utf8'); 27 | }); 28 | 29 | it('should get charset from res', function() { 30 | const res = { 31 | headers: { 32 | 'content-type': 'text/html;charset=gb2312', 33 | }, 34 | }; 35 | assert(charset(res) === 'gb2312'); 36 | }); 37 | 38 | it('should get charset from Content-Type string', function() { 39 | assert(charset('text/html;charset=gb2312') === 'gb2312'); 40 | }); 41 | 42 | it('should get charset from body', function() { 43 | assert(charset({}, new Buffer('')) === 'gbk'); 44 | assert(charset({}, new Buffer('')) === 'utf8'); 45 | assert(charset({}, testContent) === 'utf8'); 46 | // work for string body 47 | assert(charset(null, testContent.toString()) === 'utf8'); 48 | }); 49 | 50 | it('should get charset from xml header', function() { 51 | assert(charset({}, new Buffer('')) === 'utf8'); 52 | }); 53 | 54 | it('should get charset with white space chars around "="', function() { 55 | assert(charset({}, new Buffer('')) === 'utf8'); 56 | assert(charset({}, new Buffer('').toString()) === 'utf8'); 57 | }); 58 | 59 | it('should get charset with white space chars around charset', function() { 60 | assert(charset({}, new Buffer('')) === 'utf8'); 61 | }); 62 | 63 | it('should get null when charset not word, number and -', function() { 64 | assert(!charset({ 65 | 'content-type': 'text/html;charset=中文编码', 66 | }, new Buffer(''))); 67 | assert(!charset({ 68 | 'content-type': 'text/html;charset=|||', 69 | }, new Buffer(''))); 70 | }); 71 | 72 | it('should get null when charset not in top 500 bytes data', function() { 73 | assert(!charset({}, testContent2)); 74 | }); 75 | }); 76 | --------------------------------------------------------------------------------