├── Gemfile
├── lib
    ├── charlock_holmes
    │   ├── version.rb
    │   ├── string.rb
    │   └── encoding_detector.rb
    └── charlock_holmes.rb
├── test
    ├── fixtures
    │   ├── foo.pdf
    │   ├── octocat.ai
    │   ├── hello_world
    │   ├── octocat.gif
    │   ├── octocat.jpg
    │   ├── octocat.png
    │   ├── octocat.psd
    │   ├── utf16be.html
    │   ├── utf32be.html
    │   ├── utf32le.html
    │   ├── AnsiGraph.psm1
    │   ├── TwigExtensionsDate.es.yml
    │   ├── sierpinski.ps
    │   ├── ISO-2022-KR.txt
    │   ├── repl2.cljs
    │   ├── laholator.py
    │   ├── cl-messagepack.lisp
    │   ├── core.rkt
    │   ├── utf8.html
    │   └── vimrc
    ├── helper.rb
    ├── converter_test.rb
    ├── string_methods_test.rb
    ├── transliterator_test.rb
    └── encoding_detector_test.rb
├── .gitignore
├── .travis.yml
├── Rakefile
├── ext
    └── charlock_holmes
    │   ├── ext.c
    │   ├── common.h
    │   ├── converter.c
    │   ├── extconf.rb
    │   ├── transliterator.cpp
    │   └── encoding_detector.c
├── charlock_holmes.gemspec
├── benchmark
    ├── detection.rb
    └── test.txt
├── LICENSE
└── README.md


/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gemspec
4 | 


--------------------------------------------------------------------------------
/lib/charlock_holmes/version.rb:
--------------------------------------------------------------------------------
1 | module CharlockHolmes
2 |   VERSION = "0.7.5"
3 | end
4 | 


--------------------------------------------------------------------------------
/test/fixtures/foo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/foo.pdf


--------------------------------------------------------------------------------
/test/fixtures/octocat.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/octocat.ai


--------------------------------------------------------------------------------
/test/fixtures/hello_world:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/hello_world


--------------------------------------------------------------------------------
/test/fixtures/octocat.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/octocat.gif


--------------------------------------------------------------------------------
/test/fixtures/octocat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/octocat.jpg


--------------------------------------------------------------------------------
/test/fixtures/octocat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/octocat.png


--------------------------------------------------------------------------------
/test/fixtures/octocat.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/octocat.psd


--------------------------------------------------------------------------------
/test/fixtures/utf16be.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/utf16be.html


--------------------------------------------------------------------------------
/test/fixtures/utf32be.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/utf32be.html


--------------------------------------------------------------------------------
/test/fixtures/utf32le.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/utf32le.html


--------------------------------------------------------------------------------
/test/fixtures/AnsiGraph.psm1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mistydemeo/charlock_holmes/master/test/fixtures/AnsiGraph.psm1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /Gemfile.lock
2 | .bundle/
3 | tmp/
4 | vendor/
5 | *.bundle
6 | ext/charlock_holmes/dst
7 | *.a
8 | ext/charlock_holmes/src/file-*
9 | ext/charlock_holmes/src/mkmf.log


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | addons:
 3 |   apt:
 4 |     packages:
 5 |     - libicu-dev
 6 | language: ruby
 7 | rvm:
 8 |   - 2.5
 9 |   - 2.4
10 |   - 2.3
11 |   - 2.2
12 |   - 2.1
13 |   - 2.0.0
14 | 


--------------------------------------------------------------------------------
/lib/charlock_holmes.rb:
--------------------------------------------------------------------------------
1 | require 'charlock_holmes/charlock_holmes'
2 | require 'charlock_holmes/encoding_detector'
3 | require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4 | 
5 | # require this if you want the String monkey patches
6 | # require 'charlock_holmes/string'
7 | 


--------------------------------------------------------------------------------
/test/fixtures/TwigExtensionsDate.es.yml:
--------------------------------------------------------------------------------
1 | date.year: '%year% año|%year% años'
2 | date.month: '%month% mes|%month% meses'
3 | date.day: '%day% día|%day% días'
4 | date.hour: '%hour% hora|%hour% horas'
5 | date.minute: '%minute% minuto|%minute% minutos'
6 | date.second: '%second% segundo|%second% segundos'
7 | date.new: 'menos de un minuto'
8 | date.and: ' y '


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rake/testtask'
 2 | 
 3 | Rake::TestTask.new do |t|
 4 |   t.pattern = "test/**/*_test.rb"
 5 | end
 6 | 
 7 | task :default => :test
 8 | 
 9 | gem 'rake-compiler', '>= 0.7.5'
10 | require "rake/extensiontask"
11 | 
12 | Rake::ExtensionTask.new 'charlock_holmes' do |ext|
13 |   ext.lib_dir = File.join 'lib', 'charlock_holmes'
14 | end
15 | 
16 | Rake::Task[:test].prerequisites << :compile


--------------------------------------------------------------------------------
/ext/charlock_holmes/ext.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | extern void _init_charlock_encoding_detector();
 4 | extern void _init_charlock_converter();
 5 | extern void _init_charlock_transliterator();
 6 | 
 7 | VALUE rb_mCharlockHolmes;
 8 | 
 9 | void Init_charlock_holmes() {
10 | 	rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
11 | 
12 | 	_init_charlock_encoding_detector();
13 | 	_init_charlock_converter();
14 | 	_init_charlock_transliterator();
15 | }


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | # Basic test environment.
 2 | 
 3 | # blah fuck this
 4 | require 'rubygems' if !defined?(Gem)
 5 | require 'bundler/setup'
 6 | 
 7 | require 'charlock_holmes'
 8 | 
 9 | # bring in minitest
10 | require 'minitest/autorun'
11 | 
12 | if Minitest.const_defined?('Test')
13 |   # We're on Minitest 5+. Nothing to do here.
14 | else
15 |   # Minitest 4 doesn't have Minitest::Test yet.
16 |   Minitest::Test = MiniTest::Unit::TestCase
17 | end
18 | 
19 | def fixture(name)
20 |   path = File.expand_path "../fixtures/#{name}", __FILE__
21 |   File.new path
22 | end
23 | 
24 | # put lib and test dirs directly on load path
25 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
26 | $LOAD_PATH.unshift File.expand_path('..', __FILE__)
27 | 


--------------------------------------------------------------------------------
/charlock_holmes.gemspec:
--------------------------------------------------------------------------------
 1 | require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
 2 | 
 3 | Gem::Specification.new "charlock_holmes", CharlockHolmes::VERSION do |s|
 4 |   s.license = "MIT"
 5 |   s.authors = ["Brian Lopez", "Vicent Martí"]
 6 |   s.email = "seniorlopez@gmail.com"
 7 |   s.extensions = ["ext/charlock_holmes/extconf.rb"]
 8 |   s.files = `git ls-files ext lib`.split("\n")
 9 |   s.homepage = "https://github.com/brianmario/charlock_holmes"
10 |   s.rdoc_options = ["--charset=UTF-8"]
11 |   s.summary = "Character encoding detection, brought to you by ICU"
12 |   s.description = "charlock_holmes provides binary and text detection as well as text transcoding using libicu"
13 |   s.required_ruby_version = '>= 1.9.3'
14 | 
15 |   # tests
16 |   s.add_development_dependency 'rake-compiler', ">= 0.7.5"
17 |   s.add_development_dependency 'minitest'
18 |   # benchmarks
19 |   s.add_development_dependency 'chardet'
20 | end
21 | 


--------------------------------------------------------------------------------
/test/fixtures/sierpinski.ps:
--------------------------------------------------------------------------------
 1 | %!PS-Adobe-3.0
 2 | %%Creator: Aaron Puchert
 3 | %%Title: The Sierpinski triangle
 4 | %%Pages: 1
 5 | %%PageOrder: Ascend
 6 | 
 7 | %%BeginProlog
 8 | % PAGE SETTINGS
 9 | /pageset {
10 |   28.3464566 28.3464566 scale    % set cm = 1
11 |   0.5 0.5 translate
12 |   0 setlinewidth
13 | } def
14 | 
15 | % sierpinski(n) draws a sierpinski triangle of order n
16 | /sierpinski {
17 | dup 0 gt {
18 |   [0.5 0 0 0.5 0 0] concat dup 1 sub sierpinski
19 |   [1 0 0 1 1 0] concat dup 1 sub sierpinski
20 |   [1 0 0 1 -1 1] concat dup 1 sub sierpinski
21 |   [2 0 0 2 0 -1] concat
22 | } {
23 |   newpath
24 |     0 0 moveto
25 |     1 0 lineto
26 |     0 1 lineto
27 |   closepath
28 |   fill
29 | } ifelse pop} def
30 | %%EndProlog
31 | 
32 | %%BeginSetup
33 | << /PageSize [596 843] >> setpagedevice  % A4
34 | %%EndSetup
35 | 
36 | %%Page: Test 1
37 | pageset
38 | [20 0 10 300 sqrt 0 0] concat
39 | 9 sierpinski
40 | showpage
41 | %%EOF
42 | 


--------------------------------------------------------------------------------
/benchmark/detection.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
 2 | 
 3 | RUBY_19 = !!(RUBY_VERSION =~ /1.9/)
 4 | 
 5 | require 'charlock_holmes'
 6 | 
 7 | # the chardet gem isn't compatible with 1.9
 8 | require 'UniversalDetector' unless RUBY_19
 9 | 
10 | require 'benchmark'
11 | 
12 | CONTENT = File.read(File.expand_path('../test.txt', __FILE__))
13 | 
14 | TIMES = 100
15 | DETECTOR = CharlockHolmes::EncodingDetector.new
16 | 
17 | Benchmark.bmbm do |x|
18 |   # new detector every iteration
19 |   x.report 'singleton call' do
20 |     TIMES.times do
21 |       CharlockHolmes::EncodingDetector.detect CONTENT
22 |     end
23 |   end
24 | 
25 |   # shared detector for all iterations
26 |   x.report 'reusing a single detector' do
27 |     TIMES.times do
28 |       DETECTOR.detect CONTENT
29 |     end
30 |   end
31 | 
32 |   unless RUBY_19
33 |     x.report 'chardet' do
34 |       TIMES.times do
35 |         UniversalDetector.chardet CONTENT
36 |       end
37 |     end
38 |   end
39 | end
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Brian Lopez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ext/charlock_holmes/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef CHARLOCK_COMMON_H
 2 | #define CHARLOCK_COMMON_H
 3 | 
 4 | // tell rbx not to use it's caching compat layer
 5 | // by doing this we're making a promize to RBX that
 6 | // we'll never modify the pointers we get back from RSTRING_PTR
 7 | #define RSTRING_NOT_MODIFIED
 8 | 
 9 | #include <ruby.h>
10 | #ifdef HAVE_RUBY_ENCODING_H
11 | #include <ruby/encoding.h>
12 | #endif
13 | 
14 | static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
15 | {
16 | #ifdef HAVE_RUBY_ENCODING_H
17 | 	return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
18 | #else
19 | 	return rb_str_new(str, len);
20 | #endif
21 | }
22 | 
23 | static inline VALUE charlock_new_str(const char *str, size_t len)
24 | {
25 | #ifdef HAVE_RUBY_ENCODING_H
26 | 	return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
27 | #else
28 | 	return rb_str_new(str, len);
29 | #endif
30 | }
31 | 
32 | static inline VALUE charlock_new_str2(const char *str)
33 | {
34 | #ifdef HAVE_RUBY_ENCODING_H
35 | 	return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
36 | #else
37 | 	return rb_str_new2(str);
38 | #endif
39 | }
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/test/fixtures/ISO-2022-KR.txt:
--------------------------------------------------------------------------------
 1 | $)C#
 2 | # Out-AnsiGraph.psm1
 3 | # Author:       xcud
 4 | # History:
 5 | #       v0.1 September 21, 2009 initial version
 6 | #
 7 | # PS Example> ps | select -first 5 | sort -property VM | 
 8 | #             Out-AnsiGraph ProcessName, VM
 9 | #                 AEADISRV  14508032
10 | #                  audiodg  50757632
11 | #                  conhost  73740288
12 | # AppleMobileDeviceService  92061696
13 | #                    btdna  126443520
14 | #
15 | function Out-AnsiGraph($Parameter1=$null) {
16 | 	BEGIN {
17 | 		$q = new-object Collections.queue
18 | 		$max = 0; $namewidth = 0;
19 | 	}
20 | 
21 | 	PROCESS {
22 | 		if($_) {
23 | 			$name = $_.($Parameter1[0]);
24 | 			$val = $_.($Parameter1[1])
25 | 			if($max -lt $val) { $max = $val}		 
26 | 			if($namewidth -lt $name.length) { 
27 | 				$namewidth = $name.length }
28 | 			$q.enqueue(@($name, $val))			
29 | 		}
30 | 	}
31 | 
32 | 	END {
33 | 		$q | %{
34 | 			$graph = ""; 0..($_[1]/$max*20) | 
35 | 				%{ $graph += "" }
36 | 			$name = "{0,$namewidth}" -f $_[0]
37 | 			"$name $graph " + $_[1]
38 | 		}
39 | 
40 | 	}
41 | }
42 | 
43 | Export-ModuleMember Out-AnsiGraph


--------------------------------------------------------------------------------
/lib/charlock_holmes/string.rb:
--------------------------------------------------------------------------------
 1 | require 'charlock_holmes' unless defined? CharlockHolmes
 2 | 
 3 | class String
 4 |   # Attempt to detect the encoding of this string
 5 |   #
 6 |   # Returns: a Hash with :encoding, :language, :type and :confidence
 7 |   def detect_encoding(hint_enc=nil)
 8 |     detector = CharlockHolmes::EncodingDetector.new
 9 |     detector.detect(self, hint_enc)
10 |   end
11 | 
12 |   # Attempt to detect the encoding of this string, and return
13 |   # a list with all the possible encodings that match it.
14 |   #
15 |   # Returns: an Array with zero or more Hashes,
16 |   #          each one of them with with :encoding, :language, :type and :confidence
17 |   def detect_encodings(hint_enc=nil)
18 |     detector = CharlockHolmes::EncodingDetector.new
19 |     detector.detect_all(self, hint_enc)
20 |   end
21 | 
22 |   if method_defined? :force_encoding
23 |     # Attempt to detect the encoding of this string
24 |     # then set the encoding to what was detected ala `force_encoding`
25 |     #
26 |     # Returns: self
27 |     def detect_encoding!(hint_enc=nil)
28 |       if detected = self.detect_encoding(hint_enc)
29 |         self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
30 |       end
31 |       self
32 |     end
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/test/converter_test.rb:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | require File.expand_path("../helper", __FILE__)
 3 | 
 4 | class ConverterTest < MiniTest::Test
 5 |   def test_convert_ascii_from_iso859_1_to_utf16_and_back
 6 |     input = 'test'
 7 | 
 8 |     output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
 9 |     assert input.bytesize < output.bytesize
10 |     assert input != output
11 | 
12 |     output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
13 |     assert input.bytesize == output.bytesize
14 |     assert input == output
15 |   end
16 | 
17 |   def test_convert_utf8_to_utf16_and_back
18 |     input = 'λ, λ, λ'
19 | 
20 |     output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
21 |     assert input.bytesize < output.bytesize
22 |     assert input != output
23 | 
24 |     output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
25 |     assert input.bytesize == output.bytesize
26 |     assert input == output
27 |   end
28 | 
29 |   def test_params_must_be_strings
30 |     assert_raises TypeError do
31 |       CharlockHolmes::Converter.convert nil, 'UTF-8', 'UTF-16'
32 |     end
33 | 
34 |     assert_raises TypeError do
35 |       CharlockHolmes::Converter.convert 'lol', nil, 'UTF-16'
36 |     end
37 | 
38 |     assert_raises TypeError do
39 |       CharlockHolmes::Converter.convert 'lol', 'UTF-8', nil
40 |     end
41 | 
42 |     begin
43 |       CharlockHolmes::Converter.convert 'lol', 'UTF-8', 'UTF-16'
44 |     rescue Exception => e
45 |       assert_nil e, "#{e.class.name} raised, expected nothing"
46 |     end
47 |   end
48 | end


--------------------------------------------------------------------------------
/ext/charlock_holmes/converter.c:
--------------------------------------------------------------------------------
 1 | #include "unicode/ucnv.h"
 2 | #include "common.h"
 3 | 
 4 | extern VALUE rb_mCharlockHolmes;
 5 | static VALUE rb_cConverter;
 6 | 
 7 | static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
 8 | 	VALUE rb_out;
 9 | 	const char *src_enc;
10 | 	const char *dst_enc;
11 | 	const char *src_txt;
12 | 	char *out_buf;
13 | 	void *rb_enc = NULL;
14 | 	int32_t src_len;
15 | 	int32_t out_len;
16 | 	UErrorCode status = U_ZERO_ERROR;
17 | 
18 | 	Check_Type(rb_txt, T_STRING);
19 | 	Check_Type(rb_src_enc, T_STRING);
20 | 	Check_Type(rb_dst_enc, T_STRING);
21 | 
22 | 	src_txt = RSTRING_PTR(rb_txt);
23 | 	src_len = RSTRING_LEN(rb_txt);
24 | 	src_enc = RSTRING_PTR(rb_src_enc);
25 | 	dst_enc = RSTRING_PTR(rb_dst_enc);
26 | 
27 | 	// first determin the size of the output buffer
28 | 	out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
29 | 	if (status != U_BUFFER_OVERFLOW_ERROR) {
30 | 		rb_raise(rb_eArgError, "%s", u_errorName(status));
31 | 	}
32 | 	out_buf = malloc(out_len);
33 | 
34 | 	// now do the actual conversion
35 | 	status = U_ZERO_ERROR;
36 | 	out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
37 | 	if (U_FAILURE(status)) {
38 | 		free(out_buf);
39 | 		rb_raise(rb_eArgError, "%s", u_errorName(status));
40 | 	}
41 | 
42 | #ifdef HAVE_RUBY_ENCODING_H
43 | 	rb_enc = (void *)rb_enc_find(dst_enc);
44 | #endif
45 | 
46 | 	rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
47 | 
48 | 	free(out_buf);
49 | 
50 | 	return rb_out;
51 | }
52 | 
53 | void _init_charlock_converter() {
54 | 	rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
55 | 
56 | 	rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
57 | }
58 | 


--------------------------------------------------------------------------------
/ext/charlock_holmes/extconf.rb:
--------------------------------------------------------------------------------
 1 | require 'mkmf'
 2 | 
 3 | CWD = File.expand_path(File.dirname(__FILE__))
 4 | def sys(cmd)
 5 |   puts "  -- #{cmd}"
 6 |   unless ret = xsystem(cmd)
 7 |     raise "#{cmd} failed, please report issue on https://github.com/brianmario/charlock_holmes"
 8 |   end
 9 |   ret
10 | end
11 | 
12 | if `which make`.strip.empty?
13 |   STDERR.puts "\n\n"
14 |   STDERR.puts "***************************************************************************************"
15 |   STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
16 |   STDERR.puts "***************************************************************************************"
17 |   exit(1)
18 | end
19 | 
20 | ##
21 | # ICU dependency
22 | #
23 | 
24 | dir_config 'icu'
25 | 
26 | rubyopt = ENV.delete("RUBYOPT")
27 | 
28 | icu4c = "/usr"
29 | # detect homebrew installs
30 | if !have_library 'icui18n'
31 |   base = if !`which brew`.empty?
32 |     `brew --cellar`.strip
33 |   elsif File.exists?("/usr/local/Cellar/icu4c")
34 |     '/usr/local/Cellar'
35 |   end
36 | 
37 |   if base and icu4c = Dir[File.join(base, 'icu4c/*')].sort.last
38 |     $INCFLAGS << " -I#{icu4c}/include "
39 |     $LDFLAGS  << " -L#{icu4c}/lib "
40 |   end
41 | end
42 | 
43 | unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
44 |   STDERR.puts "\n\n"
45 |   STDERR.puts "***************************************************************************************"
46 |   STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
47 |   STDERR.puts "***************************************************************************************"
48 |   exit(1)
49 | end
50 | 
51 | have_library 'z' or abort 'libz missing'
52 | have_library 'icuuc' or abort 'libicuuc missing'
53 | have_library 'icudata' or abort 'libicudata missing'
54 | 
55 | # icu4c might be built in C++11 mode, but it also might not have been
56 | icuconfig = `which icu-config`.chomp
57 | icuconfig = "#{icu4c}/bin/icu-config" if icuconfig.empty?
58 | if File.exist?(icuconfig) && `#{icuconfig} --cxxflags`.include?("c++11")
59 |   $CXXFLAGS << ' -std=c++11'
60 | end
61 | 
62 | $CFLAGS << ' -Wall -funroll-loops'
63 | $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
64 | 
65 | ENV['RUBYOPT'] = rubyopt
66 | create_makefile 'charlock_holmes/charlock_holmes'
67 | 


--------------------------------------------------------------------------------
/test/string_methods_test.rb:
--------------------------------------------------------------------------------
 1 | require File.expand_path("../helper", __FILE__)
 2 | require 'charlock_holmes/string'
 3 | 
 4 | class StringMethodsTest < MiniTest::Test
 5 |   def test_adds_detect_encoding_method
 6 |     str = 'test'
 7 |     str.respond_to? :detect_encoding
 8 | 
 9 |     detected = str.detect_encoding
10 |     assert_equal 'ISO-8859-1', detected[:encoding]
11 |   end
12 | 
13 |   def test_detect_encoding_accepts_encoding_hint_param
14 |     str = 'test'
15 |     str.respond_to? :detect_encoding
16 | 
17 |     detected = str.detect_encoding 'UTF-8'
18 |     assert_equal 'ISO-8859-1', detected[:encoding]
19 |   end
20 | 
21 |   def test_adds_detect_encodings_method
22 |     str = 'test'
23 |     str.respond_to? :detect_encodings
24 | 
25 |     detected_list = str.detect_encodings
26 |     assert detected_list.is_a? Array
27 | 
28 |     encoding_list = detected_list.map {|d| d[:encoding]}.sort
29 |     assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
30 |   end
31 | 
32 |   def test_detect_encodings_accepts_encoding_hint_param
33 |     str = 'test'
34 |     str.respond_to? :detect_encodings
35 | 
36 |     detected_list = str.detect_encodings 'UTF-8'
37 |     assert detected_list.is_a? Array
38 | 
39 |     encoding_list = detected_list.map {|d| d[:encoding]}.sort
40 |     assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
41 |   end
42 | 
43 |   def test_returns_a_ruby_compatible_encoding_name
44 |     detected = 'test'.detect_encoding
45 |     assert_equal 'ISO-8859-1', detected[:encoding]
46 |     assert_equal 'ISO-8859-1', detected[:ruby_encoding]
47 | 
48 |     not_compat_txt = fixture("ISO-2022-KR.txt").read
49 |     detected = not_compat_txt.detect_encoding
50 |     assert_equal 'ISO-2022-KR', detected[:encoding]
51 |     assert_equal 'binary', detected[:ruby_encoding]
52 |   end
53 | 
54 |   if "".respond_to? :force_encoding
55 |     def test_adds_detect_encoding_bang_method
56 |       str = 'test'
57 |       str.respond_to? :detect_encoding!
58 | 
59 |       str.detect_encoding!
60 |       assert_equal Encoding.find('ISO-8859-1'), str.encoding
61 |     end
62 | 
63 |     def test_sets_a_ruby_compatible_encoding_name
64 |       str1 = 'test'
65 |       str1.detect_encoding!
66 |       assert_equal 'ISO-8859-1', str1.encoding.name
67 | 
68 |       not_compat_txt = fixture("ISO-2022-KR.txt").read
69 |       not_compat_txt.detect_encoding!
70 |       assert_equal 'ASCII-8BIT', not_compat_txt.encoding.name
71 |     end
72 |   end
73 | end
74 | 


--------------------------------------------------------------------------------
/lib/charlock_holmes/encoding_detector.rb:
--------------------------------------------------------------------------------
 1 | module CharlockHolmes
 2 |   class EncodingDetector
 3 |     # Default length for which to scan content for NULL bytes
 4 |     DEFAULT_BINARY_SCAN_LEN = 1024*1024
 5 | 
 6 |     # Length for which to scan content for NULL bytes
 7 |     attr_accessor :binary_scan_length
 8 | 
 9 |     alias :strip_tags? :strip_tags
10 | 
11 |     def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
12 |       @binary_scan_length = scan_len
13 |     end
14 | 
15 |     # Attempt to detect the encoding of this string
16 |     #
17 |     # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
18 |     # as well as use the default binary scan length
19 |     #
20 |     # str      - a String, what you want to detect the encoding of
21 |     # hint_enc - an optional String (like "UTF-8"), the encoding name which will
22 |     #            be used as an additional hint to the charset detector
23 |     #
24 |     # Returns: a Hash with :encoding, :language, :type and :confidence
25 |     def self.detect(str, hint_enc=nil)
26 |       new.detect(str, hint_enc)
27 |     end
28 | 
29 |     # Attempt to detect the encoding of this string, and return
30 |     # a list with all the possible encodings that match it.
31 |     #
32 |     # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
33 |     # as well as use the default binary scan length
34 |     #
35 |     # str      - a String, what you want to detect the encoding of
36 |     # hint_enc - an optional String (like "UTF-8"), the encoding name which will
37 |     #            be used as an additional hint to the charset detector
38 |     #
39 |     # Returns: an Array with zero or more Hashes,
40 |     # each one of them with with :encoding, :language, :type and :confidence
41 |     def self.detect_all(str, hint_enc=nil)
42 |       new.detect_all(str, hint_enc)
43 |     end
44 | 
45 |     # A mapping table of supported encoding names from EncodingDetector
46 |     # which point to the corresponding supported encoding name in Ruby.
47 |     # Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
48 |     #
49 |     # Note that encodings that can't be mapped between Charlock and Ruby will resolve
50 |     # to "ASCII-8BIT".
51 |     @encoding_table = {}
52 | 
53 |     def self.encoding_table
54 |       @encoding_table
55 |     end
56 | 
57 |     BINARY = 'binary'
58 | 
59 |     # Builds the ENCODING_TABLE hash by running through the list of supported encodings
60 |     # in the ICU detection API and trying to map them to supported encodings in Ruby.
61 |     # This is built dynamically so as to take advantage of ICU upgrades which may have
62 |     # support for more encodings in the future.
63 |     #
64 |     # Returns nothing.
65 |     def self.build_encoding_table
66 |       supported_encodings.each do |name|
67 |         @encoding_table[name] = begin
68 |           ::Encoding.find(name).name
69 |         rescue ArgumentError
70 |           BINARY
71 |         end
72 |       end
73 |     end
74 |     build_encoding_table
75 |   end
76 | end
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CharlockHolmes
  2 | 
  3 | Character encoding detecting library for Ruby using [ICU](http://site.icu-project.org/)
  4 | 
  5 | ## Usage
  6 | 
  7 | First you'll need to require it
  8 | 
  9 | ``` ruby
 10 | require 'charlock_holmes'
 11 | ```
 12 | 
 13 | ## Encoding detection
 14 | 
 15 | ``` ruby
 16 | contents = File.read('test.xml')
 17 | detection = CharlockHolmes::EncodingDetector.detect(contents)
 18 | # => {:encoding => 'UTF-8', :confidence => 100, :type => :text}
 19 | 
 20 | # optionally there will be a :language key as well, but
 21 | # that's mostly only returned for legacy encodings like ISO-8859-1
 22 | ```
 23 | 
 24 | NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
 25 | 
 26 | For binary content, `:type` will be set to `:binary`
 27 | 
 28 | Though it's more efficient to reuse once detector instance:
 29 | 
 30 | ``` ruby
 31 | detector = CharlockHolmes::EncodingDetector.new
 32 | 
 33 | detection1 = detector.detect(File.read('test.xml'))
 34 | detection2 = detector.detect(File.read('test2.json'))
 35 | 
 36 | # and so on...
 37 | ```
 38 | 
 39 | ### String monkey patch
 40 | 
 41 | Alternatively, you can just use the `detect_encoding` method on the `String` class
 42 | 
 43 | ``` ruby
 44 | require 'charlock_holmes/string'
 45 | 
 46 | contents = File.read('test.xml')
 47 | 
 48 | detection = contents.detect_encoding
 49 | ```
 50 | 
 51 | ### Ruby 1.9 specific
 52 | 
 53 | NOTE: This method only exists on Ruby 1.9+
 54 | 
 55 | If you want to use this library to detect and set the encoding flag on strings, you can use the `detect_encoding!` method on the `String` class
 56 | 
 57 | ``` ruby
 58 | require 'charlock_holmes/string'
 59 | 
 60 | contents = File.read('test.xml')
 61 | 
 62 | # this will detect and set the encoding of `contents`, then return self
 63 | contents.detect_encoding!
 64 | ```
 65 | 
 66 | ## Transcoding
 67 | 
 68 | Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using.
 69 | 
 70 | ``` ruby
 71 | content = File.read('test2.txt')
 72 | detection = CharlockHolmes::EncodingDetector.detect(content)
 73 | utf8_encoded_content = CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8'
 74 | ```
 75 | 
 76 | The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding.
 77 | 
 78 | ## Installing
 79 | 
 80 | If the traditional `gem install charlock_holmes` doesn't work, you may need to specify the path to
 81 | your installation of ICU using the `--with-icu-dir` option during the gem install or by configuring Bundler to
 82 | pass those arguments to Gem:
 83 | 
 84 | Configure Bundler to always use the correct arguments when installing:
 85 | 
 86 |     bundle config build.charlock_holmes --with-icu-dir=/path/to/installed/icu4c
 87 | 
 88 | Using Gem to install directly without Bundler:
 89 | 
 90 |     gem install charlock_holmes -- --with-icu-dir=/path/to/installed/icu4c
 91 | 
 92 | 
 93 | ### Homebrew
 94 | 
 95 | If you're installing on Mac OS X then using [Homebrew](http://mxcl.github.com/homebrew/) is
 96 | the easiest way to install ICU.
 97 | 
 98 | However, be warned; it is a Keg-Only (see [homedir issue #167](https://github.com/mxcl/homebrew/issues/167)
 99 | for more info) install meaning RubyGems won't find it when installing without specifying `--with-icu-dir`
100 | 
101 | To install ICU with Homebrew:
102 | 
103 |     brew install icu4c
104 | 
105 | Configure Bundler to always use the correct arguments when installing:
106 | 
107 |     bundle config build.charlock_holmes --with-icu-dir=/usr/local/opt/icu4c
108 | 
109 | Using Gem to install directly without Bundler:
110 | 
111 |     gem install charlock_holmes -- --with-icu-dir=/usr/local/opt/icu4c
112 | 


--------------------------------------------------------------------------------
/ext/charlock_holmes/transliterator.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #undef UChar
  3 | 
  4 | #include <string>
  5 | #include <unicode/translit.h>
  6 | 
  7 | extern "C" {
  8 | 
  9 | #ifdef HAVE_RUBY_ENCODING_H
 10 | #include <ruby/encoding.h>
 11 | static VALUE rb_eEncodingCompatibilityError;
 12 | 
 13 | static void check_utf8_encoding(VALUE str) {
 14 |   static rb_encoding *_cached[3] = {NULL, NULL, NULL};
 15 |   rb_encoding *enc;
 16 | 
 17 |   if (_cached[0] == NULL) {
 18 |     _cached[0] = rb_utf8_encoding();
 19 |     _cached[1] = rb_usascii_encoding();
 20 |     _cached[2] = rb_ascii8bit_encoding();
 21 |   }
 22 | 
 23 |   enc = rb_enc_get(str);
 24 |   if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
 25 |     rb_raise(rb_eEncodingCompatibilityError,
 26 |       "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
 27 |   }
 28 | }
 29 | 
 30 | #else
 31 | static void check_utf8_encoding(VALUE str) {}
 32 | #endif
 33 | 
 34 | extern VALUE rb_mCharlockHolmes;
 35 | static VALUE rb_cTransliterator;
 36 | 
 37 | static VALUE rb_transliterator_id_list(VALUE self) {
 38 |   UErrorCode status = U_ZERO_ERROR;
 39 |   StringEnumeration *id_list;
 40 |   int32_t id_list_size;
 41 |   const char *curr_id;
 42 |   int32_t curr_id_len;
 43 |   VALUE rb_ary;
 44 |   VALUE rb_curr_id;
 45 | 
 46 |   id_list_size = 0;
 47 |   id_list = Transliterator::getAvailableIDs(status);
 48 |   if(!U_SUCCESS(status)) {
 49 |     rb_raise(rb_eArgError, "%s", u_errorName(status));
 50 |   }
 51 | 
 52 |   status = U_ZERO_ERROR;
 53 |   id_list_size = id_list->count(status);
 54 |   if(!U_SUCCESS(status)) {
 55 |     rb_raise(rb_eArgError, "%s", u_errorName(status));
 56 |   }
 57 | 
 58 |   rb_ary = rb_ary_new2(id_list_size);
 59 | 
 60 |   do {
 61 |     curr_id_len = 0;
 62 |     curr_id = id_list->next(&curr_id_len, status);
 63 |     if(!U_SUCCESS(status)) {
 64 |       rb_raise(rb_eArgError, "%s", u_errorName(status));
 65 |     }
 66 | 
 67 |     if (curr_id != NULL) {
 68 |       rb_curr_id = charlock_new_str(curr_id, curr_id_len);
 69 |       rb_ary_push(rb_ary, rb_curr_id);
 70 |     }
 71 |   } while(curr_id != NULL);
 72 | 
 73 |   delete id_list;
 74 | 
 75 |   return rb_ary;
 76 | }
 77 | 
 78 | static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
 79 |   UErrorCode status = U_ZERO_ERROR;
 80 |   UParseError p_error;
 81 |   Transliterator *trans;
 82 |   const char *txt;
 83 |   size_t txt_len;
 84 |   const char *id;
 85 |   size_t id_len;
 86 |   UnicodeString *u_txt;
 87 |   std::string result;
 88 |   VALUE rb_out;
 89 | 
 90 |   Check_Type(rb_txt, T_STRING);
 91 |   Check_Type(rb_id, T_STRING);
 92 | 
 93 |   check_utf8_encoding(rb_txt);
 94 |   check_utf8_encoding(rb_id);
 95 | 
 96 |   txt = RSTRING_PTR(rb_txt);
 97 |   txt_len = RSTRING_LEN(rb_txt);
 98 |   id = RSTRING_PTR(rb_id);
 99 |   id_len = RSTRING_LEN(rb_id);
100 | 
101 |   trans = Transliterator::createInstance(UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
102 |   if(!U_SUCCESS(status)) {
103 |     rb_raise(rb_eArgError, "%s", u_errorName(status));
104 |   }
105 | 
106 |   u_txt = new UnicodeString(txt, txt_len);
107 |   trans->transliterate(*u_txt);
108 |   StringByteSink<std::string> sink(&result);
109 |   u_txt->toUTF8(sink);
110 | 
111 |   delete u_txt;
112 |   delete trans;
113 | 
114 |   rb_out = charlock_new_str(result.data(), result.length());
115 | 
116 |   return rb_out;
117 | }
118 | 
119 | void _init_charlock_transliterator() {
120 | #ifdef HAVE_RUBY_ENCODING_H
121 |   rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
122 | #endif
123 | 
124 |   rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
125 | 
126 |   rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
127 |   rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
128 | }
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/test/fixtures/repl2.cljs:
--------------------------------------------------------------------------------
  1 | ;   Copyright (c) Rich Hickey. All rights reserved.
  2 | ;   The use and distribution terms for this software are covered by the
  3 | ;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
  4 | ;   which can be found in the file epl-v10.html at the root of this distribution.
  5 | ;   By using this software in any fashion, you are agreeing to be bound by
  6 | ;   the terms of this license.
  7 | ;   You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns clojure.browser.repl2
 10 |   (:require [clojure.browser.net   :as net]
 11 |             [clojure.browser.event :as event]
 12 |             [goog.json :as gjson]))
 13 | 
 14 | ;; Notes
 15 | ;; =====
 16 | ;;
 17 | ;; Using keywords for the service names does not work in Chrome or
 18 | ;; FireFox.
 19 | ;;
 20 | ;; --
 21 | 
 22 | (defn log-obj [obj]
 23 |   (.log js/console obj))
 24 | 
 25 | ;; Outer/Parent Peer
 26 | ;; =================
 27 | ;;
 28 | ;; The code in this section will be run in the parent page which
 29 | ;; exists in the application's domain. This is where code will be
 30 | ;; evaluated.
 31 | 
 32 | (def parent-channel (atom nil))
 33 | 
 34 | (defn- ensure-string [val]
 35 |   (if (string? val)
 36 |     val
 37 |     (str val)))
 38 | 
 39 | (defn evaluate-javascript
 40 |   "Given a block of JavaScript, evaluate it and transmit the result to
 41 |   the inner peer of the cross domain channel."
 42 |   [block]
 43 |   (log-obj (str "evaluating: " block))
 44 |   (let [result (pr-str
 45 |                 (try {:status :success :value (ensure-string (js* "eval(~{block})"))} 
 46 |                      (catch js/Error e {:status :exception :value (pr-str e)})))]
 47 |     (log-obj (str "result: " result))
 48 |     (net/transmit @parent-channel "return-value" result)))
 49 | 
 50 | (defn create-cross-domain-channel
 51 |   "Create a cross domain channel with an iframe which can communicate
 52 |   with the REPL server."
 53 |   [url]
 54 |   (let [chnl (doto (net/xpc-connection {:peer_uri (str url "/repl")})
 55 |                (net/register-service "evaluate-javascript" evaluate-javascript)
 56 |                (net/connect document.body
 57 |                             (fn [] (log-obj "Parent channel connected."))
 58 |                             (fn [iframe] (set! iframe.style.display "none"))))]
 59 |     (reset! parent-channel chnl)))
 60 | 
 61 | (defn connect
 62 |   "Connect to a ClojureScript REPL server located at the passed url."
 63 |   [url]
 64 |   (goog.events/listen js/window "load" #(create-cross-domain-channel url)))
 65 | 
 66 | ;; Inner peer
 67 | ;; =========
 68 | ;;
 69 | ;; The code in this section will be run in the child iframe and can
 70 | ;; communicate with REPL server.
 71 | 
 72 | (def state (atom {:connection nil :url nil}))
 73 | 
 74 | (def child-channel (atom nil))
 75 | 
 76 | (defn transmit-post [connection url data]
 77 |   (net/transmit connection url "POST" data nil 0))
 78 | 
 79 | (defn start-repl-connection
 80 |   "Start the REPL loop"
 81 |   [url]
 82 |   (let [connection (net/xhr-connection)]
 83 |     (reset! state {:connection connection :url url})
 84 |     (event/listen connection
 85 |                   :success
 86 |                   (fn [e]
 87 |                     (net/transmit @child-channel
 88 |                                   "evaluate-javascript"
 89 |                                   (.getResponseText e/currentTarget ()))))
 90 |     ;; The server is expecting to see the string "ready" for the
 91 |     ;; initial connection.
 92 |     (transmit-post connection url "ready")))
 93 | 
 94 | (defn return-value [val]
 95 |   (log-obj (str "sending: " val))
 96 |   (transmit-post (:connection @state) (:url @state) val))
 97 | 
 98 | ;; I can't get this to work using the clojure.browser.net api.
 99 | 
100 | (defn inner-peer-channel
101 |   "This function will be called from a script in the child iframe."
102 |   [repl-url]
103 |   (let [cfg (gjson/parse (.getParameterValue (goog.Uri. window.location.href) "xpc"))
104 |         chnl (doto (goog.net.xpc.CrossPageChannel. cfg)
105 |                (net/register-service "return-value" return-value)
106 |                (.connect #(log-obj "Child channel connected.")))]
107 |     (do (reset! child-channel chnl)
108 |         (js/setTimeout #(start-repl-connection repl-url) 500))))
109 | 
110 | 


--------------------------------------------------------------------------------
/test/fixtures/laholator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # 
  4 | # @author:     starenka
  5 | # @email:      'moc]tod[liamg].T.E[0aknerats'[::-1]
  6 | 
  7 | import warnings, hashlib, simplejson, string
  8 | from os.path import dirname, abspath
  9 | 
 10 | from flask import Flask, render_template, request
 11 | from flaskext.sqlalchemy import SQLAlchemy
 12 | try:
 13 |     from sqlalchemy.exceptions import IntegrityError
 14 | except ImportError:
 15 |     from sqlalchemy.exc import IntegrityError
 16 | 
 17 | #Hey monkey patcher! NLTK's NgramModel is not serializable w/ pickle.HIGHEST_PROTOCOL (2)
 18 | from werkzeug.contrib import cache
 19 | cache.HIGHEST_PROTOCOL = 1
 20 | from werkzeug.contrib.cache import SimpleCache
 21 | 
 22 | from BeautifulSoup import BeautifulSoup
 23 | import nltk
 24 | 
 25 | PUNCT = list(unicode(string.punctuation))
 26 | 
 27 | app = Flask(__name__)
 28 | app.config.from_object('settings')
 29 | cache = SimpleCache()
 30 | 
 31 | app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///%s/db.sqlite3'%abspath(dirname(__file__))
 32 | db = SQLAlchemy(app)
 33 | 
 34 | class Sample(db.Model):
 35 |     id = db.Column(db.Integer, primary_key=True)
 36 |     url = db.Column(db.String(80), unique=True)
 37 |     text = db.Column(db.String())
 38 |     enabled = db.Column(db.Boolean())
 39 | 
 40 |     def __unicode__(self):
 41 |         str = unicode(BeautifulSoup(self.text,convertEntities=BeautifulSoup.HTML_ENTITIES))
 42 |         return nltk.clean_html(str)
 43 | 
 44 |     @classmethod
 45 |     def get_all(self):
 46 |         cached = cache.get('samples')
 47 |         if cached is None:
 48 |             cached = self.query.filter_by(enabled=True).all()
 49 |             cache.set('samples', cached, timeout=app.config['CACHE_MINUTES'] * 60)
 50 |         return cached
 51 | 
 52 | class Output(db.Model):
 53 |     id = db.Column(db.Integer, primary_key=True)
 54 |     hash = db.Column(db.String(128),unique=True)
 55 |     text = db.Column(db.String())
 56 |     params = db.Column(db.String(100))
 57 |     
 58 |     def __init__(self,text,**params):
 59 |         self.hash = hashlib.sha512(text.encode('utf8')).hexdigest()
 60 |         self.text = text
 61 |         self.params = simplejson.dumps(params)
 62 | 
 63 | @app.context_processor
 64 | def base_context():
 65 |     return dict(settings=app.config,
 66 |         hits = Output.query.count() + app.config['INIT_HITS']
 67 |     )
 68 | 
 69 | @app.errorhandler(404)
 70 | def page_not_found(error):
 71 |     return render_template('404.html',title=u"To tady nemáme!"), 404
 72 | 
 73 | @app.route('/faq')
 74 | def faq():
 75 |     return render_template('faq.html',title=u"Často kladené dotazy",samples=Sample.get_all())
 76 | 
 77 | @app.route('/permalink/<hash>')
 78 | def permalink(hash):
 79 |     one = Output.query.filter_by(hash=hash).first_or_404()
 80 |     return render_template('generator.html', title=u"Henrykuj!", 
 81 |                            text=one.text, hash=one.hash,  
 82 |                            **simplejson.loads(one.params)
 83 |     )
 84 | 
 85 | @app.route('/')
 86 | def index():
 87 |     bigrams = request.args.get('bigrams',False)
 88 |     try:
 89 |         words = int(request.args.get('words',app.config['WORDS']))
 90 |         if words > app.config['MAX_WORDS']:
 91 |             words = app.config['MAX_WORDS']
 92 |     except ValueError:
 93 |         words = app.config['WORDS']
 94 | 
 95 |     out = _generate(words,bigrams)
 96 |     output = Output(out,words=words,bigrams=bool(bigrams))
 97 |     try:
 98 |         db.session.add(output)
 99 |         db.session.commit()
100 |     except IntegrityError:
101 |         pass
102 |     
103 |     return render_template('generator.html', title=u"Henrykuj!",
104 |                            text=out, hash=output.hash,
105 |                            words=words, bigrams=bigrams
106 |     )
107 | 
108 | def _get_ngram_model(bigrams):
109 |     #NLTK produces a LOT of warnings - don't mess with my error log
110 |     warnings.simplefilter("ignore")
111 |     cached = cache.get('ngram_model')
112 |     if cached is None:
113 |         samples = Sample.get_all()
114 |         if samples:
115 |             text = [unicode(s) for s in samples]
116 |             tokenizer = nltk.tokenize.WordPunctTokenizer()
117 |             tokenized = tokenizer.tokenize(' '.join(text))
118 |             cached = nltk.NgramModel(3-int(bool(bigrams)), tokenized)
119 |             cache.set('ngram_model', cached, timeout=app.config['CACHE_MINUTES'] * 60)
120 |     return cached
121 | 
122 | def _generate(words,bigrams):
123 |     model = _get_ngram_model(bigrams)
124 |     starts = model.generate(100)[-4:]
125 |     starts = filter(lambda a: a not in PUNCT,starts)
126 |     generated = model.generate(words, starts)
127 |     out = ' '.join(generated).replace(' , ',', ').replace(' . ','. ')
128 |     return '%s%s&hellip;'%(out[0].upper(),out[1:])
129 | 
130 | if __name__ == '__main__':
131 |     app.run()


--------------------------------------------------------------------------------
/test/transliterator_test.rb:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | require File.expand_path("../helper", __FILE__)
  3 | 
  4 | class TransliteratorTest < MiniTest::Test
  5 |   DONT_CONVERT = [
  6 |     "Vitrum edere possum; mihi non nocet.", # Latin
  7 |     "Je puis mangier del voirre. Ne me nuit.", # Old French
  8 |     "Kristala jan dezaket, ez dit minik ematen.", # Basque
  9 |     "Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
 10 |     "Ich kann Glas essen, ohne mir weh zu tun.", # German
 11 |     "I can eat glass and it doesn't hurt me.", # English
 12 |   ]
 13 | 
 14 |   CONVERT_PAIRS = {
 15 |     "Je peux manger du verre, ça ne me fait pas de mal." => # French
 16 |       "Je peux manger du verre, ca ne me fait pas de mal.",
 17 |     "Pot să mănânc sticlă și ea nu mă rănește." => # Romanian
 18 |       "Pot sa mananc sticla si ea nu ma raneste.",
 19 |     "Ég get etið gler án þess að meiða mig." => # Icelandic
 20 |       "Eg get etid gler an thess ad meida mig.",
 21 |     "Unë mund të ha qelq dhe nuk më gjen gjë." => # Albanian
 22 |       "Une mund te ha qelq dhe nuk me gjen gje.",
 23 |     "Mogę jeść szkło i mi nie szkodzi." => # Polish
 24 |       "Moge jesc szklo i mi nie szkodzi.",
 25 | #     "Я могу есть стекло, оно мне не вредит." => # Russian
 26 | #       "Ia moghu iest' stieklo, ono mnie nie vriedit.",
 27 | #     "Мога да ям стъкло, то не ми вреди." => # Bulgarian
 28 | #       "Mogha da iam stklo, to nie mi vriedi.",
 29 | #     "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => # Anglo-Saxon
 30 | #       "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
 31 | #     "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => # Classical Greek
 32 | #       "ualon phagein dunamai; touto ou me blaptei",
 33 | #     "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => # Hindi
 34 | #       "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
 35 | #     "من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
 36 | #       "mn my twnm bdwni Hss drd shyshh bkhwrm",
 37 | #     "أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
 38 | #       "'n qdr 'l~ 'kl lzjj w hdh l yw'lmn",
 39 | #     "אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
 40 | #       "ny ykvl lkvl zkvkyt vzh l mzyq ly",
 41 | #     "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => # Thai
 42 | #       "chankinkracchkaid aetmanaimthamaihchanecchb",
 43 | #     "我能吞下玻璃而不伤身体。" => # Chinese
 44 | #       "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
 45 | #     "私はガラスを食べられます。それは私を傷つけません。" => # Japanese
 46 | #       "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
 47 | #     "⠋⠗⠁⠝⠉⠑" => # Braille
 48 | #       "france",
 49 |       "Schloß - Assunção - Łódź" =>
 50 |         "Schloss - Assuncao - Lodz",
 51 |       "TÜM GOLLER Fb 4-1 Bursa Maç Özeti Íƶle" =>
 52 |         "TUM GOLLER Fb 4-1 Bursa Mac Ozeti Izle",
 53 |       "ßßßßß" => "ssssssssss"
 54 |   }
 55 | 
 56 |   def test_transliterate
 57 |     trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC"
 58 | 
 59 |     DONT_CONVERT.each do |subject|
 60 |       assert_equal subject, trans(subject, trans_id)
 61 |     end
 62 | 
 63 |     CONVERT_PAIRS.each do |before, after|
 64 |       assert_equal after, trans(before, trans_id)
 65 |     end
 66 |   end
 67 | 
 68 |   if "".respond_to? :force_encoding
 69 |     def test_transliterate_id_must_be_utf8_or_ascii
 70 |       trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC".force_encoding('big5')
 71 |       txt = "blah blah blah"
 72 | 
 73 |       assert_raises Encoding::CompatibilityError do
 74 |         trans(txt, trans_id)
 75 |       end
 76 | 
 77 |       trans_id.force_encoding('UTF-8')
 78 |       begin
 79 |         trans(txt, trans_id)
 80 |       rescue Encoding::CompatibilityError => e
 81 |         assert_nil e, "#{e.class.name} raised, expected not to"
 82 |       end
 83 | 
 84 |       trans_id.force_encoding('US-ASCII')
 85 |       begin
 86 |         trans(txt, trans_id)
 87 |       rescue Encoding::CompatibilityError => e
 88 |         assert_nil e, "#{e.class.name} raised, expected not to"
 89 |       end
 90 |     end
 91 | 
 92 |     def test_transliterate_text_must_be_utf8_or_ascii
 93 |       trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC"
 94 |       txt = "blah blah blah".force_encoding('big5')
 95 | 
 96 |       assert_raises Encoding::CompatibilityError do
 97 |         trans(txt, trans_id)
 98 |       end
 99 | 
100 |       txt.force_encoding('UTF-8')
101 |       begin
102 |         trans(txt, trans_id)
103 |       rescue Encoding::CompatibilityError => e
104 |         assert_nil e, "#{e.class.name} raised, expected not to"
105 |       end
106 | 
107 |       txt.force_encoding('US-ASCII')
108 |       begin
109 |         trans(txt, trans_id)
110 |       rescue Encoding::CompatibilityError => e
111 |         assert_nil e, "#{e.class.name} raised, expected not to"
112 |       end
113 |     end
114 |   end
115 | 
116 |   def test_transliterator_id_list_shouldnt_be_empty
117 |     assert !CharlockHolmes::Transliterator.id_list.empty?
118 |   end
119 | 
120 |   def trans(text, id)
121 |     CharlockHolmes::Transliterator.transliterate(text, id)
122 |   end
123 | end
124 | 


--------------------------------------------------------------------------------
/test/encoding_detector_test.rb:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | require File.expand_path("../helper", __FILE__)
  3 | 
  4 | class EncodingDetectorTest < MiniTest::Test
  5 |   def setup
  6 |     @detector = CharlockHolmes::EncodingDetector.new
  7 |   end
  8 | 
  9 |   def test_has_class_level_detect_method
 10 |     CharlockHolmes::EncodingDetector.respond_to? :detect
 11 |     detected = CharlockHolmes::EncodingDetector.detect 'test'
 12 |     assert_equal 'ISO-8859-1', detected[:encoding]
 13 |   end
 14 | 
 15 |   def test_class_level_detect_accepts_encoding_hint
 16 |     CharlockHolmes::EncodingDetector.respond_to? :detect
 17 |     detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
 18 |     assert_equal 'ISO-8859-1', detected[:encoding]
 19 |   end
 20 | 
 21 |   def test_has_class_level_detect_all_method
 22 |     CharlockHolmes::EncodingDetector.respond_to? :detect_all
 23 |     detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
 24 |     assert detected_list.is_a? Array
 25 | 
 26 |     encoding_list = detected_list.map {|d| d[:encoding]}.sort
 27 |     assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
 28 |   end
 29 | 
 30 |   def test_class_level_detect_all_method_accepts_encoding_hint
 31 |     CharlockHolmes::EncodingDetector.respond_to? :detect_all
 32 |     detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
 33 |     assert detected_list.is_a? Array
 34 | 
 35 |     encoding_list = detected_list.map {|d| d[:encoding]}.sort
 36 |     assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
 37 |   end
 38 | 
 39 |   def test_has_detect_method
 40 |     @detector.respond_to? :detect
 41 |     detected = @detector.detect 'test'
 42 |     assert_equal 'ISO-8859-1', detected[:encoding]
 43 |   end
 44 | 
 45 |   def test_detect_accepts_encoding_hint
 46 |     @detector.respond_to? :detect
 47 |     detected = @detector.detect 'test', 'UTF-8'
 48 |     assert_equal 'ISO-8859-1', detected[:encoding]
 49 |   end
 50 | 
 51 |   def test_has_detect_all_method
 52 |     @detector.respond_to? :detect_all
 53 |     detected_list = @detector.detect_all 'test'
 54 |     assert detected_list.is_a? Array
 55 | 
 56 |     encoding_list = detected_list.map {|d| d[:encoding]}.sort
 57 |     assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
 58 |   end
 59 | 
 60 |   def test_detect_all_accepts_encoding_hint
 61 |     @detector.respond_to? :detect_all
 62 |     detected_list = @detector.detect_all 'test', 'UTF-8'
 63 |     assert detected_list.is_a? Array
 64 | 
 65 |     encoding_list = detected_list.map {|d| d[:encoding]}.sort
 66 |     assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
 67 |   end
 68 | 
 69 |   def test_strip_tags_flag
 70 |     detector = CharlockHolmes::EncodingDetector.new
 71 |     detector.strip_tags = true
 72 |     assert detector.strip_tags
 73 | 
 74 |     detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
 75 |     assert_equal 'UTF-8', detection[:encoding]
 76 | 
 77 |     detector.strip_tags = false
 78 |     assert !detector.strip_tags
 79 | 
 80 |     detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
 81 |     assert_equal 'UTF-8', detection[:encoding]
 82 |   end
 83 | 
 84 |   def test_has_list_of_supported_encodings
 85 |     CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
 86 |     supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
 87 | 
 88 |     assert supported_encodings.is_a?(Array)
 89 |     assert supported_encodings.include? 'UTF-8'
 90 |     assert supported_encodings.include? 'windows-1250'
 91 |     assert supported_encodings.include? 'windows-1252'
 92 |     assert supported_encodings.include? 'windows-1253'
 93 |     assert supported_encodings.include? 'windows-1254'
 94 |     assert supported_encodings.include? 'windows-1255'
 95 |   end
 96 | 
 97 |   def test_returns_a_ruby_compatible_encoding_name
 98 |     detected = @detector.detect 'test'
 99 |     assert_equal 'ISO-8859-1', detected[:encoding]
100 |     assert_equal 'ISO-8859-1', detected[:ruby_encoding]
101 | 
102 |     not_compat_txt = fixture("ISO-2022-KR.txt").read
103 |     detected = @detector.detect not_compat_txt
104 |     assert_equal 'ISO-2022-KR', detected[:encoding]
105 |     assert_equal 'binary', detected[:ruby_encoding]
106 |   end
107 | 
108 |   def test_is_binary
109 |     png = fixture('octocat.png').read
110 |     assert @detector.is_binary?(png)
111 | 
112 |     utf16 = fixture('AnsiGraph.psm1').read
113 |     refute @detector.is_binary?(utf16)
114 | 
115 |     utf8 = fixture('core.rkt').read
116 |     refute @detector.is_binary?(utf8)
117 |   end
118 | 
119 |   MAPPING = [
120 |     ['repl2.cljs',                'ISO-8859-1', :text],
121 |     ['cl-messagepack.lisp',       'ISO-8859-1', :text],
122 |     ['sierpinski.ps',             'ISO-8859-1', :text],
123 |     ['core.rkt',                  'UTF-8',      :text],
124 |     ['TwigExtensionsDate.es.yml', 'UTF-8',      :text],
125 |     ['laholator.py',              'UTF-8',      :text],
126 |     ['vimrc',                     'UTF-8',      :text],
127 |     ['AnsiGraph.psm1',            'UTF-16LE',   :text],
128 |     ['utf16be.html',              'UTF-16BE',   :text],
129 |     ['utf32le.html',              'UTF-32LE',   :text],
130 |     ['utf32be.html',              'UTF-32BE',   :text],
131 |     ['hello_world',               nil,          :binary],
132 |     ['octocat.png',               nil,          :binary],
133 |     ['octocat.jpg',               nil,          :binary],
134 |     ['octocat.psd',               nil,          :binary],
135 |     ['octocat.gif',               nil,          :binary],
136 |     ['octocat.ai',                nil,          :binary],
137 |     ['foo.pdf',                   nil,          :binary],
138 |   ]
139 | 
140 |   def test_detection_works_as_expected
141 |     MAPPING.each do |mapping|
142 |       file, encoding, type = mapping
143 | 
144 |       content = fixture(file).read
145 |       guessed = @detector.detect content
146 | 
147 |       assert_equal encoding, guessed[:encoding]
148 |       assert_equal type, guessed[:type]
149 | 
150 |       if content.respond_to?(:force_encoding) && guessed[:type] == :text
151 |         content.force_encoding guessed[:encoding]
152 |         assert content.valid_encoding?
153 |       end
154 |     end
155 |   end
156 | end
157 | 


--------------------------------------------------------------------------------
/test/fixtures/cl-messagepack.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; cl-messagepack.lisp
  2 | 
  3 | (in-package #:messagepack)
  4 | 
  5 | (declaim (optimize (debug 3)))
  6 | 
  7 | (eval-when (:compile-toplevel :load-toplevel :execute)
  8 |   (defun mkstr (&rest args)
  9 |     (format nil "~{~a~}" args))
 10 |   (defun mksymb (&rest args)
 11 |     (intern (apply #'mkstr args))))
 12 | 
 13 | (defmacro signed-unsigned-convertors (size)
 14 |   (let ((speed (if (< size 32) 3 0)))
 15 |     `(progn
 16 |        (defun ,(mksymb 'sb size '-> 'ub size) (sb)
 17 |          (declare (optimize (debug 0) (safety 0) (speed ,speed))
 18 |                   (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb))
 19 |          (if (< sb 0)
 20 |              (ldb (byte ,size 0) sb)
 21 |              sb))
 22 |        (defun ,(mksymb 'ub size '-> 'sb size) (sb)
 23 |          (declare (optimize (debug 0) (safety 0) (speed ,speed))
 24 |                   (type (mod ,(expt 2 size)) sb))
 25 |          (if (logbitp (1- ,size) sb)
 26 |              (- (1+ (logxor (1- (expt 2 ,size)) sb)))
 27 |              sb)))))
 28 | 
 29 | (signed-unsigned-convertors 8)
 30 | (signed-unsigned-convertors 16)
 31 | (signed-unsigned-convertors 32)
 32 | (signed-unsigned-convertors 64)
 33 | 
 34 | (defun write-hex (data)
 35 |   (let (line)
 36 |     (loop
 37 |        for i from 0 to (1- (length data))
 38 |        do (push (elt data i) line)
 39 |        when (= (length line) 16)
 40 |        do
 41 |          (format t "~{~2,'0x ~}~%" (nreverse line))
 42 |          (setf line nil))
 43 |     (when line
 44 |       (format t "~{~2,'0x ~}~%" (nreverse line)))))
 45 | 
 46 | (defun encode (data)
 47 |   (flexi-streams:with-output-to-sequence (stream)
 48 |     (encode-stream data stream)))
 49 | 
 50 | (defun make-hash (data)
 51 |   (let ((result (make-hash-table)))
 52 |     (dolist (kv data)
 53 |       (cond ((consp (cdr kv))
 54 |              (setf (gethash (first kv) result) (second kv)))
 55 |             (t
 56 |              (setf (gethash (car kv) result) (cdr kv)))))
 57 |     result))
 58 | 
 59 | (defun is-byte-array (data-type)
 60 |   (and (vectorp data-type)
 61 |        (equal '(unsigned-byte 8) (array-element-type data-type))))
 62 | 
 63 | (defun encode-stream (data stream)
 64 |   (cond ((floatp data) (encode-float data stream))
 65 |         ((numberp data) (encode-integer data stream))
 66 |         ((null data) (write-byte #xc0 stream))
 67 |         ((eq data t) (write-byte #xc3 stream))
 68 |         ((stringp data)
 69 |          (encode-string data stream))
 70 |         ((is-byte-array data)
 71 |          (encode-raw-bytes data stream))
 72 |         ((or (consp data) (vectorp data))
 73 |          (encode-array data stream))
 74 |         ((hash-table-p data)
 75 |          (encode-hash data stream))
 76 |         ((symbolp data)
 77 |          (encode-string (symbol-name data) stream))
 78 |         (t (error "Cannot encode data."))))
 79 | 
 80 | (defun encode-string (data stream)
 81 |   (encode-raw-bytes (babel:string-to-octets data) stream))
 82 | 
 83 | #+sbcl (defun sbcl-encode-float (data stream)
 84 |          (cond ((equal (type-of data) 'single-float)
 85 |                 (write-byte #xca stream)
 86 |                 (store-big-endian (sb-kernel:single-float-bits data) stream 4))
 87 |                ((equal (type-of data) 'double-float)
 88 |                 (write-byte #xcb stream)
 89 |                 (store-big-endian (sb-kernel:double-float-high-bits data) stream 4)
 90 |                 (store-big-endian (sb-kernel:double-float-low-bits data) stream 4)))
 91 |          t)
 92 | 
 93 | (defun encode-float (data stream)
 94 |   (or #+sbcl (sbcl-encode-float data stream)
 95 |       #-(or sbcl) (error "No floating point support yet.")))
 96 | 
 97 | (defun encode-each (data stream &optional (encoder #'encode-stream))
 98 |   (cond ((hash-table-p data)
 99 |          (maphash (lambda (key value)
100 |                     (funcall encoder key stream)
101 |                     (funcall encoder value stream))
102 |                   data))
103 |         ((or (vectorp data) (consp data))
104 |          (mapc (lambda (subdata)
105 |                  (funcall encoder subdata stream))
106 |                (coerce data 'list)))
107 |         (t (error "Not sequence or hash table."))))
108 | 
109 | (defun encode-sequence (data stream
110 |                         short-prefix short-length
111 |                         typecode-16 typecode-32
112 |                         &optional (encoder #'encode-stream))
113 |   (let ((len (if (hash-table-p data)
114 |                  (hash-table-count data)
115 |                  (length data))))
116 |     (cond ((<= 0 len short-length)
117 |            (write-byte (+ short-prefix len) stream)
118 |            (encode-each data stream encoder))
119 |           ((<= 0 len 65535)
120 |            (write-byte typecode-16 stream)
121 |            (store-big-endian len stream 2)
122 |            (encode-each data stream encoder))
123 |           ((<= 0 len (1- (expt 2 32)))
124 |            (write-byte typecode-32 stream)
125 |            (store-big-endian len stream 4)
126 |            (encode-each data stream encoder)))))
127 | 
128 | (defun encode-hash (data stream)
129 |   (encode-sequence data stream #x80 15 #xdc #xdd))
130 | 
131 | (defun encode-array (data stream)
132 |   (encode-sequence data stream #x90 15 #xdc #xdd))
133 | 
134 | (defun encode-raw-bytes (data stream)
135 |   (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte))
136 | 
137 | (defun encode-integer (data stream)
138 |   (cond ((<= 0 data 127) (write-byte data stream))
139 |         ((<= -32 data -1) (write-byte (sb8->ub8 data) stream))
140 |         ((<= 0 data 255)
141 |          (write-byte #xcc stream)
142 |          (write-byte data stream))
143 |         ((<= 0 data 65535)
144 |          (write-byte #xcd stream)
145 |          (store-big-endian data stream 2))
146 |         ((<= 0 data (1- (expt 2 32)))
147 |          (write-byte #xce stream)
148 |          (store-big-endian data stream 4))
149 |         ((<= 0 data (1- (expt 2 64)))
150 |          (write-byte #xcf stream)
151 |          (store-big-endian data stream 8))
152 |         ((<= -128 data 127)
153 |          (write-byte #xd0 stream)
154 |          (write-byte (sb8->ub8 data) stream))
155 |         ((<= -32768 data 32767)
156 |          (write-byte #xd1 stream)
157 |          (write-byte (sb16->ub16 data) stream))
158 |         ((<= (- (expt 2 31)) data (1- (expt 2 31)))
159 |          (write-byte #xd2 stream)
160 |          (write-byte (sb32->ub32 data) stream))
161 |         ((<= (- (expt 2 63)) data (1- (expt 2 63)))
162 |          (write-byte #xd3 stream)
163 |          (write-byte (sb64->ub64 data) stream))
164 |         (t (error "Integer too large or too small."))))
165 | 
166 | (defun store-big-endian (number stream byte-count)
167 |   (let (byte-list)
168 |     (loop
169 |        while (> number 0)
170 |        do
171 |          (push (rem number 256)
172 |                byte-list)
173 |          (setf number (ash number -8)))
174 |     (loop
175 |        while (< (length byte-list) byte-count)
176 |        do (push 0 byte-list))
177 |     (when (> (length byte-list) byte-count)
178 |       (error "Number too large."))
179 |     (write-sequence byte-list stream)))
180 | 
181 | (defun decode (byte-array)
182 |   (flexi-streams:with-input-from-sequence (stream byte-array)
183 |     (decode-stream stream)))
184 | 
185 | (defun decode-stream (stream)
186 |   (let ((byte (read-byte stream)))
187 |     (cond ((= 0 (ldb (byte 1 7) byte))
188 |            byte)
189 |           ((= 7 (ldb (byte 3 5) byte))
190 |            (ub8->sb8 byte))
191 |           ((= #xcc byte)
192 |            (read-byte stream))
193 |           ((= #xcd byte)
194 |            (load-big-endian stream 2))
195 |           ((= #xce byte)
196 |            (load-big-endian stream 4))
197 |           ((= #xcf byte)
198 |            (load-big-endian stream 8))
199 |           ((= #xd0 byte)
200 |            (ub8->sb8 (read-byte stream)))
201 |           ((= #xd1 byte)
202 |            (ub16->sb16 (load-big-endian stream 2)))
203 |           ((= #xd2 byte)
204 |            (ub32->sb32 (load-big-endian stream 4)))
205 |           ((= #xd3 byte)
206 |            (ub64->sb64 (load-big-endian stream 8)))
207 |           ((= #xc0 byte)
208 |            nil)
209 |           ((= #xc3 byte)
210 |            t)
211 |           ((= #xc2 byte)
212 |            nil)
213 |           ((= #xca byte)
214 |            (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4))
215 |                #-(or sbcl) (error "No floating point support yet.")))
216 |           ((= #xcb byte)
217 |            (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4)
218 |                                                    (load-big-endian stream 4))
219 |                #-(or sbcl) (error "No floating point support yet.")))
220 |           ((= 5 (ldb (byte 3 5) byte))
221 |            (decode-raw-sequence (ldb (byte 5 0) byte) stream))
222 |           ((= #xda byte)
223 |            (decode-raw-sequence (load-big-endian stream 2) stream))
224 |           ((= #xdb byte)
225 |            (decode-raw-sequence (load-big-endian stream 4) stream))
226 |           ((= 9 (ldb (byte 4 4) byte))
227 |            (decode-array (- byte #x90) stream))
228 |           ((= #xdc byte)
229 |            (decode-array (load-big-endian stream 2) stream))
230 |           ((= #xdd byte)
231 |            (decode-array (load-big-endian stream 4) stream))
232 |           ((= 8 (ldb (byte 4 4) byte))
233 |            (decode-map (- byte #x80) stream))
234 |           ((= #xde byte)
235 |            (decode-map (load-big-endian stream 2) stream))
236 |           ((= #xdf byte)
237 |            (decode-map (load-big-endian stream 4) stream)))))
238 | 
239 | (defun decode-map (length stream)
240 |   (let ((hash-table (make-hash-table :test #'equal)))
241 |     (loop repeat length
242 |        do (let ((key (decode-stream stream))
243 |                 (value (decode-stream stream)))
244 |             (setf (gethash key hash-table) value)))
245 |     hash-table))
246 | 
247 | (defun decode-array (length stream)
248 |   (let ((array (make-array length)))
249 |     (dotimes (i length)
250 |       (setf (aref array i) (decode-stream stream)))
251 |     array))
252 | 
253 | (defun decode-raw-sequence (length stream)
254 |   (let ((seq (make-array length :element-type '(mod 256))))
255 |     (read-sequence seq stream)
256 |     (babel:octets-to-string seq)))
257 | 
258 | (defun load-big-endian (stream byte-count)
259 |   (let ((result 0))
260 |     (loop
261 |        repeat byte-count
262 |        do (setf result (+ (ash result 8)
263 |                           (read-byte stream))))
264 |     result))
265 | 


--------------------------------------------------------------------------------
/ext/charlock_holmes/encoding_detector.c:
--------------------------------------------------------------------------------
  1 | #include "unicode/ucsdet.h"
  2 | #include "common.h"
  3 | 
  4 | extern VALUE rb_mCharlockHolmes;
  5 | static VALUE rb_cEncodingDetector;
  6 | 
  7 | typedef struct {
  8 | 	UCharsetDetector *csd;
  9 | } charlock_detector_t;
 10 | 
 11 | static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
 12 | {
 13 | 	UErrorCode status = U_ZERO_ERROR;
 14 | 	const char *mname;
 15 | 	const char *mlang;
 16 | 	int mconfidence;
 17 | 	VALUE rb_match;
 18 | 	VALUE enc_tbl;
 19 | 	VALUE enc_name;
 20 | 	VALUE compat_enc;
 21 | 
 22 | 	if (!match)
 23 | 		return Qnil;
 24 | 
 25 | 	mname = ucsdet_getName(match, &status);
 26 | 	mlang = ucsdet_getLanguage(match, &status);
 27 | 	mconfidence = ucsdet_getConfidence(match, &status);
 28 | 
 29 | 	rb_match = rb_hash_new();
 30 | 
 31 | 	rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
 32 | 
 33 | 	enc_name = charlock_new_str2(mname);
 34 | 	rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
 35 | 
 36 | 	enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
 37 | 	compat_enc = rb_hash_aref(enc_tbl, enc_name);
 38 | 	if (!NIL_P(compat_enc)) {
 39 | 		rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
 40 | 	}
 41 | 
 42 | 	rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
 43 | 
 44 | 	if (mlang && mlang[0])
 45 | 		rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
 46 | 
 47 | 	return rb_match;
 48 | }
 49 | 
 50 | static VALUE rb_encdec_binarymatch() {
 51 | 	VALUE rb_match;
 52 | 
 53 | 	rb_match = rb_hash_new();
 54 | 
 55 | 	rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
 56 | 	rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
 57 | 
 58 | 	return rb_match;
 59 | }
 60 | 
 61 | static int detect_binary_content(VALUE self, VALUE rb_str) {
 62 | 	size_t buf_len, scan_len;
 63 | 	const char *buf;
 64 | 
 65 | 	buf = RSTRING_PTR(rb_str);
 66 | 	buf_len = RSTRING_LEN(rb_str);
 67 | 	scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
 68 | 
 69 | 	if (buf_len > 10) {
 70 | 		// application/postscript
 71 | 		if (!memcmp(buf, "%!PS-Adobe-", 11))
 72 | 			return 0;
 73 | 	}
 74 | 
 75 | 	if (buf_len > 7) {
 76 | 		// image/png
 77 | 		if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
 78 | 			return 1;
 79 | 	}
 80 | 
 81 | 	if (buf_len > 5) {
 82 | 		// image/gif
 83 | 		if (!memcmp(buf, "GIF87a", 6))
 84 | 			return 1;
 85 | 
 86 | 		// image/gif
 87 | 		if (!memcmp(buf, "GIF89a", 6))
 88 | 			return 1;
 89 | 	}
 90 | 
 91 | 	if (buf_len > 4) {
 92 | 		// application/pdf
 93 | 		if (!memcmp(buf, "%PDF-", 5))
 94 | 			return 1;
 95 | 	}
 96 | 
 97 | 	if (buf_len > 3) {
 98 | 		// UTF-32BE
 99 | 		if (!memcmp(buf, "\0\0\xfe\xff", 4))
100 | 			return 0;
101 | 
102 | 		// UTF-32LE
103 | 		if (!memcmp(buf, "\xff\xfe\0\0", 4))
104 | 			return 0;
105 | 	}
106 | 
107 | 	if (buf_len > 2) {
108 | 		// image/jpeg
109 | 		if (!memcmp(buf, "\xFF\xD8\xFF", 3))
110 | 			return 1;
111 | 	}
112 | 
113 | 	if (buf_len > 1) {
114 | 		// UTF-16BE
115 | 		if (!memcmp(buf, "\xfe\xff", 2))
116 | 			return 0;
117 | 
118 | 		// UTF-16LE
119 | 		if (!memcmp(buf, "\xff\xfe", 2))
120 | 			return 0;
121 | 	}
122 | 
123 | 	/*
124 | 	 * If we got this far, any NULL bytes within the `scan_len`
125 | 	 * range will likely mean the contents are binary.
126 | 	 */
127 | 	if (scan_len < buf_len)
128 | 		buf_len = scan_len;
129 | 	return !!memchr(buf, 0, buf_len);
130 | }
131 | 
132 | /*
133 |  * call-seq: true/false = EncodingDetector.is_binary? str
134 |  *
135 |  * Attempt to detect if a string is binary or text
136 |  *
137 |  * str      - a String, what you want to perform the binary check on
138 |  *
139 |  * Returns: true or false
140 |  */
141 | static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
142 | {
143 | 	if (detect_binary_content(self, str))
144 | 		return Qtrue;
145 | 	else
146 | 		return Qfalse;
147 | }
148 | 
149 | /*
150 |  * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
151 |  *
152 |  * Attempt to detect the encoding of this string
153 |  *
154 |  * str      - a String, what you want to detect the encoding of
155 |  * hint_enc - an optional String (like "UTF-8"), the encoding name which will
156 |  *            be used as an additional hint to the charset detector
157 |  *
158 |  * Returns: a Hash with :encoding, :language, :type and :confidence
159 |  */
160 | static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
161 | {
162 | 	UErrorCode status = U_ZERO_ERROR;
163 | 	charlock_detector_t *detector;
164 | 	VALUE rb_str;
165 | 	VALUE rb_enc_hint;
166 | 
167 | 	rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
168 | 
169 | 	Check_Type(rb_str, T_STRING);
170 | 	Data_Get_Struct(self, charlock_detector_t, detector);
171 | 
172 | 	// first lets see if this is binary content
173 | 	if (detect_binary_content(self, rb_str)) {
174 | 		return rb_encdec_binarymatch();
175 | 	}
176 | 
177 | 	// if we got here - the data doesn't look like binary
178 | 	// lets try to figure out what encoding the text is in
179 | 	ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
180 | 
181 | 	if (!NIL_P(rb_enc_hint)) {
182 | 		Check_Type(rb_enc_hint, T_STRING);
183 | 		ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
184 | 	}
185 | 
186 | 	return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
187 | }
188 | 
189 | 
190 | /*
191 |  * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
192 |  *
193 |  * Attempt to detect the encoding of this string, and return
194 |  * a list with all the possible encodings that match it.
195 |  *
196 |  *
197 |  * str      - a String, what you want to detect the encoding of
198 |  * hint_enc - an optional String (like "UTF-8"), the encoding name which will
199 |  *            be used as an additional hint to the charset detector
200 |  *
201 |  * Returns: an Array with zero or more Hashes,
202 |  *          each one of them with with :encoding, :language, :type and :confidence
203 |  */
204 | static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
205 | {
206 | 	UErrorCode status = U_ZERO_ERROR;
207 | 	charlock_detector_t *detector;
208 | 	const UCharsetMatch **csm;
209 | 	VALUE rb_ret;
210 | 	int i, match_count;
211 | 	VALUE rb_str;
212 | 	VALUE rb_enc_hint;
213 | 	VALUE binary_match;
214 | 
215 | 	rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
216 | 
217 | 	Check_Type(rb_str, T_STRING);
218 | 	Data_Get_Struct(self, charlock_detector_t, detector);
219 | 
220 | 	rb_ret = rb_ary_new();
221 | 
222 | 	// first lets see if this is binary content
223 | 	binary_match = Qnil;
224 | 	if (detect_binary_content(self, rb_str)) {
225 | 		binary_match = rb_encdec_binarymatch();
226 | 	}
227 | 
228 | 	ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
229 | 
230 | 	if (!NIL_P(rb_enc_hint)) {
231 | 		Check_Type(rb_enc_hint, T_STRING);
232 | 		ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
233 | 	}
234 | 
235 | 	csm = ucsdet_detectAll(detector->csd, &match_count, &status);
236 | 
237 | 	for (i = 0; i < match_count; ++i) {
238 | 		rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
239 | 	}
240 | 
241 | 	if (!NIL_P(binary_match))
242 | 		rb_ary_unshift(rb_ret, binary_match);
243 | 
244 | 	return rb_ret;
245 | }
246 | 
247 | /*
248 |  * call-seq: EncodingDetector#strip_tags?
249 |  *
250 |  * Returns whether or not the strip_tags flag is set on this detector
251 |  *
252 |  * Returns: Boolean
253 |  */
254 | static VALUE rb_get_strip_tags(VALUE self)
255 | {
256 | 	charlock_detector_t *detector;
257 | 	UBool val;
258 | 	VALUE rb_val;
259 | 
260 | 	Data_Get_Struct(self, charlock_detector_t, detector);
261 | 
262 | 	val = ucsdet_isInputFilterEnabled(detector->csd);
263 | 
264 | 	rb_val = val == 1 ? Qtrue : Qfalse;
265 | 
266 | 	return rb_val;
267 | }
268 | 
269 | /*
270 |  * call-seq: EncodingDetector#strip_tags = true
271 |  *
272 |  * Enable or disable the stripping of HTML/XML tags from the input before
273 |  * attempting any detection
274 |  *
275 |  * Returns: Boolean, the value passed
276 |  */
277 | static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
278 | {
279 | 	charlock_detector_t *detector;
280 | 	UBool val;
281 | 
282 | 	Data_Get_Struct(self, charlock_detector_t, detector);
283 | 
284 | 	val = rb_val == Qtrue ? 1 : 0;
285 | 
286 | 	ucsdet_enableInputFilter(detector->csd, val);
287 | 
288 | 	return rb_val;
289 | }
290 | 
291 | /*
292 |  * call-seq: detectable_encodings = EncodingDetector.supported_encodings
293 |  *
294 |  * The list of detectable encodings supported by this library
295 |  *
296 |  * Returns: an Array of Strings
297 |  */
298 | static VALUE rb_get_supported_encodings(VALUE klass)
299 | {
300 | 	UCharsetDetector *csd;
301 | 	UErrorCode status = U_ZERO_ERROR;
302 | 	UEnumeration *encoding_list;
303 | 	VALUE rb_encoding_list;
304 | 	int32_t enc_count;
305 | 	int32_t i;
306 | 	const char *enc_name;
307 | 	int32_t enc_name_len;
308 | 
309 | 	rb_encoding_list = rb_iv_get(klass, "encoding_list");
310 | 
311 | 	// lazily populate the list
312 | 	if (NIL_P(rb_encoding_list)) {
313 | 		csd = ucsdet_open(&status);
314 | 
315 | 		encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
316 | 		rb_encoding_list = rb_ary_new();
317 | 		enc_count = uenum_count(encoding_list, &status);
318 | 
319 | 		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
320 | 		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
321 | 		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
322 | 		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
323 | 		rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
324 | 
325 | 		for(i=0; i < enc_count; i++) {
326 | 			enc_name = uenum_next(encoding_list, &enc_name_len, &status);
327 | 			rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
328 | 		}
329 | 
330 | 		rb_iv_set(klass, "encoding_list", rb_encoding_list);
331 | 		ucsdet_close(csd);
332 | 	}
333 | 
334 | 	return rb_encoding_list;
335 | }
336 | 
337 | static void rb_encdec__free(void *obj)
338 | {
339 | 	charlock_detector_t *detector;
340 | 
341 | 	detector = (charlock_detector_t *)obj;
342 | 
343 | 	if (detector->csd)
344 | 		ucsdet_close(detector->csd);
345 | 
346 | 	free(detector);
347 | }
348 | 
349 | static VALUE rb_encdec__alloc(VALUE klass)
350 | {
351 | 	charlock_detector_t *detector;
352 | 	UErrorCode status = U_ZERO_ERROR;
353 | 	VALUE obj;
354 | 
355 | 	detector = calloc(1, sizeof(charlock_detector_t));
356 | 	obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
357 | 
358 | 	detector->csd = ucsdet_open(&status);
359 | 	if (U_FAILURE(status)) {
360 | 		rb_raise(rb_eStandardError, "%s", u_errorName(status));
361 | 	}
362 | 
363 | 	return obj;
364 | }
365 | 
366 | void _init_charlock_encoding_detector()
367 | {
368 | 	rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
369 | 	rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
370 | 	rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
371 | 	rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
372 | 	rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
373 | 	rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
374 | 	rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
375 | 
376 | 	rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
377 | }
378 | 


--------------------------------------------------------------------------------
/test/fixtures/core.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket/base
  2 | 
  3 | (require (for-syntax syntax/parse racket/syntax racket)
  4 |          ffi/unsafe racket/function racket/string
  5 |          "start.rkt" "c.rkt")
  6 | 
  7 | (struct jtype (signature tag predicate ctype racket->java java->racket))
  8 | (struct jtype/object jtype (class))
  9 | (struct jtype/vector jtype/object (element))
 10 | (struct jvector (cpointer type length))
 11 | (struct jprocedure (args return proc))
 12 | 
 13 | (define ((single-compose f1 f2) e) (f1 (f2 e)))
 14 | 
 15 | (define (make-jtype obj racket->java java->racket)
 16 |   (let ([composed-racket->java (single-compose (jtype-racket->java obj) racket->java)]
 17 |         [composed-java->racket (single-compose java->racket (jtype-java->racket obj))])
 18 |     ; due to limitation in racket's struct-copy
 19 |     (cond
 20 |       [(jtype/vector? obj)
 21 |        (struct-copy jtype/vector obj
 22 |          [racket->java #:parent jtype composed-racket->java]
 23 |          [java->racket #:parent jtype composed-java->racket])]
 24 |       [(jtype/object? obj)
 25 |        (struct-copy jtype/object obj
 26 |          [racket->java #:parent jtype composed-racket->java]
 27 |          [java->racket #:parent jtype composed-java->racket])]
 28 |       [else
 29 |        (struct-copy jtype obj
 30 |          [racket->java                composed-racket->java]
 31 |          [java->racket                composed-java->racket])])))
 32 | 
 33 | (define (jtype->ctype obj)
 34 |   (make-ctype (jtype-ctype obj) (jtype-racket->java obj) (jtype-java->racket obj)))
 35 | 
 36 | ; --- signature makers ---
 37 | (define (make-class-signature c)  (string-append "L" c ";"))
 38 | (define (make-vector-signature s) (string-append "[" s))
 39 | (define (make-signature args return)
 40 |   (let ([args-signature (string-append* (map jtype-signature args))]
 41 |         [return-signature (jtype-signature return)])
 42 |     (string-append "(" args-signature ")" return-signature)))
 43 | 
 44 | ; --- predicates for java types on racket ---
 45 | (require (only-in web-server/dispatch/extend make-coerce-safe?) srfi/26/cut)
 46 | 
 47 | (define jboolean?   boolean?)
 48 | (define jbyte?      byte?)
 49 | (define jchar?      char?)
 50 | (define jshort?     (make-coerce-safe? (cut < -32768 <> 32767)))
 51 | (define jint?       (make-coerce-safe? (cut < -2147483648 <> 2147483647)))
 52 | (define jlong?      (make-coerce-safe? (cut < -9223372036854775808 <> 9223372036854775807)))
 53 | (define jfloat?     single-flonum?)
 54 | (define jdouble?    flonum?)
 55 | (define jstring?    string?)
 56 | (define ((make-jobject-predicate clss) o) (instance-of? o clss))
 57 | (define ((make-jlist-predicate element?) o) (andmap element? o))
 58 | 
 59 | ; --- java types ---
 60 | (define _jboolean (jtype "Z" 'boolean jboolean? __jboolean #f            #f))
 61 | (define _jbyte    (jtype "B" 'byte    jbyte?    __jbyte    #f            #f))
 62 | (define _jchar    (jtype "C" 'char    jchar?    __jchar    char->integer integer->char))
 63 | (define _jshort   (jtype "S" 'short   jshort?   __jshort   #f            #f))
 64 | (define _jint     (jtype "I" 'int     jint?     __jint     #f            #f))
 65 | (define _jlong    (jtype "J" 'long    jlong?    __jlong    #f            #f))
 66 | (define _jfloat   (jtype "F" 'float   jfloat?   __jfloat   #f            #f))
 67 | (define _jdouble  (jtype "D" 'double  jdouble?  __jdouble  #f            #f))
 68 | (define _jvoid    (jtype "V" 'void    #f        __jvoid    #f            #f))
 69 | ; hack for _jobject and _jlist so that they dual as a jtype and function
 70 | (define _jobject
 71 |   ((λ ()
 72 |      (struct _jobject jtype/object ()
 73 |        #:property prop:procedure 
 74 |        (λ (self class-name [racket->java #f] [java->racket #f] [predicate #f])
 75 |          (let ([class-id (find-class class-name)])
 76 |            (struct-copy jtype/object self
 77 |              [signature    #:parent jtype (make-class-signature class-name)]
 78 |              [predicate    #:parent jtype (or predicate (make-jobject-predicate class-id))]
 79 |              [racket->java #:parent jtype racket->java]
 80 |              [java->racket #:parent jtype java->racket]
 81 |              [class                 class-id]))))
 82 |      (let ([class-id (find-class "Ljava/lang/Object;")])
 83 |        (_jobject "Ljava/lang/Object;" 'object (make-jobject-predicate class-id)
 84 |                  __jobject #f #f class-id)))))
 85 | (define _jstring  (_jobject "java/lang/String" new-string get-string jstring?))
 86 | (define _jlist    
 87 |   ((λ ()
 88 |      (struct _jlist jtype/vector ()
 89 |        #:property prop:procedure
 90 |        (λ (self element)
 91 |          (define-values (make-array array-ref array-set!) (tag->array-info (jtype-tag element)))
 92 |          (when (jtype/object? element)
 93 |            (let ([clss (jtype/object-class element)])
 94 |              (set! make-array (λ (n) (new-object-array n clss #f)))))
 95 |          (let* ([signature (make-vector-signature (jtype-signature element))]
 96 |                 [element-racket->java (or (jtype-racket->java element) identity)]
 97 |                 [element-java->racket (or (jtype-java->racket element) identity)]
 98 |                 [element? (or (jtype-predicate element) (λ (_) #t))])
 99 |            (struct-copy jtype/vector self
100 |              [signature    #:parent jtype signature]
101 |              [predicate    #:parent jtype (make-jlist-predicate element?)]
102 |              [ctype        #:parent jtype __jobject]
103 |              [racket->java #:parent jtype
104 |               (λ (c)
105 |                 (let ([array (make-array (length c))])
106 |                   (for ([e (in-list c)] [i (in-naturals)])
107 |                     (array-set! array i (element-racket->java e)))
108 |                   array))]
109 |              [java->racket #:parent jtype
110 |               (λ (c)
111 |                 (for/list ([i (in-range (get-array-length c))])
112 |                   (element-java->racket (array-ref c i))))]
113 |              [class        #:parent jtype/object (find-class signature)]
114 |              [element               element]))))
115 |      (let ([class-id (find-class "[Ljava/lang/Object;")]
116 |            [element-class-id (jtype/object-class _jobject)])
117 |        (_jlist "[Ljava/lang/Object;" 'object (make-jobject-predicate element-class-id) __jobject
118 |                (λ (c)
119 |                  (let ([array (new-object-array (length c) element-class-id #f)])
120 |                    (for ([e (in-list c)]
121 |                          [i (in-naturals)])
122 |                      (set-object-array-element array i e))
123 |                    array))
124 |                (λ (c)
125 |                  (for/list ([i (in-range (get-array-length c))])
126 |                    (get-object-array-element c i)))
127 |                class-id
128 |                _jobject)))))
129 | (define-syntax (_jmethod stx)
130 |   (define-syntax-class type #:literals (->)
131 |     (pattern (~and x (~not (~or (~literal ...) ->)))))
132 |   (syntax-parse stx #:literals (->)
133 |     [(_ arg:type ... (~optional (~seq farg:type (~literal ...))) (~optional (~seq -> return*)))
134 |      (with-syntax* ([(arg* ...) (generate-temporaries #'(arg ...))]
135 |                     [(larg ... . marg) #`(arg* ... #,@(if (attribute farg) #'arg-rest #`()))]
136 |                     [(aarg ...) #`(arg* ... #,@(if (attribute farg) #'(arg-rest) #`()))]
137 |                     [return (if (attribute return*) #'return* #'_jvoid)])
138 |        #`(let* ([args  (list arg ... #,@(if (attribute farg) #`((_jlist farg)) #`()))])
139 |            (jprocedure args return
140 |             (λ (type jnienv clss method func)
141 |               (case type
142 |                 [(constructor) (λ (larg ... . marg) (func jnienv clss method aarg ...))]
143 |                 [(static-method) (λ (larg ... . marg) (func jnienv clss method aarg ...))]
144 |                 [(method) (λ (o larg ... . marg) (func jnienv o method aarg ...))]
145 |                 [else (error '_jmethod "invalid type provided")])))))]))
146 | ; dynamic and slower version of _jmethod
147 | (define (_jprocedure args return #:repeat-last-arg? [repeat-last-arg? #f])
148 |   (define (nest-at lst i)
149 |     (if (null? lst) (list null)
150 |         (let loop ([lst lst] [i i])
151 |           (cond [(null? lst) null]
152 |                 [(zero? i) (list lst)]
153 |                 [else (cons (car lst) (loop (cdr lst) (sub1 i)))]))))
154 |  (jprocedure args return
155 |     (if repeat-last-arg?
156 |         (let ([repeat-position (sub1 (length args))])
157 |           (λ (type jnienv clss method func)
158 |             (case type
159 |               [(constructor) (λ larg  (apply func jnienv clss method (nest-at larg repeat-position)))]
160 |               [(static-method) (λ larg (apply func jnienv clss method (nest-at larg repeat-position)))]
161 |               [(method) (λ (o . larg) (apply func jnienv o method (nest-at larg repeat-position)))])))
162 |         (λ (type jnienv clss method func)
163 |           (case type
164 |             [(constructor) (λ larg (apply func jnienv clss method larg))]
165 |             [(static-method) (λ larg (apply func jnienv clss method larg))]
166 |             [(method) (λ (o . larg) (apply func jnienv o method larg))]
167 |             [else (error '_jprocedure "invalid type provided")])))))
168 | ; get-jmethod/get-jconstructor pass the following arguments (type jnienv class method func) 
169 | ; to a function created by _jmethod or _jprocedure 
170 | ; according to the type the function returns one of the following functions
171 | ; | constructor   (λ (args ...) ; doesn't need to take in an object and the class is static
172 | ; | static-method (λ (args ...) ; same reasoning as above
173 | ; | method        (λ (object args ...)
174 | 
175 | 
176 | ; --- interfacing with java methods ---
177 | (define (get-jconstructor class-id type)
178 |   (let* ([args      (jprocedure-args type)]
179 |          [return    (jprocedure-return type)]
180 |          [proc      (jprocedure-proc type)]
181 |          [signature (make-signature args return)]
182 |          [method-id (get-method-id class-id "<init>" signature)]
183 |          [ffi-func  (get-jrffi-obj "new-object"
184 |                       (_cprocedure (list* __jnienv __jclass __jmethodID (map jtype->ctype args))
185 |                                    __jobject))])
186 |     (proc 'constructor current-jnienv class-id method-id ffi-func)))
187 | 
188 | (define (get-jmethod class-id method-name type #:static? [static? #f])
189 |   (let* ([args      (jprocedure-args type)]
190 |          [return    (jprocedure-return type)]
191 |          [proc      (jprocedure-proc type)]
192 |          [signature (make-signature args return)]
193 |          [method-id (get-method-id class-id method-name signature #:static? static?)]
194 |          [type      (if static? 'static-method 'method)]
195 |          [ffi-func  (get-jrffi-obj 
196 |                      (format "call-~a~a-method" (if static? "static-" "") (jtype-tag return))       
197 |                      (_cprocedure (append (list __jnienv (if static? __jclass __jobject)
198 |                                                 __jmethodID) (map jtype->ctype args))
199 |                                   (jtype->ctype return)))])
200 |     (proc type current-jnienv class-id method-id ffi-func)))
201 | 
202 | 
203 | ; --- interfacing with java fields ---
204 | (define (get-jaccessor class-id field-name type #:static? [static? #f])
205 |   (let* ([signature (jtype-signature class-id field-name (jtype-signature type))]
206 |          [field-id  (get-field-id class-id field-name signature #:static? static?)]
207 |          [ffi-func  (get-jrffi-obj
208 |                      (format "get-~a~a-field" (if static? "static-" "") (jtype-tag type))
209 |                      (_cprocedure (list __jnienv (if static? __jclass __jobject) __jfieldID) type))])
210 |     (if static? (λ () (ffi-func current-jnienv class-id field-id))
211 |         (λ (obj) (ffi-func current-jnienv obj field-id)))))
212 | 
213 | (define (get-jmutator class-id field-name type #:static? [static? #f])
214 |   (let* ([signature (jtype-signature class-id field-name (jtype-signature type))]
215 |          [field-id  (get-field-id class-id field-name signature #:static? static?)]
216 |          [ffi-func (get-jrffi-obj 
217 |                     (format "set-~a~a-field" (if static? "static-" "") (jtype-tag type))
218 |                     (_cprocedure (list __jnienv (if static? __jclass __jobject) __jfieldID type) type))])
219 |     (if static? (λ (new-value) (ffi-func current-jnienv class-id field-id new-value))
220 |         (λ (obj new-value) (ffi-func current-jnienv obj field-id new-value)))))
221 | 
222 | (define (get-jparameter class-id field-name type #:static? [static? #f])
223 |   (let* ([accessor (get-jaccessor class-id field-name type #:static? static?)]
224 |          [mutator  (get-jmutator class-id field-name type #:static? static?)])
225 |     (if static?
226 |         (case-lambda
227 |           [() (accessor)]
228 |           [(new-value) (mutator new-value)])
229 |         (case-lambda
230 |           [(obj) (accessor obj)]
231 |           [(obj new-value) (mutator obj new-value)]))))
232 | 
233 | 
234 | 
235 | (provide _jboolean _jbyte _jchar _jshort _jint _jlong _jfloat _jdouble _jvoid
236 |          _jobject _jstring _jlist)
237 | 
238 | (provide get-jconstructor get-jmethod get-jparameter get-jmutator get-jaccessor)
239 | 
240 | ;(provide instance-of? (rename-out [find-class find-class]) get-method-id get-field-id)
241 | 
242 | 
243 | (provide (all-defined-out)  : -> current-jnienv)
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/test/fixtures/utf8.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | <html xmlns="http://www.w3.org/1999/xhtml" lang="zh" dir="ltr">
  4 | <head>
  5 | <title>XML - 维基百科，自由的百科全书</title>
  6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  7 | <meta http-equiv="Content-Style-Type" content="text/css" />
  8 | <meta name="generator" content="MediaWiki 1.16wmf4" />
  9 | <link rel="alternate" type="application/x-wiki" title="编辑本页" href="/w/index.php?title=XML&amp;action=edit" />
 10 | <link rel="edit" title="编辑本页" href="/w/index.php?title=XML&amp;action=edit" />
 11 | <link rel="apple-touch-icon" href="http://zh.wikipedia.org/apple-touch-icon.png" />
 12 | <link rel="shortcut icon" href="/favicon.ico" />
 13 | <link rel="search" type="application/opensearchdescription+xml" href="/w/opensearch_desc.php" title="Wikipedia (zh)" />
 14 | <link rel="copyright" href="http://creativecommons.org/licenses/by-sa/3.0/" />
 15 | <link rel="alternate" type="application/atom+xml" title="Wikipedia的Atom订阅" href="/w/index.php?title=Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9&amp;feed=atom" />
 16 | <link rel="stylesheet" href="http://bits.wikimedia.org/skins-1.5/vector/main-ltr.css?283-19" type="text/css" media="screen" />
 17 | <link rel="stylesheet" href="http://bits.wikimedia.org/skins-1.5/common/shared.css?283-19" type="text/css" media="screen" />
 18 | <link rel="stylesheet" href="http://bits.wikimedia.org/skins-1.5/common/commonPrint.css?283-19" type="text/css" media="print" />
 19 | <link rel="stylesheet" href="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/css/combined.min.css?117" type="text/css" media="all" />
 20 | <link rel="stylesheet" href="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/css/vector/jquery-ui-1.7.2.css?1.7.2y" type="text/css" media="all" />
 21 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Common.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="all" />
 22 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Print.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="print" />
 23 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Handheld.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="handheld" />
 24 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Vector.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="all" />
 25 | <link rel="stylesheet" href="/w/index.php?title=-&amp;action=raw&amp;maxage=2678400&amp;gen=css" type="text/css" media="all" />
 26 | <script type="text/javascript">
 27 | var skin="vector",
 28 | stylepath="http://bits.wikimedia.org/skins-1.5",
 29 | wgUrlProtocols="http\\:\\/\\/|https\\:\\/\\/|ftp\\:\\/\\/|irc\\:\\/\\/|gopher\\:\\/\\/|telnet\\:\\/\\/|nntp\\:\\/\\/|worldwind\\:\\/\\/|mailto\\:|news\\:|svn\\:\\/\\/",
 30 | wgArticlePath="/wiki/$1",
 31 | wgScriptPath="/w",
 32 | wgScriptExtension=".php",
 33 | wgScript="/w/index.php",
 34 | wgVariantArticlePath="/$2/$1",
 35 | wgActionPaths={},
 36 | wgServer="http://zh.wikipedia.org",
 37 | wgCanonicalNamespace="",
 38 | wgCanonicalSpecialPageName=false,
 39 | wgNamespaceNumber=0,
 40 | wgPageName="XML",
 41 | wgTitle="XML",
 42 | wgAction="view",
 43 | wgArticleId=3632,
 44 | wgIsArticle=true,
 45 | wgUserName=null,
 46 | wgUserGroups=null,
 47 | wgUserLanguage="zh",
 48 | wgContentLanguage="zh",
 49 | wgBreakFrames=false,
 50 | wgCurRevisionId=15329713,
 51 | wgVersion="1.16wmf4",
 52 | wgEnableAPI=true,
 53 | wgEnableWriteAPI=true,
 54 | wgSeparatorTransformTable=["", ""],
 55 | wgDigitTransformTable=["", ""],
 56 | wgMainPageTitle="Wikipedia:首页",
 57 | wgFormattedNamespaces={"-2": "Media", "-1": "Special", "0": "", "1": "Talk", "2": "User", "3": "User talk", "4": "Wikipedia", "5": "Wikipedia talk", "6": "File", "7": "File talk", "8": "MediaWiki", "9": "MediaWiki talk", "10": "Template", "11": "Template talk", "12": "Help", "13": "Help talk", "14": "Category", "15": "Category talk", "100": "Portal", "101": "Portal talk"},
 58 | wgNamespaceIds={"media": -2, "special": -1, "": 0, "talk": 1, "user": 2, "user_talk": 3, "wikipedia": 4, "wikipedia_talk": 5, "file": 6, "file_talk": 7, "mediawiki": 8, "mediawiki_talk": 9, "template": 10, "template_talk": 11, "help": 12, "help_talk": 13, "category": 14, "category_talk": 15, "portal": 100, "portal_talk": 101, "媒体": -2, "媒體": -2, "特殊": -1, "对话": 1, "對話": 1, "讨论": 1, "討論": 1, "用户": 2, "用戶": 2, "用户对话": 3, "用戶對話": 3, "用户讨论": 3, "用戶討論": 3, "图像": 6, "圖像": 6, "档案": 6, "檔案": 6, "文件": 6, "图像对话": 7, "圖像對話": 7, "图像讨论": 7, "圖像討論": 7, "档案对话": 7, "檔案對話": 7, "档案讨论": 7, "檔案討論": 7, "文件对话": 7, "文件對話": 7, "文件讨论": 7, "文件討論": 7, "模板": 10, "样板": 10, "樣板": 10, "模板对话": 11, "模板對話": 11, "模板讨论": 11, "模板討論": 11, "样板对话": 11, "樣板對話": 11, "样板讨论": 11, "樣板討論": 11, "帮助": 12, "幫助": 12, "帮助对话": 13, "幫助對話": 13, "帮助讨论": 13, "幫助討論": 13, "分类": 14, "分類": 14, "分类对话": 15, "分類對話": 15, "分类讨论": 15, "分類討論": 15, "维基百科": 4, "維基百科": 4, "wp": 4, "维基百科讨论": 5, "维基百科对话": 5, "維基百科討論": 5, "維基百科對話": 5, "t": 10, "wt": 5, "cat": 14, "h": 12, "p": 100, "image": 6, "image_talk": 7},
 59 | wgSiteName="Wikipedia",
 60 | wgCategories=["含有英語的條目", "網頁技術", "W3C标准", "文件格式", "置标语言", "XML", "数据序列化格式"],
 61 | wgDBname="zhwiki",
 62 | wgUserVariant="zh",
 63 | wgMWSuggestTemplate="http://zh.wikipedia.org/w/api.php?action=opensearch\x26search={searchTerms}\x26namespace={namespaces}\x26suggest",
 64 | wgSearchNamespaces=[0],
 65 | wgMWSuggestMessages=["有建议", "无建议"],
 66 | wgRestrictionEdit=[],
 67 | wgRestrictionMove=[],
 68 | wgCollapsibleNavBucketTest=false,
 69 | wgCollapsibleNavForceNewVersion=false,
 70 | wgVectorPreferences={"collapsiblenav": {"enable": 1}, "editwarning": {"enable": 1}, "simplesearch": {"enable": 1, "disablesuggest": 0}},
 71 | wgVectorEnabledModules={"collapsiblenav": true, "collapsibletabs": true, "editwarning": true, "expandablesearch": false, "footercleanup": false, "simplesearch": true},
 72 | Geo={"city": "", "country": ""},
 73 | wgNoticeProject="wikipedia";
 74 | </script><script src="http://bits.wikimedia.org/skins-1.5/common/wikibits.js?283-19" type="text/javascript"></script>
 75 | <script type="text/javascript" src="http://bits.wikimedia.org/skins-1.5/common/jquery.min.js?283-19"></script>
 76 | <script src="http://bits.wikimedia.org/skins-1.5/common/ajax.js?283-19" type="text/javascript"></script>
 77 | <script src="http://bits.wikimedia.org/skins-1.5/common/mwsuggest.js?283-19" type="text/javascript"></script>
 78 | <script src="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/js/plugins.combined.min.js?283-19" type="text/javascript"></script>
 79 | <script src="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/Vector/Vector.combined.min.js?283-19" type="text/javascript"></script>
 80 | <script type="text/javascript">mw.usability.addMessages({'vector-collapsiblenav-more':'更多语言','vector-editwarning-warning':'离开这个页面可能会令您失去之前的所有更改。\n若您已经登入，您可在您参数设置的“编辑”节中关闭此警告。','vector-simplesearch-search':'搜索','vector-simplesearch-containing':'含有...'});</script>
 81 | <script src="/w/index.php?title=Special:BannerController&amp;cache=/cn.js&amp;283-19" type="text/javascript"></script>
 82 | <!--[if lt IE 7]><style type="text/css">body{behavior:url("/w/skins-1.5/vector/csshover.htc")}</style><![endif]-->
 83 | <script src="/w/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=vector&amp;283-19" type="text/javascript"></script>
 84 | 
 85 | <style type="text/css">/*<![CDATA[*/
 86 | .source-xml {line-height: normal;}
 87 | .source-xml li, .source-xml pre {
 88 |   line-height: normal; border: 0px none white;
 89 | }
 90 | /**
 91 |  * GeSHi Dynamically Generated Stylesheet
 92 |  * --------------------------------------
 93 |  * Dynamically generated stylesheet for xml
 94 |  * CSS class: source-xml, CSS id:
 95 |  * GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
 96 |  * (http://qbnz.com/highlighter/ and http://geshi.org/)
 97 |  * --------------------------------------
 98 |  */
 99 | .xml.source-xml .de1, .xml.source-xml .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
100 | .xml.source-xml  {font-family:monospace;}
101 | .xml.source-xml .imp {font-weight: bold; color: red;}
102 | .xml.source-xml li, .xml.source-xml .li1 {font-weight: normal; vertical-align:top;}
103 | .xml.source-xml .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
104 | .xml.source-xml .li2 {font-weight: bold; vertical-align:top;}
105 | .xml.source-xml .es0 {color: #000099; font-weight: bold;}
106 | .xml.source-xml .br0 {color: #66cc66;}
107 | .xml.source-xml .sy0 {color: #66cc66;}
108 | .xml.source-xml .st0 {color: #ff0000;}
109 | .xml.source-xml .nu0 {color: #cc66cc;}
110 | .xml.source-xml .sc-1 {color: #808080; font-style: italic;}
111 | .xml.source-xml .sc0 {color: #00bbdd;}
112 | .xml.source-xml .sc1 {color: #ddbb00;}
113 | .xml.source-xml .sc2 {color: #339933;}
114 | .xml.source-xml .sc3 {color: #009900;}
115 | .xml.source-xml .re0 {color: #000066;}
116 | .xml.source-xml .re1 {color: #000000; font-weight: bold;}
117 | .xml.source-xml .re2 {color: #000000; font-weight: bold;}
118 | .xml.source-xml .ln-xtra, .xml.source-xml li.ln-xtra, .xml.source-xml div.ln-xtra {background-color: #ffc;}
119 | .xml.source-xml span.xtra { display:block; }
120 | 
121 | /*]]>*/
122 | </style>
123 | <style type="text/css">/*<![CDATA[*/
124 | @import "/w/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=2678400";
125 | /*]]>*/
126 | </style></head>
127 | <body class="mediawiki ltr ns-0 ns-subject page-XML skin-vector">
128 |     <div id="mw-page-base" class="noprint"></div>
129 |     <div id="mw-head-base" class="noprint"></div>
130 |     <!-- content -->
131 |     <div id="content">
132 |       <a id="top"></a>
133 |       <div id="mw-js-message" style="display:none;"></div>
134 |             <!-- sitenotice -->
135 |       <div id="siteNotice"><!-- centralNotice loads here --><script type="text/javascript" language="JavaScript">
136 | /* <![CDATA[ */
137 | document.writeln("\x3cdiv id=\"localNotice\"\x3e\x3c/div\x3e");
138 | /* ]]> */
139 | </script></div>
140 |       <!-- /sitenotice -->
141 |             <!-- firstHeading -->
142 |       <h1 id="firstHeading" class="firstHeading">XML</h1>
143 |       <!-- /firstHeading -->
144 |       <!-- bodyContent -->
145 |       <div id="bodyContent">
146 |         <!-- tagline -->
147 |         <div id="siteSub">维基百科，自由的百科全书</div>
148 |         <!-- /tagline -->
149 |         <!-- subtitle -->
150 |         <div id="contentSub"></div>
151 |         <!-- /subtitle -->
152 |                                 <!-- jumpto -->
153 |         <div id="jump-to-nav">
154 |           跳转到： <a href="#mw-head">导航</a>,
155 |           <a href="#p-search">搜索</a>
156 |         </div>
157 |         <!-- /jumpto -->
158 |                 <!-- bodytext -->
159 |         <p><span style="display: none;"><a href="#_skip_noteTA">跳过字词转换说明</a></span></p>
160 | <div class="NavFrame collapsed noprint nohandheld metadata" style="z-index: 1; margin: 0 0.5em 0.5em auto; top: -1.2em; position: relative; background-color: transparent; border: none;">
161 | <div class="uncollapse toggleHotspot" style="position: absolute; right:0; background-color: transparent; padding: 0; width: 4em;" title="本頁使用了標題或全文手工轉換，單擊檢視"><span style="font-family:微软雅黑,Arial Unicode MS,黑体;"><span style="padding:1px 3px; background: slategray; color:white;">汉</span><span style="padding:1px 3px; background: sienna; color:white;">漢</span></span><span class="toggleShow" style="color:black;">▼</span><span class="toggleHide" style="color:black;">▲</span></div>
162 | <div class="NavContent" style="position: absolute; display: none; right: 0; top: 1.5em; border: 1px gray solid; background-color: lightyellow; padding: 0.3em; z-index: 100; width: 650px; font-size: 90%; background-color: #f0f2f0; color: black;">
163 | <div style="background: #FFEA88; background-color: #4d4d4d; color: #ffffff;">为了阅读方便，本文使用<a href="/wiki/Help:%E4%B8%AD%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91%E7%9A%84%E7%B9%81%E7%AE%80%E5%A4%84%E7%90%86#.E6.8E.A7.E5.88.B6.E8.87.AA.E5.8A.A8.E8.BD.AC.E6.8D.A2.E7.9A.84.E4.BB.A3.E7.A2.BC" title="Help:中文维基百科的繁简处理" class="mw-redirect"><span style="color:#ffffff; font-weight:bold;">全文手工轉換</span></a>。转换内容：</div>
164 | <div style="padding: 3px">
165 | <p>本文采用<a href="/wiki/Portal:%E7%94%B5%E8%84%91%E5%92%8C%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF" title="Portal:电脑和信息技术" class="mw-redirect"><span style="color: #333333; font-weight: bold;">电脑和信息技术</span></a>组全文转换 <span class="editlink noprint plainlinksneverexpand">[<a href="/wiki/Template:CGroup/IT/temp" title="Template:CGroup/IT/temp">查看</a>] • [<a href="http://zh.wikipedia.org/w/index.php?title=Template:CGroup/IT/temp&amp;action=edit" class="external text" rel="nofollow">编辑</a>] • [<a href="http://philip-bot.appspot.com/autoconvert?group=IT" class="external text" rel="nofollow">强制刷新</a>]</span></p>
166 | <p><br /></p>
167 | </div>
168 | <div class="NavFrame collapsed" style="background-color: transparent; border: none; margin:0; padding:0;">
169 | <div class="NavHead" style="background-color: #FFEA88; font-weight: normal; height: 1.5em; text-align: left; background-color: #4d4d4d; color: #ffffff;"><a href="/wiki/Help:%E4%B8%AD%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91%E7%9A%84%E7%B9%81%E7%AE%80%E3%80%81%E5%9C%B0%E5%8C%BA%E8%AF%8D%E5%A4%84%E7%90%86" title="Help:中文维基百科的繁简、地区词处理"><span style="color: #333333; font-weight: bold;">字詞轉換</span></a>说明<span class="NavToggle"><span class="toggleShow">顯示↓</span><span class="toggleHide">關閉↑</span></span></div>
170 | <div class="NavContent">
171 | <p>字詞轉換是中文维基的一項自動轉換，目的是通過计算机程序自動消除繁简、地区词等不同<b>用字模式</b>的差異，以達到閱讀方便。字詞轉換包括全局轉換和手動轉換，本說明所使用的标题转换和全文转换技術，都屬於手動轉換。</p>
172 | <p>如果您想对我们的字词转换系统提出一些改进建议，或者提交应用面更广的转换（<a href="/wiki/%E4%B8%AD%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91" title="中文维基百科">中文维基百科</a>全站乃至<a href="/wiki/MediaWiki" title="MediaWiki">MediaWiki</a>软件），或者报告转换系统的错误，请前往<a href="/wiki/Wikipedia:%E5%AD%97%E8%AF%8D%E8%BD%AC%E6%8D%A2%E8%AF%B7%E6%B1%82%E6%88%96%E5%80%99%E9%80%89" title="Wikipedia:字词转换请求或候选">Wikipedia:字词转换请求或候选</a>发表您的意见。</p>
173 | </div>
174 | </div>
175 | </div>
176 | </div>
177 | <p><span id="_skip_noteTA"></span></p>
178 | <div class="thumb tright">
179 | <div class="thumbinner" style="width:427px;"><a href="/wiki/File:RecipeBook_XML_Example.svg" class="image"><img alt="" src="http://upload.wikimedia.org/wikipedia/commons/thumb/7/73/RecipeBook_XML_Example.svg/425px-RecipeBook_XML_Example.svg.png" width="425" height="234" class="thumbimage" /></a>
180 | <div class="thumbcaption">RecipeBook的例子，一種基於XML語法上的烹飪技術書刊。此標籤可轉換為：<a href="/wiki/HTML" title="HTML">HTML</a>, <a href="/wiki/Portable_Document_Format" title="Portable Document Format" class="mw-redirect">PDF</a>以及<a href="/wiki/Rich_Text_Format" title="Rich Text Format" class="mw-redirect">Rich Text Format</a>並使用<a href="/wiki/%E7%A8%8B%E5%BC%8F%E8%AA%9E%E8%A8%80" title="程式語言" class="mw-redirect">程式語言</a>或<a href="/w/index.php?title=%E5%8F%AF%E6%93%B4%E5%85%85%E5%A5%97%E4%BB%B6%E6%A8%A3%E5%BC%8F%E8%AA%9E%E8%A8%80&amp;action=edit&amp;redlink=1" class="new" title="可擴充套件樣式語言">XSL</a>。</div>
181 | </div>
182 | </div>
183 | <p><b>可扩展置标语言</b>（<a href="/wiki/%E8%8B%B1%E8%AF%AD" title="英语">英语</a>：<span lang="en" xml:lang="en">e<b>X</b>tensible <b>M</b>arkup <b>L</b>anguage</span>，简称:<span lang="en" xml:lang="en"><b>XML</b></span>），又称<b>可扩展标记语言</b>，是一种<a href="/wiki/%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80" title="置标语言">置标语言</a>。置标指<a href="/wiki/%E8%AE%A1%E7%AE%97%E6%9C%BA" title="计算机" class="mw-redirect">计算机</a>所能理解的信息符号，通过此种标记，计算机之间可以处理包含各种信息的文章等。如何定义这些标记，既可以选择国际通用的标记语言，比如<a href="/wiki/HTML" title="HTML">HTML</a>，也可以使用像XML这样由相关人士自由决定的标记语言，这就是语言的可扩展性。XML是从<a href="/wiki/%E6%A0%87%E5%87%86%E9%80%9A%E7%94%A8%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80" title="标准通用置标语言" class="mw-redirect">标准通用置标语言</a>（SGML）中简化修改出来的。它主要用到的有可扩展置标语言、<a href="/wiki/%E5%8F%AF%E6%89%A9%E5%B1%95%E6%A0%B7%E5%BC%8F%E8%AF%AD%E8%A8%80" title="可扩展样式语言">可扩展样式语言</a>（XSL）、<a href="/wiki/XBRL" title="XBRL">XBRL</a>和<a href="/wiki/XPath" title="XPath">XPath</a>等。</p>
184 | <table id="toc" class="toc">
185 | <tr>
186 | <td>
187 | <div id="toctitle">
188 | <h2>目录</h2>
189 | </div>
190 | <ul>
191 | <li class="toclevel-1 tocsection-1"><a href="#.E6.AD.B7.E5.8F.B2"><span class="tocnumber">1</span> <span class="toctext">歷史</span></a></li>
192 | <li class="toclevel-1 tocsection-2"><a href="#.E7.94.A8.E9.80.94"><span class="tocnumber">2</span> <span class="toctext">用途</span></a></li>
193 | <li class="toclevel-1 tocsection-3"><a href="#.E4.BE.8B"><span class="tocnumber">3</span> <span class="toctext">例</span></a></li>
194 | <li class="toclevel-1 tocsection-4"><a href="#.E7.BB.93.E6.9E.84"><span class="tocnumber">4</span> <span class="toctext">结构</span></a></li>
195 | <li class="toclevel-1 tocsection-5"><a href="#.E5.8F.82.E8.A7.81"><span class="tocnumber">5</span> <span class="toctext">参见</span></a></li>
196 | <li class="toclevel-1 tocsection-6"><a href="#.E5.A4.96.E9.83.A8.E9.93.BE.E6.8E.A5"><span class="tocnumber">6</span> <span class="toctext">外部链接</span></a></li>
197 | </ul>
198 | </td>
199 | </tr>
200 | </table>
201 | <script type="text/javascript">
202 | //<![CDATA[
203 | if (window.showTocToggle) { var tocShowText = "显示"; var tocHideText = "隐藏"; showTocToggle(); }
204 | //]]>
205 | </script>
206 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=1" title="编辑段落：歷史">编辑</a>]</span> <span class="mw-headline" id=".E6.AD.B7.E5.8F.B2">歷史</span></h2>
207 | <p>XML是從1995年開始有其雛形，並向<a href="/wiki/W3C" title="W3C" class="mw-redirect">W3C</a>（<a href="/wiki/%E4%B8%87%E7%BB%B4%E7%BD%91%E8%81%94%E7%9B%9F" title="万维网联盟">全球資訊網聯盟</a>）提案，而在1998二月發佈為W3C的標準（XML1.0）。XML的前身是<b>SGML</b>（<span lang="en" xml:lang="en">The <b>S</b>tandard <b>G</b>eneralized <b>M</b>arkup <b>L</b>anguage</span>），是自IBM從1960年代就開始發展的<b><a href="/w/index.php?title=%E9%80%9A%E7%94%A8%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80&amp;action=edit&amp;redlink=1" class="new" title="通用置标语言">GML</a></b>（<span lang="en" xml:lang="en"><b>G</b>eneralized <b>M</b>arkup <b>L</b>anguage</span>）標準化後的名稱。</p>
208 | <p>GML的重要概念：</p>
209 | <ul>
210 | <li>文件中能夠明確的將標示與內容分開</li>
211 | <li>所有文件的標示使用方法均一致</li>
212 | </ul>
213 | <p>1978年，<a href="/wiki/ANSI" title="ANSI" class="mw-redirect">ANSI</a>將GML加以整理規範，發佈成為SGML，1986年起為<a href="/wiki/ISO" title="ISO" class="mw-redirect">ISO</a>所採用（ISO 8879），並且被廣泛地運用在各種大型的文件計劃中，但是SGML是一種非常嚴謹的文件描述法，導致過於龐大複雜（標準手冊就有500多頁），難以理解和學習，進而影響其推廣與應用。</p>
214 | <p>同時W3C也發現到HTML的問題：</p>
215 | <ul>
216 | <li>不能解決所有解釋資料的問題 - 像是影音檔或化學公式、音樂符號等其他形態的內容。</li>
217 | <li>效能問題 - 需要下載整份文件，才能開始對文件做搜尋。</li>
218 | <li>擴充性、彈性、易讀性均不佳。</li>
219 | </ul>
220 | <p>為了解決以上問題，專家們使用SGML精簡製作，並依照HTML的發展經驗，產生出一套使用上規則嚴謹，但是簡單的描述資料語言：XML。 XML是在一個這樣的背景下誕生的——为了有一個更中立的方式，讓消費端自行決定要如何消化、呈現從服務端所提供的資訊。</p>
221 | <p>XML被廣泛用來作為跨平台之間交互數據的形式，主要針對數據的內容，通過不同的格式化描述手段（XSLT，CSS等）可以完成最終的形式表達（生成對應的HTML，PDF或者其他的文件格式）。</p>
222 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=2" title="编辑段落：用途">编辑</a>]</span> <span class="mw-headline" id=".E7.94.A8.E9.80.94">用途</span></h2>
223 | <p>XML设计用来传送及携带数据信息，不用来表现或展示数据，<a href="/wiki/HTML" title="HTML">HTML</a>语言則用来表现数据，所以XML用途的焦点是它说明数据是什么，以及携带数据信息。</p>
224 | <ul>
225 | <li>丰富文件（Rich Documents）- 自定文件描述并使其更丰富
226 | <ul>
227 | <li>属于文件为主的XML技术应用</li>
228 | <li>标记是用来定义一份资料应该如何呈现</li>
229 | </ul>
230 | </li>
231 | <li>元数据（Metadata）- 描述其它文件或网络资讯
232 | <ul>
233 | <li>属于资料为主的XML技术应用</li>
234 | <li>标记是用来说明一份资料的意义</li>
235 | </ul>
236 | </li>
237 | <li>設定档案（Configuration Files）- 描述软件設定的参数</li>
238 | </ul>
239 | </body>
240 | </html>
241 | 


--------------------------------------------------------------------------------
/test/fixtures/vimrc:
--------------------------------------------------------------------------------
  1 | " vim:foldmethod=marker:foldlevel=0:textwidth=79
  2 | "
  3 | " I promise to clean up my vimrc one day !
  4 | " Kepping up promise made in 9a59c443260aeb6ac64b7766fbe5cc4ad76f860a
  5 | " vimrc
  6 | " Author: Pratheek
  7 | " My Vim settings.
  8 | " _____________________
  9 | "  Here be my Secrets
 10 | " ---------------------
 11 | "       o                    / \  //\
 12 | "        o    |\___/|      /   \//  \\
 13 | "             /0  0  \__  /    //  | \ \
 14 | "            /     /  \/_/    //   |  \  \
 15 | "            @_^_@'/   \/_   //    |   \   \
 16 | "            //_^_/     \/_ //     |    \    \
 17 | "         ( //) |        \///      |     \     \
 18 | "       ( / /) _|_ /   )  //       |      \     _\
 19 | "     ( // /) '/,_ _ _/  ( ; -.    |    _ _\.-~        .-~~~^-.
 20 | "   (( / / )) ,-{        _      `-.|.-~-.           .~         `.
 21 | "  (( // / ))  '/\      /                 ~-. _ .-~      .-~^-.  \
 22 | "  (( /// ))      `.   {            }                   /      \  \
 23 | "   (( / ))     .----~-.\        \-'                 .~         \  `. \^-.
 24 | "              ///.----..>        \             _ -~             `.  ^-`  ^-_
 25 | "                ///-._ _ _ _ _ _ _}^ - - - - ~                     ~-- ,.-~
 26 | "
 27 | "
 28 | " Preamble ---------------------------------------------------------------- {{{
 29 | 
 30 | " No explanation needed
 31 | set nocompatible
 32 | 
 33 | " Load pathogen, and automatically call pathogen#helptags()
 34 | execute pathogen#infect()
 35 | call pathogen#helptags()
 36 | set shell=/bin/bash
 37 | "}}}
 38 | " Basic settings----------------------------------------------------------- {{{
 39 | "
 40 | " Better than just /<search term>
 41 | " Also, `\v` enables use of Perl compatible regexes
 42 | nmap / /\v
 43 | syntax on
 44 | filetype indent on
 45 | filetype plugin on
 46 | syntax enable
 47 | set background=light
 48 | 
 49 | " Have a different colorscheme for GUI and console version of Vim
 50 | if has('gui_running')
 51 |     colorscheme sol
 52 | else
 53 |     colorscheme luna-term
 54 | endif
 55 | 
 56 | " Set stuff (set <whatever>) {{{
 57 | " I'm going to try to put a comment above most of these `set` stuff,
 58 | " but if its not present, just do a `:h` for that
 59 | 
 60 | " Always have a status bar
 61 | set laststatus=2
 62 | set encoding=utf-8
 63 | set t_Co=256
 64 | set relativenumber
 65 | set autoindent
 66 | set smartindent
 67 | set incsearch
 68 | set listchars=tab:›∙,eol:¬
 69 | 
 70 | " Can *NEVER* settle with one font ! :D
 71 | " set guifont=Menlo\ for\ Powerline\ 13
 72 | set guifont=Monaco\ for\ Powerline\ 13
 73 | 
 74 | set hlsearch
 75 | set wildmenu
 76 | set colorcolumn=80
 77 | set cursorline
 78 | set tabstop=8
 79 | set expandtab
 80 | set softtabstop=4
 81 | set shiftwidth=4
 82 | set foldlevel=99
 83 | set foldmethod=indent
 84 | 
 85 | " Remember more previously used stuff
 86 | set history=1000
 87 | 
 88 | " Better term setting (read :h ttyfast for better understanding)
 89 | set ttyfast
 90 | 
 91 | " Stops screen from redrawing when macros are being executed
 92 | set lazyredraw
 93 | 
 94 | " NO WRAP ! EVER !
 95 | " This is especially usefull for `:vertical h <topic>`
 96 | set nowrap
 97 | 
 98 | " Amount of time taken to show the matching paren
 99 | " (This is in tenths of a sec)
100 | set matchtime=3
101 | 
102 | " Better completion
103 | " do a `:h 'complete'`
104 | set complete=.,w,b,u,t
105 | " do a `:h 'completeopt'`
106 | set completeopt=longest,menuone,preview,menu
107 | 
108 | " I think all these guioptions should be in gvimrc,
109 | " but since vimrc loads faster, I've added them here
110 | set guioptions-=T "removes toolbar
111 | set guioptions-=r "removes Right-hand scrollbar
112 | set guioptions-=R "removes Right-hand scrollbar (Which is present in :vsp)
113 | set guioptions-=l "removes Left-hand scrollbar
114 | set guioptions-=L "removes Left-hand scrollbar (which is present in :vsp)
115 | set guioptions-=m "removes menubar
116 | 
117 | " Defualt Status line
118 | " Commented out all these, in favour of vim-airline
119 | "
120 | " set statusline=
121 | " set statusline+=(%t)
122 | " set statusline+=%m\
123 | " set statusline+=%=\
124 | " set statusline+=%r\
125 | " set statusline+=%{fugitive#statusline()}
126 | " set statusline+=%=\
127 | " set statusline+=%Y\
128 | " set statusline+=(%l/%L)\
129 | " set statusline+=(%p%%)\
130 | 
131 | " Use the custom fold function
132 | set foldtext=MyFoldText()
133 | "}}}
134 | " Breaking Habit {{{
135 | nnoremap <up> <nop>
136 | nnoremap <down> <nop>
137 | nnoremap <left> <nop>
138 | nnoremap <right> <nop>
139 | inoremap <up> <nop>
140 | inoremap <down> <nop>
141 | inoremap <left> <nop>
142 | inoremap <right> <nop>
143 | vnoremap <up> <nop>
144 | vnoremap <down> <nop>
145 | vnoremap <left> <nop>
146 | vnoremap <right> <nop>
147 | "}}}
148 | " Mind Hacks {{{
149 | " ---------------------------------------------------------------------------
150 | " Better <C-^> hack !
151 | " :nnoremap <C-^> :buffers<CR>:b<Space>
152 | " using tabline (built-in with airline)
153 | nnoremap <C-Tab> :tabnext<CR>
154 | " ---------------------------------------------------------------------------
155 | " with the default `:e` I'll have to remember
156 | " the path of the file (which is hard)
157 | " So, remap `:e` to run `:CtrlPMRUFiles`
158 | " Pros : No need to remember the path, CtrlP will find it for you.
159 | " Cons : Requires CtrlP (dependency) and
160 | "           commands that start with e will be hard to type.
161 | "           (You'll have to hit `:` and wait about a half a sec or so
162 | "           to start typing the command, starting with 'e')
163 | nnoremap :e :CtrlPMRUFiles<CR>
164 | " ---------------------------------------------------------------------------
165 | " clear serached stuff
166 | nnoremap <C-S-c> :let @/=""<CR>
167 | " ---------------------------------------------------------------------------
168 | " Show syntax highlighting groups for word under cursor
169 | " Thanks to Drew Neil from vimcasts
170 | nmap <A-S-P> :call <SID>SynStack()<CR>
171 | function! <SID>SynStack()
172 |   if !exists("*synstack")
173 |     return
174 |   endif
175 |   echo map(synstack(line('.'), col('.')), 'synIDattr(v:val, "name")')
176 | endfunc
177 | " ---------------------------------------------------------------------------
178 | " When using (g)vimdiff (or running `:Gdiff` in fugitive)
179 | " Disable relativenumber and use normal numbers
180 | if &diff
181 |     set norelativenumber
182 |     set number
183 | endif
184 | " ---------------------------------------------------------------------------
185 | " Returns cursor to last position before quitting
186 | augroup line_return
187 |     au!
188 |     au BufReadPost *
189 |         \ if line("'\"") > 0 && line("'\"") <= line("$") |
190 |         \ execute 'normal! g`"zvzz' |
191 |         \ endif
192 | augroup END
193 | " ---------------------------------------------------------------------------
194 | " Key mapping for tab switching
195 | nnoremap <C-t> :tabnew<CR>
196 | " :map <C-x> :tabclose<CR>
197 | " :map <C-h> :tabprevious<CR>
198 | nnoremap <C-Tab> :tabnext<CR>
199 | " ---------------------------------------------------------------------------
200 | " Thanks to Steve Losh
201 | " "Uppercase word" mapping.
202 | "
203 | " This mapping allows you to press <c-u> in insert mode to convert the current
204 | " word to uppercase.  It's handy when you're writing names of constants and
205 | " don't want to use Capslock.
206 | "
207 | " To use it you type the name of the constant in lowercase.  While your
208 | " cursor is at the end of the word, press <c-u> to uppercase it, and then
209 | " continue happily on your way:
210 | "
211 | "                            cursor
212 | "                            v
213 | "     max_connections_allowed|
214 | "     <c-u>
215 | "     MAX_CONNECTIONS_ALLOWED|
216 | "                            ^
217 | "                            cursor
218 | "
219 | " It works by exiting out of insert mode, recording the current cursor location
220 | " in the z mark, using gUiw to uppercase inside the current word, moving
221 | " back to the z mark, and entering insert mode again.
222 | "
223 | " Note that this will overwrite the contents of the z mark.  I never use it,
224 | " but if you do you'll probably want to use another mark.
225 | inoremap <C-u> <esc>mzgUiw`za
226 | " ---------------------------------------------------------------------------
227 | "}}}
228 | " <F5> FileType Runners and Builders {{{
229 | " ---------------------------------------------------------------------------
230 | " Python {{{
231 | augroup ft_python
232 |     au!
233 |     " Run the code in `%` (path/to/file) in python
234 |     " Damn the <leader>r in python-mode for python3
235 |     au Filetype python nnoremap <F5> :<C-u> ! python %<CR>
236 | augroup END
237 | "}}}
238 | " Lua {{{
239 | augroup ft_lua
240 |     au!
241 |     " Run the code in `%` (path/to/file) in lua
242 |     au Filetype lua nnoremap <F5> :<C-u> ! lua %<CR>
243 | augroup END
244 | " }}}
245 | " C {{{
246 | augroup ft_c
247 |     au!
248 |     " Build the given .c file
249 |     " by giving `%` as the arg (which is path/to/file)
250 |     " and store the built binary file in the same path
251 |     "
252 |     " Best explained with example:
253 |     " Let's say file name is foo_bar.c
254 |     " Then, this makes it `:! gcc -Wall foo_bar.c -o foo_bar.c.o`
255 |     " (Slightly bad in naming the bin file ! ;) )
256 |     "
257 |     " This also works in this case.
258 |     " $ gvim ~/foo/bar/baz/dumb.c
259 |     " Then, this runs as:
260 |     " `:!gcc -Wall ~/foo/bar/baz/dumb.c -o ~/foo/bar/baz/dumb.c.o`
261 |     "
262 |     au Filetype c nnoremap <F5> :<C-u> ! gcc -Wall % -o %.o<CR>
263 | augroup END
264 | " }}}
265 | " C++ {{{
266 | augroup ft_cpp
267 |     au!
268 |     " Build the given .cc file
269 |     " by giving `%` as the arg (which is path/to/file)
270 |     " and store the built binary file in the same path
271 |     "
272 |     " Best explained with example:
273 |     " Let's say file name is foo_bar.cc
274 |     " Then, this makes it `:! g++ -Wall foo_bar.cc -o foo_bar.cc.oo`
275 |     " Slightly bad in naming the bin file
276 |     "
277 |     " This also works in this case.
278 |     " $ gvim ~/foo/bar/baz/dumb.cc
279 |     " Then, this runs as:
280 |     " `:!g++ -Wall ~/foo/bar/baz/dumb.cc -o ~/foo/bar/baz/dumb.cc.oo`
281 |     "
282 |     au Filetype cpp nnoremap <F5> :<C-u> ! g++ -Wall % -o %.oo<CR>
283 | augroup END
284 | " }}}
285 | " }}}
286 | " ---------------------------------------------------------------------------
287 | " }}}
288 | " Wild Ignore ------------------------------------------------------------- {{{
289 | set wildignore+=.hg,.git,.svn                    " Version control
290 | set wildignore+=*.aux,*.out,*.toc                " LaTeX intermediate files
291 | set wildignore+=*.jpg,*.bmp,*.gif,*.png,*.jpeg   " binary images
292 | set wildignore+=*.o,*.oo,*.obj,*.exe,*.dll       " compiled object files
293 | set wildignore+=*.spl                            " compiled spelling word lists
294 | set wildignore+=*.sw?                            " Vim swap files
295 | set wildignore+=*.luac                           " Lua byte code
296 | set wildignore+=*.pyc                            " Python byte code
297 | set wildignore+=*.orig                           " Merge resolution files
298 | set wildignore+=$VIMRUNTIME/doc/*.txt            " Individual helpfiles
299 | set wildignore+=$VIMRUNTIME/doc/*.tar.gz         " compressed helpfiles
300 | " For the `**` meaning, do a `:h starstar-wildcard`
301 | set wildignore+=**/doc/*.txt
302 | " Clojure/Leiningen
303 | set wildignore+=classes
304 | set wildignore+=lib
305 | "}}}
306 | " Auto Commands ----------------------------------------------------------- {{{
307 | " General filetype {{{
308 | " ---------------------------------------------------------------------------
309 | " Resize splits when the window is resized
310 | au VimResized * :wincmd =
311 | " First off,  HUGE thanks to Steve Losh !
312 | " Read below !
313 | " Make {<cr> insert a pair of brackets in such a way that the cursor is correctly
314 | " positioned inside of them AND the following code doesn't get unfolded.
315 | au Filetype * inoremap <buffer> {<cr> {}<left><cr><space><space><space><space>.<cr><esc>kA<bs>
316 | " ---------------------------------------------------------------------------
317 | " }}}
318 | " Clojure {{{
319 | " ---------------------------------------------------------------------------
320 | au FileType clojure RainbowParenthesesActivate
321 | au syntax clojure RainbowParenthesesLoadRound
322 | au syntax clojure RainbowParenthesesLoadSquare
323 | au syntax clojure RainbowParenthesesLoadBraces
324 | " ---------------------------------------------------------------------------
325 | " }}}
326 | " Python {{{
327 | " ---------------------------------------------------------------------------
328 | autocmd BufWritePre *.py normal m`:%s/\s\+$//e ``
329 | autocmd BufRead *.py set smartindent cinwords=if,elif,else,for,while,try,except,finally,def,class
330 | au FileType python set omnifunc=pythoncomplete#Complete
331 | au FileType python syn keyword pythonDecorator True None False self
332 | " ---------------------------------------------------------------------------
333 | " }}}
334 | " }}}
335 | " Custom Functions -------------------------------------------------------- {{{
336 | " ---------------------------------------------------------------------------
337 | function! MyFoldText() " {{{
338 |     let line = getline(v:foldstart)
339 | 
340 |     let nucolwidth = &fdc + &number * &numberwidth
341 |     let windowwidth = winwidth(0) - nucolwidth - 3
342 |     let foldedlinecount = v:foldend - v:foldstart
343 | 
344 |     " expand tabs into spaces
345 |     let onetab = strpart('          ', 0, &tabstop)
346 |     let line = substitute(line, '\t', onetab, 'g')
347 | 
348 |     let line = strpart(line, 0, windowwidth - 2 -len(foldedlinecount))
349 |     let fillcharcount = windowwidth - len(line) - len(foldedlinecount)
350 |     return line . ' (' . foldedlinecount . ') ' . '…' . repeat(" ",fillcharcount) .  ' '
351 | endfunction " }}}
352 | " Nyan cat
353 | command! NyanMe call NyanMe()
354 | " ---------------------------------------------------------------------------
355 | function! NyanMe() " {{{
356 |     hi NyanFur guifg=#BBBBBB
357 |     hi NyanPoptartEdge guifg=#ffd0ac
358 |     hi NyanPoptartFrosting guifg=#fd3699 guibg=#fe98ff
359 |     hi NyanRainbow1 guifg=#6831f8
360 |     hi NyanRainbow2 guifg=#0099fc
361 |     hi NyanRainbow3 guifg=#3cfa04
362 |     hi NyanRainbow4 guifg=#fdfe00
363 |     hi NyanRainbow5 guifg=#fc9d00
364 |     hi NyanRainbow6 guifg=#fe0000
365 | 
366 | 
367 |     echohl NyanRainbow1
368 |     echon "≈"
369 |     echohl NyanRainbow2
370 |     echon "≋"
371 |     echohl NyanRainbow3
372 |     echon "≈"
373 |     echohl NyanRainbow4
374 |     echon "≋"
375 |     echohl NyanRainbow5
376 |     echon "≈"
377 |     echohl NyanRainbow6
378 |     echon "≋"
379 |     echohl NyanRainbow1
380 |     echon "≈"
381 |     echohl NyanRainbow2
382 |     echon "≋"
383 |     echohl NyanRainbow3
384 |     echon "≈"
385 |     echohl NyanRainbow4
386 |     echon "≋"
387 |     echohl NyanRainbow5
388 |     echon "≈"
389 |     echohl NyanRainbow6
390 |     echon "≋"
391 |     echohl None
392 |     echo ""
393 | 
394 |     echohl NyanRainbow1
395 |     echon "≈"
396 |     echohl NyanRainbow2
397 |     echon "≋"
398 |     echohl NyanRainbow3
399 |     echon "≈"
400 |     echohl NyanRainbow4
401 |     echon "≋"
402 |     echohl NyanRainbow5
403 |     echon "≈"
404 |     echohl NyanRainbow6
405 |     echon "≋"
406 |     echohl NyanRainbow1
407 |     echon "≈"
408 |     echohl NyanRainbow2
409 |     echon "≋"
410 |     echohl NyanRainbow3
411 |     echon "≈"
412 |     echohl NyanRainbow4
413 |     echon "≋"
414 |     echohl NyanRainbow5
415 |     echon "≈"
416 |     echohl NyanRainbow6
417 |     echon "≋"
418 |     echohl NyanFur
419 |     echon "╰"
420 |     echohl NyanPoptartEdge
421 |     echon "⟨"
422 |     echohl NyanPoptartFrosting
423 |     echon "⣮⣯⡿"
424 |     echohl NyanPoptartEdge
425 |     echon "⟩"
426 |     echohl NyanFur
427 |     echon "⩾^ω^⩽"
428 |     echohl None
429 |     echo ""
430 | 
431 |     echohl NyanRainbow1
432 |     echon "≈"
433 |     echohl NyanRainbow2
434 |     echon "≋"
435 |     echohl NyanRainbow3
436 |     echon "≈"
437 |     echohl NyanRainbow4
438 |     echon "≋"
439 |     echohl NyanRainbow5
440 |     echon "≈"
441 |     echohl NyanRainbow6
442 |     echon "≋"
443 |     echohl NyanRainbow1
444 |     echon "≈"
445 |     echohl NyanRainbow2
446 |     echon "≋"
447 |     echohl NyanRainbow3
448 |     echon "≈"
449 |     echohl NyanRainbow4
450 |     echon "≋"
451 |     echohl NyanRainbow5
452 |     echon "≈"
453 |     echohl NyanRainbow6
454 |     echon "≋"
455 |     echohl None
456 |     echon " "
457 |     echohl NyanFur
458 |     echon "” ‟"
459 |     echohl None
460 | 
461 |     sleep 1
462 |     redraw
463 |     echo " "
464 |     echo " "
465 |     echo "Noms?"
466 |     redraw
467 | endfunction " }}}
468 | " }}}
469 | " Plugin Settings --------------------------------------------------------- {{{
470 | " Airline {{{
471 | let g:airline_theme = 'sol'
472 | let g:airline_left_sep=''
473 | let g:airline_left_alt_sep='|'
474 | let g:airline_right_sep=''
475 | let g:airline_right_alt_sep='|'
476 | let g:airline_powerline_fonts = 1
477 | let g:airline#extensions#tabline#enabled = 1
478 | let g:airline#extensions#tabline#fnamemod = ':~:.'
479 | " let g:airline#extensions#tabline#fnamemod = ':t:~'
480 | " let g:airline#extensions#tabline#fnamemod = ':p:~'
481 | 
482 | " Get rid of the arrows in tabline
483 | let g:airline#extensions#tabline#left_sep = ' '
484 | let g:airline#extensions#tabline#left_alt_sep = '|'
485 | 
486 | " Enable Fugitive support in airline
487 | let g:airline#extensions#branch#enabled = 1
488 | 
489 | " Show hunks, only if there is change in file,
490 | " with respect to the version last commited into git
491 | let g:airline#extensions#hunks#non_zero_only = 1
492 | 
493 | " Display buffers in a single tab
494 | " (also this is required for the next setting)
495 | let g:airline#extensions#tabline#show_buffers = 1
496 | 
497 | " Show tabline only if more than 2 buffers exist
498 | let g:airline#extensions#tabline#buffer_min_count = 2
499 | 
500 | " unicode symbols
501 | let g:airline_branch_prefix = 'Br:'
502 | let g:airline_paste_symbol = 'ρ'
503 | 
504 | " Reducing mode strings to a single chars
505 | let g:airline_mode_map = {
506 |     \ '__' : '-',
507 |     \ 'n'  : 'N',
508 |     \ 'i'  : 'I',
509 |     \ 'R'  : 'R',
510 |     \ 'c'  : 'C',
511 |     \ 'v'  : 'V',
512 |     \ 'V'  : 'V',
513 |     \ '' : 'V',
514 |     \ 's'  : 'S',
515 |     \ 'S'  : 'S',
516 |     \ '' : 'S',
517 |     \ }
518 | 
519 | " Custom Function to display a slightly modified airline
520 | " Slight change to show branch info on the right side
521 | " rather than next to the hunks
522 | function! AirlineInit()
523 |     let g:airline_section_a = airline#section#create(['mode'])
524 |     let g:airline_section_b = airline#section#create_left(['hunks'])
525 |     let g:airline_section_c = airline#section#create(['%f'])
526 |     let g:airline_section_y = airline#section#create_right(['branch', 'ffenc'])
527 | endfunction
528 | autocmd VimEnter * call AirlineInit()
529 | " }}}
530 | " Startify {{{
531 | 
532 | let g:startify_bookmarks = [
533 |             \ '~/.vim/vimrc',
534 |             \ ]
535 | let g:startify_files_number = 8
536 | let g:startify_change_to_dir = 0
537 | let g:startify_change_to_vcs_root = 1
538 | let g:startify_skiplist = [
539 |                 \ '.git/COMMIT_EDITMSG',
540 |                 \ '.gtkrc-2.0',
541 |                 \ '/usr/share/vim/vim74/doc',
542 |                 \ '/etc/*',
543 |                 \ $VIMRUNTIME . '*/doc',
544 |                 \ 'bundle/.*/doc'
545 |                 \ ]
546 | 
547 | let g:startify_list_order = [
548 |       \ ['   MRU:'],
549 |       \ 'files',
550 |       \ ['   Sess:'],
551 |       \ 'sessions',
552 |       \ ['   Markers:'],
553 |       \ 'bookmarks'
554 |       \ ]
555 | " Startify Custom Header
556 | "let g:startify_custom_header = [
557 |             "\ '  .____---^^     ^^---____.                                                      ',
558 |             "\ '  TI      *       *      IT  Three Rings for the Elvin-Kings under the sky.      ',
559 |             "\ '  !I          *          I!  Seven for the DwarfLords in their halls of stone.   ',
560 |             "\ '   X                     X       Nine for the Mortal Men doomed to die.          ',
561 |             "\ '   XL                   JX       One for the Dark Lord on his dark throne.       ',
562 |             "\ '   II        / \        II   In the Land of Mordor where the Shadow Lies.        ',
563 |             "\ '   II   / \ /   \ / \   II                                                       ',
564 |             "\ '    X  /   v     v   \  X       One Ring to rule them all,One Ring to find them, ',
565 |             "\ '    ``/    _     _    \''     One Ring to bring them all and in the Darkness     ',
566 |             "\ '     \\- _-_ -_- _-_ -//         Bind Them                                       ',
567 |             "\ '       \\_-  -_-  -_//          In the Land of Mordor where the Shadows Lie.     ',
568 |             "\ '         ``       ''                                                             ',
569 |             "\ '           ``-_-''                                                               ',
570 |             "\ '                                                    "Lord Of THe Rings"          ',
571 |             "\ '                                                          by J.R.R. Tolkien      ',
572 |             "\ '',
573 |             "\ ]
574 | " }}}
575 | " CtrlP {{{
576 | 
577 | " make CtrlP exclude these type of files from adding to MRUFiles cache
578 | let g:ctrlp_mruf_exclude = '*.tar.gz\|'
579 | let g:ctrlp_clear_cache_on_exit = 1
580 | let g:ctrlp_cache_dir = $HOME.'/.cache/ctrlp'
581 | " }}}
582 | " SuperTab {{{
583 | 
584 | " let g:SuperTabDefaultCompletionType = \"<c-p>"
585 | let g:SuperTabDefaultCompletionType = "<c-n>"
586 | let g:SuperTabLongestHighlight = 1
587 | let g:SuperTabCrMapping = 1
588 | " }}}
589 | " Indent Guides {{{
590 | 
591 | let g:indent_guides_guide_size = 1
592 | " let g:indent_guides_enable_on_vim_startup = 1
593 | " }}}
594 | " ---------------------------------------------------------------------------
595 | "}}}
596 | "
597 | 


--------------------------------------------------------------------------------
/benchmark/test.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
  3 | <html xmlns="http://www.w3.org/1999/xhtml" lang="zh" dir="ltr"> 
  4 | <head> 
  5 | <title>XML - 维基百科，自由的百科全书</title> 
  6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
  7 | <meta http-equiv="Content-Style-Type" content="text/css" /> 
  8 | <meta name="generator" content="MediaWiki 1.16wmf4" /> 
  9 | <link rel="alternate" type="application/x-wiki" title="编辑本页" href="/w/index.php?title=XML&amp;action=edit" /> 
 10 | <link rel="edit" title="编辑本页" href="/w/index.php?title=XML&amp;action=edit" /> 
 11 | <link rel="apple-touch-icon" href="http://zh.wikipedia.org/apple-touch-icon.png" /> 
 12 | <link rel="shortcut icon" href="/favicon.ico" /> 
 13 | <link rel="search" type="application/opensearchdescription+xml" href="/w/opensearch_desc.php" title="Wikipedia (zh)" /> 
 14 | <link rel="copyright" href="http://creativecommons.org/licenses/by-sa/3.0/" /> 
 15 | <link rel="alternate" type="application/atom+xml" title="Wikipedia的Atom订阅" href="/w/index.php?title=Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9&amp;feed=atom" /> 
 16 | <link rel="stylesheet" href="http://bits.wikimedia.org/skins-1.5/vector/main-ltr.css?283-19" type="text/css" media="screen" /> 
 17 | <link rel="stylesheet" href="http://bits.wikimedia.org/skins-1.5/common/shared.css?283-19" type="text/css" media="screen" /> 
 18 | <link rel="stylesheet" href="http://bits.wikimedia.org/skins-1.5/common/commonPrint.css?283-19" type="text/css" media="print" /> 
 19 | <link rel="stylesheet" href="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/css/combined.min.css?117" type="text/css" media="all" /> 
 20 | <link rel="stylesheet" href="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/css/vector/jquery-ui-1.7.2.css?1.7.2y" type="text/css" media="all" /> 
 21 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Common.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="all" /> 
 22 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Print.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="print" /> 
 23 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Handheld.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="handheld" /> 
 24 | <link rel="stylesheet" href="/w/index.php?title=MediaWiki:Vector.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=2678400&amp;action=raw&amp;maxage=2678400" type="text/css" media="all" /> 
 25 | <link rel="stylesheet" href="/w/index.php?title=-&amp;action=raw&amp;maxage=2678400&amp;gen=css" type="text/css" media="all" /> 
 26 | <script type="text/javascript"> 
 27 | var skin="vector",
 28 | stylepath="http://bits.wikimedia.org/skins-1.5",
 29 | wgUrlProtocols="http\\:\\/\\/|https\\:\\/\\/|ftp\\:\\/\\/|irc\\:\\/\\/|gopher\\:\\/\\/|telnet\\:\\/\\/|nntp\\:\\/\\/|worldwind\\:\\/\\/|mailto\\:|news\\:|svn\\:\\/\\/",
 30 | wgArticlePath="/wiki/$1",
 31 | wgScriptPath="/w",
 32 | wgScriptExtension=".php",
 33 | wgScript="/w/index.php",
 34 | wgVariantArticlePath="/$2/$1",
 35 | wgActionPaths={},
 36 | wgServer="http://zh.wikipedia.org",
 37 | wgCanonicalNamespace="",
 38 | wgCanonicalSpecialPageName=false,
 39 | wgNamespaceNumber=0,
 40 | wgPageName="XML",
 41 | wgTitle="XML",
 42 | wgAction="view",
 43 | wgArticleId=3632,
 44 | wgIsArticle=true,
 45 | wgUserName=null,
 46 | wgUserGroups=null,
 47 | wgUserLanguage="zh",
 48 | wgContentLanguage="zh",
 49 | wgBreakFrames=false,
 50 | wgCurRevisionId=15329713,
 51 | wgVersion="1.16wmf4",
 52 | wgEnableAPI=true,
 53 | wgEnableWriteAPI=true,
 54 | wgSeparatorTransformTable=["", ""],
 55 | wgDigitTransformTable=["", ""],
 56 | wgMainPageTitle="Wikipedia:首页",
 57 | wgFormattedNamespaces={"-2": "Media", "-1": "Special", "0": "", "1": "Talk", "2": "User", "3": "User talk", "4": "Wikipedia", "5": "Wikipedia talk", "6": "File", "7": "File talk", "8": "MediaWiki", "9": "MediaWiki talk", "10": "Template", "11": "Template talk", "12": "Help", "13": "Help talk", "14": "Category", "15": "Category talk", "100": "Portal", "101": "Portal talk"},
 58 | wgNamespaceIds={"media": -2, "special": -1, "": 0, "talk": 1, "user": 2, "user_talk": 3, "wikipedia": 4, "wikipedia_talk": 5, "file": 6, "file_talk": 7, "mediawiki": 8, "mediawiki_talk": 9, "template": 10, "template_talk": 11, "help": 12, "help_talk": 13, "category": 14, "category_talk": 15, "portal": 100, "portal_talk": 101, "媒体": -2, "媒體": -2, "特殊": -1, "对话": 1, "對話": 1, "讨论": 1, "討論": 1, "用户": 2, "用戶": 2, "用户对话": 3, "用戶對話": 3, "用户讨论": 3, "用戶討論": 3, "图像": 6, "圖像": 6, "档案": 6, "檔案": 6, "文件": 6, "图像对话": 7, "圖像對話": 7, "图像讨论": 7, "圖像討論": 7, "档案对话": 7, "檔案對話": 7, "档案讨论": 7, "檔案討論": 7, "文件对话": 7, "文件對話": 7, "文件讨论": 7, "文件討論": 7, "模板": 10, "样板": 10, "樣板": 10, "模板对话": 11, "模板對話": 11, "模板讨论": 11, "模板討論": 11, "样板对话": 11, "樣板對話": 11, "样板讨论": 11, "樣板討論": 11, "帮助": 12, "幫助": 12, "帮助对话": 13, "幫助對話": 13, "帮助讨论": 13, "幫助討論": 13, "分类": 14, "分類": 14, "分类对话": 15, "分類對話": 15, "分类讨论": 15, "分類討論": 15, "维基百科": 4, "維基百科": 4, "wp": 4, "维基百科讨论": 5, "维基百科对话": 5, "維基百科討論": 5, "維基百科對話": 5, "t": 10, "wt": 5, "cat": 14, "h": 12, "p": 100, "image": 6, "image_talk": 7},
 59 | wgSiteName="Wikipedia",
 60 | wgCategories=["含有英語的條目", "網頁技術", "W3C标准", "文件格式", "置标语言", "XML", "数据序列化格式"],
 61 | wgDBname="zhwiki",
 62 | wgUserVariant="zh",
 63 | wgMWSuggestTemplate="http://zh.wikipedia.org/w/api.php?action=opensearch\x26search={searchTerms}\x26namespace={namespaces}\x26suggest",
 64 | wgSearchNamespaces=[0],
 65 | wgMWSuggestMessages=["有建议", "无建议"],
 66 | wgRestrictionEdit=[],
 67 | wgRestrictionMove=[],
 68 | wgCollapsibleNavBucketTest=false,
 69 | wgCollapsibleNavForceNewVersion=false,
 70 | wgVectorPreferences={"collapsiblenav": {"enable": 1}, "editwarning": {"enable": 1}, "simplesearch": {"enable": 1, "disablesuggest": 0}},
 71 | wgVectorEnabledModules={"collapsiblenav": true, "collapsibletabs": true, "editwarning": true, "expandablesearch": false, "footercleanup": false, "simplesearch": true},
 72 | Geo={"city": "", "country": ""},
 73 | wgNoticeProject="wikipedia";
 74 | </script><script src="http://bits.wikimedia.org/skins-1.5/common/wikibits.js?283-19" type="text/javascript"></script> 
 75 | <script type="text/javascript" src="http://bits.wikimedia.org/skins-1.5/common/jquery.min.js?283-19"></script> 
 76 | <script src="http://bits.wikimedia.org/skins-1.5/common/ajax.js?283-19" type="text/javascript"></script> 
 77 | <script src="http://bits.wikimedia.org/skins-1.5/common/mwsuggest.js?283-19" type="text/javascript"></script> 
 78 | <script src="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/js/plugins.combined.min.js?283-19" type="text/javascript"></script> 
 79 | <script src="http://bits.wikimedia.org/w/extensions/UsabilityInitiative/Vector/Vector.combined.min.js?283-19" type="text/javascript"></script> 
 80 | <script type="text/javascript">mw.usability.addMessages({'vector-collapsiblenav-more':'更多语言','vector-editwarning-warning':'离开这个页面可能会令您失去之前的所有更改。\n若您已经登入，您可在您参数设置的“编辑”节中关闭此警告。','vector-simplesearch-search':'搜索','vector-simplesearch-containing':'含有...'});</script> 
 81 | <script src="/w/index.php?title=Special:BannerController&amp;cache=/cn.js&amp;283-19" type="text/javascript"></script> 
 82 | <!--[if lt IE 7]><style type="text/css">body{behavior:url("/w/skins-1.5/vector/csshover.htc")}</style><![endif]--> 
 83 | <script src="/w/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=vector&amp;283-19" type="text/javascript"></script> 
 84 |  
 85 | <style type="text/css">/*<![CDATA[*/
 86 | .source-xml {line-height: normal;}
 87 | .source-xml li, .source-xml pre {
 88 | 	line-height: normal; border: 0px none white;
 89 | }
 90 | /**
 91 |  * GeSHi Dynamically Generated Stylesheet
 92 |  * --------------------------------------
 93 |  * Dynamically generated stylesheet for xml
 94 |  * CSS class: source-xml, CSS id: 
 95 |  * GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
 96 |  * (http://qbnz.com/highlighter/ and http://geshi.org/)
 97 |  * --------------------------------------
 98 |  */
 99 | .xml.source-xml .de1, .xml.source-xml .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
100 | .xml.source-xml  {font-family:monospace;}
101 | .xml.source-xml .imp {font-weight: bold; color: red;}
102 | .xml.source-xml li, .xml.source-xml .li1 {font-weight: normal; vertical-align:top;}
103 | .xml.source-xml .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
104 | .xml.source-xml .li2 {font-weight: bold; vertical-align:top;}
105 | .xml.source-xml .es0 {color: #000099; font-weight: bold;}
106 | .xml.source-xml .br0 {color: #66cc66;}
107 | .xml.source-xml .sy0 {color: #66cc66;}
108 | .xml.source-xml .st0 {color: #ff0000;}
109 | .xml.source-xml .nu0 {color: #cc66cc;}
110 | .xml.source-xml .sc-1 {color: #808080; font-style: italic;}
111 | .xml.source-xml .sc0 {color: #00bbdd;}
112 | .xml.source-xml .sc1 {color: #ddbb00;}
113 | .xml.source-xml .sc2 {color: #339933;}
114 | .xml.source-xml .sc3 {color: #009900;}
115 | .xml.source-xml .re0 {color: #000066;}
116 | .xml.source-xml .re1 {color: #000000; font-weight: bold;}
117 | .xml.source-xml .re2 {color: #000000; font-weight: bold;}
118 | .xml.source-xml .ln-xtra, .xml.source-xml li.ln-xtra, .xml.source-xml div.ln-xtra {background-color: #ffc;}
119 | .xml.source-xml span.xtra { display:block; }
120 |  
121 | /*]]>*/
122 | </style> 
123 | <style type="text/css">/*<![CDATA[*/
124 | @import "/w/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=2678400";
125 | /*]]>*/
126 | </style></head> 
127 | <body class="mediawiki ltr ns-0 ns-subject page-XML skin-vector"> 
128 | 		<div id="mw-page-base" class="noprint"></div> 
129 | 		<div id="mw-head-base" class="noprint"></div> 
130 | 		<!-- content --> 
131 | 		<div id="content"> 
132 | 			<a id="top"></a> 
133 | 			<div id="mw-js-message" style="display:none;"></div> 
134 | 						<!-- sitenotice --> 
135 | 			<div id="siteNotice"><!-- centralNotice loads here --><script type="text/javascript" language="JavaScript"> 
136 | /* <![CDATA[ */
137 | document.writeln("\x3cdiv id=\"localNotice\"\x3e\x3c/div\x3e");
138 | /* ]]> */
139 | </script></div> 
140 | 			<!-- /sitenotice --> 
141 | 						<!-- firstHeading --> 
142 | 			<h1 id="firstHeading" class="firstHeading">XML</h1> 
143 | 			<!-- /firstHeading --> 
144 | 			<!-- bodyContent --> 
145 | 			<div id="bodyContent"> 
146 | 				<!-- tagline --> 
147 | 				<div id="siteSub">维基百科，自由的百科全书</div> 
148 | 				<!-- /tagline --> 
149 | 				<!-- subtitle --> 
150 | 				<div id="contentSub"></div> 
151 | 				<!-- /subtitle --> 
152 | 																<!-- jumpto --> 
153 | 				<div id="jump-to-nav"> 
154 | 					跳转到： <a href="#mw-head">导航</a>,
155 | 					<a href="#p-search">搜索</a> 
156 | 				</div> 
157 | 				<!-- /jumpto --> 
158 | 								<!-- bodytext --> 
159 | 				<p><span style="display: none;"><a href="#_skip_noteTA">跳过字词转换说明</a></span></p> 
160 | <div class="NavFrame collapsed noprint nohandheld metadata" style="z-index: 1; margin: 0 0.5em 0.5em auto; top: -1.2em; position: relative; background-color: transparent; border: none;"> 
161 | <div class="uncollapse toggleHotspot" style="position: absolute; right:0; background-color: transparent; padding: 0; width: 4em;" title="本頁使用了標題或全文手工轉換，單擊檢視"><span style="font-family:微软雅黑,Arial Unicode MS,黑体;"><span style="padding:1px 3px; background: slategray; color:white;">汉</span><span style="padding:1px 3px; background: sienna; color:white;">漢</span></span><span class="toggleShow" style="color:black;">▼</span><span class="toggleHide" style="color:black;">▲</span></div> 
162 | <div class="NavContent" style="position: absolute; display: none; right: 0; top: 1.5em; border: 1px gray solid; background-color: lightyellow; padding: 0.3em; z-index: 100; width: 650px; font-size: 90%; background-color: #f0f2f0; color: black;"> 
163 | <div style="background: #FFEA88; background-color: #4d4d4d; color: #ffffff;">为了阅读方便，本文使用<a href="/wiki/Help:%E4%B8%AD%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91%E7%9A%84%E7%B9%81%E7%AE%80%E5%A4%84%E7%90%86#.E6.8E.A7.E5.88.B6.E8.87.AA.E5.8A.A8.E8.BD.AC.E6.8D.A2.E7.9A.84.E4.BB.A3.E7.A2.BC" title="Help:中文维基百科的繁简处理" class="mw-redirect"><span style="color:#ffffff; font-weight:bold;">全文手工轉換</span></a>。转换内容：</div> 
164 | <div style="padding: 3px"> 
165 | <p>本文采用<a href="/wiki/Portal:%E7%94%B5%E8%84%91%E5%92%8C%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF" title="Portal:电脑和信息技术" class="mw-redirect"><span style="color: #333333; font-weight: bold;">电脑和信息技术</span></a>组全文转换 <span class="editlink noprint plainlinksneverexpand">[<a href="/wiki/Template:CGroup/IT/temp" title="Template:CGroup/IT/temp">查看</a>] • [<a href="http://zh.wikipedia.org/w/index.php?title=Template:CGroup/IT/temp&amp;action=edit" class="external text" rel="nofollow">编辑</a>] • [<a href="http://philip-bot.appspot.com/autoconvert?group=IT" class="external text" rel="nofollow">强制刷新</a>]</span></p> 
166 | <p><br /></p> 
167 | </div> 
168 | <div class="NavFrame collapsed" style="background-color: transparent; border: none; margin:0; padding:0;"> 
169 | <div class="NavHead" style="background-color: #FFEA88; font-weight: normal; height: 1.5em; text-align: left; background-color: #4d4d4d; color: #ffffff;"><a href="/wiki/Help:%E4%B8%AD%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91%E7%9A%84%E7%B9%81%E7%AE%80%E3%80%81%E5%9C%B0%E5%8C%BA%E8%AF%8D%E5%A4%84%E7%90%86" title="Help:中文维基百科的繁简、地区词处理"><span style="color: #333333; font-weight: bold;">字詞轉換</span></a>说明<span class="NavToggle"><span class="toggleShow">顯示↓</span><span class="toggleHide">關閉↑</span></span></div> 
170 | <div class="NavContent"> 
171 | <p>字詞轉換是中文维基的一項自動轉換，目的是通過计算机程序自動消除繁简、地区词等不同<b>用字模式</b>的差異，以達到閱讀方便。字詞轉換包括全局轉換和手動轉換，本說明所使用的标题转换和全文转换技術，都屬於手動轉換。</p> 
172 | <p>如果您想对我们的字词转换系统提出一些改进建议，或者提交应用面更广的转换（<a href="/wiki/%E4%B8%AD%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91" title="中文维基百科">中文维基百科</a>全站乃至<a href="/wiki/MediaWiki" title="MediaWiki">MediaWiki</a>软件），或者报告转换系统的错误，请前往<a href="/wiki/Wikipedia:%E5%AD%97%E8%AF%8D%E8%BD%AC%E6%8D%A2%E8%AF%B7%E6%B1%82%E6%88%96%E5%80%99%E9%80%89" title="Wikipedia:字词转换请求或候选">Wikipedia:字词转换请求或候选</a>发表您的意见。</p> 
173 | </div> 
174 | </div> 
175 | </div> 
176 | </div> 
177 | <p><span id="_skip_noteTA"></span></p> 
178 | <div class="thumb tright"> 
179 | <div class="thumbinner" style="width:427px;"><a href="/wiki/File:RecipeBook_XML_Example.svg" class="image"><img alt="" src="http://upload.wikimedia.org/wikipedia/commons/thumb/7/73/RecipeBook_XML_Example.svg/425px-RecipeBook_XML_Example.svg.png" width="425" height="234" class="thumbimage" /></a> 
180 | <div class="thumbcaption">RecipeBook的例子，一種基於XML語法上的烹飪技術書刊。此標籤可轉換為：<a href="/wiki/HTML" title="HTML">HTML</a>, <a href="/wiki/Portable_Document_Format" title="Portable Document Format" class="mw-redirect">PDF</a>以及<a href="/wiki/Rich_Text_Format" title="Rich Text Format" class="mw-redirect">Rich Text Format</a>並使用<a href="/wiki/%E7%A8%8B%E5%BC%8F%E8%AA%9E%E8%A8%80" title="程式語言" class="mw-redirect">程式語言</a>或<a href="/w/index.php?title=%E5%8F%AF%E6%93%B4%E5%85%85%E5%A5%97%E4%BB%B6%E6%A8%A3%E5%BC%8F%E8%AA%9E%E8%A8%80&amp;action=edit&amp;redlink=1" class="new" title="可擴充套件樣式語言">XSL</a>。</div> 
181 | </div> 
182 | </div> 
183 | <p><b>可扩展置标语言</b>（<a href="/wiki/%E8%8B%B1%E8%AF%AD" title="英语">英语</a>：<span lang="en" xml:lang="en">e<b>X</b>tensible <b>M</b>arkup <b>L</b>anguage</span>，简称:<span lang="en" xml:lang="en"><b>XML</b></span>），又称<b>可扩展标记语言</b>，是一种<a href="/wiki/%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80" title="置标语言">置标语言</a>。置标指<a href="/wiki/%E8%AE%A1%E7%AE%97%E6%9C%BA" title="计算机" class="mw-redirect">计算机</a>所能理解的信息符号，通过此种标记，计算机之间可以处理包含各种信息的文章等。如何定义这些标记，既可以选择国际通用的标记语言，比如<a href="/wiki/HTML" title="HTML">HTML</a>，也可以使用像XML这样由相关人士自由决定的标记语言，这就是语言的可扩展性。XML是从<a href="/wiki/%E6%A0%87%E5%87%86%E9%80%9A%E7%94%A8%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80" title="标准通用置标语言" class="mw-redirect">标准通用置标语言</a>（SGML）中简化修改出来的。它主要用到的有可扩展置标语言、<a href="/wiki/%E5%8F%AF%E6%89%A9%E5%B1%95%E6%A0%B7%E5%BC%8F%E8%AF%AD%E8%A8%80" title="可扩展样式语言">可扩展样式语言</a>（XSL）、<a href="/wiki/XBRL" title="XBRL">XBRL</a>和<a href="/wiki/XPath" title="XPath">XPath</a>等。</p> 
184 | <table id="toc" class="toc"> 
185 | <tr> 
186 | <td> 
187 | <div id="toctitle"> 
188 | <h2>目录</h2> 
189 | </div> 
190 | <ul> 
191 | <li class="toclevel-1 tocsection-1"><a href="#.E6.AD.B7.E5.8F.B2"><span class="tocnumber">1</span> <span class="toctext">歷史</span></a></li> 
192 | <li class="toclevel-1 tocsection-2"><a href="#.E7.94.A8.E9.80.94"><span class="tocnumber">2</span> <span class="toctext">用途</span></a></li> 
193 | <li class="toclevel-1 tocsection-3"><a href="#.E4.BE.8B"><span class="tocnumber">3</span> <span class="toctext">例</span></a></li> 
194 | <li class="toclevel-1 tocsection-4"><a href="#.E7.BB.93.E6.9E.84"><span class="tocnumber">4</span> <span class="toctext">结构</span></a></li> 
195 | <li class="toclevel-1 tocsection-5"><a href="#.E5.8F.82.E8.A7.81"><span class="tocnumber">5</span> <span class="toctext">参见</span></a></li> 
196 | <li class="toclevel-1 tocsection-6"><a href="#.E5.A4.96.E9.83.A8.E9.93.BE.E6.8E.A5"><span class="tocnumber">6</span> <span class="toctext">外部链接</span></a></li> 
197 | </ul> 
198 | </td> 
199 | </tr> 
200 | </table> 
201 | <script type="text/javascript"> 
202 | //<![CDATA[
203 | if (window.showTocToggle) { var tocShowText = "显示"; var tocHideText = "隐藏"; showTocToggle(); } 
204 | //]]>
205 | </script> 
206 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=1" title="编辑段落：歷史">编辑</a>]</span> <span class="mw-headline" id=".E6.AD.B7.E5.8F.B2">歷史</span></h2> 
207 | <p>XML是從1995年開始有其雛形，並向<a href="/wiki/W3C" title="W3C" class="mw-redirect">W3C</a>（<a href="/wiki/%E4%B8%87%E7%BB%B4%E7%BD%91%E8%81%94%E7%9B%9F" title="万维网联盟">全球資訊網聯盟</a>）提案，而在1998二月發佈為W3C的標準（XML1.0）。XML的前身是<b>SGML</b>（<span lang="en" xml:lang="en">The <b>S</b>tandard <b>G</b>eneralized <b>M</b>arkup <b>L</b>anguage</span>），是自IBM從1960年代就開始發展的<b><a href="/w/index.php?title=%E9%80%9A%E7%94%A8%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80&amp;action=edit&amp;redlink=1" class="new" title="通用置标语言">GML</a></b>（<span lang="en" xml:lang="en"><b>G</b>eneralized <b>M</b>arkup <b>L</b>anguage</span>）標準化後的名稱。</p> 
208 | <p>GML的重要概念：</p> 
209 | <ul> 
210 | <li>文件中能夠明確的將標示與內容分開</li> 
211 | <li>所有文件的標示使用方法均一致</li> 
212 | </ul> 
213 | <p>1978年，<a href="/wiki/ANSI" title="ANSI" class="mw-redirect">ANSI</a>將GML加以整理規範，發佈成為SGML，1986年起為<a href="/wiki/ISO" title="ISO" class="mw-redirect">ISO</a>所採用（ISO 8879），並且被廣泛地運用在各種大型的文件計劃中，但是SGML是一種非常嚴謹的文件描述法，導致過於龐大複雜（標準手冊就有500多頁），難以理解和學習，進而影響其推廣與應用。</p> 
214 | <p>同時W3C也發現到HTML的問題：</p> 
215 | <ul> 
216 | <li>不能解決所有解釋資料的問題 - 像是影音檔或化學公式、音樂符號等其他形態的內容。</li> 
217 | <li>效能問題 - 需要下載整份文件，才能開始對文件做搜尋。</li> 
218 | <li>擴充性、彈性、易讀性均不佳。</li> 
219 | </ul> 
220 | <p>為了解決以上問題，專家們使用SGML精簡製作，並依照HTML的發展經驗，產生出一套使用上規則嚴謹，但是簡單的描述資料語言：XML。 XML是在一個這樣的背景下誕生的——为了有一個更中立的方式，讓消費端自行決定要如何消化、呈現從服務端所提供的資訊。</p> 
221 | <p>XML被廣泛用來作為跨平台之間交互數據的形式，主要針對數據的內容，通過不同的格式化描述手段（XSLT，CSS等）可以完成最終的形式表達（生成對應的HTML，PDF或者其他的文件格式）。</p> 
222 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=2" title="编辑段落：用途">编辑</a>]</span> <span class="mw-headline" id=".E7.94.A8.E9.80.94">用途</span></h2> 
223 | <p>XML设计用来传送及携带数据信息，不用来表现或展示数据，<a href="/wiki/HTML" title="HTML">HTML</a>语言則用来表现数据，所以XML用途的焦点是它说明数据是什么，以及携带数据信息。</p> 
224 | <ul> 
225 | <li>丰富文件（Rich Documents）- 自定文件描述并使其更丰富
226 | <ul> 
227 | <li>属于文件为主的XML技术应用</li> 
228 | <li>标记是用来定义一份资料应该如何呈现</li> 
229 | </ul> 
230 | </li> 
231 | <li>元数据（Metadata）- 描述其它文件或网络资讯
232 | <ul> 
233 | <li>属于资料为主的XML技术应用</li> 
234 | <li>标记是用来说明一份资料的意义</li> 
235 | </ul> 
236 | </li> 
237 | <li>設定档案（Configuration Files）- 描述软件設定的参数</li> 
238 | </ul> 
239 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=3" title="编辑段落：例">编辑</a>]</span> <span class="mw-headline" id=".E4.BE.8B">例</span></h2> 
240 | <p>XML定义结构、存储信息、传送信息。下例為<u>張旭</u>发送给<u>陳貞伶</u>的便条，存储为XML。</p> 
241 | <div dir="ltr" class="mw-geshi" style="text-align: left;"> 
242 | <div class="xml source-xml" style="font-family:monospace;"> 
243 | <pre class="de1"> 
244 | <span class="sc3">&lt;小纸条<span class="re2">&gt;</span></span> 
245 |     <span class="sc3">&lt;收件人<span class="re2">&gt;</span></span>陳貞伶<span class="sc3">&lt;/收件人<span class="re2">&gt;</span></span> 
246 |     <span class="sc3">&lt;发件人<span class="re2">&gt;</span></span>張旭<span class="sc3">&lt;/发件人<span class="re2">&gt;</span></span> 
247 |     <span class="sc3">&lt;主题<span class="re2">&gt;</span></span>問候<span class="sc3">&lt;/主题<span class="re2">&gt;</span></span> 
248 |     <span class="sc3">&lt;具体内容<span class="re2">&gt;</span></span>最近可好？<span class="sc3">&lt;/具体内容<span class="re2">&gt;</span></span> 
249 | <span class="sc3">&lt;/小纸条<span class="re2">&gt;</span></span> 
250 | </pre></div> 
251 | </div> 
252 | <p>这XML文档仅是纯粹的信息标签，这些标签意义的展开依赖于应用它的程序。</p> 
253 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=4" title="编辑段落：结构">编辑</a>]</span> <span class="mw-headline" id=".E7.BB.93.E6.9E.84">结构</span></h2> 
254 | <p>每个XML文档都由XML序言开始，在前面的代码中的第一行便是XML序言，&lt;?xml version="1.0"?&gt;。这一行代码会告诉解析器和浏览器，这个文件应该按照前面讨论过的XML规则进行解析。第二行代码，&lt;books&gt;，则是文档元素（document element），它是文件中最外面的标签（我们认为元素（element）是起始标签和结束标签之间的内容）。所有其他的标签必须包含在这个标签之内来组成一个有效的XML文件。XML文件的第二行并不一定要包含文档元素；如果有注释或者其他内容，文档元素可以迟些出现。</p> 
255 | <p>范例文件中的第三行代码是注释，你会发现它与HTML中使用的注释风格是一样的。这是XML从SGML中继承的语法元素之一。</p> 
256 | <p>页面再往下的一些地方，可以发现&lt;desc&gt;标签裡有一些特殊的语法。&lt;![CDATA[ ]]&gt;代码用于表示无需进行解析的文本，允许诸如大于号和小于号之类的特殊字符包含在文本中，而无需担心破坏XML的语法。文本必须出现在&lt;![CDATA[和]]&gt;之间才能合适地避免被解析。这样的文本称为Character Data Section，简称CData Section。</p> 
257 | <p>下面的一行就是在第二本书的定义之前的：</p> 
258 | <p>&lt;?page render multiple authors&#160;?&gt;</p> 
259 | <p>虽然它看上去很像XML序言，但实际上是一种称为处理指令（processing instruction）的不同类型的语法。处理指令（以下简称PI）的目的是为了给处理页面的程序（例如XML解析器）提供额外的信息。PI通常情况下是没有固定格式的，唯一的要求是紧随第一个问号必须至少有一个字母。在此之后，PI可以包含除了小于号和大于号之外的任何字符串序列。</p> 
260 | <p>最常见的PI是用来指定XML文件的样式表：</p> 
261 | <p>这个PI一般会直接放在XML序言之后，通常由Web浏览器使用，来将XML数据以特殊的样式显示出来。</p> 
262 | <p>XML的结构有一个缺陷，那就是不支持分帧（framing）。当多条XML消息在TCP上传输的时候，无法基于XML协议来确定一条XML消息是否已经结束。</p> 
263 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=5" title="编辑段落：参见">编辑</a>]</span> <span class="mw-headline" id=".E5.8F.82.E8.A7.81">参见</span></h2> 
264 | <ul> 
265 | <li><a href="/wiki/XHTML" title="XHTML">XHTML</a></li> 
266 | <li><a href="/wiki/DTD" title="DTD" class="mw-redirect">DTD</a>（<a href="/wiki/%E6%96%87%E4%BB%B6%E7%B1%BB%E5%9E%8B%E6%8F%8F%E8%BF%B0" title="文件类型描述">文件类型描述</a>）</li> 
267 | <li><a href="/wiki/XML_Schema" title="XML Schema">XML Schema</a></li> 
268 | <li><a href="/w/index.php?title=XLink&amp;action=edit&amp;redlink=1" class="new" title="XLink">XLink</a></li> 
269 | <li><a href="/wiki/SVG" title="SVG">SVG</a></li> 
270 | <li><a href="/wiki/XSLT" title="XSLT">XSLT</a></li> 
271 | <li><a href="/wiki/X3D" title="X3D">X3D</a></li> 
272 | <li><a href="/wiki/HTML" title="HTML">HTML</a></li> 
273 | <li><a href="/wiki/CSS" title="CSS">CSS</a></li> 
274 | <li><a href="/wiki/RDF" title="RDF" class="mw-redirect">RDF</a></li> 
275 | <li><a href="/wiki/RSS" title="RSS">RSS</a> 
276 | <ul> 
277 | <li><a href="/wiki/Unicode" title="Unicode">Unicode</a></li> 
278 | </ul> 
279 | </li> 
280 | </ul> 
281 | <h2><span class="editsection">[<a href="/w/index.php?title=XML&amp;action=edit&amp;section=6" title="编辑段落：外部链接">编辑</a>]</span> <span class="mw-headline" id=".E5.A4.96.E9.83.A8.E9.93.BE.E6.8E.A5">外部链接</span></h2> 
282 | <ul> 
283 | <li><a href="http://xml.ascc.net/zh/utf-8/gloss.html" class="external text" rel="nofollow">XML及SGML名词英汉翻译表 (台湾)</a></li> 
284 | </ul> 
285 | <p><br /></p> 
286 | <table class="navbox" cellspacing="0" style=";"> 
287 | <tr> 
288 | <td style="padding:2px;"> 
289 | <table cellspacing="0" class="nowraplinks collapsible autocollapse" style="width:100%;background:transparent;color:inherit;;"> 
290 | <tr> 
291 | <th style=";" colspan="2" class="navbox-title"> 
292 | <div style="float:left; width:6em;text-align:left;"> 
293 | <div class="noprint plainlinksneverexpand" style="background-color:transparent; padding:0; font-weight:normal; font-size:80%; color:#000000; white-space:nowrap;"><a href="/wiki/Template:W3C%E8%A7%84%E8%8C%83%E5%92%8C%E6%A0%87%E5%87%86" title="Template:W3C规范和标准"><span style=";;border:none;" title="查看这个模板">查</span></a>&#160;<span style="font-size:80%;">•</span>&#160;<a href="/w/index.php?title=Template_talk:W3C%E8%A7%84%E8%8C%83%E5%92%8C%E6%A0%87%E5%87%86&amp;action=edit&amp;redlink=1" class="new" title="Template talk:W3C规范和标准"><span style=";;border:none;" title="关于这个模板的讨论页面">論</span></a>&#160;<span style="font-size:80%;">•</span>&#160;<a href="http://zh.wikipedia.org/w/index.php?title=Template:W3C%E8%A7%84%E8%8C%83%E5%92%8C%E6%A0%87%E5%87%86&amp;action=edit" class="external text" rel="nofollow"><span style=";;border:none;" title="您可以编辑这个模板。请在储存变更之前先预览">編</span></a></div> 
294 | </div> 
295 | <span style="font-size:110%;"><a href="/wiki/%E4%B8%87%E7%BB%B4%E7%BD%91%E8%81%94%E7%9B%9F" title="万维网联盟">W3C</a>规范和标准</span></th> 
296 | </tr> 
297 | <tr style="height:2px;"> 
298 | <td></td> 
299 | </tr> 
300 | <tr> 
301 | <td class="navbox-group" style=";background-color:#e6e6ff;;"><a href="/wiki/W3C%E6%8E%A8%E8%8D%90%E6%A0%87%E5%87%86" title="W3C推荐标准">推薦標準</a></td> 
302 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;line-height:1.4em;;;" class="navbox-list navbox-odd"> 
303 | <div style="padding:0em 0.25em"><span style="white-space:nowrap"><a href="/wiki/Canonical_XML" title="Canonical XML">Canonical XML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Compound_Document_Format&amp;action=edit&amp;redlink=1" class="new" title="Compound Document Format">CDF</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/CSS" title="CSS">CSS</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E6%96%87%E6%A1%A3%E5%AF%B9%E8%B1%A1%E6%A8%A1%E5%9E%8B" title="文档对象模型">DOM</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/HTML" title="HTML">HTML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E6%95%B0%E5%AD%A6%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80" title="数学置标语言">MathML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E7%BD%91%E7%BB%9C%E6%9C%AC%E4%BD%93%E8%AF%AD%E8%A8%80" title="网络本体语言">OWL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=P3P&amp;action=edit&amp;redlink=1" class="new" title="P3P">P3P</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Pronunciation_Lexicon_Specification&amp;action=edit&amp;redlink=1" class="new" title="Pronunciation Lexicon Specification">PLS</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/RDF" title="RDF" class="mw-redirect">RDF</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=RDF_Schema&amp;action=edit&amp;redlink=1" class="new" title="RDF Schema">RDF Schema</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Semantic_Interpretation_for_Speech_Recognition&amp;action=edit&amp;redlink=1" class="new" title="Semantic Interpretation for Speech Recognition">SISR</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E5%90%8C%E6%AD%A5%E5%A4%9A%E5%AA%92%E4%BD%93%E9%9B%86%E6%88%90%E8%AF%AD%E8%A8%80" title="同步多媒体集成语言">SMIL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E7%B0%A1%E5%96%AE%E7%89%A9%E4%BB%B6%E5%AD%98%E5%8F%96%E5%8D%94%E5%AE%9A" title="簡單物件存取協定" class="mw-redirect">SOAP</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Speech_Recognition_Grammar_Specification&amp;action=edit&amp;redlink=1" class="new" title="Speech Recognition Grammar Specification">SRGS</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Speech_Synthesis_Markup_Language&amp;action=edit&amp;redlink=1" class="new" title="Speech Synthesis Markup Language">SSML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E5%8F%AF%E7%BC%A9%E6%94%BE%E7%9F%A2%E9%87%8F%E5%9B%BE%E5%BD%A2" title="可缩放矢量图形" class="mw-redirect">SVG</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/SPARQL" title="SPARQL">SPARQL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Timed_Text&amp;action=edit&amp;redlink=1" class="new" title="Timed Text">Timed Text</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=VoiceXML&amp;action=edit&amp;redlink=1" class="new" title="VoiceXML">VoiceXML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/WSDL" title="WSDL">WSDL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XForms" title="XForms">XForms</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XHTML" title="XHTML">XHTML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XLink&amp;action=edit&amp;redlink=1" class="new" title="XLink">XLink</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><strong class="selflink">XML</strong>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XML_Base&amp;action=edit&amp;redlink=1" class="new" title="XML Base">XML Base</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XML_Encryption" title="XML Encryption">XML Encryption</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XML_Events&amp;action=edit&amp;redlink=1" class="new" title="XML Events">XML Events</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XML%E4%BF%A1%E6%81%AF%E9%9B%86" title="XML信息集">XML信息集</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XML%E5%91%BD%E5%90%8D%E7%A9%BA%E9%97%B4" title="XML命名空间">XML命名空间</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XML_Schema" title="XML Schema">XML Schema</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XML_Signature" title="XML Signature">XML Signature</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XPath" title="XPath">XPath</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XPointer&amp;action=edit&amp;redlink=1" class="new" title="XPointer">XPointer</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XQuery&amp;action=edit&amp;redlink=1" class="new" title="XQuery">XQuery</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/%E5%8F%AF%E6%89%A9%E5%B1%95%E6%A0%B7%E5%BC%8F%E8%AF%AD%E8%A8%80" title="可扩展样式语言">XSL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XSL-FO" title="XSL-FO">XSL-FO</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XSL_Transformations" title="XSL Transformations" class="mw-redirect">XSLT</a></span></div> 
304 | </td> 
305 | </tr> 
306 | <tr style="height:2px"> 
307 | <td></td> 
308 | </tr> 
309 | <tr> 
310 | <td class="navbox-group" style=";background-color:#e6e6ff;;">Notes</td> 
311 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;line-height:1.4em;;;" class="navbox-list navbox-even"> 
312 | <div style="padding:0em 0.25em"><span style="white-space:nowrap"><a href="/w/index.php?title=XAdES&amp;action=edit&amp;redlink=1" class="new" title="XAdES">XAdES</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XHTML%2BSMIL&amp;action=edit&amp;redlink=1" class="new" title="XHTML+SMIL">XHTML+SMIL</a></span></div> 
313 | </td> 
314 | </tr> 
315 | <tr style="height:2px"> 
316 | <td></td> 
317 | </tr> 
318 | <tr> 
319 | <td class="navbox-group" style=";background-color:#e6e6ff;;">工作草案</td> 
320 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;line-height:1.4em;;;" class="navbox-list navbox-odd"> 
321 | <div style="padding:0em 0.25em"><span style="white-space:nowrap"><a href="/w/index.php?title=Call_Control_eXtensible_Markup_Language&amp;action=edit&amp;redlink=1" class="new" title="Call Control eXtensible Markup Language">CCXML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=CURIE&amp;action=edit&amp;redlink=1" class="new" title="CURIE">CURIE</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/HTML_5" title="HTML 5">HTML 5</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=InkML&amp;action=edit&amp;redlink=1" class="new" title="InkML">InkML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Web_Integration_Compound_Document&amp;action=edit&amp;redlink=1" class="new" title="Web Integration Compound Document">WICD</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Extensible_Forms_Description_Language&amp;action=edit&amp;redlink=1" class="new" title="Extensible Forms Description Language">XFDL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XFrames&amp;action=edit&amp;redlink=1" class="new" title="XFrames">XFrames</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XBL" title="XBL">XBL</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XHTML%2BMathML%2BSVG&amp;action=edit&amp;redlink=1" class="new" title="XHTML+MathML+SVG">XHTML+MathML+SVG</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/XMLHttpRequest" title="XMLHttpRequest" class="mw-redirect">XMLHttpRequest</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=XProc&amp;action=edit&amp;redlink=1" class="new" title="XProc">XProc</a></span></div> 
322 | </td> 
323 | </tr> 
324 | <tr style="height:2px"> 
325 | <td></td> 
326 | </tr> 
327 | <tr> 
328 | <td class="navbox-group" style=";background-color:#e6e6ff;;">檢測</td> 
329 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;line-height:1.4em;;;" class="navbox-list navbox-even"> 
330 | <div style="padding:0em 0.25em"><a href="/w/index.php?title=%E7%84%A1%E9%9A%9C%E7%A4%99%E7%B6%B2%E9%A0%81%E6%AA%A2%E6%B8%AC&amp;action=edit&amp;redlink=1" class="new" title="無障礙網頁檢測">無障礙網頁檢測</a></div> 
331 | </td> 
332 | </tr> 
333 | <tr style="height:2px"> 
334 | <td></td> 
335 | </tr> 
336 | <tr> 
337 | <td class="navbox-group" style=";background-color:#e6e6ff;;">前标准</td> 
338 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;line-height:1.4em;;;" class="navbox-list navbox-odd"> 
339 | <div style="padding:0em 0.25em"><span style="white-space:nowrap"><a href="/wiki/C-HTML" title="C-HTML">C-HTML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Handheld_Device_Markup_Language&amp;action=edit&amp;redlink=1" class="new" title="Handheld Device Markup Language">HDML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=JavaScript_Style_Sheets&amp;action=edit&amp;redlink=1" class="new" title="JavaScript Style Sheets">JSSS</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/w/index.php?title=Precision_Graphics_Markup_Language&amp;action=edit&amp;redlink=1" class="new" title="Precision Graphics Markup Language">PGML</a>&#160;<b>·</b></span> <span style="white-space:nowrap"><a href="/wiki/Vector_Markup_Language" title="Vector Markup Language" class="mw-redirect">VML</a></span></div> 
340 | </td> 
341 | </tr> 
342 | </table> 
343 | </td> 
344 | </tr> 
345 | </table> 
346 | <table class="navbox" cellspacing="0" style=";"> 
347 | <tr> 
348 | <td style="padding:2px;"> 
349 | <table cellspacing="0" class="nowraplinks collapsible autocollapse" style="width:100%;background:transparent;color:inherit;;"> 
350 | <tr> 
351 | <th style=";" colspan="2" class="navbox-title"> 
352 | <div style="float:left; width:6em;text-align:left;"> 
353 | <div class="noprint plainlinksneverexpand" style="background-color:transparent; padding:0; font-weight:normal; font-size:80%; color:#000000; white-space:nowrap;"><a href="/wiki/Template:%E7%B6%B2%E9%A0%81%E6%8A%80%E8%A1%93%E8%88%87%E6%A8%99%E6%BA%96" title="Template:網頁技術與標準"><span style=";;border:none;" title="查看这个模板">查</span></a>&#160;<span style="font-size:80%;">•</span>&#160;<a href="/wiki/Template_talk:%E7%B6%B2%E9%A0%81%E6%8A%80%E8%A1%93%E8%88%87%E6%A8%99%E6%BA%96" title="Template talk:網頁技術與標準"><span style=";;border:none;" title="关于这个模板的讨论页面">論</span></a>&#160;<span style="font-size:80%;">•</span>&#160;<a href="http://zh.wikipedia.org/w/index.php?title=Template:%E7%B6%B2%E9%A0%81%E6%8A%80%E8%A1%93%E8%88%87%E6%A8%99%E6%BA%96&amp;action=edit" class="external text" rel="nofollow"><span style=";;border:none;" title="您可以编辑这个模板。请在储存变更之前先预览">編</span></a></div> 
354 | </div> 
355 | <span style="font-size:110%;">網頁技術與標準</span></th> 
356 | </tr> 
357 | <tr style="height:2px;"> 
358 | <td></td> 
359 | </tr> 
360 | <tr> 
361 | <td class="navbox-group" style=";;">文档呈现语言</td> 
362 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-odd"> 
363 | <div style="padding:0em 0.25em"><a href="/wiki/HTML" title="HTML">HTML</a>*（<a href="/wiki/HTML5" title="HTML5" class="mw-redirect">HTML5</a>*）<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/XHTML" title="XHTML">XHTML</a>*<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <strong class="selflink">XML</strong>*<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/XForms" title="XForms">XForms</a>*<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/DHTML" title="DHTML">DHTML</a>*</div> 
364 | </td> 
365 | </tr> 
366 | <tr style="height:2px"> 
367 | <td></td> 
368 | </tr> 
369 | <tr> 
370 | <td class="navbox-group" style=";;">样式格式描述语言</td> 
371 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-even"> 
372 | <div style="padding:0em 0.25em"><a href="/wiki/CSS" title="CSS">CSS</a>*<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/XSL" title="XSL" class="mw-redirect">XSL</a>*</div> 
373 | </td> 
374 | </tr> 
375 | <tr style="height:2px"> 
376 | <td></td> 
377 | </tr> 
378 | <tr> 
379 | <td class="navbox-group" style=";;">动态网页技术</td> 
380 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-odd"> 
381 | <div style="padding:0em 0.25em"><a href="/wiki/%E9%80%9A%E7%94%A8%E7%BD%91%E5%85%B3%E6%8E%A5%E5%8F%A3" title="通用网关接口">CGI</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/Active_Server_Pages" title="Active Server Pages">ASP</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/ASP.NET" title="ASP.NET">ASP.NET</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/ColdFusion" title="ColdFusion" class="mw-redirect">ColdFusion</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/JSP" title="JSP">JSP</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/PHP" title="PHP">PHP</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/Ruby_on_Rails" title="Ruby on Rails">Ruby on Rails</a></div> 
382 | </td> 
383 | </tr> 
384 | <tr style="height:2px"> 
385 | <td></td> 
386 | </tr> 
387 | <tr> 
388 | <td class="navbox-group" style=";;">客户端交互技术</td> 
389 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-even"> 
390 | <div style="padding:0em 0.25em"><a href="/wiki/ActiveX" title="ActiveX">ActiveX</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/Java_Applet" title="Java Applet" class="mw-redirect">Java Applet</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/JavaFX" title="JavaFX">JavaFX</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/AJAX" title="AJAX">AJAX</a>（<a href="/wiki/XMLHTTP" title="XMLHTTP">XMLHTTP</a>*）<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/Microsoft_Silverlight" title="Microsoft Silverlight">Silverlight</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/ActionScript" title="ActionScript">ActionScript</a>（<a href="/wiki/Adobe_Flash" title="Adobe Flash">Flash</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/Adobe_Flex" title="Adobe Flex">Flex</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/Adobe_Integrated_Runtime" title="Adobe Integrated Runtime" class="mw-redirect">AIR</a>）</div> 
391 | </td> 
392 | </tr> 
393 | <tr style="height:2px"> 
394 | <td></td> 
395 | </tr> 
396 | <tr> 
397 | <td class="navbox-group" style=";;">客户端脚本语言</td> 
398 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-odd"> 
399 | <div style="padding:0em 0.25em"><a href="/wiki/JavaScript" title="JavaScript">JavaScript</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/JScript" title="JScript">JScript</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/VBScript" title="VBScript">VBScript</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/ECMAScript" title="ECMAScript">ECMAScript</a></div> 
400 | </td> 
401 | </tr> 
402 | <tr style="height:2px"> 
403 | <td></td> 
404 | </tr> 
405 | <tr> 
406 | <td class="navbox-group" style=";;">标识定位语言</td> 
407 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-even"> 
408 | <div style="padding:0em 0.25em"><a href="/wiki/%E7%BB%9F%E4%B8%80%E8%B5%84%E6%BA%90%E5%AE%9A%E4%BD%8D%E7%AC%A6" title="统一资源定位符">URL</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/%E7%BB%9F%E4%B8%80%E8%B5%84%E6%BA%90%E6%A0%87%E5%BF%97%E7%AC%A6" title="统一资源标志符">URI</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/XPath" title="XPath">XPath</a><span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/URL%E9%87%8D%E5%AF%AB" title="URL重寫">URL重寫</a></div> 
409 | </td> 
410 | </tr> 
411 | <tr style="height:2px"> 
412 | <td></td> 
413 | </tr> 
414 | <tr> 
415 | <td class="navbox-group" style=";;">文档纲要语言</td> 
416 | <td style="text-align:left;border-left:2px solid #fdfdfd;width:100%;padding:0px;;;" class="navbox-list navbox-odd"> 
417 | <div style="padding:0em 0.25em"><a href="/wiki/%E6%96%87%E4%BB%B6%E7%B1%BB%E5%9E%8B%E6%8F%8F%E8%BF%B0" title="文件类型描述">DTD</a>*<span style="white-space:nowrap; font-weight:bold;">&#160;·</span> <a href="/wiki/XML_Schema" title="XML Schema">XML Schema</a>*</div> 
418 | </td> 
419 | </tr> 
420 | </table> 
421 | </td> 
422 | </tr> 
423 | </table> 
424 |  
425 |  
426 | <!-- 
427 | NewPP limit report
428 | Preprocessor node count: 2032/1000000
429 | Post-expand include size: 837777/2048000 bytes
430 | Template argument size: 22446/2048000 bytes
431 | Expensive parser function count: 11/500
432 | --> 
433 |  
434 | <!-- Saved in parser cache with key zhwiki:pcache:idhash:3632-0!1!0!!zh!4!zh and timestamp 20110110032035 --> 
435 | <div class="printfooter"> 
436 | 来自“<a href="http://zh.wikipedia.org/wiki/XML">http://zh.wikipedia.org/wiki/XML</a>”</div> 
437 | 				<!-- /bodytext --> 
438 | 								<!-- catlinks --> 
439 | 				<div id='catlinks' class='catlinks'><div id="mw-normal-catlinks"><a href="/wiki/Special:%E9%A1%B5%E9%9D%A2%E5%88%86%E7%B1%BB" title="Special:页面分类">6个分类</a>: <span dir='ltr'><a href="/wiki/Category:%E7%B6%B2%E9%A0%81%E6%8A%80%E8%A1%93" title="Category:網頁技術">網頁技術</a></span> | <span dir='ltr'><a href="/wiki/Category:W3C%E6%A0%87%E5%87%86" title="Category:W3C标准">W3C标准</a></span> | <span dir='ltr'><a href="/wiki/Category:%E6%96%87%E4%BB%B6%E6%A0%BC%E5%BC%8F" title="Category:文件格式">文件格式</a></span> | <span dir='ltr'><a href="/wiki/Category:%E7%BD%AE%E6%A0%87%E8%AF%AD%E8%A8%80" title="Category:置标语言">置标语言</a></span> | <span dir='ltr'><a href="/wiki/Category:XML" title="Category:XML">XML</a></span> | <span dir='ltr'><a href="/wiki/Category:%E6%95%B0%E6%8D%AE%E5%BA%8F%E5%88%97%E5%8C%96%E6%A0%BC%E5%BC%8F" title="Category:数据序列化格式">数据序列化格式</a></span></div><div id="mw-hidden-catlinks" class="mw-hidden-cats-hidden">1个隐藏分类: <span dir='ltr'><a href="/wiki/Category:%E5%90%AB%E6%9C%89%E8%8B%B1%E8%AA%9E%E7%9A%84%E6%A2%9D%E7%9B%AE" title="Category:含有英語的條目">含有英語的條目</a></span></div></div>				<!-- /catlinks --> 
440 | 												<div class="visualClear"></div> 
441 | 			</div> 
442 | 			<!-- /bodyContent --> 
443 | 		</div> 
444 | 		<!-- /content --> 
445 | 		<!-- header --> 
446 | 		<div id="mw-head" class="noprint"> 
447 | 			
448 | <!-- 0 --> 
449 | <div id="p-personal" class=""> 
450 | 	<h5>个人工具</h5> 
451 | 	<ul> 
452 | 					<li  id="pt-login"><a href="/w/index.php?title=Special:%E7%94%A8%E6%88%B7%E7%99%BB%E5%BD%95&amp;returnto=XML&amp;returntoquery=variant%3Dzh" title="建议你登录，尽管并非必须。 [o]" accesskey="o">登录／创建账户</a></li> 
453 | 			</ul> 
454 | </div> 
455 |  
456 | <!-- /0 --> 
457 | 			<div id="left-navigation"> 
458 | 				
459 | <!-- 0 --> 
460 | <div id="p-namespaces" class="vectorTabs"> 
461 | 	<h5>名字空间</h5> 
462 | 	<ul> 
463 | 					<li  id="ca-nstab-main" class="selected"><a href="/wiki/XML"  title="查看页面内容 [c]" accesskey="c"><span>条目</span></a></li> 
464 | 					<li  id="ca-talk"><a href="/wiki/Talk:XML"  title="关于条目正文的讨论 [t]" accesskey="t"><span>讨论</span></a></li> 
465 | 			</ul> 
466 | </div> 
467 |  
468 | <!-- /0 --> 
469 |  
470 | <!-- 1 --> 
471 | <div id="p-variants" class="vectorMenu"> 
472 | 			<h4> 
473 | 									不转换																																					</h4> 
474 | 		<h5><span>变换</span><a href="#"></a></h5> 
475 | 	<div class="menu"> 
476 | 		<ul> 
477 | 							<li id="ca-0" class="selected"><a href="/zh/XML" >不转换</a></li> 
478 | 							<li id="ca-1"><a href="/zh-hans/XML" >简体</a></li> 
479 | 							<li id="ca-2"><a href="/zh-hant/XML" >繁體</a></li> 
480 | 							<li id="ca-3"><a href="/zh-cn/XML" >大陆简体</a></li> 
481 | 							<li id="ca-4"><a href="/zh-hk/XML" >港澳繁體</a></li> 
482 | 							<li id="ca-5"><a href="/zh-sg/XML" >马新简体</a></li> 
483 | 							<li id="ca-6"><a href="/zh-tw/XML" >台灣正體</a></li> 
484 | 					</ul> 
485 | 	</div> 
486 | </div> 
487 |  
488 | <!-- /1 --> 
489 | 			</div> 
490 | 			<div id="right-navigation"> 
491 | 				
492 | <!-- 0 --> 
493 | <div id="p-views" class="vectorTabs"> 
494 | 	<h5>查看</h5> 
495 | 	<ul> 
496 | 					<li id="ca-view" class="selected"><a href="/wiki/XML" ><span>阅读</span></a></li> 
497 | 					<li id="ca-edit"><a href="/w/index.php?title=XML&amp;action=edit"  title="你可编辑此页，请在保存前先预览一下。 [e]" accesskey="e"><span>编辑</span></a></li> 
498 | 					<li id="ca-history" class="collapsible "><a href="/w/index.php?title=XML&amp;action=history"  title="本页面的早前版本。 [h]" accesskey="h"><span>查看历史</span></a></li> 
499 | 			</ul> 
500 | </div> 
501 |  
502 | <!-- /0 --> 
503 |  
504 | <!-- 1 --> 
505 | <div id="p-cactions" class="vectorMenu emptyPortlet"> 
506 | 	<h5><span>动作</span><a href="#"></a></h5> 
507 | 	<div class="menu"> 
508 | 		<ul> 
509 | 					</ul> 
510 | 	</div> 
511 | </div> 
512 |  
513 | <!-- /1 --> 
514 |  
515 | <!-- 2 --> 
516 | <div id="p-search"> 
517 | 	<h5><label for="searchInput">搜索</label></h5> 
518 | 	<form action="/w/index.php" id="searchform"> 
519 | 		<input type='hidden' name="title" value="Special:搜索"/> 
520 | 				<div id="simpleSearch"> 
521 | 			<input id="searchInput" name="search" type="text"  title="搜索维基百科 [f]" accesskey="f"  value="" /> 
522 | 			<button id="searchButton" type='submit' name='button'  title="搜索该文字的页面"><img src="http://bits.wikimedia.org/skins-1.5/vector/images/search-ltr.png?283-19" alt="搜索" /></button> 
523 | 		</div> 
524 | 			</form> 
525 | </div> 
526 |  
527 | <!-- /2 --> 
528 | 			</div> 
529 | 		</div> 
530 | 		<!-- /header --> 
531 | 		<!-- panel --> 
532 | 			<div id="mw-panel" class="noprint"> 
533 | 				<!-- logo --> 
534 | 					<div id="p-logo"><a style="background-image: url(http://upload.wikimedia.org/wikipedia/commons/0/0a/Wikipedia-logo-v2-zh.png);" href="/wiki/Wikipedia:%E9%A6%96%E9%A1%B5"  title="首页"></a></div> 
535 | 				<!-- /logo --> 
536 | 				
537 | <!-- SEARCH --> 
538 |  
539 | <!-- /SEARCH --> 
540 |  
541 | <!-- navigation --> 
542 | <div class="portal" id='p-navigation'> 
543 | 	<h5>导航</h5> 
544 | 	<div class="body"> 
545 | 				<ul> 
546 | 					<li id="n-mainpage-description"><a href="/wiki/Wikipedia:%E9%A6%96%E9%A1%B5" title="访问首页 [z]" accesskey="z">首页</a></li> 
547 | 					<li id="n-indexpage"><a href="/wiki/Wikipedia:%E5%88%86%E9%A1%9E%E7%B4%A2%E5%BC%95">分類索引</a></li> 
548 | 					<li id="n-Featured_content"><a href="/wiki/Portal:%E7%89%B9%E8%89%B2%E5%85%A7%E5%AE%B9">特色内容</a></li> 
549 | 					<li id="n-currentevents"><a href="/wiki/Portal:%E6%96%B0%E8%81%9E%E5%8B%95%E6%85%8B" title="提供当前新闻事件的背景资料">新闻动态</a></li> 
550 | 					<li id="n-recentchanges"><a href="/wiki/Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9" title="列出维基百科中的最近修改 [r]" accesskey="r">最近更改</a></li> 
551 | 					<li id="n-randompage"><a href="/wiki/Special:%E9%9A%8F%E6%9C%BA%E9%A1%B5%E9%9D%A2" title="随机载入一个页面 [x]" accesskey="x">随机条目</a></li> 
552 | 				</ul> 
553 | 			</div> 
554 | </div> 
555 |  
556 | <!-- /navigation --> 
557 |  
558 | <!-- help --> 
559 | <div class="portal" id='p-help'> 
560 | 	<h5>帮助</h5> 
561 | 	<div class="body"> 
562 | 				<ul> 
563 | 					<li id="n-help"><a href="/wiki/Help:%E7%9B%AE%E5%BD%95" title="寻求帮助">帮助</a></li> 
564 | 					<li id="n-portal"><a href="/wiki/Wikipedia:%E7%A4%BE%E5%8C%BA%E4%B8%BB%E9%A1%B5" title="关于本计划、你可以做什么、应该如何做">社区入口</a></li> 
565 | 					<li id="n-policy"><a href="/wiki/Wikipedia:%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95">方针与指引</a></li> 
566 | 					<li id="n-villagepump"><a href="/wiki/Wikipedia:%E4%BA%92%E5%8A%A9%E5%AE%A2%E6%A0%88">互助客栈</a></li> 
567 | 					<li id="n-Information_desk"><a href="/wiki/Wikipedia:%E8%A9%A2%E5%95%8F%E8%99%95">询问处</a></li> 
568 | 					<li id="n-conversion"><a href="/wiki/Wikipedia:%E5%AD%97%E8%AF%8D%E8%BD%AC%E6%8D%A2%E8%AF%B7%E6%B1%82%E6%88%96%E5%80%99%E9%80%89">字词转换</a></li> 
569 | 					<li id="n-IRC"><a href="/wiki/Wikipedia:IRC%E8%81%8A%E5%A4%A9%E9%A2%91%E9%81%93">IRC即时聊天</a></li> 
570 | 					<li id="n-contact"><a href="/wiki/Wikipedia:%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC">联系我们</a></li> 
571 | 					<li id="n-about"><a href="/wiki/Wikipedia:%E5%85%B3%E4%BA%8E">关于维基百科</a></li> 
572 | 					<li id="n-sitesupport"><a href="http://wikimediafoundation.org/wiki/Special:Landingcheck?landing_page=WMFJA1&amp;language=zh&amp;utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=20101204SB001" title="如果您在維基百科受益良多，您可以考慮資助我們">资助维基百科</a></li> 
573 | 				</ul> 
574 | 			</div> 
575 | </div> 
576 |  
577 | <!-- /help --> 
578 |  
579 | <!-- TOOLBOX --> 
580 | <div class="portal" id="p-tb"> 
581 | 	<h5>工具</h5> 
582 | 	<div class="body"> 
583 | 		<ul> 
584 | 					<li id="t-whatlinkshere"><a href="/wiki/Special:%E9%93%BE%E5%85%A5%E9%A1%B5%E9%9D%A2/XML" title="列出所有与本页相链的页面 [j]" accesskey="j">链入页面</a></li> 
585 | 						<li id="t-recentchangeslinked"><a href="/wiki/Special:%E9%93%BE%E5%87%BA%E6%9B%B4%E6%94%B9/XML" title="页面链出所有页面的更改 [k]" accesskey="k">链出更改</a></li> 
586 | 																																					<li id="t-upload"><a href="/wiki/Project:%E4%B8%8A%E4%BC%A0" title="上传图像或多媒体文件 [u]" accesskey="u">上传文件</a></li> 
587 | 											<li id="t-specialpages"><a href="/wiki/Special:%E7%89%B9%E6%AE%8A%E9%A1%B5%E9%9D%A2" title="全部特殊页面的列表 [q]" accesskey="q">特殊页面</a></li> 
588 | 									<li id="t-print"><a href="/w/index.php?title=XML&amp;variant=zh&amp;printable=yes" rel="alternate" title="这个页面的可打印版本 [p]" accesskey="p">打印页面</a></li> 
589 | 						<li id="t-permalink"><a href="/w/index.php?title=XML&amp;oldid=15329713" title="这个页面修订版本的永久链接">永久链接</a></li> 
590 | 				<li id="t-cite"><a href="/w/index.php?title=Special:%E5%BC%95%E7%94%A8&amp;page=XML&amp;id=15329713" title="Information on how to cite this page">引用此文</a></li>		</ul> 
591 | 	</div> 
592 | </div> 
593 |  
594 | <!-- /TOOLBOX --> 
595 |  
596 | <!-- LANGUAGES --> 
597 | <div class="portal" id="p-lang"> 
598 | 	<h5>其他语言</h5> 
599 | 	<div class="body"> 
600 | 		<ul> 
601 | 					<li class="interwiki-af"><a href="http://af.wikipedia.org/wiki/XML" title="XML">Afrikaans</a></li> 
602 | 					<li class="interwiki-ar"><a href="http://ar.wikipedia.org/wiki/%D9%84%D8%BA%D8%A9_%D8%A7%D9%84%D8%B1%D9%82%D9%85_%D8%A7%D9%84%D9%82%D8%A7%D8%A8%D9%84%D8%A9_%D9%84%D9%84%D8%A7%D9%85%D8%AA%D8%AF%D8%A7%D8%AF" title="لغة الرقم القابلة للامتداد">العربية</a></li> 
603 | 					<li class="interwiki-bat-smg"><a href="http://bat-smg.wikipedia.org/wiki/XML" title="XML">Žemaitėška</a></li> 
604 | 					<li class="interwiki-bg"><a href="http://bg.wikipedia.org/wiki/XML" title="XML">Български</a></li> 
605 | 					<li class="interwiki-bn"><a href="http://bn.wikipedia.org/wiki/%E0%A6%8F%E0%A6%95%E0%A7%8D%E0%A6%B8%E0%A6%9F%E0%A7%87%E0%A6%A8%E0%A6%B8%E0%A6%BF%E0%A6%AD_%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A7%8D%E0%A6%95%E0%A6%86%E0%A6%AA_%E0%A6%B2%E0%A7%8D%E0%A6%AF%E0%A6%BE%E0%A6%82%E0%A6%97%E0%A7%81%E0%A6%AF%E0%A6%BC%E0%A7%87%E0%A6%9C" title="এক্সটেনসিভ মার্কআপ ল্যাংগুয়েজ">বাংলা</a></li> 
606 | 					<li class="interwiki-bs"><a href="http://bs.wikipedia.org/wiki/XML" title="XML">Bosanski</a></li> 
607 | 					<li class="interwiki-ca"><a href="http://ca.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">Català</a></li> 
608 | 					<li class="interwiki-ckb"><a href="http://ckb.wikipedia.org/wiki/%D8%A6%DB%8E%DA%A9%D8%B3_%D8%A6%DB%8E%D9%85_%D8%A6%DB%8E%DA%B5" title="ئێکس ئێم ئێڵ">Soranî / کوردی</a></li> 
609 | 					<li class="interwiki-cs"><a href="http://cs.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">Česky</a></li> 
610 | 					<li class="interwiki-da"><a href="http://da.wikipedia.org/wiki/XML" title="XML">Dansk</a></li> 
611 | 					<li class="interwiki-de"><a href="http://de.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">Deutsch</a></li> 
612 | 					<li class="interwiki-el"><a href="http://el.wikipedia.org/wiki/XML" title="XML">Ελληνικά</a></li> 
613 | 					<li class="interwiki-en"><a href="http://en.wikipedia.org/wiki/XML" title="XML">English</a></li> 
614 | 					<li class="interwiki-eo"><a href="http://eo.wikipedia.org/wiki/XML" title="XML">Esperanto</a></li> 
615 | 					<li class="interwiki-es"><a href="http://es.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">Español</a></li> 
616 | 					<li class="interwiki-et"><a href="http://et.wikipedia.org/wiki/XML" title="XML">Eesti</a></li> 
617 | 					<li class="interwiki-eu"><a href="http://eu.wikipedia.org/wiki/XML" title="XML">Euskara</a></li> 
618 | 					<li class="interwiki-fa"><a href="http://fa.wikipedia.org/wiki/%D8%A7%DB%8C%DA%A9%D8%B3%E2%80%8C%D8%A7%D9%85%E2%80%8C%D8%A7%D9%84" title="ایکس‌ام‌ال">فارسی</a></li> 
619 | 					<li class="interwiki-fi"><a href="http://fi.wikipedia.org/wiki/XML" title="XML">Suomi</a></li> 
620 | 					<li class="interwiki-fr"><a href="http://fr.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">Français</a></li> 
621 | 					<li class="interwiki-ga"><a href="http://ga.wikipedia.org/wiki/XML" title="XML">Gaeilge</a></li> 
622 | 					<li class="interwiki-gl"><a href="http://gl.wikipedia.org/wiki/XML" title="XML">Galego</a></li> 
623 | 					<li class="interwiki-he"><a href="http://he.wikipedia.org/wiki/XML" title="XML">עברית</a></li> 
624 | 					<li class="interwiki-hi"><a href="http://hi.wikipedia.org/wiki/%E0%A4%95%E0%A5%8D%E0%A4%B7%E0%A4%AE%E0%A4%B2" title="क्षमल">हिन्दी</a></li> 
625 | 					<li class="interwiki-hr"><a href="http://hr.wikipedia.org/wiki/XML" title="XML">Hrvatski</a></li> 
626 | 					<li class="interwiki-hu"><a href="http://hu.wikipedia.org/wiki/XML" title="XML">Magyar</a></li> 
627 | 					<li class="interwiki-ia"><a href="http://ia.wikipedia.org/wiki/XML" title="XML">Interlingua</a></li> 
628 | 					<li class="interwiki-id"><a href="http://id.wikipedia.org/wiki/Extensible_markup_language" title="Extensible markup language">Bahasa Indonesia</a></li> 
629 | 					<li class="interwiki-is"><a href="http://is.wikipedia.org/wiki/XML" title="XML">Íslenska</a></li> 
630 | 					<li class="interwiki-it"><a href="http://it.wikipedia.org/wiki/XML" title="XML">Italiano</a></li> 
631 | 					<li class="interwiki-ja"><a href="http://ja.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">日本語</a></li> 
632 | 					<li class="interwiki-ko"><a href="http://ko.wikipedia.org/wiki/XML" title="XML">한국어</a></li> 
633 | 					<li class="interwiki-lo"><a href="http://lo.wikipedia.org/wiki/XML" title="XML">ລາວ</a></li> 
634 | 					<li class="interwiki-lt"><a href="http://lt.wikipedia.org/wiki/XML" title="XML">Lietuvių</a></li> 
635 | 					<li class="interwiki-lv"><a href="http://lv.wikipedia.org/wiki/Valoda_XML" title="Valoda XML">Latviešu</a></li> 
636 | 					<li class="interwiki-ml"><a href="http://ml.wikipedia.org/wiki/%E0%B4%8E%E0%B4%95%E0%B5%8D%E0%B4%B8%E0%B5%8D.%E0%B4%8E%E0%B4%82.%E0%B4%8E%E0%B5%BD." title="എക്സ്.എം.എൽ.">മലയാളം</a></li> 
637 | 					<li class="interwiki-mn"><a href="http://mn.wikipedia.org/wiki/XML" title="XML">Монгол</a></li> 
638 | 					<li class="interwiki-ms"><a href="http://ms.wikipedia.org/wiki/XML" title="XML">Bahasa Melayu</a></li> 
639 | 					<li class="interwiki-nl"><a href="http://nl.wikipedia.org/wiki/Extensible_Markup_Language" title="Extensible Markup Language">Nederlands</a></li> 
640 | 					<li class="interwiki-nn"><a href="http://nn.wikipedia.org/wiki/XML" title="XML">‪Norsk (nynorsk)‬</a></li> 
641 | 					<li class="interwiki-no"><a href="http://no.wikipedia.org/wiki/XML" title="XML">‪Norsk (bokmål)‬</a></li> 
642 | 					<li class="interwiki-pl"><a href="http://pl.wikipedia.org/wiki/XML" title="XML">Polski</a></li> 
643 | 					<li class="interwiki-pt"><a href="http://pt.wikipedia.org/wiki/XML" title="XML">Português</a></li> 
644 | 					<li class="interwiki-ro"><a href="http://ro.wikipedia.org/wiki/XML" title="XML">Română</a></li> 
645 | 					<li class="interwiki-ru"><a href="http://ru.wikipedia.org/wiki/XML" title="XML">Русский</a></li> 
646 | 					<li class="interwiki-simple"><a href="http://simple.wikipedia.org/wiki/XML" title="XML">Simple English</a></li> 
647 | 					<li class="interwiki-sk"><a href="http://sk.wikipedia.org/wiki/XML" title="XML">Slovenčina</a></li> 
648 | 					<li class="interwiki-sl"><a href="http://sl.wikipedia.org/wiki/XML" title="XML">Slovenščina</a></li> 
649 | 					<li class="interwiki-sq"><a href="http://sq.wikipedia.org/wiki/XML" title="XML">Shqip</a></li> 
650 | 					<li class="interwiki-sr"><a href="http://sr.wikipedia.org/wiki/XML" title="XML">Српски / Srpski</a></li> 
651 | 					<li class="interwiki-sv"><a href="http://sv.wikipedia.org/wiki/XML" title="XML">Svenska</a></li> 
652 | 					<li class="interwiki-ta"><a href="http://ta.wikipedia.org/wiki/%E0%AE%8E%E0%AE%95%E0%AF%8D%E0%AE%B8%E0%AF%8D%E0%AE%8E%E0%AE%AE%E0%AF%8D%E0%AE%8E%E0%AE%B2%E0%AF%8D" title="எக்ஸ்எம்எல்">தமிழ்</a></li> 
653 | 					<li class="interwiki-te"><a href="http://te.wikipedia.org/wiki/XML" title="XML">తెలుగు</a></li> 
654 | 					<li class="interwiki-tg"><a href="http://tg.wikipedia.org/wiki/XML" title="XML">Тоҷикӣ</a></li> 
655 | 					<li class="interwiki-th"><a href="http://th.wikipedia.org/wiki/%E0%B9%80%E0%B8%AD%E0%B8%81%E0%B8%8B%E0%B9%8C%E0%B9%80%E0%B8%AD%E0%B9%87%E0%B8%A1%E0%B9%81%E0%B8%AD%E0%B8%A5" title="เอกซ์เอ็มแอล">ไทย</a></li> 
656 | 					<li class="interwiki-tk"><a href="http://tk.wikipedia.org/wiki/XML" title="XML">Türkmençe</a></li> 
657 | 					<li class="interwiki-tr"><a href="http://tr.wikipedia.org/wiki/Geni%C5%9Fletilebilir_i%C5%9Faretleme_dili" title="Genişletilebilir işaretleme dili">Türkçe</a></li> 
658 | 					<li class="interwiki-uk"><a href="http://uk.wikipedia.org/wiki/XML" title="XML">Українська</a></li> 
659 | 					<li class="interwiki-ur"><a href="http://ur.wikipedia.org/wiki/%D8%AA%D9%88%D8%B3%DB%8C%D8%B9%DB%8C_%D8%B2%D8%A8%D8%A7%D9%86_%D8%AA%D8%AF%D9%88%DB%8C%D9%86" title="توسیعی زبان تدوین">اردو</a></li> 
660 | 					<li class="interwiki-vi"><a href="http://vi.wikipedia.org/wiki/XML" title="XML">Tiếng Việt</a></li> 
661 | 				</ul> 
662 | 	</div> 
663 | </div> 
664 |  
665 | <!-- /LANGUAGES --> 
666 | 			</div> 
667 | 		<!-- /panel --> 
668 | 		<!-- footer --> 
669 | 		<div id="footer"> 
670 | 											<ul id="footer-info"> 
671 | 																	<li id="footer-info-lastmod"> 本页面最后修订于2011年1月1日 (星期六) 15:20。</li> 
672 | 																							<li id="footer-info-copyright">本站的全部文字在<a class="internal" href="/wiki/Wikipedia:CC-by-sa-3.0%E5%8D%8F%E8%AE%AE%E6%96%87%E6%9C%AC" title="Wikipedia:CC-by-sa-3.0协议文本">知识共享 署名-相同方式共享 3.0协议</a>之条款下提供，附加条款亦可能应用。（请参阅<a href="http://wikimediafoundation.org/wiki/Terms_of_Use">使用条款</a>）<br /> 
673 | Wikipedia&reg;和维基百科标志是<a href="http://wikimediafoundation.org">维基媒体基金会</a>的注册商标；维基&trade;是维基媒体基金会的商标。<br />维基媒体基金会是在美国佛罗里达州登记的501(c)(3)<a href="http://wikimediafoundation.org/wiki/%E8%B5%84%E5%8A%A9%E7%9A%84%E5%85%8D%E7%A8%8E%E6%94%BF%E7%AD%96">免税</a>、非营利、慈善机构。<br /></li> 
674 | 															</ul> 
675 | 															<ul id="footer-places"> 
676 | 																	<li id="footer-places-privacy"><a href="http://wikimediafoundation.org/wiki/%E9%9A%90%E7%A7%81%E6%94%BF%E7%AD%96" title="wikimedia:隐私政策">隐私政策</a></li> 
677 | 																							<li id="footer-places-about"><a href="/wiki/Wikipedia:%E5%85%B3%E4%BA%8E" title="Wikipedia:关于">关于维基百科</a></li> 
678 | 																							<li id="footer-places-disclaimer"><a href="/wiki/Wikipedia:%E5%85%8D%E8%B4%A3%E5%A3%B0%E6%98%8E" title="Wikipedia:免责声明">免责声明</a></li> 
679 | 															</ul> 
680 | 										<ul id="footer-icons" class="noprint"> 
681 | 								<li id="footer-icon-poweredby"><a href="http://www.mediawiki.org/"><img src="http://bits.wikimedia.org/skins-1.5/common/images/poweredby_mediawiki_88x31.png" height="31" width="88" alt="Powered by MediaWiki" /></a></li> 
682 | 												<li id="footer-icon-copyright"><a href="http://wikimediafoundation.org/"><img src="/images/wikimedia-button.png" width="88" height="31" alt="Wikimedia Foundation"/></a></li> 
683 | 							</ul> 
684 | 			<div style="clear:both"></div> 
685 | 		</div> 
686 | 		<!-- /footer --> 
687 | 		<!-- fixalpha --> 
688 | 		<script type="text/javascript"> if ( window.isMSIE55 ) fixalpha(); </script> 
689 | 		<!-- /fixalpha --> 
690 | 		
691 | <script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script> 
692 | <script type="text/javascript" src="http://geoiplookup.wikimedia.org/"></script>		<!-- Served by srv197 in 0.306 secs. -->			</body> 
693 | </html>


--------------------------------------------------------------------------------