6 | #import "HTMLOrderedDictionary.h"
7 | #import "HTMLParser.h"
8 | #import "HTMLTokenizerState.h"
9 |
10 | /**
11 | * An HTMLTokenizer emits tokens derived from a string of HTML.
12 | *
13 | * For more information, see http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
14 | */
15 | @interface HTMLTokenizer : NSEnumerator
16 |
17 | /**
18 | * This is the designated initializer.
19 | */
20 | - (id)initWithString:(NSString *)string;
21 |
22 | /**
23 | * The string where tokens come from.
24 | */
25 | @property (readonly, copy, nonatomic) NSString *string;
26 |
27 | /**
28 | * The current state of the tokenizer. Sometimes the parser needs to change this.
29 | */
30 | @property (assign, nonatomic) HTMLTokenizerState state;
31 |
32 | /**
33 | * The parser that is consuming the tokenizer's tokens. Sometimes the tokenizer needs to know the parser's state.
34 | */
35 | @property (weak, nonatomic) HTMLParser *parser;
36 |
37 | @end
38 |
39 | /**
40 | * An HTMLDOCTYPEToken represents a `` tag.
41 | */
42 | @interface HTMLDOCTYPEToken : NSObject
43 |
44 | /**
45 | * The name of the DOCTYPE, or nil if it has none.
46 | */
47 | @property (copy, nonatomic) NSString *name;
48 |
49 | /**
50 | * The public identifier of the DOCTYPE, or nil if it has none.
51 | */
52 | @property (copy, nonatomic) NSString *publicIdentifier;
53 |
54 | /**
55 | * The system identifier of the DOCTYPE, or nil if it has none.
56 | */
57 | @property (copy, nonatomic) NSString *systemIdentifier;
58 |
59 | /**
60 | * YES if the parsed HTMLDocument's quirks mode should be set, or NO if other indicators should be used.
61 | */
62 | @property (assign, nonatomic) BOOL forceQuirks;
63 |
64 | @end
65 |
66 | /**
67 | * An HTMLTagToken abstractly represents opening (``) and closing (`
`) HTML tags with optional attributes.
68 | */
69 | @interface HTMLTagToken : NSObject
70 |
71 | /**
72 | * This is the designated initializer.
73 | */
74 | - (id)init;
75 |
76 | /**
77 | * Initializes a token with a tag name.
78 | */
79 | - (id)initWithTagName:(NSString *)tagName;
80 |
81 | /**
82 | * The name of this tag.
83 | */
84 | @property (copy, nonatomic) NSString *tagName;
85 |
86 | /**
87 | * A dictionary mapping HTMLAttributeName keys to NSString values.
88 | */
89 | @property (copy, nonatomic) HTMLOrderedDictionary *attributes;
90 |
91 | /**
92 | * YES if this tag is a self-closing tag (
), or NO otherwise (
or ).
93 | */
94 | @property (nonatomic) BOOL selfClosingFlag;
95 |
96 | @end
97 |
98 | /**
99 | * An HTMLStartTagToken represents a start tag like ``.
100 | */
101 | @interface HTMLStartTagToken : HTMLTagToken
102 |
103 | /**
104 | * Returns an initialized copy of this start tag token with a new tag name.
105 | *
106 | * @param tagName The tag name of the copied token.
107 | */
108 | - (id)copyWithTagName:(NSString *)tagName;
109 |
110 | @end
111 |
112 | /**
113 | * An HTMLEndTagToken represents an end tag like `
`.
114 | */
115 | @interface HTMLEndTagToken : HTMLTagToken
116 |
117 | @end
118 |
119 | /**
120 | * An HTMLCommentToken represents a comment .
121 | */
122 | @interface HTMLCommentToken : NSObject
123 |
124 | /**
125 | * This is the designated initializer.
126 | *
127 | * @param data The comment's data.
128 | */
129 | - (id)initWithData:(NSString *)data;
130 |
131 | /**
132 | * The comment's data.
133 | */
134 | @property (readonly, copy, nonatomic) NSString *data;
135 |
136 | @end
137 |
138 | /**
139 | * An HTMLCharacterToken represents a series of code points as text in an HTML document.
140 | */
141 | @interface HTMLCharacterToken : NSObject
142 |
143 | /**
144 | * This is the designated initializer.
145 | */
146 | - (id)initWithString:(NSString *)string;
147 |
148 | /**
149 | * The code points represented by this token.
150 | */
151 | @property (readonly, copy, nonatomic) NSString *string;
152 |
153 | /**
154 | * Returns a token for the leading whitespace, or nil if there is no leading whitespace.
155 | */
156 | - (instancetype)leadingWhitespaceToken;
157 |
158 | /**
159 | * Returns a token for the characters after leading whitespace, or nil if the token is entirely whitespace.
160 | */
161 | - (instancetype)afterLeadingWhitespaceToken;
162 |
163 | @end
164 |
165 | /**
166 | * An HTMLParseErrorToken represents a parse error during tokenization.
167 | *
168 | * Parse errors are emitted as tokens for context.
169 | */
170 | @interface HTMLParseErrorToken : NSObject
171 |
172 | /**
173 | * This is the designated initializer.
174 | *
175 | * @param error The reason for the parse error.
176 | */
177 | - (id)initWithError:(NSString *)error;
178 |
179 | /**
180 | * The reason for the parse error.
181 | */
182 | @property (readonly, copy, nonatomic) NSString *error;
183 |
184 | @end
185 |
186 | /**
187 | * A single HTMLEOFToken is emitted when the end of the file is parsed and no further tokens will be emitted.
188 | */
189 | @interface HTMLEOFToken : NSObject
190 |
191 | @end
192 |
193 | @interface HTMLTokenizer (Testing)
194 |
195 | /**
196 | * Sets the name of the last start tag, which is used at certain steps of tokenization.
197 | *
198 | * @param tagName The name of the pretend last start tag.
199 | */
200 | - (void)setLastStartTag:(NSString *)tagName;
201 |
202 | @end
203 |
--------------------------------------------------------------------------------
/TextBookParse/epubParse/htmlParse/HTMLTokenizerState.h:
--------------------------------------------------------------------------------
1 | // HTMLTokenizerState.h
2 | //
3 | // Public domain. https://github.com/nolanw/HTMLReader
4 |
5 | /**
6 | * The many states of an HTML tokenizer.
7 | *
8 | * For more information, see http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
9 | */
10 | typedef NS_ENUM(NSInteger, HTMLTokenizerState)
11 | {
12 | HTMLDataTokenizerState,
13 | HTMLCharacterReferenceInDataTokenizerState,
14 | HTMLRCDATATokenizerState,
15 | HTMLCharacterReferenceInRCDATATokenizerState,
16 | HTMLRAWTEXTTokenizerState,
17 | HTMLScriptDataTokenizerState,
18 | HTMLPLAINTEXTTokenizerState,
19 | HTMLTagOpenTokenizerState,
20 | HTMLEndTagOpenTokenizerState,
21 | HTMLTagNameTokenizerState,
22 | HTMLRCDATALessThanSignTokenizerState,
23 | HTMLRCDATAEndTagOpenTokenizerState,
24 | HTMLRCDATAEndTagNameTokenizerState,
25 | HTMLRAWTEXTLessThanSignTokenizerState,
26 | HTMLRAWTEXTEndTagOpenTokenizerState,
27 | HTMLRAWTEXTEndTagNameTokenizerState,
28 | HTMLScriptDataLessThanSignTokenizerState,
29 | HTMLScriptDataEndTagOpenTokenizerState,
30 | HTMLScriptDataEndTagNameTokenizerState,
31 | HTMLScriptDataEscapeStartTokenizerState,
32 | HTMLScriptDataEscapeStartDashTokenizerState,
33 | HTMLScriptDataEscapedTokenizerState,
34 | HTMLScriptDataEscapedDashTokenizerState,
35 | HTMLScriptDataEscapedDashDashTokenizerState,
36 | HTMLScriptDataEscapedLessThanSignTokenizerState,
37 | HTMLScriptDataEscapedEndTagOpenTokenizerState,
38 | HTMLScriptDataEscapedEndTagNameTokenizerState,
39 | HTMLScriptDataDoubleEscapeStartTokenizerState,
40 | HTMLScriptDataDoubleEscapedTokenizerState,
41 | HTMLScriptDataDoubleEscapedDashTokenizerState,
42 | HTMLScriptDataDoubleEscapedDashDashTokenizerState,
43 | HTMLScriptDataDoubleEscapedLessThanSignTokenizerState,
44 | HTMLScriptDataDoubleEscapeEndTokenizerState,
45 | HTMLBeforeAttributeNameTokenizerState,
46 | HTMLAttributeNameTokenizerState,
47 | HTMLAfterAttributeNameTokenizerState,
48 | HTMLBeforeAttributeValueTokenizerState,
49 | HTMLAttributeValueDoubleQuotedTokenizerState,
50 | HTMLAttributeValueSingleQuotedTokenizerState,
51 | HTMLAttributeValueUnquotedTokenizerState,
52 | HTMLCharacterReferenceInAttributeValueTokenizerState,
53 | HTMLAfterAttributeValueQuotedTokenizerState,
54 | HTMLSelfClosingStartTagTokenizerState,
55 | HTMLBogusCommentTokenizerState,
56 | HTMLMarkupDeclarationOpenTokenizerState,
57 | HTMLCommentStartTokenizerState,
58 | HTMLCommentStartDashTokenizerState,
59 | HTMLCommentTokenizerState,
60 | HTMLCommentEndDashTokenizerState,
61 | HTMLCommentEndTokenizerState,
62 | HTMLCommentEndBangTokenizerState,
63 | HTMLDOCTYPETokenizerState,
64 | HTMLBeforeDOCTYPENameTokenizerState,
65 | HTMLDOCTYPENameTokenizerState,
66 | HTMLAfterDOCTYPENameTokenizerState,
67 | HTMLAfterDOCTYPEPublicKeywordTokenizerState,
68 | HTMLBeforeDOCTYPEPublicIdentifierTokenizerState,
69 | HTMLDOCTYPEPublicIdentifierDoubleQuotedTokenizerState,
70 | HTMLDOCTYPEPublicIdentifierSingleQuotedTokenizerState,
71 | HTMLAfterDOCTYPEPublicIdentifierTokenizerState,
72 | HTMLBetweenDOCTYPEPublicAndSystemIdentifiersTokenizerState,
73 | HTMLAfterDOCTYPESystemKeywordTokenizerState,
74 | HTMLBeforeDOCTYPESystemIdentifierTokenizerState,
75 | HTMLDOCTYPESystemIdentifierDoubleQuotedTokenizerState,
76 | HTMLDOCTYPESystemIdentifierSingleQuotedTokenizerState,
77 | HTMLAfterDOCTYPESystemIdentifierTokenizerState,
78 | HTMLBogusDOCTYPETokenizerState,
79 | HTMLCDATASectionTokenizerState,
80 | };
81 |
--------------------------------------------------------------------------------
/TextBookParse/epubParse/htmlParse/HTMLTreeEnumerator.h:
--------------------------------------------------------------------------------
1 | // HTMLTreeEnumerator.h
2 | //
3 | // Public domain. https://github.com/nolanw/HTMLReader
4 |
5 | #import
6 | @class HTMLNode;
7 |
8 | /**
9 | * An HTMLTreeEnumerator emits HTMLNode instances in tree order (preorder, depth-first) or reverse tree order (preorder, depth-first starting with the last child).
10 | */
11 | @interface HTMLTreeEnumerator : NSEnumerator
12 |
13 | /**
14 | * This is the designated initializer.
15 | */
16 | - (id)initWithNode:(HTMLNode *)node reversed:(BOOL)reversed;
17 |
18 | @end
19 |
--------------------------------------------------------------------------------
/TextBookParse/epubParse/htmlParse/HTMLTreeEnumerator.m:
--------------------------------------------------------------------------------
1 | // HTMLTreeEnumerator.m
2 | //
3 | // Public domain. https://github.com/nolanw/HTMLReader
4 |
5 | #import "HTMLTreeEnumerator.h"
6 | #import "HTMLNode.h"
7 |
8 | // For performance we'll cache the number of nodes at each level of the tree.
9 | typedef struct {
10 | NSUInteger i;
11 | NSUInteger count;
12 | } Row;
13 |
14 | typedef struct {
15 | Row *path;
16 | NSUInteger length;
17 | NSUInteger capacity;
18 | } IndexPath;
19 |
20 | @implementation HTMLTreeEnumerator
21 | {
22 | HTMLNode *_nextNode;
23 | BOOL _reversed;
24 | IndexPath _indexPath;
25 | }
26 |
27 | - (void)dealloc
28 | {
29 | free(_indexPath.path);
30 | }
31 |
32 | - (id)initWithNode:(HTMLNode *)node reversed:(BOOL)reversed
33 | {
34 | self = [super init];
35 | if (!self) return nil;
36 |
37 | _nextNode = node;
38 | _reversed = reversed;
39 |
40 | return self;
41 | }
42 |
43 | - (id)nextObject
44 | {
45 | // This enumerator works by storing the *next* node we intend to emit, and the index path that points to that next node.
46 | HTMLNode *currentNode = _nextNode;
47 |
48 | NSUInteger numberOfChildren = currentNode.numberOfChildren;
49 |
50 | if (numberOfChildren > 0) {
51 |
52 | // Depth-first means the next node we'll emit is the current node's first child.
53 | if (_indexPath.length == _indexPath.capacity) {
54 | _indexPath.capacity += 16;
55 | _indexPath.path = reallocf(_indexPath.path, sizeof(_indexPath.path[0]) * _indexPath.capacity);
56 | }
57 | Row *row = _indexPath.path + _indexPath.length;
58 | _indexPath.length++;
59 | row->count = numberOfChildren;
60 | row->i = _reversed ? numberOfChildren - 1 : 0;
61 | _nextNode = [currentNode childAtIndex:row->i];
62 |
63 | } else {
64 |
65 | // We're out of children on this row, so walk back up the tree until we find a level with spare children.
66 | HTMLNode *parentNode = currentNode.parentNode;
67 | while (_indexPath.length > 0) {
68 | Row *row = _indexPath.path + _indexPath.length - 1;
69 | if (_reversed && row->i > 0) {
70 | row->i--;
71 | } else if (!_reversed && row->i + 1 < row->count) {
72 | row->i++;
73 | } else {
74 | _indexPath.length--;
75 | parentNode = parentNode.parentNode;
76 | continue;
77 | }
78 | _nextNode = [parentNode childAtIndex:row->i];
79 | break;
80 | }
81 |
82 | // No more spare children means we're done.
83 | if (_indexPath.length == 0) {
84 | _nextNode = nil;
85 | }
86 | }
87 | return currentNode;
88 | }
89 |
90 | @end
91 |
--------------------------------------------------------------------------------
/TextBookParse/epubParse/htmlParse/NSString+HTMLEntities.h:
--------------------------------------------------------------------------------
1 | // NSString+HTMLEntities.h
2 | //
3 | // Public domain. https://github.com/nolanw/HTMLReader
4 |
5 | #import
6 |
7 | @interface NSString (HTMLEntities)
8 |
9 | /**
10 | * Returns a copy of the string with the necessary characters escaped for HTML.
11 | *
12 | * For more information, see http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#escapingString (the algorithm is not invoked in the "attribute mode").
13 | */
14 | - (NSString *)html_stringByEscapingForHTML;
15 |
16 | /**
17 | * Returns a copy of the string with all recognized HTML entities replaced by their respective code points. If no replacement is necessary, the same instance may be returned.
18 | */
19 | - (NSString *)html_stringByUnescapingHTML;
20 |
21 | @end
22 |
--------------------------------------------------------------------------------
/TextBookParse/epubParse/htmlParse/NSString+HTMLEntities.m:
--------------------------------------------------------------------------------
1 | // NSString+HTMLEntities.m
2 | //
3 | // Public domain. https://github.com/nolanw/HTMLReader
4 |
5 | #import "NSString+HTMLEntities.h"
6 | #import "HTMLEntities.h"
7 | #import "HTMLString.h"
8 |
9 | @implementation NSString (HTMLEntities)
10 |
11 | - (NSString *)html_stringByEscapingForHTML
12 | {
13 | NSMutableString *escaped = [self mutableCopy];
14 | void (^replace)() = ^(NSString *find, NSString *replace) {
15 | [escaped replaceOccurrencesOfString:find withString:replace options:0 range:NSMakeRange(0, escaped.length)];
16 | };
17 | replace(@"&", @"&");
18 | replace(@"\u00A0", @" ");
19 | replace(@"\"", @""");
20 | replace(@"<", @"<");
21 | replace(@">", @">");
22 | return escaped;
23 | }
24 |
25 | - (NSString *)html_stringByUnescapingHTML
26 | {
27 | NSRange ampersand = [self rangeOfString:@"&" options:NSBackwardsSearch];
28 | if (ampersand.location == NSNotFound || NSMaxRange(ampersand) == self.length) return self;
29 |
30 | // These are expensive to create, so we'll lazily create them once per unescaping operation.
31 | NSCharacterSet *decimalDigitCharacterSet;
32 | NSCharacterSet *hexadecimalDigitCharacterSet;
33 |
34 | NSRange searchRange = NSMakeRange(0, self.length);
35 | NSScanner *scanner = [NSScanner scannerWithString:self];
36 | NSMutableString *unescaped = [self mutableCopy];
37 | do {
38 | searchRange.length = ampersand.location;
39 |
40 | NSString *replacement;
41 |
42 | // Numeric entity.
43 | scanner.scanLocation = NSMaxRange(ampersand);
44 | if ([scanner scanString:@"#" intoString:nil]) {
45 |
46 | UInt32 entity;
47 |
48 | // Hex number.
49 | if ([scanner scanString:@"x" intoString:nil]) {
50 | if (!hexadecimalDigitCharacterSet) {
51 | hexadecimalDigitCharacterSet = [NSCharacterSet characterSetWithCharactersInString:@"0123456789ABCDEFabcdef"];
52 | }
53 | NSString *entityString;
54 | if ([scanner scanCharactersFromSet:hexadecimalDigitCharacterSet intoString:&entityString]) {
55 | NSScanner *hexScanner = [NSScanner scannerWithString:entityString];
56 | unsigned int hex;
57 | [hexScanner scanHexInt:&hex];
58 | entity = hex;
59 | } else {
60 | continue;
61 | }
62 | }
63 |
64 | // Decimal number.
65 | else {
66 | if (!decimalDigitCharacterSet) {
67 | decimalDigitCharacterSet = [NSCharacterSet characterSetWithCharactersInString:@"0123456789"];
68 | }
69 | NSString *entityString;
70 | if ([scanner scanCharactersFromSet:decimalDigitCharacterSet intoString:&entityString]) {
71 | NSInteger decimal = entityString.integerValue;
72 | if (decimal > 0x10FFFF) {
73 | entity = UINT32_MAX;
74 | } else {
75 | entity = (UInt32)decimal;
76 | }
77 | } else {
78 | continue;
79 | }
80 | }
81 |
82 | UTF32Char win1252Replacement = ReplacementForNumericEntity(entity);
83 | if (win1252Replacement) {
84 | entity = win1252Replacement;
85 | }
86 |
87 | if ((entity >= 0xD800 && entity <= 0xDFFF) || entity > 0x10FFFF) {
88 | entity = 0xFFFD;
89 | }
90 |
91 | replacement = StringWithLongCharacter(entity);
92 |
93 | // Optional semicolon.
94 | [scanner scanString:@";" intoString:nil];
95 | }
96 |
97 | // Named entity.
98 | else {
99 | NSRange nameRange = NSMakeRange(NSMaxRange(ampersand), LongestEntityNameLength);
100 | if (NSMaxRange(nameRange) > self.length) {
101 | nameRange.length = self.length - nameRange.location;
102 | }
103 | NSString *nameString = [self substringWithRange:nameRange];
104 | NSString *parsedEntity;
105 | replacement = StringForNamedEntity(nameString, &parsedEntity);
106 | if (replacement) {
107 | [scanner scanString:parsedEntity intoString:nil];
108 | } else {
109 | continue;
110 | }
111 | }
112 |
113 | [unescaped replaceCharactersInRange:NSMakeRange(ampersand.location, scanner.scanLocation - ampersand.location) withString:replacement];
114 | } while ((ampersand = [self rangeOfString:@"&" options:NSBackwardsSearch range:searchRange]).location != NSNotFound);
115 | return unescaped;
116 | }
117 |
118 | @end
119 |
--------------------------------------------------------------------------------
/TextBookParse/main.m:
--------------------------------------------------------------------------------
1 | //
2 | // main.m
3 | // TextBookParse
4 | //
5 | // Created by xxsy-ima001 on 14-8-5.
6 | // Copyright (c) 2014年 ___xiaoxiangwenxue___. All rights reserved.
7 | //
8 |
9 | #import
10 |
11 | #import "AppDelegate.h"
12 |
13 | int main(int argc, char * argv[])
14 | {
15 | @autoreleasepool {
16 | return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/TextBookParseTests/TextBookParseTests-Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | CFBundleDevelopmentRegion
6 | en
7 | CFBundleExecutable
8 | ${EXECUTABLE_NAME}
9 | CFBundleIdentifier
10 | xiaoxiangwenxue.${PRODUCT_NAME:rfc1034identifier}
11 | CFBundleInfoDictionaryVersion
12 | 6.0
13 | CFBundlePackageType
14 | BNDL
15 | CFBundleShortVersionString
16 | 1.0
17 | CFBundleSignature
18 | ????
19 | CFBundleVersion
20 | 1
21 |
22 |
23 |
--------------------------------------------------------------------------------
/TextBookParseTests/TextBookParseTests.m:
--------------------------------------------------------------------------------
1 | //
2 | // TextBookParseTests.m
3 | // TextBookParseTests
4 | //
5 | // Created by xxsy-ima001 on 14-8-5.
6 | // Copyright (c) 2014年 ___xiaoxiangwenxue___. All rights reserved.
7 | //
8 |
9 | #import
10 |
11 | @interface TextBookParseTests : XCTestCase
12 |
13 | @end
14 |
15 | @implementation TextBookParseTests
16 |
17 | - (void)setUp
18 | {
19 | [super setUp];
20 | // Put setup code here. This method is called before the invocation of each test method in the class.
21 | }
22 |
23 | - (void)tearDown
24 | {
25 | // Put teardown code here. This method is called after the invocation of each test method in the class.
26 | [super tearDown];
27 | }
28 |
29 | - (void)testExample
30 | {
31 | XCTFail(@"No implementation for \"%s\"", __PRETTY_FUNCTION__);
32 | }
33 |
34 | @end
35 |
--------------------------------------------------------------------------------
/TextBookParseTests/en.lproj/InfoPlist.strings:
--------------------------------------------------------------------------------
1 | /* Localized versions of Info.plist keys */
2 |
3 |
--------------------------------------------------------------------------------