├── TODO.md ├── .gitmodules ├── readability ├── readability-Prefix.pch ├── NSString+ReplaceExtensions.h ├── htmls.h ├── JXWebResourceLoadingBarrier.h ├── NSString+Counting.h ├── NSString+JXRemoving.h ├── NSString+ReplaceExtensions.m ├── JXWebResourceLoadingBarrier.m ├── NSXMLNode+HTMLUtilities.h ├── NSString+Counting.m ├── NSString+JXRemoving.m ├── JXReadabilityDocument.h ├── readability.1 ├── NSXMLNode+HTMLUtilities.m ├── htmls.m ├── main.m └── JXReadabilityDocument.m ├── .gitignore ├── Configs └── Base.xcconfig ├── README.md ├── COPYING └── readability.xcodeproj └── project.pbxproj /TODO.md: -------------------------------------------------------------------------------- 1 | #To Do 2 | 3 | - copy title into ``. 4 | - evaluate caching -clean: results in a dictionary. -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "webarchiver"] 2 | path = webarchiver 3 | url = git@github.com:JanX2/webarchiver.git 4 | -------------------------------------------------------------------------------- /readability/readability-Prefix.pch: -------------------------------------------------------------------------------- 1 | // 2 | // Prefix header for all source files of the 'readability' target in the 'readability' project 3 | // 4 | 5 | #ifdef __OBJC__ 6 | #import <Foundation/Foundation.h> 7 | #endif 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | build/* 3 | *.pbxuser 4 | !default.pbxuser 5 | *.mode1v3 6 | !default.mode1v3 7 | *.mode2v3 8 | !default.mode2v3 9 | *.perspectivev3 10 | !default.perspectivev3 11 | *.xcworkspace 12 | !default.xcworkspace 13 | xcuserdata 14 | profile 15 | *.moved-aside 16 | 17 | ## Ignore incredibly annoying .DS_Store files 18 | .DS_Store 19 | -------------------------------------------------------------------------------- /readability/NSString+ReplaceExtensions.h: -------------------------------------------------------------------------------- 1 | // 2 | // NSString+ReplaceExtensions.h 3 | // readability 4 | // 5 | // Created by Georg Fritzsche on 17.09.10. 6 | // http://stackoverflow.com/questions/3733980/replace-multiple-groups-of-characters-in-an-nsstring 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | 11 | @interface NSString (ReplaceExtensions) 12 | - (NSString *)stringByReplacingStringsFromDictionary:(NSDictionary *)dict; 13 | @end 14 | -------------------------------------------------------------------------------- /readability/htmls.h: -------------------------------------------------------------------------------- 1 | // 2 | // htmls.h 3 | // readability 4 | // 5 | // Created by Jan on 24.03.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | 11 | NSString * lxmlCSSToXPath(NSString *cssExpr); 12 | void addMatch(NSMutableSet *collection, NSString *text, NSString *orig); 13 | NSString * getTitleInDocument(NSXMLDocument *doc); 14 | NSString * shortenTitleInDocument(NSXMLDocument *doc); 15 | -------------------------------------------------------------------------------- /readability/JXWebResourceLoadingBarrier.h: -------------------------------------------------------------------------------- 1 | // 2 | // JXWebResourceLoadingBarrier.h 3 | // readability 4 | // 5 | // Created by Jan on 04.06.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | #import <WebKit/WebKit.h> 11 | 12 | @interface JXWebResourceLoadingBarrier : NSObject { 13 | BOOL localResourceLoadingOnly; 14 | } 15 | 16 | @property (nonatomic) BOOL localResourceLoadingOnly; 17 | 18 | @end 19 | -------------------------------------------------------------------------------- /readability/NSString+Counting.h: -------------------------------------------------------------------------------- 1 | // 2 | // NSString+Counting.h 3 | // readability 4 | // 5 | // Created by Jan on 05.03.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | 11 | @interface NSString (Counting) 12 | 13 | - (NSUInteger)countOccurancesOfString:(NSString *)needle; 14 | - (NSUInteger)countSubstringsWithOptions:(NSStringEnumerationOptions)opts; 15 | - (BOOL)countOfSubstringsWithOptions:(NSStringEnumerationOptions)opts isAtLeast:(NSUInteger)lowerBound; 16 | 17 | @end 18 | -------------------------------------------------------------------------------- /readability/NSString+JXRemoving.h: -------------------------------------------------------------------------------- 1 | // 2 | // NSString+JXRemoving.h 3 | // string-splitter 4 | // 5 | // Created by Jan on 11.01.12. 6 | // Copyright 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | 11 | 12 | @interface NSString (Removing) 13 | 14 | - (NSString *)jx_stringByRemovingPrefix:(NSString *)prefix; 15 | - (NSString *)jx_stringByRemovingSuffix:(NSString *)suffix; 16 | - (NSString *)jx_stringByRemovingSurroundingWhitespace; 17 | - (NSString *)jx_stringByCollapsingAndRemovingSurroundingCharactersInSet:(NSCharacterSet *)collapsibleCharacterSet 18 | intoString:(NSString *)replacementString; 19 | 20 | @end 21 | -------------------------------------------------------------------------------- /readability/NSString+ReplaceExtensions.m: -------------------------------------------------------------------------------- 1 | // 2 | // NSString+ReplaceExtensions.m 3 | // readability 4 | // 5 | // Created by Georg Fritzsche on 17.09.10. 6 | // http://stackoverflow.com/questions/3733980/replace-multiple-groups-of-characters-in-an-nsstring 7 | // 8 | 9 | #import "NSString+ReplaceExtensions.h" 10 | 11 | @implementation NSString (ReplaceExtensions) 12 | 13 | - (NSString *)stringByReplacingStringsFromDictionary:(NSDictionary *)dict; 14 | { 15 | NSMutableString *string = [self mutableCopy]; 16 | 17 | for (NSString *target in dict) { 18 | [string replaceOccurrencesOfString:target 19 | withString:dict[target] 20 | options:0 21 | range:NSMakeRange(0, [string length])]; 22 | } 23 | 24 | return string; 25 | } 26 | 27 | @end 28 | -------------------------------------------------------------------------------- /readability/JXWebResourceLoadingBarrier.m: -------------------------------------------------------------------------------- 1 | // 2 | // JXWebResourceLoadingBarrier.m 3 | // readability 4 | // 5 | // Created by Jan on 04.06.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import "JXWebResourceLoadingBarrier.h" 10 | 11 | @implementation JXWebResourceLoadingBarrier 12 | 13 | @synthesize localResourceLoadingOnly; 14 | 15 | - (NSURLRequest *)webView:(WebView *)sender resource:(id)identifier willSendRequest:(NSURLRequest *)request redirectResponse:(NSURLResponse *)redirectResponse fromDataSource:(WebDataSource *)dataSource 16 | { 17 | if (!localResourceLoadingOnly 18 | || (localResourceLoadingOnly && [[[request URL] scheme] isEqualToString:@"file"])) 19 | { 20 | return request; 21 | } else { 22 | return nil; 23 | } 24 | } 25 | 26 | 27 | @end 28 | -------------------------------------------------------------------------------- /readability/NSXMLNode+HTMLUtilities.h: -------------------------------------------------------------------------------- 1 | // 2 | // NSXMLNode+HTMLUtilities.h 3 | // readability 4 | // 5 | // Created by Jan on 26.02.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | 11 | extern NSString * const tagNameXPath; 12 | 13 | @interface NSXMLNode (HTMLUtilities) 14 | 15 | - (NSArray *)tagsWithNames:(NSString *)firstTagName, ... NS_REQUIRES_NIL_TERMINATION; 16 | - (NSArray *)reverseTagsWithNames:(NSString *)firstTagName, ... NS_REQUIRES_NIL_TERMINATION; 17 | 18 | - (void)addCSSName:(NSString *)cssName toAttributeWithName:(NSString *)attributeName; 19 | 20 | - (NSString *)cssNamesForAttributeWithName:(NSString *)attributeName; 21 | #if 0 22 | - (NSArray *)cssNamesSetForAttributeWithName:(NSString *)attributeName; 23 | - (NSArray *)cssNamesForAttributeWithName:(NSString *)attributeName; 24 | #endif 25 | 26 | - (NSString *)lxmlText; 27 | - (NSXMLNode *)lxmlTextNode; 28 | - (NSXMLNode *)lxmlTailNode; 29 | 30 | - (NSString *)readabilityDescription; 31 | - (NSString *)readabilityDescriptionWithDepth:(NSUInteger)depth; 32 | 33 | @end 34 | -------------------------------------------------------------------------------- /Configs/Base.xcconfig: -------------------------------------------------------------------------------- 1 | ARCHS = $(ARCHS_STANDARD_64_BIT) 2 | SDKROOT = macosx 3 | MACOSX_DEPLOYMENT_TARGET = 10.7 4 | 5 | CLANG_ENABLE_OBJC_ARC = NO 6 | 7 | GCC_VERSION = com.apple.compilers.llvm.clang.1_0 8 | GCC_C_LANGUAGE_STANDARD = c99 9 | 10 | PREBINDING = NO 11 | GCC_WARN_CHECK_SWITCH_STATEMENTS = YES 12 | GCC_WARN_FOUR_CHARACTER_CONSTANTS = NO 13 | GCC_WARN_SHADOW = YES 14 | GCC_TREAT_WARNINGS_AS_ERRORS = NO 15 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES 16 | GCC_WARN_ABOUT_MISSING_FIELD_INITIALIZERS = YES 17 | GCC_WARN_INITIALIZER_NOT_FULLY_BRACKETED = YES 18 | GCC_WARN_ABOUT_RETURN_TYPE = YES 19 | GCC_WARN_MISSING_PARENTHESES = YES 20 | GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES 21 | GCC_WARN_ABOUT_MISSING_NEWLINE = YES 22 | GCC_WARN_NON_VIRTUAL_DESTRUCTOR = YES 23 | GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES 24 | GCC_WARN_SIGN_COMPARE = YES 25 | GCC_WARN_TYPECHECK_CALLS_TO_PRINTF = YES 26 | GCC_WARN_UNDECLARED_SELECTOR = YES 27 | GCC_TREAT_IMPLICIT_FUNCTION_DECLARATIONS_AS_ERRORS = YES 28 | GCC_WARN_UNINITIALIZED_AUTOS = YES 29 | GCC_WARN_UNKNOWN_PRAGMAS = YES 30 | GCC_WARN_UNUSED_FUNCTION = YES 31 | GCC_WARN_UNUSED_LABEL = YES 32 | GCC_WARN_UNUSED_PARAMETER = NO 33 | GCC_WARN_UNUSED_VALUE = YES 34 | GCC_WARN_UNUSED_VARIABLE = YES 35 | 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #readability Objective-C 2 | 3 | This is an Objective-C port of a python port of a ruby port of [arc90's readability project](http://lab.arc90.com/experiments/readability/). 4 | 5 | The goal is that given a URL, an HTML document or a Safari webarchive, it will pull out the main body text and clean it up. 6 | 7 | Currently it deviates from the original in various ways: 8 | 9 | - Some implementation details were changed for performance reasons, others to speed up porting. 10 | - It does not accept and produce HTML strings directly. 11 | - main.m demonstrates how to use NSXMLDocument objects instead. 12 | - There still are bugs in this port that are a result of porting. 13 | 14 | readability-objc uses [KBWebArchiver](https://github.com/JanX2/webarchiver) to create a webarchive from the input if necessary. Amongst other things, this enables the automatic encoding detection implemented in WebKit. 15 | 16 | KBWebArchiver is included as a submodule. After cloning a main repository, you initialize submodules by typing: 17 | 18 | git submodule init 19 | git submodule update 20 | 21 | The code is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). 22 | 23 | ##Based on: 24 | 25 | - [buriy’s python-readability fork](https://github.com/buriy/python-readability). 26 | - Github user contributions. 27 | 28 | ##Command-line usage demo: 29 | 30 | readability -url http://pypi.python.org/pypi/readability-lxml 31 | -------------------------------------------------------------------------------- /readability/NSString+Counting.m: -------------------------------------------------------------------------------- 1 | // 2 | // NSString+Counting.m 3 | // readability 4 | // 5 | // Created by Jan on 05.03.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import "NSString+Counting.h" 10 | 11 | @implementation NSString (Counting) 12 | 13 | - (NSUInteger)countOccurancesOfString:(NSString *)needle; 14 | { 15 | if ([self length] == 0) return 0; 16 | 17 | NSUInteger count = 0; 18 | 19 | NSScanner *scanner = [[NSScanner alloc] initWithString:self]; 20 | [scanner setCharactersToBeSkipped:nil]; 21 | 22 | while ([scanner isAtEnd] == NO) { 23 | if ([scanner scanString:needle intoString:NULL]) count++; 24 | 25 | // Scan up to the start of the next occurence of needle or to the end of the scanned string. 26 | [scanner scanUpToString:needle intoString:NULL]; 27 | } 28 | 29 | return count; 30 | } 31 | 32 | - (NSUInteger)countSubstringsWithOptions:(NSStringEnumerationOptions)opts; 33 | { 34 | if (self.length == 0) return 0; 35 | 36 | __block NSUInteger count = 0; 37 | 38 | [self enumerateSubstringsInRange:NSMakeRange(0, self.length) 39 | options:(opts | NSStringEnumerationSubstringNotRequired) 40 | usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) { 41 | count++; 42 | }]; 43 | 44 | return count; 45 | } 46 | 47 | - (BOOL)countOfSubstringsWithOptions:(NSStringEnumerationOptions)opts isAtLeast:(NSUInteger)lowerBound; 48 | { 49 | if (self.length == 0) return 0; 50 | 51 | __block NSUInteger count = 0; 52 | 53 | [self enumerateSubstringsInRange:NSMakeRange(0, self.length) 54 | options:(opts | NSStringEnumerationSubstringNotRequired) 55 | usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) { 56 | count++; 57 | if (count == lowerBound) *stop = YES; 58 | }]; 59 | 60 | return (count >= lowerBound); 61 | } 62 | 63 | @end 64 | -------------------------------------------------------------------------------- /readability/NSString+JXRemoving.m: -------------------------------------------------------------------------------- 1 | // 2 | // NSString+JXRemoving.m 3 | // string-splitter 4 | // 5 | // Created by Jan on 11.01.12. 6 | // Copyright 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import "NSString+JXRemoving.h" 10 | 11 | // Based on OmniFoundation/NSString-OFReplacement 12 | 13 | @implementation NSString (Removing) 14 | 15 | - (NSString *)jx_stringByRemovingPrefix:(NSString *)prefix; 16 | { 17 | NSRange aRange = [self rangeOfString:prefix options:NSAnchoredSearch]; 18 | if ((aRange.length == 0) || (aRange.location != 0)) 19 | return self; 20 | return [self substringFromIndex:aRange.length]; 21 | } 22 | 23 | - (NSString *)jx_stringByRemovingSuffix:(NSString *)suffix; 24 | { 25 | if (![self hasSuffix:suffix]) 26 | return self; 27 | return [self substringToIndex:[self length] - [suffix length]]; 28 | } 29 | 30 | - (NSString *)jx_stringByRemovingSurroundingWhitespace; 31 | { 32 | return [self stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; 33 | } 34 | 35 | - (NSString *)jx_stringByCollapsingAndRemovingSurroundingCharactersInSet:(NSCharacterSet *)collapsibleCharacterSet 36 | intoString:(NSString *)replacementString; 37 | { 38 | NSUInteger length = [self length]; 39 | if (length == 0) return @""; // Trivial optimization 40 | 41 | NSScanner *stringScanner = [[NSScanner alloc] initWithString:self]; 42 | [stringScanner setCharactersToBeSkipped:collapsibleCharacterSet]; 43 | NSMutableString *collapsedString = [[NSMutableString alloc] initWithCapacity:length]; 44 | BOOL firstSubstring = YES; 45 | NSString *nonWhitespaceSubstring; 46 | while ([stringScanner scanUpToCharactersFromSet:collapsibleCharacterSet intoString:&nonWhitespaceSubstring]) { 47 | if (nonWhitespaceSubstring) { 48 | if (firstSubstring) { 49 | firstSubstring = NO; 50 | } else { 51 | [collapsedString appendString:replacementString]; 52 | } 53 | [collapsedString appendString:nonWhitespaceSubstring]; 54 | } 55 | } 56 | return collapsedString; 57 | } 58 | 59 | 60 | @end 61 | -------------------------------------------------------------------------------- /readability/JXReadabilityDocument.h: -------------------------------------------------------------------------------- 1 | /* 2 | * JXReadablilityDocument 3 | * 4 | * Copyright (c) 2012 geheimwerk.de. 5 | * https://github.com/JanX2/readability-objc 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * 19 | * ObjC port: jan@geheimwerk.de (Jan Weiß) 20 | */ 21 | 22 | #import <Foundation/Foundation.h> 23 | 24 | // Class for cleaning up an NSXMLDocument to improve readability. 25 | 26 | @interface JXReadabilityDocument : NSObject 27 | { 28 | NSString * input; 29 | NSXMLDocument * html; 30 | 31 | NSMutableDictionary * options; 32 | 33 | NSCharacterSet * whitespaceAndNewlineCharacterSet; 34 | 35 | NSRegularExpression * unlikelyCandidatesRe; 36 | NSRegularExpression * okMaybeItsACandidateRe; 37 | NSRegularExpression * positiveRe; 38 | NSRegularExpression * negativeRe; 39 | NSRegularExpression * divToPElementsRe; 40 | 41 | NSRegularExpression * newlinePlusSurroundingwhitespaceRe; 42 | NSRegularExpression * tabRunRe; 43 | NSRegularExpression * sentenceEndRe; 44 | 45 | NSSet * divToPElementsTagNames; 46 | } 47 | 48 | @property (nonatomic, copy) NSString *input; 49 | @property (nonatomic, strong) NSXMLDocument *html; 50 | 51 | @property (nonatomic, strong) NSMutableDictionary *options; 52 | /* 53 | Possible keys (in flux): 54 | - attributes: (currently disabled) 55 | - debug (NSNumber): output debug messages 56 | - minTextLength: 57 | - retryLength: 58 | - url: will allow adjusting links to be absolute 59 | */ 60 | 61 | @property (nonatomic, readonly) NSString *title; 62 | @property (nonatomic, readonly) NSString *shortTitle; 63 | 64 | - (id)initWithXMLDocument:(NSXMLDocument *)aDoc copyDocument:(BOOL)doCopy; 65 | - (id)initWithXMLDocument:(NSXMLDocument *)aDoc; // Same as above with doCopy == NO 66 | 67 | // Generate and return the summary of the HTML document 68 | - (NSXMLDocument *)summaryXMLDocument; 69 | 70 | @end 71 | -------------------------------------------------------------------------------- /readability/readability.1: -------------------------------------------------------------------------------- 1 | .\"Modified from man(1) of FreeBSD, the NetBSD mdoc.template, and mdoc.samples. 2 | .\"See Also: 3 | .\"man mdoc.samples for a complete listing of options 4 | .\"man mdoc for the short list of editing options 5 | .\"/usr/share/misc/mdoc.template 6 | .Dd 23.02.12 \" DATE 7 | .Dt readability 1 \" Program name and manual section number 8 | .Os Darwin 9 | .Sh NAME \" Section Header - required - don't modify 10 | .Nm readability, 11 | .\" The following lines are read in generating the apropos(man -k) database. Use only key 12 | .\" words here as the database is built based on the words here and in the .ND line. 13 | .Nm Other_name_for_same_program(), 14 | .Nm Yet another name for the same program. 15 | .\" Use .Nm macro to designate other names for the documented program. 16 | .Nd This line parsed for whatis database. 17 | .Sh SYNOPSIS \" Section Header - required - don't modify 18 | .Nm 19 | .Op Fl abcd \" [-abcd] 20 | .Op Fl a Ar path \" [-a path] 21 | .Op Ar file \" [file] 22 | .Op Ar \" [file ...] 23 | .Ar arg0 \" Underlined argument - use .Ar anywhere to underline 24 | arg2 ... \" Arguments 25 | .Sh DESCRIPTION \" Section Header - required - don't modify 26 | Use the .Nm macro to refer to your program throughout the man page like such: 27 | .Nm 28 | Underlining is accomplished with the .Ar macro like this: 29 | .Ar underlined text . 30 | .Pp \" Inserts a space 31 | A list of items with descriptions: 32 | .Bl -tag -width -indent \" Begins a tagged list 33 | .It item a \" Each item preceded by .It macro 34 | Description of item a 35 | .It item b 36 | Description of item b 37 | .El \" Ends the list 38 | .Pp 39 | A list of flags and their descriptions: 40 | .Bl -tag -width -indent \" Differs from above in tag removed 41 | .It Fl a \"-a flag as a list item 42 | Description of -a flag 43 | .It Fl b 44 | Description of -b flag 45 | .El \" Ends the list 46 | .Pp 47 | .\" .Sh ENVIRONMENT \" May not be needed 48 | .\" .Bl -tag -width "ENV_VAR_1" -indent \" ENV_VAR_1 is width of the string ENV_VAR_1 49 | .\" .It Ev ENV_VAR_1 50 | .\" Description of ENV_VAR_1 51 | .\" .It Ev ENV_VAR_2 52 | .\" Description of ENV_VAR_2 53 | .\" .El 54 | .Sh FILES \" File used or created by the topic of the man page 55 | .Bl -tag -width "/Users/joeuser/Library/really_long_file_name" -compact 56 | .It Pa /usr/share/file_name 57 | FILE_1 description 58 | .It Pa /Users/joeuser/Library/really_long_file_name 59 | FILE_2 description 60 | .El \" Ends the list 61 | .\" .Sh DIAGNOSTICS \" May not be needed 62 | .\" .Bl -diag 63 | .\" .It Diagnostic Tag 64 | .\" Diagnostic informtion here. 65 | .\" .It Diagnostic Tag 66 | .\" Diagnostic informtion here. 67 | .\" .El 68 | .Sh SEE ALSO 69 | .\" List links in ascending order by section, alphabetically within a section. 70 | .\" Please do not reference files that do not exist without filing a bug report 71 | .Xr a 1 , 72 | .Xr b 1 , 73 | .Xr c 1 , 74 | .Xr a 2 , 75 | .Xr b 2 , 76 | .Xr a 3 , 77 | .Xr b 3 78 | .\" .Sh BUGS \" Document known, unremedied bugs 79 | .\" .Sh HISTORY \" Document history if command behaves in a unique manner -------------------------------------------------------------------------------- /readability/NSXMLNode+HTMLUtilities.m: -------------------------------------------------------------------------------- 1 | // 2 | // NSXMLNode+HTMLUtilities.m 3 | // readability 4 | // 5 | // Created by Jan on 26.02.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import "NSXMLNode+HTMLUtilities.h" 10 | 11 | 12 | // Original XPath: @".//%@". Alternative XPath: @".//*[matches(name(),'%@','i')]" 13 | NSString * const tagNameXPath = @".//*[lower-case(name())='%@']"; 14 | 15 | 16 | @implementation NSXMLNode (HTMLUtilities) 17 | 18 | - (NSArray *)tagsWithNames:(NSString *)firstTagName, ... ; 19 | { 20 | NSMutableArray *tags = [NSMutableArray array]; 21 | 22 | va_list tag_names; 23 | va_start (tag_names, firstTagName); 24 | for (NSString *tagName = firstTagName; tagName != nil; tagName = va_arg(tag_names, NSString *)) { 25 | NSArray *foundNodes = [self nodesForXPath:[NSString stringWithFormat:tagNameXPath, tagName] 26 | error:NULL]; 27 | //foundNodes = [[foundNodes reverseObjectEnumerator] allObjects]; 28 | [tags addObjectsFromArray:foundNodes]; 29 | } 30 | va_end (tag_names); 31 | 32 | return tags; 33 | } 34 | 35 | - (NSArray *)reverseTagsWithNames:(NSString *)firstTagName, ... ; 36 | { 37 | NSMutableArray *tags = [NSMutableArray array]; 38 | 39 | va_list tag_names; 40 | va_start (tag_names, firstTagName); 41 | for (NSString *tagName = firstTagName; tagName != nil; tagName = va_arg(tag_names, NSString *)) { 42 | NSArray *foundNodes = [self nodesForXPath:[NSString stringWithFormat:tagNameXPath, tagName] 43 | error:NULL]; 44 | foundNodes = [[foundNodes reverseObjectEnumerator] allObjects]; 45 | [tags addObjectsFromArray:foundNodes]; 46 | } 47 | va_end (tag_names); 48 | 49 | return tags; 50 | } 51 | 52 | 53 | - (void)addCSSName:(NSString *)cssName toAttributeWithName:(NSString *)attributeName; 54 | { 55 | if ([self kind] == NSXMLElementKind) { 56 | NSXMLElement *selfElement = (NSXMLElement *)self; 57 | NSXMLNode *attribute = [selfElement attributeForName:attributeName]; 58 | if (attribute == nil) { 59 | attribute = [NSXMLNode attributeWithName:attributeName 60 | stringValue:cssName]; 61 | [selfElement addAttribute:attribute]; 62 | } else { 63 | NSString *attributeStringValue = [attribute stringValue]; 64 | if ([attributeStringValue rangeOfString:cssName 65 | options:NSLiteralSearch].location == NSNotFound) { 66 | [attribute setStringValue:[NSString stringWithFormat:@"%@ %@", 67 | attributeStringValue, 68 | cssName] 69 | ]; 70 | } 71 | } 72 | } 73 | } 74 | 75 | - (NSString *)cssNamesForAttributeWithName:(NSString *)attributeName; 76 | { 77 | if ([self kind] == NSXMLElementKind) { 78 | NSXMLElement *selfElement = (NSXMLElement *)self; 79 | NSXMLNode *attribute = [selfElement attributeForName:attributeName]; 80 | return [attribute stringValue]; 81 | } 82 | 83 | return nil; 84 | } 85 | 86 | #if 0 87 | - (NSSet *)cssNamesSetForAttributeWithName:(NSString *)attributeName; 88 | { 89 | if ([self kind] == NSXMLElementKind) { 90 | NSXMLElement *selfElement = (NSXMLElement *)self; 91 | NSXMLNode *attribute = [selfElement attributeForName:attributeName]; 92 | if (attribute == nil) { 93 | return nil; 94 | } else { 95 | NSArray *cssNames = [[attribute stringValue] componentsSeparatedByCharactersInSet:[NSCharacterSet whitespaceCharacterSet]]; 96 | 97 | NSMutableSet *cssNamesSet = [NSMutableSet setWithArray:cssNames]; 98 | [cssNamesSet removeObject:@""]; 99 | 100 | return cssNamesSet; 101 | } 102 | } 103 | 104 | return nil; 105 | } 106 | 107 | - (NSArray *)cssNamesForAttributeWithName:(NSString *)attributeName; 108 | { 109 | if ([self kind] == NSXMLElementKind) { 110 | NSXMLElement *selfElement = (NSXMLElement *)self; 111 | NSXMLNode *attribute = [selfElement attributeForName:attributeName]; 112 | if (attribute == nil) { 113 | return nil; 114 | } else { 115 | NSArray *cssNames = [[attribute stringValue] componentsSeparatedByCharactersInSet:[NSCharacterSet whitespaceCharacterSet]]; 116 | 117 | NSIndexSet *emptyStringIndexes = [cssNames indexesOfObjectsPassingTest:^(id obj, NSUInteger index, BOOL *stop) { 118 | if ([(NSString *)obj length] == 0) { 119 | return YES; 120 | } 121 | else { 122 | return NO; 123 | } 124 | }]; 125 | 126 | if ([emptyStringIndexes count] > 0) { 127 | NSMutableArray *cssNamesMutable = [cssNames mutableCopy]; 128 | [cssNamesMutable removeObjectsAtIndexes:emptyStringIndexes]; 129 | return [cssNamesMutable autorelease]; 130 | } else { 131 | return cssNames; 132 | } 133 | 134 | } 135 | } 136 | 137 | return nil; 138 | } 139 | #endif 140 | 141 | 142 | - (NSString *)lxmlText; 143 | { 144 | NSString *s = nil; 145 | 146 | if (([self childCount] > 0)) { 147 | NSXMLNode *child = [self childAtIndex:0]; 148 | if ([child kind] == NSXMLTextKind) { 149 | s = [child stringValue]; 150 | } 151 | } 152 | 153 | return s; 154 | } 155 | 156 | - (NSXMLNode *)lxmlTextNode; 157 | { 158 | if (([self childCount] > 0)) { 159 | NSXMLNode *child = [self childAtIndex:0]; 160 | if ([child kind] == NSXMLTextKind) { 161 | return child; 162 | } 163 | } 164 | 165 | return nil; 166 | } 167 | 168 | - (NSXMLNode *)lxmlTailNode; 169 | { 170 | if ([self kind] != NSXMLTextKind) { 171 | NSXMLNode *tailNode = [self nextSibling]; 172 | 173 | if ((tailNode == nil) || ([tailNode kind] != NSXMLTextKind)) { 174 | return nil; 175 | } else { 176 | return tailNode; 177 | } 178 | } 179 | else { 180 | return nil; 181 | } 182 | } 183 | 184 | 185 | - (NSString *)readabilityDescription; 186 | { 187 | return [self readabilityDescriptionWithDepth:1]; 188 | } 189 | 190 | - (NSString *)readabilityDescriptionWithDepth:(NSUInteger)depth; 191 | { 192 | NSString *selfName = self.name; 193 | if (selfName == nil) { 194 | NSString *kinds[] = { 195 | @"NSXMLInvalidKind,", 196 | @"NSXMLDocumentKind,", 197 | @"NSXMLElementKind,", 198 | @"NSXMLAttributeKind,", 199 | @"NSXMLNamespaceKind,", 200 | @"NSXMLProcessingInstructionKind,", 201 | @"NSXMLCommentKind,", 202 | @"NSXMLTextKind,", 203 | @"NSXMLDTDKind,", 204 | @"NSXMLEntityDeclarationKind,", 205 | @"NSXMLAttributeDeclarationKind,", 206 | @"NSXMLElementDeclarationKind,", 207 | @"NSXMLNotationDeclarationKind" 208 | }; 209 | return [NSString stringWithFormat:@"[%@]", kinds[(int)self.kind]]; 210 | } 211 | 212 | NSMutableString *name = [NSMutableString string]; 213 | 214 | NSString *ids = [self cssNamesForAttributeWithName:@"id"]; 215 | NSString *classes = [self cssNamesForAttributeWithName:@"class"]; 216 | 217 | if (ids != nil) { 218 | [name appendFormat:@"#%@", 219 | [ids stringByReplacingOccurrencesOfString:@" " 220 | withString:@"#" 221 | options:NSLiteralSearch 222 | range:NSMakeRange(0, [ids length])]]; 223 | } 224 | 225 | if (classes != nil) { 226 | [name appendFormat:@".%@", 227 | [classes stringByReplacingOccurrencesOfString:@" " 228 | withString:@"." 229 | options:NSLiteralSearch 230 | range:NSMakeRange(0, [classes length])]]; 231 | } 232 | 233 | if (([name length] == 0) || ![selfName isEqualToString:@"div"]) { 234 | [name insertString:selfName atIndex:0]; 235 | } 236 | 237 | if (depth > 0) { 238 | NSXMLNode *selfParent = [self parent]; 239 | if (selfParent != nil) { 240 | [name appendFormat:@" - %@", 241 | [selfParent readabilityDescriptionWithDepth:(depth-1)] 242 | ]; 243 | } 244 | } 245 | 246 | return name; 247 | } 248 | 249 | @end 250 | -------------------------------------------------------------------------------- /readability/htmls.m: -------------------------------------------------------------------------------- 1 | // 2 | // htmls.m 3 | // readability 4 | // 5 | // Created by Jan on 24.03.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import "htmls.h" 10 | 11 | #import "NSString+Counting.h" 12 | #import "NSXMLNode+HTMLUtilities.h" 13 | #import "NSString+JXRemoving.h" 14 | #import "NSString+ReplaceExtensions.h" 15 | 16 | NSString * normalizeEntities(NSString *curTitle); 17 | NSString * normTitle(NSString *title); 18 | 19 | NSString * lxmlCSSToXPath(NSString *cssExpr) { 20 | NSString *prefix = @"descendant-or-self::"; 21 | 22 | static BOOL firstRun = YES; 23 | static NSRegularExpression *elRe = nil; 24 | static NSRegularExpression *idRe = nil; 25 | static NSRegularExpression *classRe = nil; 26 | 27 | if (firstRun) { 28 | elRe = [[NSRegularExpression alloc] initWithPattern:@"^(\\w+)\\s*$" options:0 error:NULL]; 29 | idRe = [[NSRegularExpression alloc] initWithPattern:@"^(\\w*)#(\\w+)\\s*$" options:0 error:NULL]; 30 | classRe = [[NSRegularExpression alloc] initWithPattern:@"^(\\w*)\\.(\\w+)\\s*$" options:0 error:NULL]; 31 | firstRun = NO; 32 | } 33 | 34 | NSString *expr = nil; 35 | 36 | NSRange cssExprRange = NSMakeRange(0, cssExpr.length); 37 | NSTextCheckingResult *match; 38 | 39 | match = [elRe firstMatchInString:cssExpr options:0 range:cssExprRange]; 40 | if (match != nil) { 41 | return [NSString stringWithFormat:@"%@%@", prefix, [cssExpr substringWithRange:[match rangeAtIndex:1]]]; 42 | } 43 | 44 | match = [idRe firstMatchInString:cssExpr options:0 range:cssExprRange]; 45 | if (match != nil) { 46 | NSRange match1Range = [match rangeAtIndex:1]; 47 | NSString *match1 = ((match1Range.location == NSNotFound) || (match1Range.length == 0)) ? @"*" : [cssExpr substringWithRange:match1Range]; 48 | NSString *match2 = [cssExpr substringWithRange:[match rangeAtIndex:2]]; 49 | NSString *result = [NSString stringWithFormat:@"%@%@[@id = '%@']", prefix, match1, match2]; 50 | 51 | return result; 52 | } 53 | 54 | match = [classRe firstMatchInString:cssExpr options:0 range:cssExprRange]; 55 | if (match != nil) { 56 | NSRange match1Range = [match rangeAtIndex:1]; 57 | NSString *match1 = ((match1Range.location == NSNotFound) || (match1Range.length == 0)) ? @"*" : [cssExpr substringWithRange:match1Range]; 58 | NSString *match2 = [cssExpr substringWithRange:[match rangeAtIndex:2]]; 59 | NSString *result = [NSString stringWithFormat:@"%@%@[contains(concat(' ', normalize-space(@class), ' '), ' %@ ')]", prefix, match1, match2]; 60 | 61 | return result; 62 | } 63 | 64 | return expr; 65 | } 66 | 67 | 68 | NSString * normalizeEntities(NSString *curTitle) { 69 | NSDictionary *entities = @{@"—": @"-", // EM DASH 70 | @"–": @"-", // EN DASH 71 | @"—": @"-", 72 | @"–": @"-", 73 | @" ": @" ", // NO-BREAK SPACE 74 | @"«": @"\"", 75 | @"»": @"\"", 76 | @""": @"\""}; 77 | 78 | return [curTitle stringByReplacingStringsFromDictionary:entities]; 79 | } 80 | 81 | NSString * normTitle(NSString *title) { 82 | return normalizeEntities([title jx_stringByCollapsingAndRemovingSurroundingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet] intoString:@" "]); 83 | } 84 | 85 | NSString * getTitleInDocument(NSXMLDocument *doc) { 86 | NSString *title = nil; 87 | NSArray *titleNodes = [doc tagsWithNames:@"title", nil]; 88 | 89 | if (titleNodes.count == 0) return @"[no-title]"; 90 | 91 | title = [titleNodes[0] lxmlText]; 92 | 93 | return normTitle(title); 94 | } 95 | 96 | void addMatch(NSMutableSet *collection, NSString *text, NSString *orig) { 97 | text = normTitle(text); 98 | 99 | if ((text.length >= 15) && [text countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:2]) { 100 | NSString *textWithoutQuotes = [text stringByReplacingOccurrencesOfString:@"\"" withString:@"" options:NSLiteralSearch range:NSMakeRange(0, text.length)]; 101 | NSString *origWithoutQuotes = [orig stringByReplacingOccurrencesOfString:@"\"" withString:@"" options:NSLiteralSearch range:NSMakeRange(0, orig.length)]; 102 | 103 | if (([origWithoutQuotes rangeOfString:textWithoutQuotes 104 | options:NSLiteralSearch 105 | range:NSMakeRange(0, origWithoutQuotes.length)].location) != NSNotFound) { 106 | [collection addObject:text]; 107 | } 108 | 109 | } 110 | } 111 | 112 | NSString * shortenTitleInDocument(NSXMLDocument *doc) { 113 | static BOOL firstRun = YES; 114 | static NSArray *cssXPaths = nil; 115 | static NSArray *delimiters = nil; 116 | 117 | if (firstRun) { 118 | NSArray *cssSelectors = @[@"#title", @"#head", @"#heading", @".pageTitle", @".newsTitle", @".title", @".head", @".heading", @".contentheading", @".smallHeaderRed"]; 119 | 120 | NSMutableArray *cssXPathsMutable = [[NSMutableArray alloc] initWithCapacity:cssSelectors.count]; 121 | 122 | for (NSString *selector in cssSelectors) { 123 | [cssXPathsMutable addObject:lxmlCSSToXPath(selector)]; 124 | } 125 | 126 | cssXPaths = [cssXPathsMutable copy]; 127 | 128 | delimiters = @[@" | ", @" - ", @" :: ", @" / "]; 129 | 130 | firstRun = NO; 131 | } 132 | 133 | NSString *title = nil; 134 | NSArray *titleNodes = [doc tagsWithNames:@"title", nil]; 135 | 136 | if (titleNodes.count == 0) return @""; 137 | 138 | title = [titleNodes[0] lxmlText]; 139 | 140 | NSString *orig; 141 | title = orig = normTitle(title); 142 | 143 | #warning How does NSXML treat HTML entities? 144 | 145 | NSMutableSet *candidates = [NSMutableSet set]; 146 | 147 | for (NSXMLElement *e in [doc tagsWithNames:@"h1", @"h2", @"h3", nil]) { 148 | NSString *eText; 149 | 150 | eText = e.lxmlText; 151 | if (eText) { 152 | addMatch(candidates, eText, orig); 153 | } 154 | 155 | eText = e.stringValue; 156 | if (eText) { 157 | addMatch(candidates, eText, orig); 158 | } 159 | } 160 | 161 | for (NSString *item in cssXPaths) { 162 | NSArray *foundNodes = [doc nodesForXPath:item 163 | error:NULL]; 164 | 165 | for (NSXMLElement *e in foundNodes) { 166 | NSString *eText; 167 | 168 | eText = e.lxmlText; 169 | if (eText) { 170 | addMatch(candidates, eText, orig); 171 | } 172 | 173 | eText = e.stringValue; 174 | if (eText) { 175 | addMatch(candidates, eText, orig); 176 | } 177 | } 178 | } 179 | 180 | if (candidates) { 181 | NSSortDescriptor *candidatesAscendingDescriptor = [NSSortDescriptor sortDescriptorWithKey:@"length" 182 | ascending:YES]; 183 | 184 | NSArray *sortedCandidates = [[candidates allObjects] sortedArrayUsingDescriptors: 185 | @[candidatesAscendingDescriptor]]; 186 | 187 | 188 | title = [sortedCandidates lastObject]; 189 | } 190 | else { 191 | NSArray *parts; 192 | BOOL didBreak = NO; 193 | 194 | for (NSString *delimiter in delimiters) { 195 | if ([title rangeOfString:delimiter 196 | options:NSLiteralSearch].location != NSNotFound) { 197 | parts = [orig componentsSeparatedByString:delimiter]; 198 | 199 | NSString *titleCandidate; 200 | if (titleCandidate = parts[0], 201 | [titleCandidate countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:4]) { 202 | title = titleCandidate; 203 | didBreak = YES; 204 | break; 205 | } 206 | else if (titleCandidate = [parts lastObject], 207 | [titleCandidate countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:4]) { 208 | title = titleCandidate; 209 | didBreak = YES; 210 | break; 211 | } 212 | } 213 | } 214 | 215 | if (didBreak == NO) { 216 | NSString *delimiter = @": "; 217 | if ([title rangeOfString:delimiter 218 | options:NSLiteralSearch].location != NSNotFound) { 219 | parts = [orig componentsSeparatedByString:delimiter]; 220 | 221 | NSString *titleCandidate; 222 | if (titleCandidate = [parts lastObject], 223 | [titleCandidate countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:4]) { 224 | title = [parts lastObject]; 225 | } 226 | else { 227 | title = [[parts subarrayWithRange:NSMakeRange(1, (parts.count - 1))] componentsJoinedByString:delimiter]; 228 | } 229 | } 230 | } 231 | } 232 | 233 | NSUInteger titleLength = title.length; 234 | if ( !((15 < titleLength) && (titleLength < 150)) ) return orig; 235 | 236 | return title; 237 | } 238 | -------------------------------------------------------------------------------- /readability/main.m: -------------------------------------------------------------------------------- 1 | // 2 | // main.m 3 | // readability 4 | // 5 | // Created by Jan on 23.02.12. 6 | // Copyright (c) 2012 geheimwerk.de. All rights reserved. 7 | // 8 | 9 | #import <Foundation/Foundation.h> 10 | 11 | #import <AppKit/AppKit.h> 12 | 13 | #import <Webkit/Webkit.h> 14 | #import <WebKit/WebArchive.h> 15 | 16 | #import "KBWebArchiver.h" 17 | #import "JXReadabilityDocument.h" 18 | #import "JXWebResourceLoadingBarrier.h" 19 | 20 | 21 | BOOL dumpXMLDocumentToPath(NSXMLDocument *doc, NSString *output, NSUInteger xmlOutputOptions, NSString *tag, NSError **error); 22 | 23 | 24 | BOOL dumpXMLDocumentToPath(NSXMLDocument *doc, NSString *output, NSUInteger xmlOutputOptions, NSString *tag, NSError **error) { 25 | if (output == nil) return NO; 26 | 27 | NSString *outputPath = nil; 28 | if (tag == nil) { 29 | outputPath = [[output stringByDeletingPathExtension] 30 | stringByAppendingPathExtension:@"html"]; 31 | } else { 32 | outputPath = [[[output stringByDeletingPathExtension] 33 | stringByAppendingString:tag] 34 | stringByAppendingPathExtension:@"html"]; 35 | } 36 | 37 | BOOL OK; 38 | 39 | if (doc != nil) { 40 | NSData *docData = [doc XMLDataWithOptions:xmlOutputOptions]; 41 | OK = [docData writeToFile:outputPath 42 | options:NSDataWritingAtomic 43 | error:error]; 44 | } 45 | else { 46 | OK = NO; 47 | } 48 | 49 | return OK; 50 | } 51 | 52 | 53 | int main(int argc, const char * argv[]) 54 | { 55 | 56 | @autoreleasepool { 57 | NSError *error = nil; 58 | 59 | NSUserDefaults *args = [NSUserDefaults standardUserDefaults]; 60 | 61 | NSString *urlString = [args stringForKey:@"url"]; 62 | NSString *localOnlyString = [args stringForKey:@"local"]; 63 | NSString *webarchivePath = [args stringForKey:@"webarchive"]; 64 | NSString *verboseString = [args stringForKey:@"verbose"]; 65 | NSString *output = [args stringForKey:@"output"]; 66 | 67 | BOOL localOnly = [localOnlyString isEqualToString:@"YES"]; 68 | BOOL verbose = [verboseString isEqualToString:@"YES"]; 69 | 70 | if ((urlString == nil) && (webarchivePath == nil)) { 71 | #if 0 72 | NSArray *arguments = [[NSProcessInfo processInfo] arguments]; 73 | const char *executablePath = [[arguments objectAtIndex:0] 74 | fileSystemRepresentation]; 75 | #endif 76 | fprintf(stderr, "readability 0.1.1\nUsage: \nreadability -url URL [-verbose YES|NO] -output FILE \n"); 77 | 78 | return EXIT_FAILURE; 79 | } 80 | 81 | 82 | WebArchive *webarchive; 83 | if (webarchivePath == nil) { 84 | KBWebArchiver *archiver = [[KBWebArchiver alloc] initWithURLString:urlString]; 85 | archiver.localResourceLoadingOnly = localOnly; 86 | webarchive = [archiver webArchive]; 87 | NSData *data = [webarchive data]; 88 | error = [archiver error]; 89 | 90 | if ( webarchive == nil || data == nil ) { 91 | fprintf(stderr, "Error: Unable to create webarchive\n"); 92 | if (error != nil) fprintf(stderr, "%s\n", [[error description] UTF8String]); 93 | 94 | return EXIT_FAILURE; 95 | } 96 | } 97 | else { 98 | NSData *data = [NSData dataWithContentsOfFile:webarchivePath 99 | options:0 100 | error:&error]; 101 | if (data == nil) { 102 | fprintf(stderr, "Error: Unable to read webarchive\n"); 103 | if (error != nil) fprintf(stderr, "%s\n", [[error description] UTF8String]); 104 | 105 | return EXIT_FAILURE; 106 | } 107 | 108 | webarchive = [[WebArchive alloc] initWithData:data]; 109 | } 110 | 111 | WebResource *resource = [webarchive mainResource]; 112 | 113 | NSString *textEncodingName = [resource textEncodingName]; 114 | 115 | NSStringEncoding encoding; 116 | if (textEncodingName == nil) { 117 | encoding = NSISOLatin1StringEncoding; 118 | } 119 | else { 120 | CFStringEncoding cfEnc = CFStringConvertIANACharSetNameToEncoding((CFStringRef)textEncodingName); 121 | if (kCFStringEncodingInvalidId == cfEnc) { 122 | encoding = NSUTF8StringEncoding; 123 | } 124 | else { 125 | encoding = CFStringConvertEncodingToNSStringEncoding(cfEnc); 126 | } 127 | } 128 | 129 | NSString *source = [[NSString alloc] initWithData:[resource data] 130 | encoding:encoding]; 131 | #if DEBUG 132 | if (output != nil) { 133 | NSString *outputRawPath = [[[output stringByDeletingPathExtension] 134 | stringByAppendingString:@"-raw"] 135 | stringByAppendingPathExtension:@"html"]; 136 | BOOL OK; 137 | OK = [source writeToFile:outputRawPath 138 | atomically:YES 139 | encoding:encoding 140 | error:&error]; 141 | 142 | if (!OK && verbose) { 143 | NSLog(@"\n%@", error); 144 | } 145 | } 146 | #endif 147 | 148 | 149 | NSXMLDocumentContentKind contentKind = NSXMLDocumentXHTMLKind; 150 | NSUInteger xmlOutputOptions = (contentKind 151 | //| NSXMLNodePrettyPrint 152 | | NSXMLNodePreserveWhitespace 153 | | NSXMLNodeCompactEmptyElement 154 | ); 155 | 156 | NSXMLDocument *doc = [[NSXMLDocument alloc] initWithXMLString:source 157 | options:NSXMLDocumentTidyHTML 158 | error:&error]; 159 | #if DEBUG 160 | if (!dumpXMLDocumentToPath(doc, output, xmlOutputOptions, @"-tidy", &error) && verbose) { 161 | NSLog(@"\n%@", error); 162 | } 163 | #endif 164 | 165 | NSXMLDocument *cleanedDoc = nil; 166 | NSXMLDocument *summaryDoc = nil; 167 | 168 | if (doc != nil) { 169 | [doc setDocumentContentKind:contentKind]; 170 | 171 | { 172 | JXReadabilityDocument *readabilityDoc = [[JXReadabilityDocument alloc] initWithXMLDocument:doc 173 | copyDocument:NO]; 174 | summaryDoc = [readabilityDoc summaryXMLDocument]; 175 | cleanedDoc = readabilityDoc.html; 176 | 177 | //NSLog(@"\nTitle: %@", readabilityDoc.title); 178 | //NSLog(@"\nShort Title: %@", readabilityDoc.shortTitle); 179 | 180 | } 181 | } 182 | 183 | #if DEBUG 184 | if (!dumpXMLDocumentToPath(cleanedDoc, output, xmlOutputOptions, @"-cleaned", &error) && verbose) { 185 | NSLog(@"\n%@", error); 186 | } 187 | #endif 188 | 189 | if (output == nil) { 190 | fprintf(stdout, "%s\n", [[summaryDoc XMLString] UTF8String]); 191 | } 192 | else { 193 | if (!dumpXMLDocumentToPath(summaryDoc, output, xmlOutputOptions, nil, &error) && verbose) { 194 | NSLog(@"\n%@", error); 195 | } 196 | 197 | NSString *outputPathExtension = [output pathExtension]; 198 | 199 | BOOL wantWebarchive; 200 | if ((wantWebarchive = [outputPathExtension isEqualToString:@"webarchive"]) 201 | || [outputPathExtension isEqualToString:@"rtf"]) { 202 | BOOL success; 203 | 204 | // Create a new webarchive with the processed markup as main content and the resources from the source webarchive 205 | NSData *docData = [summaryDoc XMLDataWithOptions:xmlOutputOptions]; 206 | WebResource *mainResource = [[WebResource alloc] initWithData:docData 207 | URL:[resource URL] 208 | MIMEType:[resource MIMEType] 209 | textEncodingName:[resource textEncodingName] 210 | frameName:nil]; 211 | 212 | WebArchive *outWebarchive = [[WebArchive alloc] initWithMainResource:mainResource 213 | subresources:[webarchive subresources] 214 | subframeArchives:[webarchive subframeArchives]]; 215 | 216 | NSData *outWebarchiveData = [outWebarchive data]; 217 | 218 | if (wantWebarchive) { 219 | success = [outWebarchiveData writeToFile:output 220 | options:NSDataWritingAtomic 221 | error:&error]; 222 | } 223 | else { 224 | JXWebResourceLoadingBarrier *loadDelegate = [JXWebResourceLoadingBarrier new]; 225 | loadDelegate.localResourceLoadingOnly = localOnly; 226 | NSDictionary *options = @{NSWebResourceLoadDelegateDocumentOption: loadDelegate}; 227 | NSDictionary *documentAttributes = nil; 228 | NSAttributedString *outAttributedString = [[NSAttributedString alloc] initWithData:outWebarchiveData 229 | options:options 230 | documentAttributes:&documentAttributes 231 | error:&error]; 232 | if (outAttributedString != nil) { 233 | NSRange fullRange = NSMakeRange(0, outAttributedString.length); 234 | NSData *outRTFData = [outAttributedString RTFFromRange:fullRange documentAttributes:documentAttributes]; 235 | success = [outRTFData writeToFile:output 236 | options:NSDataWritingAtomic 237 | error:&error]; 238 | } 239 | else { 240 | success = NO; 241 | } 242 | 243 | } 244 | 245 | if (!success) { 246 | NSLog(@"\n%@", error); 247 | } 248 | 249 | 250 | } 251 | } 252 | 253 | } 254 | 255 | return EXIT_SUCCESS; 256 | } 257 | 258 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /readability.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 46; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 3D706B8D14F6ABDD008ACC2E /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3D706B8C14F6ABDD008ACC2E /* Foundation.framework */; }; 11 | 3D706B9014F6ABDD008ACC2E /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 3D706B8F14F6ABDD008ACC2E /* main.m */; }; 12 | 3D706B9414F6ABDD008ACC2E /* readability.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = 3D706B9314F6ABDD008ACC2E /* readability.1 */; }; 13 | 3DAB86C9157CC04500392401 /* JXWebResourceLoadingBarrier.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DAB86C8157CC04500392401 /* JXWebResourceLoadingBarrier.m */; }; 14 | 3DAB86CC157CC15700392401 /* AppKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3DAB86CB157CC15700392401 /* AppKit.framework */; }; 15 | 3DACF75E15051755003A6BF7 /* NSString+Counting.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DACF75D15051755003A6BF7 /* NSString+Counting.m */; }; 16 | 3DC9BE3314F77F37007062BC /* WebKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3DC9BE3214F77F37007062BC /* WebKit.framework */; }; 17 | 3DC9BE5314F7D3D4007062BC /* NSURL+ValidityChecking.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BE5014F7D3D4007062BC /* NSURL+ValidityChecking.m */; }; 18 | 3DC9BE5414F7D3D4007062BC /* KBWebArchiver.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BE5214F7D3D4007062BC /* KBWebArchiver.m */; }; 19 | 3DC9BE7214F93ECC007062BC /* JXReadabilityDocument.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BE7114F93ECC007062BC /* JXReadabilityDocument.m */; }; 20 | 3DC9BEC414FA5F12007062BC /* NSXMLNode+HTMLUtilities.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BEC314FA5F12007062BC /* NSXMLNode+HTMLUtilities.m */; }; 21 | 3DDBC67F151DCB9600D8CF54 /* htmls.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DDBC67E151DCB9600D8CF54 /* htmls.m */; }; 22 | 3DDBC6AA151E306D00D8CF54 /* NSString+ReplaceExtensions.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DDBC6A9151E306D00D8CF54 /* NSString+ReplaceExtensions.m */; }; 23 | 3DDBC6AE151E349900D8CF54 /* NSString+JXRemoving.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DDBC6AD151E349900D8CF54 /* NSString+JXRemoving.m */; }; 24 | /* End PBXBuildFile section */ 25 | 26 | /* Begin PBXCopyFilesBuildPhase section */ 27 | 3D706B8614F6ABDC008ACC2E /* CopyFiles */ = { 28 | isa = PBXCopyFilesBuildPhase; 29 | buildActionMask = 2147483647; 30 | dstPath = /usr/share/man/man1/; 31 | dstSubfolderSpec = 0; 32 | files = ( 33 | 3D706B9414F6ABDD008ACC2E /* readability.1 in CopyFiles */, 34 | ); 35 | runOnlyForDeploymentPostprocessing = 1; 36 | }; 37 | /* End PBXCopyFilesBuildPhase section */ 38 | 39 | /* Begin PBXFileReference section */ 40 | 3D706B8814F6ABDC008ACC2E /* readability */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = readability; sourceTree = BUILT_PRODUCTS_DIR; }; 41 | 3D706B8C14F6ABDD008ACC2E /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 42 | 3D706B8F14F6ABDD008ACC2E /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; }; 43 | 3D706B9214F6ABDD008ACC2E /* readability-Prefix.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "readability-Prefix.pch"; sourceTree = "<group>"; }; 44 | 3D706B9314F6ABDD008ACC2E /* readability.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = readability.1; sourceTree = "<group>"; }; 45 | 3D706BA014F6AC76008ACC2E /* Base.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = Base.xcconfig; sourceTree = "<group>"; }; 46 | 3DAB86C7157CC04500392401 /* JXWebResourceLoadingBarrier.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = JXWebResourceLoadingBarrier.h; path = readability/JXWebResourceLoadingBarrier.h; sourceTree = "<group>"; }; 47 | 3DAB86C8157CC04500392401 /* JXWebResourceLoadingBarrier.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = JXWebResourceLoadingBarrier.m; path = readability/JXWebResourceLoadingBarrier.m; sourceTree = "<group>"; }; 48 | 3DAB86CB157CC15700392401 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; }; 49 | 3DACF75C15051755003A6BF7 /* NSString+Counting.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSString+Counting.h"; path = "readability/NSString+Counting.h"; sourceTree = "<group>"; }; 50 | 3DACF75D15051755003A6BF7 /* NSString+Counting.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSString+Counting.m"; path = "readability/NSString+Counting.m"; sourceTree = "<group>"; }; 51 | 3DC9BE3214F77F37007062BC /* WebKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = WebKit.framework; path = System/Library/Frameworks/WebKit.framework; sourceTree = SDKROOT; }; 52 | 3DC9BE4F14F7D3D4007062BC /* NSURL+ValidityChecking.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSURL+ValidityChecking.h"; path = "webarchiver/NSURL+ValidityChecking.h"; sourceTree = "<group>"; }; 53 | 3DC9BE5014F7D3D4007062BC /* NSURL+ValidityChecking.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSURL+ValidityChecking.m"; path = "webarchiver/NSURL+ValidityChecking.m"; sourceTree = "<group>"; }; 54 | 3DC9BE5114F7D3D4007062BC /* KBWebArchiver.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = KBWebArchiver.h; path = webarchiver/KBWebArchiver.h; sourceTree = "<group>"; }; 55 | 3DC9BE5214F7D3D4007062BC /* KBWebArchiver.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = KBWebArchiver.m; path = webarchiver/KBWebArchiver.m; sourceTree = "<group>"; }; 56 | 3DC9BE7014F93ECC007062BC /* JXReadabilityDocument.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = JXReadabilityDocument.h; path = readability/JXReadabilityDocument.h; sourceTree = "<group>"; }; 57 | 3DC9BE7114F93ECC007062BC /* JXReadabilityDocument.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = JXReadabilityDocument.m; path = readability/JXReadabilityDocument.m; sourceTree = "<group>"; }; 58 | 3DC9BEC214FA5F12007062BC /* NSXMLNode+HTMLUtilities.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSXMLNode+HTMLUtilities.h"; path = "readability/NSXMLNode+HTMLUtilities.h"; sourceTree = "<group>"; }; 59 | 3DC9BEC314FA5F12007062BC /* NSXMLNode+HTMLUtilities.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSXMLNode+HTMLUtilities.m"; path = "readability/NSXMLNode+HTMLUtilities.m"; sourceTree = "<group>"; }; 60 | 3DDBC67D151DCB9600D8CF54 /* htmls.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = htmls.h; path = readability/htmls.h; sourceTree = "<group>"; }; 61 | 3DDBC67E151DCB9600D8CF54 /* htmls.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = htmls.m; path = readability/htmls.m; sourceTree = "<group>"; }; 62 | 3DDBC6A8151E306D00D8CF54 /* NSString+ReplaceExtensions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSString+ReplaceExtensions.h"; path = "readability/NSString+ReplaceExtensions.h"; sourceTree = "<group>"; }; 63 | 3DDBC6A9151E306D00D8CF54 /* NSString+ReplaceExtensions.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSString+ReplaceExtensions.m"; path = "readability/NSString+ReplaceExtensions.m"; sourceTree = "<group>"; }; 64 | 3DDBC6AC151E349900D8CF54 /* NSString+JXRemoving.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSString+JXRemoving.h"; path = "readability/NSString+JXRemoving.h"; sourceTree = "<group>"; }; 65 | 3DDBC6AD151E349900D8CF54 /* NSString+JXRemoving.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSString+JXRemoving.m"; path = "readability/NSString+JXRemoving.m"; sourceTree = "<group>"; }; 66 | /* End PBXFileReference section */ 67 | 68 | /* Begin PBXFrameworksBuildPhase section */ 69 | 3D706B8514F6ABDC008ACC2E /* Frameworks */ = { 70 | isa = PBXFrameworksBuildPhase; 71 | buildActionMask = 2147483647; 72 | files = ( 73 | 3DAB86CC157CC15700392401 /* AppKit.framework in Frameworks */, 74 | 3DC9BE3314F77F37007062BC /* WebKit.framework in Frameworks */, 75 | 3D706B8D14F6ABDD008ACC2E /* Foundation.framework in Frameworks */, 76 | ); 77 | runOnlyForDeploymentPostprocessing = 0; 78 | }; 79 | /* End PBXFrameworksBuildPhase section */ 80 | 81 | /* Begin PBXGroup section */ 82 | 3D706B7D14F6ABDB008ACC2E = { 83 | isa = PBXGroup; 84 | children = ( 85 | 3DC9BE6E14F93EB8007062BC /* Classes */, 86 | 3DC9BEBE14FA5CC3007062BC /* Categories */, 87 | 3DDBC675151DCB5200D8CF54 /* Other Sources */, 88 | 3D706B8E14F6ABDD008ACC2E /* readability */, 89 | 3DC9BE5514F7D3DF007062BC /* Third Party */, 90 | 3D706B9F14F6AC59008ACC2E /* Configs */, 91 | 3D706B8B14F6ABDC008ACC2E /* Frameworks */, 92 | 3D706B8914F6ABDC008ACC2E /* Products */, 93 | ); 94 | sourceTree = "<group>"; 95 | }; 96 | 3D706B8914F6ABDC008ACC2E /* Products */ = { 97 | isa = PBXGroup; 98 | children = ( 99 | 3D706B8814F6ABDC008ACC2E /* readability */, 100 | ); 101 | name = Products; 102 | sourceTree = "<group>"; 103 | }; 104 | 3D706B8B14F6ABDC008ACC2E /* Frameworks */ = { 105 | isa = PBXGroup; 106 | children = ( 107 | 3DAB86CB157CC15700392401 /* AppKit.framework */, 108 | 3D706B8C14F6ABDD008ACC2E /* Foundation.framework */, 109 | 3DC9BE3214F77F37007062BC /* WebKit.framework */, 110 | ); 111 | name = Frameworks; 112 | sourceTree = "<group>"; 113 | }; 114 | 3D706B8E14F6ABDD008ACC2E /* readability */ = { 115 | isa = PBXGroup; 116 | children = ( 117 | 3D706B8F14F6ABDD008ACC2E /* main.m */, 118 | 3D706B9314F6ABDD008ACC2E /* readability.1 */, 119 | 3D706B9114F6ABDD008ACC2E /* Supporting Files */, 120 | ); 121 | path = readability; 122 | sourceTree = "<group>"; 123 | }; 124 | 3D706B9114F6ABDD008ACC2E /* Supporting Files */ = { 125 | isa = PBXGroup; 126 | children = ( 127 | 3D706B9214F6ABDD008ACC2E /* readability-Prefix.pch */, 128 | ); 129 | name = "Supporting Files"; 130 | sourceTree = "<group>"; 131 | }; 132 | 3D706B9F14F6AC59008ACC2E /* Configs */ = { 133 | isa = PBXGroup; 134 | children = ( 135 | 3D706BA014F6AC76008ACC2E /* Base.xcconfig */, 136 | ); 137 | path = Configs; 138 | sourceTree = "<group>"; 139 | }; 140 | 3DC9BE5514F7D3DF007062BC /* Third Party */ = { 141 | isa = PBXGroup; 142 | children = ( 143 | 3DC9BE5614F7D3E3007062BC /* KBWebArchiver */, 144 | ); 145 | name = "Third Party"; 146 | sourceTree = "<group>"; 147 | }; 148 | 3DC9BE5614F7D3E3007062BC /* KBWebArchiver */ = { 149 | isa = PBXGroup; 150 | children = ( 151 | 3DC9BE4F14F7D3D4007062BC /* NSURL+ValidityChecking.h */, 152 | 3DC9BE5014F7D3D4007062BC /* NSURL+ValidityChecking.m */, 153 | 3DC9BE5114F7D3D4007062BC /* KBWebArchiver.h */, 154 | 3DC9BE5214F7D3D4007062BC /* KBWebArchiver.m */, 155 | ); 156 | name = KBWebArchiver; 157 | sourceTree = "<group>"; 158 | }; 159 | 3DC9BE6E14F93EB8007062BC /* Classes */ = { 160 | isa = PBXGroup; 161 | children = ( 162 | 3DC9BE7014F93ECC007062BC /* JXReadabilityDocument.h */, 163 | 3DC9BE7114F93ECC007062BC /* JXReadabilityDocument.m */, 164 | 3DAB86C7157CC04500392401 /* JXWebResourceLoadingBarrier.h */, 165 | 3DAB86C8157CC04500392401 /* JXWebResourceLoadingBarrier.m */, 166 | ); 167 | name = Classes; 168 | sourceTree = "<group>"; 169 | }; 170 | 3DC9BEBE14FA5CC3007062BC /* Categories */ = { 171 | isa = PBXGroup; 172 | children = ( 173 | 3DC9BEC214FA5F12007062BC /* NSXMLNode+HTMLUtilities.h */, 174 | 3DC9BEC314FA5F12007062BC /* NSXMLNode+HTMLUtilities.m */, 175 | 3DACF75C15051755003A6BF7 /* NSString+Counting.h */, 176 | 3DACF75D15051755003A6BF7 /* NSString+Counting.m */, 177 | 3DDBC6AC151E349900D8CF54 /* NSString+JXRemoving.h */, 178 | 3DDBC6AD151E349900D8CF54 /* NSString+JXRemoving.m */, 179 | 3DDBC6A8151E306D00D8CF54 /* NSString+ReplaceExtensions.h */, 180 | 3DDBC6A9151E306D00D8CF54 /* NSString+ReplaceExtensions.m */, 181 | ); 182 | name = Categories; 183 | sourceTree = "<group>"; 184 | }; 185 | 3DDBC675151DCB5200D8CF54 /* Other Sources */ = { 186 | isa = PBXGroup; 187 | children = ( 188 | 3DDBC67D151DCB9600D8CF54 /* htmls.h */, 189 | 3DDBC67E151DCB9600D8CF54 /* htmls.m */, 190 | ); 191 | name = "Other Sources"; 192 | sourceTree = "<group>"; 193 | }; 194 | /* End PBXGroup section */ 195 | 196 | /* Begin PBXNativeTarget section */ 197 | 3D706B8714F6ABDC008ACC2E /* readability */ = { 198 | isa = PBXNativeTarget; 199 | buildConfigurationList = 3D706B9714F6ABDD008ACC2E /* Build configuration list for PBXNativeTarget "readability" */; 200 | buildPhases = ( 201 | 3D706B8414F6ABDC008ACC2E /* Sources */, 202 | 3D706B8514F6ABDC008ACC2E /* Frameworks */, 203 | 3D706B8614F6ABDC008ACC2E /* CopyFiles */, 204 | ); 205 | buildRules = ( 206 | ); 207 | dependencies = ( 208 | ); 209 | name = readability; 210 | productName = readability; 211 | productReference = 3D706B8814F6ABDC008ACC2E /* readability */; 212 | productType = "com.apple.product-type.tool"; 213 | }; 214 | /* End PBXNativeTarget section */ 215 | 216 | /* Begin PBXProject section */ 217 | 3D706B7F14F6ABDB008ACC2E /* Project object */ = { 218 | isa = PBXProject; 219 | attributes = { 220 | LastUpgradeCheck = 0500; 221 | ORGANIZATIONNAME = geheimwerk.de; 222 | }; 223 | buildConfigurationList = 3D706B8214F6ABDB008ACC2E /* Build configuration list for PBXProject "readability" */; 224 | compatibilityVersion = "Xcode 3.2"; 225 | developmentRegion = English; 226 | hasScannedForEncodings = 0; 227 | knownRegions = ( 228 | en, 229 | ); 230 | mainGroup = 3D706B7D14F6ABDB008ACC2E; 231 | productRefGroup = 3D706B8914F6ABDC008ACC2E /* Products */; 232 | projectDirPath = ""; 233 | projectRoot = ""; 234 | targets = ( 235 | 3D706B8714F6ABDC008ACC2E /* readability */, 236 | ); 237 | }; 238 | /* End PBXProject section */ 239 | 240 | /* Begin PBXSourcesBuildPhase section */ 241 | 3D706B8414F6ABDC008ACC2E /* Sources */ = { 242 | isa = PBXSourcesBuildPhase; 243 | buildActionMask = 2147483647; 244 | files = ( 245 | 3D706B9014F6ABDD008ACC2E /* main.m in Sources */, 246 | 3DC9BE5314F7D3D4007062BC /* NSURL+ValidityChecking.m in Sources */, 247 | 3DC9BE5414F7D3D4007062BC /* KBWebArchiver.m in Sources */, 248 | 3DC9BE7214F93ECC007062BC /* JXReadabilityDocument.m in Sources */, 249 | 3DC9BEC414FA5F12007062BC /* NSXMLNode+HTMLUtilities.m in Sources */, 250 | 3DACF75E15051755003A6BF7 /* NSString+Counting.m in Sources */, 251 | 3DDBC67F151DCB9600D8CF54 /* htmls.m in Sources */, 252 | 3DDBC6AA151E306D00D8CF54 /* NSString+ReplaceExtensions.m in Sources */, 253 | 3DDBC6AE151E349900D8CF54 /* NSString+JXRemoving.m in Sources */, 254 | 3DAB86C9157CC04500392401 /* JXWebResourceLoadingBarrier.m in Sources */, 255 | ); 256 | runOnlyForDeploymentPostprocessing = 0; 257 | }; 258 | /* End PBXSourcesBuildPhase section */ 259 | 260 | /* Begin XCBuildConfiguration section */ 261 | 3D706B9514F6ABDD008ACC2E /* Debug */ = { 262 | isa = XCBuildConfiguration; 263 | baseConfigurationReference = 3D706BA014F6AC76008ACC2E /* Base.xcconfig */; 264 | buildSettings = { 265 | ALWAYS_SEARCH_USER_PATHS = NO; 266 | CLANG_WARN_BOOL_CONVERSION = YES; 267 | CLANG_WARN_CONSTANT_CONVERSION = YES; 268 | CLANG_WARN_EMPTY_BODY = YES; 269 | CLANG_WARN_ENUM_CONVERSION = YES; 270 | CLANG_WARN_INT_CONVERSION = YES; 271 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 272 | COPY_PHASE_STRIP = NO; 273 | GCC_DYNAMIC_NO_PIC = NO; 274 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 275 | GCC_OPTIMIZATION_LEVEL = 0; 276 | GCC_PREPROCESSOR_DEFINITIONS = ( 277 | "DEBUG=1", 278 | "$(inherited)", 279 | ); 280 | GCC_SYMBOLS_PRIVATE_EXTERN = NO; 281 | ONLY_ACTIVE_ARCH = YES; 282 | }; 283 | name = Debug; 284 | }; 285 | 3D706B9614F6ABDD008ACC2E /* Release */ = { 286 | isa = XCBuildConfiguration; 287 | baseConfigurationReference = 3D706BA014F6AC76008ACC2E /* Base.xcconfig */; 288 | buildSettings = { 289 | ALWAYS_SEARCH_USER_PATHS = NO; 290 | CLANG_WARN_BOOL_CONVERSION = YES; 291 | CLANG_WARN_CONSTANT_CONVERSION = YES; 292 | CLANG_WARN_EMPTY_BODY = YES; 293 | CLANG_WARN_ENUM_CONVERSION = YES; 294 | CLANG_WARN_INT_CONVERSION = YES; 295 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 296 | COPY_PHASE_STRIP = YES; 297 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 298 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 299 | }; 300 | name = Release; 301 | }; 302 | 3D706B9814F6ABDD008ACC2E /* Debug */ = { 303 | isa = XCBuildConfiguration; 304 | buildSettings = { 305 | CLANG_ENABLE_OBJC_ARC = YES; 306 | GCC_PRECOMPILE_PREFIX_HEADER = YES; 307 | GCC_PREFIX_HEADER = "readability/readability-Prefix.pch"; 308 | PRODUCT_NAME = "$(TARGET_NAME)"; 309 | SDKROOT = macosx; 310 | }; 311 | name = Debug; 312 | }; 313 | 3D706B9914F6ABDD008ACC2E /* Release */ = { 314 | isa = XCBuildConfiguration; 315 | buildSettings = { 316 | CLANG_ENABLE_OBJC_ARC = YES; 317 | GCC_PRECOMPILE_PREFIX_HEADER = YES; 318 | GCC_PREFIX_HEADER = "readability/readability-Prefix.pch"; 319 | PRODUCT_NAME = "$(TARGET_NAME)"; 320 | SDKROOT = macosx; 321 | }; 322 | name = Release; 323 | }; 324 | /* End XCBuildConfiguration section */ 325 | 326 | /* Begin XCConfigurationList section */ 327 | 3D706B8214F6ABDB008ACC2E /* Build configuration list for PBXProject "readability" */ = { 328 | isa = XCConfigurationList; 329 | buildConfigurations = ( 330 | 3D706B9514F6ABDD008ACC2E /* Debug */, 331 | 3D706B9614F6ABDD008ACC2E /* Release */, 332 | ); 333 | defaultConfigurationIsVisible = 0; 334 | defaultConfigurationName = Release; 335 | }; 336 | 3D706B9714F6ABDD008ACC2E /* Build configuration list for PBXNativeTarget "readability" */ = { 337 | isa = XCConfigurationList; 338 | buildConfigurations = ( 339 | 3D706B9814F6ABDD008ACC2E /* Debug */, 340 | 3D706B9914F6ABDD008ACC2E /* Release */, 341 | ); 342 | defaultConfigurationIsVisible = 0; 343 | defaultConfigurationName = Release; 344 | }; 345 | /* End XCConfigurationList section */ 346 | }; 347 | rootObject = 3D706B7F14F6ABDB008ACC2E /* Project object */; 348 | } 349 | -------------------------------------------------------------------------------- /readability/JXReadabilityDocument.m: -------------------------------------------------------------------------------- 1 | /* 2 | * JXReadablilityDocument 3 | * 4 | * Copyright (c) 2012 geheimwerk.de. 5 | * https://github.com/JanX2/readability-objc 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * 19 | * ObjC port: jan@geheimwerk.de (Jan Weiß) 20 | */ 21 | 22 | #import "JXReadabilityDocument.h" 23 | 24 | #import "htmls.h" 25 | #import "NSString+Counting.h" 26 | #import "NSXMLNode+HTMLUtilities.h" 27 | 28 | #define TEXT_LENGTH_THRESHOLD 25 29 | #define RETRY_LENGTH 250 30 | 31 | NSString * const unlikelyCandidates = @"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter"; 32 | NSString * const okMaybeItsACandidate = @"and|article|body|column|main|shadow"; 33 | NSString * const positiveNames = @"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story"; 34 | NSString * const negativeNames = @"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget"; 35 | NSString * const divToPElements = @"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"; 36 | 37 | NSString * const newlinePlusSurroundingwhitespace = @"\\s*\n\\s*"; 38 | NSString * const tabRun = @"[ \t]{2,}"; 39 | NSString * const sentenceEnd = @"\\.( |$)"; 40 | 41 | 42 | NSString * const divToPElementsTagNamesString = @"a|blockquote|dl|div|img|ol|p|pre|table|ul"; 43 | 44 | 45 | NSSet * stringSetForListStringDelimitedBy(NSString *listString, NSString *delimiter); 46 | 47 | 48 | @interface HashableElement : NSObject <NSCopying> { 49 | NSXMLNode *_node; 50 | } 51 | 52 | @property (nonatomic, strong) NSXMLNode *node; 53 | 54 | + (id)elementForNode:(NSXMLNode *)aNode; 55 | - (id)initWithNode:(NSXMLNode *)aNode; 56 | 57 | @end 58 | 59 | NSSet * stringSetForListStringDelimitedBy(NSString *listString, NSString *delimiter) { 60 | NSArray *strings = [listString componentsSeparatedByString:delimiter]; 61 | 62 | NSSet *stringSet = [NSSet setWithArray:strings]; 63 | 64 | return stringSet; 65 | } 66 | 67 | 68 | @implementation JXReadabilityDocument 69 | 70 | @synthesize input; 71 | @synthesize html; 72 | 73 | @synthesize options; 74 | 75 | - (id)initWithXMLDocument:(NSXMLDocument *)aDoc copyDocument:(BOOL)doCopy; 76 | { 77 | if (doCopy) { 78 | return [self initWithXMLDocument:[aDoc copy]]; 79 | } else { 80 | return [self initWithXMLDocument:aDoc]; 81 | } 82 | } 83 | 84 | - (id)initWithXMLDocument:(NSXMLDocument *)aDoc; 85 | { 86 | self = [super init]; 87 | 88 | if (self) { 89 | self.html = aDoc; 90 | self.options = [NSMutableDictionary dictionary]; 91 | 92 | whitespaceAndNewlineCharacterSet = [NSCharacterSet whitespaceAndNewlineCharacterSet]; 93 | 94 | unlikelyCandidatesRe = [[NSRegularExpression alloc] initWithPattern:unlikelyCandidates options:0 error:NULL]; 95 | okMaybeItsACandidateRe = [[NSRegularExpression alloc] initWithPattern:okMaybeItsACandidate options:0 error:NULL]; 96 | positiveRe = [[NSRegularExpression alloc] initWithPattern:positiveNames options:0 error:NULL]; 97 | negativeRe = [[NSRegularExpression alloc] initWithPattern:negativeNames options:0 error:NULL]; 98 | divToPElementsRe = [[NSRegularExpression alloc] initWithPattern:divToPElements options:0 error:NULL]; 99 | 100 | newlinePlusSurroundingwhitespaceRe = 101 | [[NSRegularExpression alloc] initWithPattern:newlinePlusSurroundingwhitespace 102 | options:0 103 | error:NULL]; 104 | tabRunRe = [[NSRegularExpression alloc] initWithPattern:tabRun 105 | options:0 106 | error:NULL]; 107 | sentenceEndRe = [[NSRegularExpression alloc] initWithPattern:sentenceEnd 108 | options:0 109 | error:NULL]; 110 | 111 | NSString *delimiter = @"|"; 112 | divToPElementsTagNames = stringSetForListStringDelimitedBy(divToPElementsTagNamesString, delimiter); 113 | 114 | } 115 | 116 | return self; 117 | } 118 | 119 | 120 | 121 | - (NSString *)title; 122 | { 123 | return getTitleInDocument(self.html); 124 | } 125 | 126 | - (NSString *)shortTitle; 127 | { 128 | return shortenTitleInDocument(self.html); 129 | } 130 | 131 | 132 | - (void)debug:(id)a 133 | { 134 | if ([(NSNumber *)(self.options)[@"debug"] boolValue]) { 135 | NSLog(@"%@", a); 136 | } 137 | } 138 | 139 | - (void)removeUnlikelyCandidates 140 | { 141 | NSXMLNode *elem = self.html; 142 | 143 | do { 144 | if ([elem kind] == NSXMLElementKind) { 145 | NSString *classes = [elem cssNamesForAttributeWithName:@"class"]; 146 | NSString *ids = [elem cssNamesForAttributeWithName:@"id"]; 147 | 148 | if (classes == nil && ids == nil) continue; 149 | 150 | NSString *s = [NSString stringWithFormat:@"%@ %@", 151 | (classes == nil ? @"" : classes), 152 | (ids == nil ? @"" : ids)]; 153 | NSRange sRange = NSMakeRange(0, [s length]); 154 | 155 | if (sRange.length < 2) continue; 156 | 157 | //[self debug:s]; 158 | 159 | if (([unlikelyCandidatesRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) 160 | && ([okMaybeItsACandidateRe rangeOfFirstMatchInString:s options:0 range:sRange].location == NSNotFound) 161 | && ![elem.name isEqualToString:@"html"] 162 | && ![elem.name isEqualToString:@"body"]) { 163 | //[self debug:[NSString stringWithFormat:@"Removing unlikely candidate - %@", [elem readabilityDescription]]]; 164 | [elem detach]; 165 | } 166 | } 167 | 168 | } while ((elem = [elem nextNode]) != nil); 169 | } 170 | 171 | - (void)transformMisusedDivsIntoParagraphs 172 | { 173 | NSArray *nodes; 174 | 175 | nodes = [self.html tagsWithNames:@"div", nil]; 176 | for (NSXMLNode *elem in nodes) { 177 | // Transform <div>s that do not contain other block elements into <p>s 178 | NSXMLNode *elemNextSibling = [elem nextSibling]; 179 | NSXMLNode *descendant = elem; 180 | BOOL blockElementFound = NO; 181 | 182 | while ((descendant = [descendant nextNode]) != elemNextSibling) { 183 | if ([divToPElementsTagNames containsObject:descendant.name]) { 184 | blockElementFound = YES; 185 | break; 186 | } 187 | } 188 | 189 | if (blockElementFound == NO) { 190 | //[self debug:[NSString stringWithFormat:@"Altering %@ to p", [elem readabilityDescription]]]; 191 | [elem setName:@"p"]; 192 | //NSLog(@"Fixed element %@", [elem readabilityDescription]); 193 | } 194 | } 195 | 196 | NSXMLElement *p; 197 | NSString *s; 198 | 199 | nodes = [self.html tagsWithNames:@"div", nil]; 200 | for (NSXMLElement *elem in nodes) { // div tags always are elements 201 | 202 | NSXMLNode *firstTextNode = [elem lxmlTextNode]; 203 | s = [firstTextNode stringValue]; 204 | if ((s != nil) 205 | && ([s length] != 0) 206 | && ([[s stringByTrimmingCharactersInSet:whitespaceAndNewlineCharacterSet] length] != 0)) { // using -ws_isBlankString would be faster 207 | 208 | p = [NSXMLNode elementWithName:@"p" 209 | stringValue:s]; 210 | 211 | [firstTextNode detach]; 212 | [elem insertChild:p atIndex:0]; 213 | //NSLog(@"Appended %@ to %@", p, [elem readabilityDescription]); 214 | } 215 | 216 | [[elem children] enumerateObjectsWithOptions:NSEnumerationReverse 217 | usingBlock:^(id obj, NSUInteger pos, BOOL *stop) { 218 | NSXMLNode *child = obj; 219 | NSXMLElement *paragraph; 220 | 221 | NSXMLNode *tailNode = [child lxmlTailNode]; 222 | 223 | NSString *childTailString = ((tailNode == nil) ? @"" : [tailNode stringValue]); 224 | 225 | if (([childTailString length] != 0) 226 | && ([[childTailString stringByTrimmingCharactersInSet:whitespaceAndNewlineCharacterSet] length] != 0)) { // using -ws_isBlankString would be faster 227 | 228 | paragraph = [NSXMLNode elementWithName:@"p" 229 | stringValue:childTailString]; 230 | 231 | [tailNode detach]; // We could get [tailNode index] and insert there after detaching 232 | [elem insertChild:paragraph atIndex:(pos + 1)]; 233 | //NSLog(@"Appended %@ to %@", p, [elem readabilityDescription]); 234 | } 235 | 236 | if ([[child name] isEqualToString:@"br"]) { 237 | [child detach]; 238 | //NSLog(@"Dropped <br> at %@", [elem readabilityDescription]); 239 | } 240 | }]; 241 | 242 | } 243 | } 244 | 245 | - (NSString *)clean:(NSString *)_text 246 | { 247 | NSUInteger textLength = [_text length]; 248 | if (textLength == 0) return _text; 249 | 250 | NSMutableString *text = [_text mutableCopy]; 251 | 252 | [newlinePlusSurroundingwhitespaceRe replaceMatchesInString:text 253 | options:0 254 | range:NSMakeRange(0, textLength) 255 | withTemplate:@"\n"]; 256 | 257 | [tabRunRe replaceMatchesInString:text 258 | options:0 259 | range:NSMakeRange(0, [text length]) 260 | withTemplate:@" "]; 261 | 262 | CFStringTrimWhitespace((CFMutableStringRef)text); 263 | 264 | return text; 265 | } 266 | 267 | - (NSUInteger)textLength:(NSXMLNode *)i 268 | { 269 | if ([i kind] == NSXMLElementKind) { 270 | NSString *s = [i stringValue]; 271 | NSString *cleanS = (s != nil) ? [self clean:s] : @""; 272 | return [cleanS length]; 273 | } 274 | else { 275 | return 0; 276 | } 277 | } 278 | 279 | - (float)classWeight:(NSXMLElement *)e 280 | { 281 | float weight = 0; 282 | NSString *s; 283 | 284 | if ((s = [e cssNamesForAttributeWithName:@"class"]) != nil) { 285 | NSRange sRange = NSMakeRange(0, [s length]); 286 | 287 | if ([negativeRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight -= 25; 288 | 289 | if ([positiveRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight += 25; 290 | } 291 | 292 | if ((s = [e cssNamesForAttributeWithName:@"id"]) != nil) { 293 | NSRange sRange = NSMakeRange(0, [s length]); 294 | 295 | if ([negativeRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight -= 25; 296 | 297 | if ([positiveRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight += 25; 298 | } 299 | 300 | return weight; 301 | } 302 | 303 | - (NSMutableDictionary *)scoreNode:(NSXMLElement *)elem 304 | { 305 | static BOOL firstRun = YES; 306 | static NSSet *preTDBlockquote = nil; 307 | static NSSet *addressEtc = nil; 308 | static NSSet *headlines = nil; 309 | 310 | if (firstRun) { 311 | preTDBlockquote = [[NSSet alloc] initWithObjects:@"pre", @"td", @"blockquote", nil]; 312 | addressEtc = [[NSSet alloc] initWithObjects:@"address", @"ol", @"ul", @"dl", @"dd", @"dt", @"li", @"form", nil]; 313 | headlines = [[NSSet alloc] initWithObjects:@"h1", @"h2", @"h3", @"h4", @"h5", @"h6", @"th", nil]; 314 | firstRun = NO; 315 | } 316 | 317 | float contentScore = [self classWeight:elem]; 318 | NSString *name = [elem.name lowercaseString]; 319 | if ([name isEqualToString:@"div"]) { 320 | contentScore += 5; 321 | } 322 | else if ([preTDBlockquote containsObject:name]) { 323 | contentScore += 3; 324 | } 325 | else if ([addressEtc containsObject:name]) { 326 | contentScore -= 3; 327 | } 328 | else if ([headlines containsObject:name]) { 329 | contentScore -= 5; 330 | } 331 | 332 | return [NSMutableDictionary dictionaryWithObjectsAndKeys: 333 | @(contentScore), @"contentScore", 334 | elem, @"elem", 335 | nil]; 336 | } 337 | 338 | - (NSXMLDocument *)getArticleForCandidates:(NSDictionary *)candidates 339 | andBestCandidate:(NSDictionary *)bestCandidate 340 | HTMLPartial:(BOOL)HTMLPartial 341 | { 342 | // Now that we have the top candidate, look through its siblings for content that might also be related 343 | // Things like preambles, content split by ads that we removed, etc. 344 | 345 | float siblingScoreThreshold = MAX(10.0, ([bestCandidate[@"contentScore"] floatValue] * 0.2)); 346 | 347 | // Create a new HTML document with a html->body->div 348 | NSXMLDocument *output = [[NSXMLDocument alloc] initWithXMLString:@"<html><head><title /></head><body><div id='readibility-root' /></body></html>" 349 | options:NSXMLDocumentTidyHTML 350 | error:NULL]; 351 | [output setDocumentContentKind:NSXMLDocumentXHTMLKind]; 352 | NSXMLElement *htmlDiv = [output nodesForXPath:@"/html/body/div" 353 | error:NULL][0]; 354 | #if 0 355 | // Disabled until we can figure out a good way to return an NSXMLDocument OR an NSXMLElement 356 | if (HTMLPartial) { 357 | output = htmlDiv; 358 | } 359 | #endif 360 | NSXMLNode *bestElem = bestCandidate[@"elem"]; 361 | 362 | BOOL append; 363 | NSDictionary *siblingScoreDict; 364 | HashableElement *siblingKey; 365 | for (NSXMLNode *sibling in [[bestElem parent] children]) { 366 | //if isinstance(sibling, NavigableString): continue 367 | // in lxml there no concept of simple text 368 | append = NO; 369 | 370 | if (sibling == bestElem) append = YES; 371 | 372 | if (append == NO) { 373 | siblingKey = [HashableElement elementForNode:sibling]; 374 | siblingScoreDict = candidates[siblingKey]; 375 | if ((siblingScoreDict != nil) 376 | && ([siblingScoreDict[@"contentScore"] floatValue] >= siblingScoreThreshold)) { 377 | append = YES; 378 | } 379 | } 380 | 381 | if ((append == NO) 382 | && [sibling.name isEqualToString:@"p"] 383 | && ([sibling kind] == NSXMLElementKind)) { 384 | 385 | float linkDensity = [self getLinkDensity:(NSXMLElement *)sibling]; 386 | NSString *nodeContent = [sibling lxmlText]; 387 | nodeContent = (nodeContent == nil) ? @"" : nodeContent; 388 | NSUInteger nodeLength = [nodeContent length]; 389 | 390 | if ((nodeLength > 80) 391 | && (linkDensity < 0.25)) { 392 | append = YES; 393 | } 394 | else if ((nodeLength <= 80) 395 | && (linkDensity == 0.0) 396 | && ([sentenceEndRe rangeOfFirstMatchInString:nodeContent options:0 range:NSMakeRange(0, [nodeContent length])].location != NSNotFound)) { 397 | append = YES; 398 | } 399 | } 400 | 401 | if (append) [htmlDiv addChild:[sibling copy]]; 402 | } 403 | 404 | //if output is not None: 405 | // output.append(bestElem) 406 | 407 | return output; 408 | 409 | } 410 | 411 | - (NSDictionary *)selectBestCandidate:(NSDictionary *)candidates 412 | { 413 | NSArray *allCandidates = [candidates allValues]; 414 | if ([allCandidates count] == 0) return nil; 415 | 416 | NSSortDescriptor *contentScoreDescendingDescriptor = [NSSortDescriptor sortDescriptorWithKey:@"contentScore" 417 | ascending:NO]; 418 | 419 | NSArray *sortedCandidates = [allCandidates sortedArrayUsingDescriptors: 420 | @[contentScoreDescendingDescriptor]]; 421 | 422 | #if 0 423 | NSXMLElement *elem; 424 | NSArray *topFive = ([sortedCandidates count] >= 5) ? [sortedCandidates subarrayWithRange:NSMakeRange(0, 5)] : sortedCandidates; 425 | for (NSDictionary *candidate in topFive) { 426 | elem = [candidate objectForKey:@"elem"]; 427 | [self debug:[NSString stringWithFormat:@"Top 5 : %6.3f %@", [candidate objectForKey:@"contentScore"], [elem readabilityDescription]]]; 428 | } 429 | #endif 430 | 431 | NSDictionary *bestCandidate = sortedCandidates[0]; 432 | return bestCandidate; 433 | } 434 | 435 | - (float)getLinkDensity:(NSXMLElement *)elem 436 | { 437 | NSUInteger linkLength = 0; 438 | for (NSXMLNode *i in [elem nodesForXPath:@".//a" error:NULL]) { 439 | linkLength += [[i stringValue] length]; 440 | //if len(elem.findall(".//div") or elem.findall(".//p")): 441 | // linkLength = linkLength 442 | } 443 | NSUInteger totalLength = [self textLength:elem]; 444 | return (float)linkLength / MAX(totalLength, 1); 445 | } 446 | 447 | - (NSDictionary *)scoreParagraphs 448 | { 449 | NSNumber *minLength = (self.options)[@"minTextLength"]; 450 | NSUInteger minLen = (minLength != nil) ? [minLength unsignedIntegerValue] : TEXT_LENGTH_THRESHOLD; 451 | 452 | NSMutableDictionary *candidates = [NSMutableDictionary dictionary]; 453 | 454 | #if 0 455 | for (NSXMLNode *node in [self.html tagsWithNames:@"div", nil]) { 456 | [self debug:[node readabilityDescription]]; 457 | } 458 | #endif 459 | 460 | NSXMLElement *parentNode, *grandParentNode; // parents have to be elements 461 | NSString *elemTextContent, *innerText; 462 | NSUInteger innerTextLen; 463 | 464 | NSMutableArray *ordered = [NSMutableArray array]; 465 | HashableElement *hashableParent, *hashableGrandParent; 466 | for (NSXMLElement *elem in [self.html tagsWithNames:@"p", @"pre", @"td", nil]) { 467 | parentNode = (NSXMLElement *)[elem parent]; 468 | if (parentNode == nil) continue; 469 | grandParentNode = (NSXMLElement *)[parentNode parent]; 470 | 471 | elemTextContent = [elem stringValue]; 472 | innerText = (elemTextContent != nil) ? [self clean:elemTextContent] : @""; 473 | innerTextLen = [innerText length]; 474 | 475 | // If this paragraph is less than 25 characters, don't even count it. 476 | if (innerTextLen < minLen) continue; 477 | 478 | hashableParent = [HashableElement elementForNode:parentNode]; 479 | if (candidates[hashableParent] == nil) { 480 | candidates[hashableParent] = [self scoreNode:parentNode]; 481 | [ordered addObject:parentNode]; 482 | } 483 | 484 | if (grandParentNode != nil) { 485 | hashableGrandParent = [HashableElement elementForNode:grandParentNode]; 486 | if (candidates[hashableGrandParent] == nil) { 487 | candidates[hashableGrandParent] = [self scoreNode:grandParentNode]; 488 | [ordered addObject:grandParentNode]; 489 | } 490 | } 491 | 492 | float contentScore = 1.0; 493 | contentScore += [innerText countOccurancesOfString:@","] + 1; 494 | contentScore += MIN((innerTextLen / 100), 3); 495 | //if elem not in candidates: 496 | // candidates[elem] = self.scoreNode(elem) 497 | 498 | //WTF? candidates[elem]['contentScore'] += contentScore 499 | float tempScore; 500 | NSMutableDictionary *scoreDict; 501 | scoreDict = candidates[hashableParent]; 502 | tempScore = [scoreDict[@"contentScore"] floatValue] + contentScore; 503 | scoreDict[@"contentScore"] = @(tempScore); 504 | if (grandParentNode != nil) { 505 | scoreDict = candidates[hashableGrandParent]; 506 | tempScore = [scoreDict[@"contentScore"] floatValue] + contentScore / 2.0; 507 | scoreDict[@"contentScore"] = @(tempScore); 508 | } 509 | } 510 | 511 | // Scale the final candidates score based on link density. Good content should have a 512 | // relatively small link density (5% or less) and be mostly unaffected by this operation. 513 | NSMutableDictionary *candidate; 514 | float ld; 515 | float score; 516 | 517 | for (NSXMLElement *elem in ordered) { 518 | HashableElement *hashableElem = [HashableElement elementForNode:elem]; 519 | candidate = candidates[hashableElem]; 520 | ld = [self getLinkDensity:elem]; 521 | score = [candidate[@"contentScore"] floatValue]; 522 | //[self debug:[NSString stringWithFormat:@"Candid: %6.3f %s link density %.3f -> %6.3f", score, [elem readabilityDescription], ld, score*(1-ld)]]; 523 | score *= (1 - ld); 524 | candidate[@"contentScore"] = @(score); 525 | } 526 | 527 | return candidates; 528 | } 529 | 530 | NSUInteger sumCFArrayOfNSUInteger(CFArrayRef array); 531 | NSUInteger sumCFArrayOfNSUInteger(CFArrayRef array) { 532 | NSUInteger siblingsSum = 0; 533 | 534 | CFIndex i, c = CFArrayGetCount(array); 535 | for (i = 0; i < c; i++) { 536 | siblingsSum += (NSUInteger)CFArrayGetValueAtIndex(array, i); 537 | } 538 | 539 | return siblingsSum; 540 | } 541 | 542 | - (NSXMLDocument *)sanitizeArticle:(NSXMLDocument *)node forCandidates:(NSDictionary *)candidates 543 | { 544 | #ifndef DEBUG_SANITIZE 545 | # define DEBUG_SANITIZE 0 546 | #endif 547 | 548 | NSNumber *minTextLengthNum = (self.options)[@"minTextLength"]; 549 | NSUInteger minLen = (minTextLengthNum != nil) ? [minTextLengthNum unsignedIntegerValue] : TEXT_LENGTH_THRESHOLD; 550 | for (NSXMLElement *header in [node tagsWithNames:@"h1", @"h2", @"h3", @"h4", @"h5", @"h6", nil]) { 551 | if ([self classWeight:header] < 0 || [self getLinkDensity:header] > 0.33) { 552 | [header detach]; 553 | } 554 | } 555 | 556 | for (NSXMLElement *elem in [node tagsWithNames:@"form", @"iframe", @"textarea", nil]) { 557 | [elem detach]; 558 | } 559 | 560 | CFMutableDictionaryRef allowed = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, NULL); // keys: HashableElement, values:raw BOOL 561 | 562 | NSDictionary *elDict; 563 | HashableElement *hashableEl; 564 | float weight; 565 | NSString *tag; 566 | float contentScore; 567 | CFIndex kindCount; 568 | NSArray *tagKinds = @[@"p", @"img", @"li", @"a", @"embed", @"input"]; 569 | NSUInteger contentLength; 570 | float linkDensity; 571 | NSXMLNode *parentNode; 572 | 573 | BOOL toRemove; 574 | #if DEBUG_SANITIZE 575 | NSString *reason; 576 | #endif 577 | 578 | // Conditionally clean <table>s, <ul>s, and <div>s 579 | for (NSXMLElement *el in [node tagsWithNames:@"table", @"ul", @"div", nil]) { 580 | hashableEl = [HashableElement elementForNode:el]; 581 | 582 | if (CFDictionaryContainsValue(allowed, (__bridge const void *)(hashableEl))) continue; 583 | 584 | weight = [self classWeight:el]; 585 | 586 | elDict = candidates[hashableEl]; 587 | if (elDict != nil) { 588 | contentScore = [elDict[@"contentScore"] floatValue]; 589 | //print '!',el, '-> %6.3f' % contentScore 590 | } 591 | else { 592 | contentScore = 0; 593 | } 594 | 595 | tag = el.name; 596 | 597 | if ((weight + contentScore) < 0.0) { 598 | //[self debug:[NSString stringWithFormat:@"Cleaned %@ with score %6.3f and weight %-3s", [el readabilityDescription], contentScore, weight]]; 599 | [el detach]; 600 | } 601 | else if ([[el stringValue] countOccurancesOfString:@","] < 10) { 602 | CFMutableDictionaryRef counts = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, NULL); // keys: NSString, values:raw CFIndex 603 | 604 | for (NSString *kind in tagKinds) { 605 | kindCount = (CFIndex)[[node nodesForXPath:[NSString stringWithFormat:tagNameXPath, kind] 606 | error:NULL] count]; 607 | CFDictionaryAddValue(counts, (__bridge const void *)(kind), (void *)kindCount); 608 | } 609 | 610 | if (CFDictionaryGetValueIfPresent(counts, @"li", (const void **)&kindCount)) { 611 | kindCount -= 100; 612 | CFDictionarySetValue(counts, @"li", (void *)kindCount); 613 | } 614 | 615 | contentLength = [self textLength:el]; // Count the text length excluding any surrounding whitespace 616 | linkDensity = [self getLinkDensity:el]; 617 | 618 | parentNode = [el parent]; 619 | if (parentNode != nil) { 620 | 621 | #if DEBUG_SANITIZE 622 | NSDictionary *parentNodeDict = [candidates objectForKey:[HashableElement elementForNode:parentNode]]; 623 | if (parentNodeDict != nil) { 624 | contentScore = [[parentNodeDict objectForKey:@"contentScore"] floatValue]; 625 | } 626 | else { 627 | contentScore = 0.0; 628 | } 629 | #endif 630 | 631 | //if parentNode is not None: 632 | // pweight = self.classWeight(parentNode) + contentScore 633 | // pname = describe(parentNode) 634 | //else: 635 | // pweight = 0 636 | // pname = "no parent" 637 | 638 | toRemove = NO; 639 | #if DEBUG_SANITIZE 640 | reason = @""; 641 | #endif 642 | 643 | #define countsFor(A) (CFIndex)(CFDictionaryGetValue(counts, (A))) 644 | 645 | //if el.tag == 'div' and counts["img"] >= 1: 646 | // continue 647 | if (countsFor(@"p") 648 | && (countsFor(@"img") > countsFor(@"p"))) { 649 | #if DEBUG_SANITIZE 650 | reason = [NSString stringWithFormat:@"too many images (%ld)", (long)countsFor(@"img")]; 651 | #endif 652 | toRemove = YES; 653 | } 654 | else if ((countsFor(@"li") > countsFor(@"p")) 655 | && ![tag isEqualToString:@"ul"] 656 | && ![tag isEqualToString:@"ol"]) { 657 | #if DEBUG_SANITIZE 658 | reason = @"more <li>s than <p>s"; 659 | #endif 660 | toRemove = YES; 661 | } 662 | else if (countsFor(@"input") > (countsFor(@"p") / 3)) { 663 | #if DEBUG_SANITIZE 664 | reason = @"less than 3x <p>s than <input>s"; 665 | #endif 666 | toRemove = YES; 667 | } 668 | else if ((contentLength < minLen) 669 | && ((countsFor(@"img") == 0) 670 | || (countsFor(@"img") > 2))) { 671 | #if DEBUG_SANITIZE 672 | reason = [NSString stringWithFormat:@"too short content length %lu without a single image", (unsigned long)contentLength]; 673 | #endif 674 | toRemove = YES; 675 | } 676 | else if (weight < 25 && linkDensity > 0.2) { 677 | #if DEBUG_SANITIZE 678 | reason = [NSString stringWithFormat:@"too many links %.3f for its weight %.0f", linkDensity, weight]; 679 | #endif 680 | toRemove = YES; 681 | } 682 | else if (weight >= 25 && linkDensity > 0.5) { 683 | #if DEBUG_SANITIZE 684 | reason = [NSString stringWithFormat:@"too many links %.3f for its weight %.0f", linkDensity, weight]; 685 | #endif 686 | toRemove = YES; 687 | } 688 | else if (((countsFor(@"embed") == 1) && (contentLength < 75)) || (countsFor(@"embed") > 1)) { 689 | #if DEBUG_SANITIZE 690 | reason = @"<embed>s with too short content length, or too many <embed>s"; 691 | #endif 692 | toRemove = YES; 693 | } 694 | 695 | #undef countsFor 696 | 697 | //if el.tag == 'div' and counts['img'] >= 1 and toRemove: 698 | // imgs = el.findall('.//img') 699 | // validImg = False 700 | // self.debug(tounicode(el)) 701 | // for img in imgs: 702 | // 703 | // height = img.get('height') 704 | // textLength = img.get('textLength') 705 | // self.debug ("height %s textLength %s" %(repr(height), repr(textLength))) 706 | // if toInt(height) >= 100 or toInt(textLength) >= 100: 707 | // validImg = True 708 | // self.debug("valid image" + tounicode(img)) 709 | // break 710 | // if validImg: 711 | // toRemove = False 712 | // self.debug("Allowing %s" %el.textContent()) 713 | // for desnode in self.tags(el, "table", "ul", "div"): 714 | // allowed[desnode] = True 715 | 716 | // Find x non-empty preceding and succeeding siblings 717 | NSUInteger i = 0, j = 0; 718 | NSUInteger x = 1; 719 | CFMutableArrayRef siblings = CFArrayCreateMutable(kCFAllocatorDefault, 0, NULL); 720 | NSUInteger sibContentLength; 721 | NSXMLNode *sib; 722 | 723 | sib = el; 724 | while ((sib = [sib nextSibling]) != nil) { 725 | //self.debug(sib.textContent()) 726 | sibContentLength = [self textLength:sib]; 727 | if (sibContentLength) { 728 | i += 1; 729 | CFArrayAppendValue(siblings, (void *)sibContentLength); 730 | if (i == x) break; 731 | } 732 | } 733 | 734 | sib = el; 735 | while ((sib = [sib previousSibling]) != nil) { 736 | //self.debug(sib.textContent()) 737 | sibContentLength = [self textLength:sib]; 738 | if (sibContentLength) { 739 | j += 1; 740 | CFArrayAppendValue(siblings, (void *)sibContentLength); 741 | if (j == x) break; 742 | } 743 | } 744 | 745 | //self.debug(str(siblings)) 746 | 747 | if ((CFArrayGetCount(siblings) > 0) 748 | && (sumCFArrayOfNSUInteger(siblings) > 1000)) { 749 | 750 | toRemove = NO; 751 | //[self debug:[NSString stringWithFormat:@"Allowing %@", [el readabilityDescription]]]; 752 | 753 | BOOL yesBool = YES; 754 | for (NSXMLElement *desnode in [el tagsWithNames:@"table", @"ul", @"div", nil]) { 755 | CFDictionarySetValue(allowed, (__bridge const void *)([HashableElement elementForNode:desnode]), (void *)(intptr_t)yesBool); 756 | } 757 | } 758 | 759 | CFRelease(siblings); 760 | 761 | if (toRemove) { 762 | #if DEBUG_SANITIZE 763 | [self debug:[NSString stringWithFormat:@"Cleaned %6.3f %@ with weight %f cause it has %@.", contentScore, [el readabilityDescription], weight, reason]]; 764 | #endif 765 | //print tounicode(el) 766 | //self.debug("pname %s pweight %.3f" %(pname, pweight)) 767 | [el detach]; 768 | } 769 | } 770 | 771 | CFRelease(counts); 772 | 773 | } 774 | } 775 | 776 | /* 777 | // This doesn’t appear to do anything! 778 | for el in ([node] + [n for n in node.iter()]): 779 | if not (self.options['attributes']): 780 | //el.attrib = {} //FIXME:Checkout the effects of disabling this 781 | pass 782 | */ 783 | 784 | CFRelease(allowed); 785 | 786 | return node; 787 | } 788 | 789 | // HTMLPartial == YES is supposed to request the return of only the div of the document (not wrapped in <html> and <body> tags). 790 | // Currently unsupported. Implemented here to keep parity with python/lxml-readability. 791 | - (NSXMLDocument *)summaryXMLDocument:(BOOL)HTMLPartial; 792 | { 793 | if (self.html == nil) return nil; 794 | 795 | BOOL ruthless = YES; 796 | while (1) { 797 | //[self _html:YES]; 798 | 799 | NSArray *nodes; 800 | 801 | // Remove comment nodes 802 | NSXMLNode *thisNode = self.html; 803 | NSXMLNode *prevNode = nil; 804 | while (thisNode != nil) { 805 | if ((prevNode != nil) && ([prevNode kind] == NSXMLCommentKind)) { 806 | [prevNode detach]; 807 | } 808 | prevNode = thisNode; 809 | thisNode = [thisNode nextNode]; 810 | } 811 | 812 | // Delete non-content nodes 813 | nodes = [self.html tagsWithNames:@"noscript", @"script", @"style", nil]; 814 | for (NSXMLNode *i in nodes) { 815 | [i detach]; 816 | } 817 | 818 | // Add readability CSS ID to body tag 819 | nodes = [self.html tagsWithNames:@"body", nil]; 820 | for (NSXMLNode *i in nodes) { 821 | [i addCSSName:@"readabilityBody" toAttributeWithName:@"id"]; 822 | } 823 | 824 | if (ruthless) [self removeUnlikelyCandidates]; 825 | 826 | [self transformMisusedDivsIntoParagraphs]; 827 | 828 | NSDictionary *candidates = [self scoreParagraphs]; 829 | //NSLog(@"%@", candidates); 830 | 831 | NSDictionary *bestCandidate = [self selectBestCandidate:candidates]; 832 | 833 | NSXMLDocument *article = nil; 834 | 835 | if (bestCandidate != nil) { 836 | article = [self getArticleForCandidates:candidates 837 | andBestCandidate:bestCandidate 838 | HTMLPartial:HTMLPartial]; 839 | 840 | if (HTMLPartial == NO) { 841 | NSXMLElement *titleNode = [article nodesForXPath:@"/html/head/title" 842 | error:NULL][0]; 843 | [titleNode setStringValue:[self title]]; 844 | } 845 | } 846 | else { 847 | if (ruthless) { 848 | [self debug:@"Ruthless removal did not work. "]; 849 | ruthless = NO; 850 | //[self debug:@"Ended up stripping too much - going for a safer _parse"]; 851 | // Loop through and try again. 852 | continue; 853 | } 854 | else { 855 | [self debug:@"Ruthless and lenient parsing did not work. Returning raw html"]; 856 | if ([self.html kind] == NSXMLElementKind) { 857 | article = [(NSXMLElement *)self.html elementsForName:@"body"][0]; 858 | } 859 | if (article == nil) { 860 | article = self.html; 861 | } 862 | 863 | } 864 | } 865 | 866 | NSXMLDocument *cleanedArticle = [self sanitizeArticle:article forCandidates:candidates]; 867 | //[self cleanAttributes:] 868 | NSUInteger cleanedArticleLength = (cleanedArticle == nil) ? 0 : [[cleanedArticle XMLString] length]; 869 | NSNumber *retryLengthNum = (self.options)[@"retryLength"]; 870 | NSUInteger retryLength = (retryLengthNum != nil) ? [retryLengthNum unsignedIntegerValue] : RETRY_LENGTH; 871 | BOOL ofAcceptableLength = cleanedArticleLength >= retryLength; 872 | if (ruthless && !ofAcceptableLength) { 873 | ruthless = NO; 874 | // Loop through and try again. 875 | continue; 876 | } 877 | else { 878 | return cleanedArticle; 879 | } 880 | 881 | } 882 | 883 | } 884 | 885 | - (NSXMLDocument *)summaryXMLDocument; 886 | { 887 | return [self summaryXMLDocument:NO]; 888 | } 889 | @end 890 | 891 | 892 | @implementation HashableElement 893 | 894 | @synthesize node = _node; 895 | 896 | + (id)elementForNode:(NSXMLNode *)aNode; 897 | { 898 | return [[self alloc] initWithNode:aNode]; 899 | } 900 | 901 | - (id)initWithNode:(NSXMLNode *)aNode; 902 | { 903 | self = [super init]; 904 | if (self) { 905 | self.node = aNode; 906 | } 907 | return self; 908 | } 909 | 910 | 911 | - (id)copyWithZone:(NSZone *)zone 912 | { 913 | id newElement = [[[self class] allocWithZone:zone] 914 | initWithNode:self.node]; 915 | 916 | return newElement; 917 | } 918 | 919 | 920 | - (NSString *)description 921 | { 922 | return [self.node description]; 923 | } 924 | 925 | - (BOOL)isEqual:(id)obj 926 | { 927 | if (obj == nil) return NO; 928 | if (![obj isKindOfClass:[HashableElement class]]) return NO; 929 | 930 | HashableElement *p = (HashableElement *)obj; 931 | NSXMLNode *pNode = p.node; 932 | NSXMLNode *selfNode = self.node; 933 | return [pNode isEqualTo:selfNode] && [pNode.children isEqual:selfNode.children]; 934 | } 935 | 936 | - (BOOL)isEqualToElement:(HashableElement *)p 937 | { 938 | if (p == nil) return NO; 939 | 940 | NSXMLNode *pNode = p.node; 941 | NSXMLNode *selfNode = self.node; 942 | return [pNode isEqualTo:selfNode] && [pNode.children isEqual:selfNode.children]; 943 | } 944 | 945 | - (NSUInteger)hash 946 | { 947 | NSXMLNode *selfNode = self.node; 948 | return ([selfNode hash] ^ [selfNode.children hash]); 949 | } 950 | 951 | @end 952 | --------------------------------------------------------------------------------