s that do not contain other block elements into
s
178 | NSXMLNode *elemNextSibling = [elem nextSibling];
179 | NSXMLNode *descendant = elem;
180 | BOOL blockElementFound = NO;
181 |
182 | while ((descendant = [descendant nextNode]) != elemNextSibling) {
183 | if ([divToPElementsTagNames containsObject:descendant.name]) {
184 | blockElementFound = YES;
185 | break;
186 | }
187 | }
188 |
189 | if (blockElementFound == NO) {
190 | //[self debug:[NSString stringWithFormat:@"Altering %@ to p", [elem readabilityDescription]]];
191 | [elem setName:@"p"];
192 | //NSLog(@"Fixed element %@", [elem readabilityDescription]);
193 | }
194 | }
195 |
196 | NSXMLElement *p;
197 | NSString *s;
198 |
199 | nodes = [self.html tagsWithNames:@"div", nil];
200 | for (NSXMLElement *elem in nodes) { // div tags always are elements
201 |
202 | NSXMLNode *firstTextNode = [elem lxmlTextNode];
203 | s = [firstTextNode stringValue];
204 | if ((s != nil)
205 | && ([s length] != 0)
206 | && ([[s stringByTrimmingCharactersInSet:whitespaceAndNewlineCharacterSet] length] != 0)) { // using -ws_isBlankString would be faster
207 |
208 | p = [NSXMLNode elementWithName:@"p"
209 | stringValue:s];
210 |
211 | [firstTextNode detach];
212 | [elem insertChild:p atIndex:0];
213 | //NSLog(@"Appended %@ to %@", p, [elem readabilityDescription]);
214 | }
215 |
216 | [[elem children] enumerateObjectsWithOptions:NSEnumerationReverse
217 | usingBlock:^(id obj, NSUInteger pos, BOOL *stop) {
218 | NSXMLNode *child = obj;
219 | NSXMLElement *paragraph;
220 |
221 | NSXMLNode *tailNode = [child lxmlTailNode];
222 |
223 | NSString *childTailString = ((tailNode == nil) ? @"" : [tailNode stringValue]);
224 |
225 | if (([childTailString length] != 0)
226 | && ([[childTailString stringByTrimmingCharactersInSet:whitespaceAndNewlineCharacterSet] length] != 0)) { // using -ws_isBlankString would be faster
227 |
228 | paragraph = [NSXMLNode elementWithName:@"p"
229 | stringValue:childTailString];
230 |
231 | [tailNode detach]; // We could get [tailNode index] and insert there after detaching
232 | [elem insertChild:paragraph atIndex:(pos + 1)];
233 | //NSLog(@"Appended %@ to %@", p, [elem readabilityDescription]);
234 | }
235 |
236 | if ([[child name] isEqualToString:@"br"]) {
237 | [child detach];
238 | //NSLog(@"Dropped
at %@", [elem readabilityDescription]);
239 | }
240 | }];
241 |
242 | }
243 | }
244 |
245 | - (NSString *)clean:(NSString *)_text
246 | {
247 | NSUInteger textLength = [_text length];
248 | if (textLength == 0) return _text;
249 |
250 | NSMutableString *text = [_text mutableCopy];
251 |
252 | [newlinePlusSurroundingwhitespaceRe replaceMatchesInString:text
253 | options:0
254 | range:NSMakeRange(0, textLength)
255 | withTemplate:@"\n"];
256 |
257 | [tabRunRe replaceMatchesInString:text
258 | options:0
259 | range:NSMakeRange(0, [text length])
260 | withTemplate:@" "];
261 |
262 | CFStringTrimWhitespace((CFMutableStringRef)text);
263 |
264 | return text;
265 | }
266 |
267 | - (NSUInteger)textLength:(NSXMLNode *)i
268 | {
269 | if ([i kind] == NSXMLElementKind) {
270 | NSString *s = [i stringValue];
271 | NSString *cleanS = (s != nil) ? [self clean:s] : @"";
272 | return [cleanS length];
273 | }
274 | else {
275 | return 0;
276 | }
277 | }
278 |
279 | - (float)classWeight:(NSXMLElement *)e
280 | {
281 | float weight = 0;
282 | NSString *s;
283 |
284 | if ((s = [e cssNamesForAttributeWithName:@"class"]) != nil) {
285 | NSRange sRange = NSMakeRange(0, [s length]);
286 |
287 | if ([negativeRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight -= 25;
288 |
289 | if ([positiveRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight += 25;
290 | }
291 |
292 | if ((s = [e cssNamesForAttributeWithName:@"id"]) != nil) {
293 | NSRange sRange = NSMakeRange(0, [s length]);
294 |
295 | if ([negativeRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight -= 25;
296 |
297 | if ([positiveRe rangeOfFirstMatchInString:s options:0 range:sRange].location != NSNotFound) weight += 25;
298 | }
299 |
300 | return weight;
301 | }
302 |
303 | - (NSMutableDictionary *)scoreNode:(NSXMLElement *)elem
304 | {
305 | static BOOL firstRun = YES;
306 | static NSSet *preTDBlockquote = nil;
307 | static NSSet *addressEtc = nil;
308 | static NSSet *headlines = nil;
309 |
310 | if (firstRun) {
311 | preTDBlockquote = [[NSSet alloc] initWithObjects:@"pre", @"td", @"blockquote", nil];
312 | addressEtc = [[NSSet alloc] initWithObjects:@"address", @"ol", @"ul", @"dl", @"dd", @"dt", @"li", @"form", nil];
313 | headlines = [[NSSet alloc] initWithObjects:@"h1", @"h2", @"h3", @"h4", @"h5", @"h6", @"th", nil];
314 | firstRun = NO;
315 | }
316 |
317 | float contentScore = [self classWeight:elem];
318 | NSString *name = [elem.name lowercaseString];
319 | if ([name isEqualToString:@"div"]) {
320 | contentScore += 5;
321 | }
322 | else if ([preTDBlockquote containsObject:name]) {
323 | contentScore += 3;
324 | }
325 | else if ([addressEtc containsObject:name]) {
326 | contentScore -= 3;
327 | }
328 | else if ([headlines containsObject:name]) {
329 | contentScore -= 5;
330 | }
331 |
332 | return [NSMutableDictionary dictionaryWithObjectsAndKeys:
333 | @(contentScore), @"contentScore",
334 | elem, @"elem",
335 | nil];
336 | }
337 |
338 | - (NSXMLDocument *)getArticleForCandidates:(NSDictionary *)candidates
339 | andBestCandidate:(NSDictionary *)bestCandidate
340 | HTMLPartial:(BOOL)HTMLPartial
341 | {
342 | // Now that we have the top candidate, look through its siblings for content that might also be related
343 | // Things like preambles, content split by ads that we removed, etc.
344 |
345 | float siblingScoreThreshold = MAX(10.0, ([bestCandidate[@"contentScore"] floatValue] * 0.2));
346 |
347 | // Create a new HTML document with a html->body->div
348 | NSXMLDocument *output = [[NSXMLDocument alloc] initWithXMLString:@"
"
349 | options:NSXMLDocumentTidyHTML
350 | error:NULL];
351 | [output setDocumentContentKind:NSXMLDocumentXHTMLKind];
352 | NSXMLElement *htmlDiv = [output nodesForXPath:@"/html/body/div"
353 | error:NULL][0];
354 | #if 0
355 | // Disabled until we can figure out a good way to return an NSXMLDocument OR an NSXMLElement
356 | if (HTMLPartial) {
357 | output = htmlDiv;
358 | }
359 | #endif
360 | NSXMLNode *bestElem = bestCandidate[@"elem"];
361 |
362 | BOOL append;
363 | NSDictionary *siblingScoreDict;
364 | HashableElement *siblingKey;
365 | for (NSXMLNode *sibling in [[bestElem parent] children]) {
366 | //if isinstance(sibling, NavigableString): continue
367 | // in lxml there no concept of simple text
368 | append = NO;
369 |
370 | if (sibling == bestElem) append = YES;
371 |
372 | if (append == NO) {
373 | siblingKey = [HashableElement elementForNode:sibling];
374 | siblingScoreDict = candidates[siblingKey];
375 | if ((siblingScoreDict != nil)
376 | && ([siblingScoreDict[@"contentScore"] floatValue] >= siblingScoreThreshold)) {
377 | append = YES;
378 | }
379 | }
380 |
381 | if ((append == NO)
382 | && [sibling.name isEqualToString:@"p"]
383 | && ([sibling kind] == NSXMLElementKind)) {
384 |
385 | float linkDensity = [self getLinkDensity:(NSXMLElement *)sibling];
386 | NSString *nodeContent = [sibling lxmlText];
387 | nodeContent = (nodeContent == nil) ? @"" : nodeContent;
388 | NSUInteger nodeLength = [nodeContent length];
389 |
390 | if ((nodeLength > 80)
391 | && (linkDensity < 0.25)) {
392 | append = YES;
393 | }
394 | else if ((nodeLength <= 80)
395 | && (linkDensity == 0.0)
396 | && ([sentenceEndRe rangeOfFirstMatchInString:nodeContent options:0 range:NSMakeRange(0, [nodeContent length])].location != NSNotFound)) {
397 | append = YES;
398 | }
399 | }
400 |
401 | if (append) [htmlDiv addChild:[sibling copy]];
402 | }
403 |
404 | //if output is not None:
405 | // output.append(bestElem)
406 |
407 | return output;
408 |
409 | }
410 |
411 | - (NSDictionary *)selectBestCandidate:(NSDictionary *)candidates
412 | {
413 | NSArray *allCandidates = [candidates allValues];
414 | if ([allCandidates count] == 0) return nil;
415 |
416 | NSSortDescriptor *contentScoreDescendingDescriptor = [NSSortDescriptor sortDescriptorWithKey:@"contentScore"
417 | ascending:NO];
418 |
419 | NSArray *sortedCandidates = [allCandidates sortedArrayUsingDescriptors:
420 | @[contentScoreDescendingDescriptor]];
421 |
422 | #if 0
423 | NSXMLElement *elem;
424 | NSArray *topFive = ([sortedCandidates count] >= 5) ? [sortedCandidates subarrayWithRange:NSMakeRange(0, 5)] : sortedCandidates;
425 | for (NSDictionary *candidate in topFive) {
426 | elem = [candidate objectForKey:@"elem"];
427 | [self debug:[NSString stringWithFormat:@"Top 5 : %6.3f %@", [candidate objectForKey:@"contentScore"], [elem readabilityDescription]]];
428 | }
429 | #endif
430 |
431 | NSDictionary *bestCandidate = sortedCandidates[0];
432 | return bestCandidate;
433 | }
434 |
435 | - (float)getLinkDensity:(NSXMLElement *)elem
436 | {
437 | NSUInteger linkLength = 0;
438 | for (NSXMLNode *i in [elem nodesForXPath:@".//a" error:NULL]) {
439 | linkLength += [[i stringValue] length];
440 | //if len(elem.findall(".//div") or elem.findall(".//p")):
441 | // linkLength = linkLength
442 | }
443 | NSUInteger totalLength = [self textLength:elem];
444 | return (float)linkLength / MAX(totalLength, 1);
445 | }
446 |
447 | - (NSDictionary *)scoreParagraphs
448 | {
449 | NSNumber *minLength = (self.options)[@"minTextLength"];
450 | NSUInteger minLen = (minLength != nil) ? [minLength unsignedIntegerValue] : TEXT_LENGTH_THRESHOLD;
451 |
452 | NSMutableDictionary *candidates = [NSMutableDictionary dictionary];
453 |
454 | #if 0
455 | for (NSXMLNode *node in [self.html tagsWithNames:@"div", nil]) {
456 | [self debug:[node readabilityDescription]];
457 | }
458 | #endif
459 |
460 | NSXMLElement *parentNode, *grandParentNode; // parents have to be elements
461 | NSString *elemTextContent, *innerText;
462 | NSUInteger innerTextLen;
463 |
464 | NSMutableArray *ordered = [NSMutableArray array];
465 | HashableElement *hashableParent, *hashableGrandParent;
466 | for (NSXMLElement *elem in [self.html tagsWithNames:@"p", @"pre", @"td", nil]) {
467 | parentNode = (NSXMLElement *)[elem parent];
468 | if (parentNode == nil) continue;
469 | grandParentNode = (NSXMLElement *)[parentNode parent];
470 |
471 | elemTextContent = [elem stringValue];
472 | innerText = (elemTextContent != nil) ? [self clean:elemTextContent] : @"";
473 | innerTextLen = [innerText length];
474 |
475 | // If this paragraph is less than 25 characters, don't even count it.
476 | if (innerTextLen < minLen) continue;
477 |
478 | hashableParent = [HashableElement elementForNode:parentNode];
479 | if (candidates[hashableParent] == nil) {
480 | candidates[hashableParent] = [self scoreNode:parentNode];
481 | [ordered addObject:parentNode];
482 | }
483 |
484 | if (grandParentNode != nil) {
485 | hashableGrandParent = [HashableElement elementForNode:grandParentNode];
486 | if (candidates[hashableGrandParent] == nil) {
487 | candidates[hashableGrandParent] = [self scoreNode:grandParentNode];
488 | [ordered addObject:grandParentNode];
489 | }
490 | }
491 |
492 | float contentScore = 1.0;
493 | contentScore += [innerText countOccurancesOfString:@","] + 1;
494 | contentScore += MIN((innerTextLen / 100), 3);
495 | //if elem not in candidates:
496 | // candidates[elem] = self.scoreNode(elem)
497 |
498 | //WTF? candidates[elem]['contentScore'] += contentScore
499 | float tempScore;
500 | NSMutableDictionary *scoreDict;
501 | scoreDict = candidates[hashableParent];
502 | tempScore = [scoreDict[@"contentScore"] floatValue] + contentScore;
503 | scoreDict[@"contentScore"] = @(tempScore);
504 | if (grandParentNode != nil) {
505 | scoreDict = candidates[hashableGrandParent];
506 | tempScore = [scoreDict[@"contentScore"] floatValue] + contentScore / 2.0;
507 | scoreDict[@"contentScore"] = @(tempScore);
508 | }
509 | }
510 |
511 | // Scale the final candidates score based on link density. Good content should have a
512 | // relatively small link density (5% or less) and be mostly unaffected by this operation.
513 | NSMutableDictionary *candidate;
514 | float ld;
515 | float score;
516 |
517 | for (NSXMLElement *elem in ordered) {
518 | HashableElement *hashableElem = [HashableElement elementForNode:elem];
519 | candidate = candidates[hashableElem];
520 | ld = [self getLinkDensity:elem];
521 | score = [candidate[@"contentScore"] floatValue];
522 | //[self debug:[NSString stringWithFormat:@"Candid: %6.3f %s link density %.3f -> %6.3f", score, [elem readabilityDescription], ld, score*(1-ld)]];
523 | score *= (1 - ld);
524 | candidate[@"contentScore"] = @(score);
525 | }
526 |
527 | return candidates;
528 | }
529 |
530 | NSUInteger sumCFArrayOfNSUInteger(CFArrayRef array);
531 | NSUInteger sumCFArrayOfNSUInteger(CFArrayRef array) {
532 | NSUInteger siblingsSum = 0;
533 |
534 | CFIndex i, c = CFArrayGetCount(array);
535 | for (i = 0; i < c; i++) {
536 | siblingsSum += (NSUInteger)CFArrayGetValueAtIndex(array, i);
537 | }
538 |
539 | return siblingsSum;
540 | }
541 |
542 | - (NSXMLDocument *)sanitizeArticle:(NSXMLDocument *)node forCandidates:(NSDictionary *)candidates
543 | {
544 | #ifndef DEBUG_SANITIZE
545 | # define DEBUG_SANITIZE 0
546 | #endif
547 |
548 | NSNumber *minTextLengthNum = (self.options)[@"minTextLength"];
549 | NSUInteger minLen = (minTextLengthNum != nil) ? [minTextLengthNum unsignedIntegerValue] : TEXT_LENGTH_THRESHOLD;
550 | for (NSXMLElement *header in [node tagsWithNames:@"h1", @"h2", @"h3", @"h4", @"h5", @"h6", nil]) {
551 | if ([self classWeight:header] < 0 || [self getLinkDensity:header] > 0.33) {
552 | [header detach];
553 | }
554 | }
555 |
556 | for (NSXMLElement *elem in [node tagsWithNames:@"form", @"iframe", @"textarea", nil]) {
557 | [elem detach];
558 | }
559 |
560 | CFMutableDictionaryRef allowed = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, NULL); // keys: HashableElement, values:raw BOOL
561 |
562 | NSDictionary *elDict;
563 | HashableElement *hashableEl;
564 | float weight;
565 | NSString *tag;
566 | float contentScore;
567 | CFIndex kindCount;
568 | NSArray *tagKinds = @[@"p", @"img", @"li", @"a", @"embed", @"input"];
569 | NSUInteger contentLength;
570 | float linkDensity;
571 | NSXMLNode *parentNode;
572 |
573 | BOOL toRemove;
574 | #if DEBUG_SANITIZE
575 | NSString *reason;
576 | #endif
577 |
578 | // Conditionally clean
s, s, and s
579 | for (NSXMLElement *el in [node tagsWithNames:@"table", @"ul", @"div", nil]) {
580 | hashableEl = [HashableElement elementForNode:el];
581 |
582 | if (CFDictionaryContainsValue(allowed, (__bridge const void *)(hashableEl))) continue;
583 |
584 | weight = [self classWeight:el];
585 |
586 | elDict = candidates[hashableEl];
587 | if (elDict != nil) {
588 | contentScore = [elDict[@"contentScore"] floatValue];
589 | //print '!',el, '-> %6.3f' % contentScore
590 | }
591 | else {
592 | contentScore = 0;
593 | }
594 |
595 | tag = el.name;
596 |
597 | if ((weight + contentScore) < 0.0) {
598 | //[self debug:[NSString stringWithFormat:@"Cleaned %@ with score %6.3f and weight %-3s", [el readabilityDescription], contentScore, weight]];
599 | [el detach];
600 | }
601 | else if ([[el stringValue] countOccurancesOfString:@","] < 10) {
602 | CFMutableDictionaryRef counts = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, NULL); // keys: NSString, values:raw CFIndex
603 |
604 | for (NSString *kind in tagKinds) {
605 | kindCount = (CFIndex)[[node nodesForXPath:[NSString stringWithFormat:tagNameXPath, kind]
606 | error:NULL] count];
607 | CFDictionaryAddValue(counts, (__bridge const void *)(kind), (void *)kindCount);
608 | }
609 |
610 | if (CFDictionaryGetValueIfPresent(counts, @"li", (const void **)&kindCount)) {
611 | kindCount -= 100;
612 | CFDictionarySetValue(counts, @"li", (void *)kindCount);
613 | }
614 |
615 | contentLength = [self textLength:el]; // Count the text length excluding any surrounding whitespace
616 | linkDensity = [self getLinkDensity:el];
617 |
618 | parentNode = [el parent];
619 | if (parentNode != nil) {
620 |
621 | #if DEBUG_SANITIZE
622 | NSDictionary *parentNodeDict = [candidates objectForKey:[HashableElement elementForNode:parentNode]];
623 | if (parentNodeDict != nil) {
624 | contentScore = [[parentNodeDict objectForKey:@"contentScore"] floatValue];
625 | }
626 | else {
627 | contentScore = 0.0;
628 | }
629 | #endif
630 |
631 | //if parentNode is not None:
632 | // pweight = self.classWeight(parentNode) + contentScore
633 | // pname = describe(parentNode)
634 | //else:
635 | // pweight = 0
636 | // pname = "no parent"
637 |
638 | toRemove = NO;
639 | #if DEBUG_SANITIZE
640 | reason = @"";
641 | #endif
642 |
643 | #define countsFor(A) (CFIndex)(CFDictionaryGetValue(counts, (A)))
644 |
645 | //if el.tag == 'div' and counts["img"] >= 1:
646 | // continue
647 | if (countsFor(@"p")
648 | && (countsFor(@"img") > countsFor(@"p"))) {
649 | #if DEBUG_SANITIZE
650 | reason = [NSString stringWithFormat:@"too many images (%ld)", (long)countsFor(@"img")];
651 | #endif
652 | toRemove = YES;
653 | }
654 | else if ((countsFor(@"li") > countsFor(@"p"))
655 | && ![tag isEqualToString:@"ul"]
656 | && ![tag isEqualToString:@"ol"]) {
657 | #if DEBUG_SANITIZE
658 | reason = @"more
- s than
s";
659 | #endif
660 | toRemove = YES;
661 | }
662 | else if (countsFor(@"input") > (countsFor(@"p") / 3)) {
663 | #if DEBUG_SANITIZE
664 | reason = @"less than 3x
s than s";
665 | #endif
666 | toRemove = YES;
667 | }
668 | else if ((contentLength < minLen)
669 | && ((countsFor(@"img") == 0)
670 | || (countsFor(@"img") > 2))) {
671 | #if DEBUG_SANITIZE
672 | reason = [NSString stringWithFormat:@"too short content length %lu without a single image", (unsigned long)contentLength];
673 | #endif
674 | toRemove = YES;
675 | }
676 | else if (weight < 25 && linkDensity > 0.2) {
677 | #if DEBUG_SANITIZE
678 | reason = [NSString stringWithFormat:@"too many links %.3f for its weight %.0f", linkDensity, weight];
679 | #endif
680 | toRemove = YES;
681 | }
682 | else if (weight >= 25 && linkDensity > 0.5) {
683 | #if DEBUG_SANITIZE
684 | reason = [NSString stringWithFormat:@"too many links %.3f for its weight %.0f", linkDensity, weight];
685 | #endif
686 | toRemove = YES;
687 | }
688 | else if (((countsFor(@"embed") == 1) && (contentLength < 75)) || (countsFor(@"embed") > 1)) {
689 | #if DEBUG_SANITIZE
690 | reason = @"