├── .clang-format ├── .github └── workflows │ └── ant.yml ├── .gitignore ├── LICENSE ├── LICENSE.txt.boilerpipe ├── NOTICE.txt.boilerpipe ├── OWNERS ├── PRESUBMIT.py ├── Proto.gwt.xml ├── README.md ├── TestProto.gwt.xml ├── Vagrantfile ├── build.xml ├── codereview.settings ├── create-hook-symlinks ├── create_standalone_js.py ├── create_wrapped_standalone_js.py ├── extension ├── background.js ├── devtools.html ├── devtools.js ├── devtools_panel.html ├── devtools_panel.js ├── extract.js ├── icon19.png ├── icon24.png ├── icon38.png ├── icon48.png ├── icon96.png ├── manifest.json └── preview.js ├── heuristics └── distillable │ ├── README.md │ ├── calculate_derived_features.py │ ├── check_derived_features.py │ ├── check_distilled_mhtml.py │ ├── extract_features.js │ ├── gen_mhtml_corpus.py │ ├── get_screenshots.py │ ├── index.html │ ├── index.js │ ├── server.py │ └── write_features_csv.py ├── hooks └── pre-commit ├── install-build-deps.sh ├── java ├── DomDistiller.gwt.xml └── org │ └── chromium │ └── distiller │ ├── BoilerpipeFilter.java │ ├── ContentExtractor.java │ ├── DocumentTitleGetter.java │ ├── DomDistiller.java │ ├── DomDistillerEntry.java │ ├── DomUtil.java │ ├── DomWalker.java │ ├── IEReadingViewParser.java │ ├── JavaScript.java │ ├── LogUtil.java │ ├── MarkupParser.java │ ├── MonotonicPageInfosGroups.java │ ├── NodeListExpander.java │ ├── NodeTree.java │ ├── OpenGraphProtocolParser.java │ ├── OpenGraphProtocolParserAccessor.java │ ├── OrderedNodeMatcher.java │ ├── PageLinkInfo.java │ ├── PageParamInfo.java │ ├── PageParameterDetector.java │ ├── PageParameterParser.java │ ├── PagingLinksFinder.java │ ├── ParsedUrl.java │ ├── PathComponentPagePattern.java │ ├── QueryParamPagePattern.java │ ├── SchemaOrgParser.java │ ├── SchemaOrgParserAccessor.java │ ├── StringUtil.java │ ├── TableClassifier.java │ ├── TreeCloneBuilder.java │ ├── document │ ├── TextBlock.java │ ├── TextDocument.java │ └── TextDocumentStatistics.java │ ├── extractors │ ├── ArticleExtractor.java │ ├── KeepEverythingExtractor.java │ ├── KeepEverythingWithMinKWordsExtractor.java │ └── embeds │ │ ├── EmbedExtractor.java │ │ ├── ImageExtractor.java │ │ ├── TwitterExtractor.java │ │ ├── VimeoExtractor.java │ │ └── YouTubeExtractor.java │ ├── filters │ ├── debug │ │ └── PrintDebugFilter.java │ ├── english │ │ ├── NumWordsRulesClassifier.java │ │ └── TerminatingBlocksFinder.java │ ├── heuristics │ │ ├── BlockProximityFusion.java │ │ ├── DocumentTitleMatchClassifier.java │ │ ├── ExpandTitleToContentFilter.java │ │ ├── HeadingFusion.java │ │ ├── KeepLargestBlockFilter.java │ │ ├── LargeBlockSameTagLevelToContentFilter.java │ │ ├── ListAtEndFilter.java │ │ └── SimilarSiblingContentExpansion.java │ └── simple │ │ ├── BoilerplateBlockFilter.java │ │ ├── LabelToBoilerplateFilter.java │ │ ├── MarkEverythingBoilerplateFilter.java │ │ ├── MarkEverythingContentFilter.java │ │ └── MinWordsFilter.java │ ├── labels │ ├── DefaultLabels.java │ └── LabelAction.java │ └── webdocument │ ├── DomConverter.java │ ├── ElementAction.java │ ├── WebDocument.java │ ├── WebDocumentBuilder.java │ ├── WebDocumentBuilderInterface.java │ ├── WebElement.java │ ├── WebEmbed.java │ ├── WebFigure.java │ ├── WebImage.java │ ├── WebTable.java │ ├── WebTag.java │ ├── WebText.java │ ├── WebTextBuilder.java │ ├── WebVideo.java │ └── filters │ ├── LeadImageFinder.java │ ├── NestedElementRetainer.java │ ├── RelevantElements.java │ └── images │ ├── AreaScorer.java │ ├── BaseImageScorer.java │ ├── DimensionsRatioScorer.java │ ├── DomDistanceScorer.java │ ├── HasFigureScorer.java │ └── ImageScorer.java ├── javatests ├── DomDistillerJsTest.gwt.xml └── org │ └── chromium │ └── distiller │ ├── Assert.java │ ├── AssertTest.java │ ├── BlockProximityFusionTest.java │ ├── ContentExtractorTest.java │ ├── DocumentTitleGetterTest.java │ ├── DocumentTitleMatchClassifierTest.java │ ├── DomDistillerJsTestCase.java │ ├── DomUtilTest.java │ ├── DomWalkerTest.java │ ├── EmbedExtractorTest.java │ ├── GwtOverlayProtoTest.java │ ├── HeadingFusionTest.java │ ├── IEReadingViewParserTest.java │ ├── ImageHeuristicsTest.java │ ├── JavaScriptTest.java │ ├── JsTestCase.java │ ├── JsTestEntry.java │ ├── JsTestSuiteBase.java │ ├── JsTestSuiteBaseTest.java │ ├── JsTestSuiteBuilder.java │ ├── MarkupParserProtoTest.java │ ├── MarkupParserTest.java │ ├── MonotonicPageInfosGroupsTest.java │ ├── NodeDirectionalityTest.java │ ├── NodeListExpanderTest.java │ ├── OpenGraphProtocolParserAccessorTest.java │ ├── OrderedNodeMatcherTest.java │ ├── PageParamContentInfo.java │ ├── PageParamInfoTest.java │ ├── PageParameterDetectorTest.java │ ├── PageParameterParserTest.java │ ├── PagingLinksFinderTest.java │ ├── ParsedUrlTest.java │ ├── PathComponentPagePatternTest.java │ ├── QueryParamPagePatternTest.java │ ├── SchemaOrgParserAccessorTest.java │ ├── SimilarSiblingContentExpansionTest.java │ ├── SimpleTest.java │ ├── StringUtilTest.java │ ├── TableClassifierTest.java │ ├── TerminatingBlocksFinderTest.java │ ├── TestLogger.java │ ├── TestTextBlockBuilder.java │ ├── TestTextDocumentBuilder.java │ ├── TestUtil.java │ ├── TestUtilTest.java │ ├── TextDocumentConstructionTest.java │ ├── TextDocumentStatisticsTest.java │ ├── TreeCloneBuilderTest.java │ ├── document │ └── TextDocumentTestUtil.java │ ├── rebind │ └── JsTestEntryGenerator.java │ └── webdocument │ ├── DomConverterTest.java │ ├── ElementActionTest.java │ ├── FakeWebDocumentBuilder.java │ ├── TestWebDocumentBuilder.java │ ├── TestWebTextBuilder.java │ ├── WebDocumentBuilderTest.java │ ├── WebImageTest.java │ ├── WebTableTest.java │ ├── WebTagTest.java │ ├── WebTextBuilderTest.java │ ├── WebTextTest.java │ ├── WebVideoTest.java │ └── filters │ ├── LeadImageFinderTest.java │ ├── NestedElementRetainerTest.java │ └── RelevantElementsTest.java ├── land-external-contributor-cl.sh ├── proto └── dom_distiller.proto ├── protoc_plugins ├── README ├── gwt_overlay.py ├── json_values_converter.py ├── json_values_converter_tests.py └── util │ ├── __init__.py │ ├── plugin.py │ ├── plugin_protos.py │ ├── types.py │ └── writer.py ├── run_jstests.py ├── test └── proto │ └── test.proto ├── third_party ├── gwt-2.7.0 │ ├── COPYING │ ├── COPYING.html │ ├── about.html │ ├── about.txt │ ├── gwt-api-checker.jar │ ├── gwt-codeserver.jar │ ├── gwt-dev.jar │ ├── gwt-elemental.jar │ ├── gwt-ll.dll │ ├── gwt-module.dtd │ ├── gwt-servlet-deps.jar │ ├── gwt-servlet.jar │ ├── gwt-user.jar │ ├── i18nCreator │ ├── i18nCreator.cmd │ ├── release_notes.html │ ├── requestfactory-apt-src.jar │ ├── requestfactory-apt.jar │ ├── requestfactory-client+src.jar │ ├── requestfactory-client-src.jar │ ├── requestfactory-client.jar │ ├── requestfactory-server+src.jar │ ├── requestfactory-server-src.jar │ ├── requestfactory-server.jar │ ├── validation-api-1.0.0.GA-sources.jar │ ├── validation-api-1.0.0.GA.jar │ ├── webAppCreator │ └── webAppCreator.cmd ├── junit │ └── junit-4.11.jar └── protobuf │ ├── COPYING.txt │ └── google │ └── protobuf │ ├── compiler │ └── plugin.proto │ └── descriptor.proto ├── tools └── UnicodePatternGenerator.java └── war ├── DomDistiller.html ├── favicon.ico ├── test.html └── wrapped_domdistiller_template.js /.clang-format: -------------------------------------------------------------------------------- 1 | # Defines the DomDistiller style for automatic reformatting. 2 | # http://clang.llvm.org/docs/ClangFormatStyleOptions.html 3 | # To use `git cl format`, several things are expected to be in the buildtools/ 4 | # directory. You can create a symlink to the relevant directories in a normal 5 | # Chromium checkout to make git cl format work. 6 | # See README.md for instructions. 7 | BasedOnStyle: Chromium 8 | 9 | Language: Java 10 | IndentWidth: 4 11 | -------------------------------------------------------------------------------- /.github/workflows/ant.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Ant 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-ant 3 | 4 | name: Java CI 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | schedule: 12 | - cron: '30 5 * * 2' 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | include: 20 | - chrome-min-version: 49 21 | - chrome-min-version: 999 22 | env: 23 | CHROME_MIN_VERSION: ${{ matrix.chrome-min-version }} 24 | steps: 25 | - uses: actions/checkout@v2 26 | - name: Set up JDK 8 27 | uses: actions/setup-java@v2 28 | with: 29 | java-version: '8' 30 | distribution: 'adopt' 31 | - name: Show history 32 | run: git log --oneline -n 5 33 | - name: Install deps 34 | run: | 35 | google-chrome --version 36 | sudo -E ./install-build-deps.sh 37 | - name: Check environment 38 | run: | 39 | which java 40 | java -Xmx32m -version 41 | javac -J-Xmx32m -version 42 | google-chrome --version 43 | - name: Run pre-commit tests 44 | run: hooks/pre-commit 45 | - name: Run tests 46 | run: ant test -Dtest.shuffle=1 -Dtest.repeat=10 47 | - name: Package 48 | run: ant package 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | DomDistiller.war 2 | /buildtools 3 | /gwt-unitCache/ 4 | /out/ 5 | /war/WEB-INF/ 6 | /war/domdistiller 7 | /war/domdistillerjstest 8 | /war/*.JUnit/ 9 | .*.swp 10 | *.pyc 11 | junitvmwatcher*.properties 12 | -------------------------------------------------------------------------------- /LICENSE.txt.boilerpipe: -------------------------------------------------------------------------------- 1 | 2 | boilerpipe 3 | 4 | Copyright (c) 2009-2011 Christian Kohlschütter 5 | 6 | The author licenses this file to You under the Apache License, Version 2.0 7 | (the "License"); you may not use this file except in compliance with 8 | the License. You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | -------------------------------------------------------------------------------- /NOTICE.txt.boilerpipe: -------------------------------------------------------------------------------- 1 | 2 | boilerpipe 3 | 4 | Copyright (c) 2009-2011 Christian Kohlschütter 5 | 6 | The author licenses this file to You under the Apache License, Version 2.0 7 | (the "License"); you may not use this file except in compliance with 8 | the License. You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | 19 | This software contains the following parts which are also provided 20 | under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt): 21 | 22 | - NekoHTML 23 | - Xerces 24 | 25 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /PRESUBMIT.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 The Chromium Authors 2 | # Use of this source code is governed by a BSD-style license that can be 3 | # found in the LICENSE file. 4 | 5 | """Top-level presubmit script for DOM Distiller. 6 | 7 | See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts 8 | for more details about the presubmit API built into git cl. 9 | """ 10 | 11 | import subprocess 12 | import sys 13 | 14 | def _Git(args): 15 | """Runs the requested git command and returns the first line of output.""" 16 | output = subprocess.check_output(['git'] + args) 17 | return output.split('\n')[0] 18 | 19 | 20 | def _CheckUpstream(input_api, output_api): 21 | """Checks that the upstream branch is remote. 22 | 23 | git cl push will push the issue's change to the branch's upstream branch. This 24 | should be origin/master (or maybe origin/some_branch) to work as expected. 25 | Otherwise, git cl push will push the change to some local branch and close the 26 | issue. 27 | """ 28 | branch = _Git(['symbolic-ref', 'HEAD']) 29 | shortbranch = branch.replace('refs/heads/', '') 30 | remote = _Git(['config', '--local', 'branch.%s.remote' % shortbranch]) 31 | if remote != 'origin': 32 | upstream = _Git(['config', '--local', 'branch.%s.merge' % shortbranch]) 33 | shortupstream = upstream.replace('refs/heads/', '') 34 | return [output_api.PresubmitError( 35 | 'Changes should be pushed to origin/master.\n' 36 | 'Try this:\n' 37 | ' git branch -u origin/master\n' 38 | ' git cl push\n' 39 | ' git branch -u %s' % shortupstream)] 40 | 41 | return [] 42 | 43 | def CheckChangeOnCommit(input_api, output_api): 44 | results = [] 45 | results.extend(input_api.canned_checks.CheckOwners(input_api, output_api)) 46 | results.extend(_CheckUpstream(input_api, output_api)) 47 | return results 48 | -------------------------------------------------------------------------------- /Proto.gwt.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /TestProto.gwt.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # All Vagrant configuration is done below. The "2" in Vagrant.configure 5 | # configures the configuration version (we support older styles for 6 | # backwards compatibility). Please don't change it unless you know what 7 | # you're doing. 8 | Vagrant.configure(2) do |config| 9 | # The most common configuration options are documented and commented below. 10 | # For a complete reference, please see the online documentation at 11 | # https://docs.vagrantup.com. 12 | 13 | # Every Vagrant development environment requires a box. You can search for 14 | # boxes at https://atlas.hashicorp.com/search. 15 | config.vm.box = "ubuntu/trusty64" 16 | 17 | # Disable automatic box update checking. If you disable this, then 18 | # boxes will only be checked for updates when the user runs 19 | # `vagrant box outdated`. This is not recommended. 20 | # config.vm.box_check_update = false 21 | 22 | # Create a forwarded port mapping which allows access to a specific port 23 | # within the machine from a port on the host machine. In the example below, 24 | # accessing "localhost:8080" will access port 80 on the guest machine. 25 | # config.vm.network "forwarded_port", guest: 80, host: 8080 26 | 27 | # Create a private network, which allows host-only access to the machine 28 | # using a specific IP. 29 | # config.vm.network "private_network", ip: "192.168.33.10" 30 | 31 | # Create a public network, which generally matched to bridged network. 32 | # Bridged networks make the machine appear as another physical device on 33 | # your network. 34 | # config.vm.network "public_network" 35 | 36 | # Share an additional folder to the guest VM. The first argument is 37 | # the path on the host to the actual folder. The second argument is 38 | # the path on the guest to mount the folder. And the optional third 39 | # argument is a set of non-required options. 40 | # config.vm.synced_folder "../data", "/vagrant_data" 41 | 42 | # Provider-specific configuration so you can fine-tune various 43 | # backing providers for Vagrant. These expose provider-specific options. 44 | # Example for VirtualBox: 45 | # 46 | # config.vm.provider "virtualbox" do |vb| 47 | # # Display the VirtualBox GUI when booting the machine 48 | # vb.gui = true 49 | # 50 | # # Customize the amount of memory on the VM: 51 | # vb.memory = "1024" 52 | # end 53 | # 54 | # View the documentation for the provider you are using for more 55 | # information on available options. 56 | 57 | # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies 58 | # such as FTP and Heroku are also available. See the documentation at 59 | # https://docs.vagrantup.com/v2/push/atlas.html for more information. 60 | # config.push.define "atlas" do |push| 61 | # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" 62 | # end 63 | 64 | # Enable provisioning with a shell script. Additional provisioners such as 65 | # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the 66 | # documentation for more information about their specific syntax and use. 67 | config.vm.provision "shell", inline: <<-SHELL 68 | yes | /vagrant/install-build-deps.sh 69 | SHELL 70 | end 71 | -------------------------------------------------------------------------------- /codereview.settings: -------------------------------------------------------------------------------- 1 | # This file is used by git-cl to get repository specific information. 2 | GERRIT_HOST: True 3 | PROJECT: dom_distiller 4 | -------------------------------------------------------------------------------- /create-hook-symlinks: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install hook scripts by making symlinks to $GIT_ROOT/hooks. 4 | 5 | HOOK_DIR=.git/hooks 6 | 7 | for hook in $(ls hooks); do 8 | ln -s -f ../../hooks/$hook $HOOK_DIR/$hook 9 | done 10 | -------------------------------------------------------------------------------- /create_standalone_js.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 The Chromium Authors 2 | # Use of this source code is governed by a BSD-style license that can be 3 | # found in the LICENSE file. 4 | 5 | """Converts gwt-compiled javascript to standalone javascript 6 | 7 | gwt-compiled javascript is in the form of an js file that is expected to be 8 | loaded into its own script tag. This reads such a compiled file and converts it 9 | to standalone javascript that can be loaded as Chrome does. 10 | """ 11 | 12 | # TODO(cjhopman): The proper way to do this is to write a gwt Linker 13 | # (gwt.core.ext.Linker) and use that for compilation. See 14 | # http://crbug.com/437113 15 | 16 | import glob 17 | import optparse 18 | import os 19 | import re 20 | import shutil 21 | import sys 22 | 23 | def ExtractJavascript(content, module): 24 | """ Extracts javascript from within 4 | 5 | 6 | -------------------------------------------------------------------------------- /extension/devtools.js: -------------------------------------------------------------------------------- 1 | chrome.devtools.panels.create( 2 | "Dom Distiller", 3 | null, 4 | "devtools_panel.html", 5 | null); 6 | -------------------------------------------------------------------------------- /extension/devtools_panel.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /extension/devtools_panel.js: -------------------------------------------------------------------------------- 1 | function sendRequest(url, callback) { 2 | var xhr = new XMLHttpRequest(); 3 | xhr.onreadystatechange = function() { 4 | if (xhr.readyState == 4) { 5 | callback((xhr.status == 200) || (xhr.status == 304), xhr.responseText); 6 | } 7 | } 8 | xhr.onerror = function() { 9 | callback(false, undefined); 10 | } 11 | xhr.open("GET", url, true); 12 | xhr.send(); 13 | } 14 | 15 | // These will hold the contents of the respective files for injecting into the 16 | // inspected window. 17 | var extract, domdistiller; 18 | 19 | sendRequest("extract.js", function(success, val) { 20 | if (success) extract = val; 21 | updateButtonReady(); 22 | }); 23 | sendRequest("domdistiller.js", function(success, val) { 24 | if (success) domdistiller = val; 25 | updateButtonReady(); 26 | }); 27 | 28 | var button 29 | var extracting = false; 30 | 31 | function updateButtonReady() { 32 | if (!button) return; 33 | var ready = (extract !== undefined) && (domdistiller !== undefined) && !extracting; 34 | button.disabled = !ready; 35 | } 36 | 37 | window.onload = function() { 38 | button = document.getElementById("button"); 39 | updateButtonReady(); 40 | button.onclick = function() { 41 | extracting = true; 42 | updateButtonReady(); 43 | 44 | chrome.devtools.inspectedWindow.eval(domdistiller); 45 | chrome.devtools.inspectedWindow.eval(extract, function(res, err) { 46 | extracting = false; 47 | updateButtonReady(); 48 | }); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /extension/extract.js: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | var options = {}; 6 | console.profile("Extraction"); 7 | var res = org.chromium.distiller.DomDistiller.applyWithOptions(options); 8 | console.profileEnd("Extraction"); 9 | console.log(res) 10 | -------------------------------------------------------------------------------- /extension/icon19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/extension/icon19.png -------------------------------------------------------------------------------- /extension/icon24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/extension/icon24.png -------------------------------------------------------------------------------- /extension/icon38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/extension/icon38.png -------------------------------------------------------------------------------- /extension/icon48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/extension/icon48.png -------------------------------------------------------------------------------- /extension/icon96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/extension/icon96.png -------------------------------------------------------------------------------- /extension/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | 4 | "name": "Dom Distiller Dev", 5 | "description": "This extension does various things to support DomDistillerJS development", 6 | "version": "1.0", 7 | 8 | "icons": { 9 | "24": "icon24.png", 10 | "48": "icon48.png", 11 | "96": "icon96.png" 12 | }, 13 | 14 | "background": { 15 | "scripts": ["background.js"], 16 | "persistent": false 17 | }, 18 | 19 | "page_action": { 20 | "default_title": "Profile Extraction", 21 | "default_icon": { 22 | "19": "icon19.png", 23 | "38": "icon38.png" 24 | } 25 | }, 26 | 27 | "devtools_page": "devtools.html", 28 | 29 | "permissions": [ 30 | "tabs", 31 | "activeTab" 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /extension/preview.js: -------------------------------------------------------------------------------- 1 | document.head.innerHTML = "" + res[1] + ""; 2 | document.body.innerHTML = "

" + res[1] + "

" + res[2][1]; 3 | -------------------------------------------------------------------------------- /heuristics/distillable/check_derived_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2016 The Chromium Authors 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | import argparse 7 | import csv 8 | import json 9 | import os 10 | import shutil 11 | import sys 12 | import unittest 13 | 14 | from write_features_csv import filter_fields, getGroups 15 | 16 | def isAlmostEqual(a, b, header, eps=0.001): 17 | assert len(a) == len(b) 18 | for i in range(len(a)): 19 | if abs(a[i] - b[i]) > eps: 20 | print '%s mismatch: a[%d] = %f, b[%d] = %f' % (header[i], i, a[i], i, b[i]) 21 | return False 22 | return True 23 | 24 | def compareDerivedFeatures(features, from_mhtml): 25 | """Compare the derived features from the JS vs. native impl 26 | 27 | Args: 28 | features: the JSON dump of features 29 | from_mhtml (bool): whether the features are collected from mhtml archive 30 | """ 31 | header = map(str, features[0]['features'][::2]) 32 | err = 0 33 | skipped = 0 34 | for f in features: 35 | if not 'native' in f: 36 | print 'Skipped %s' % (f['url']) 37 | skipped += 1 38 | continue 39 | data = [map(float, f['features'][1::2])] 40 | (h, data) = filter_fields(header, data, getGroups(header)['v1']) 41 | js = data[0] 42 | # js is now the derived features from JS aligned with native impl. 43 | if not from_mhtml and js[17] != f['native']['features']['elementCount']: 44 | # elementCount is simple enough so assume it's correct. 45 | # If elementCount doesn't match, the DOM might've changed between JS and 46 | # native runs. 47 | # For mhtml, this should not be possible since DOM is static. 48 | print 'Skipped %s' % (f['url']) 49 | skipped += 1 50 | continue 51 | 52 | native = map(float, f['native']['derived_features']) 53 | data = [js, native] 54 | # Filter out the features derived from path if it is from mhtml, because 55 | # the url from native impl would be the file:// one. 56 | if from_mhtml: 57 | (h, data) = filter_fields(h, data, getGroups(header)['v1NoPath']) 58 | if not isAlmostEqual(data[0], data[1], h): 59 | err += 1 60 | print f['url'] 61 | if from_mhtml: 62 | print '%s.mhtml' % f['index'] 63 | print data[0] 64 | print data[1] 65 | print 66 | print '%d/%d have mismatching derived features, %d were skipped.' % (err, len(features), skipped) 67 | 68 | def main(argv): 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--features', required=True, help="filename of aggregated derived features") 71 | parser.add_argument('--from-mhtml', action='store_true', help="whether the features are from mhtml") 72 | options = parser.parse_args(argv) 73 | 74 | with open(options.features) as features: 75 | features = json.load(features) 76 | compareDerivedFeatures(features, options.from_mhtml) 77 | 78 | if __name__ == '__main__': 79 | sys.exit(main(sys.argv[1:])) 80 | -------------------------------------------------------------------------------- /heuristics/distillable/check_distilled_mhtml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2016 The Chromium Authors 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | import argparse 7 | import json 8 | import os 9 | import sys 10 | 11 | def compare_innerText(dfeature, mdfeature): 12 | """Compare the distilled content from the original page with the one from the mhtml archive 13 | 14 | Args: 15 | dfeature (str): filename of the distilled feature from the original page 16 | mdfeature (str): filename of the distilled feature from mhtml archive 17 | 18 | Returns: 19 | True if the content is the same. 20 | """ 21 | 22 | with open(dfeature) as f: 23 | d = json.load(f) 24 | with open(mdfeature) as f: 25 | md = json.load(f) 26 | mhtml = os.path.splitext(mdfeature)[0] + '.mhtml' 27 | if d['features']['innerText'] != md['features']['innerText']: 28 | if md['features']['innerText'] in d['features']['innerText']: 29 | # The one from the original might have next page stitched. 30 | return True 31 | if md['features']['innerText'] == 'No data found.': 32 | print '%s failed to distill, but %s can' % (mhtml, d['url']) 33 | else: 34 | print '\n[ERROR] Different distilled content.\nFrom original (%s):\n"%s"\n\n\nFrom mhtml (%s):\n"%s"\n' % ( 35 | d['url'], d['features']['innerText'], 36 | mhtml, md['features']['innerText'] 37 | ) 38 | return False 39 | return True 40 | 41 | def compare_distilled(dir): 42 | """Compare all the distilled contents from the original pages with those from the mhtml archives 43 | 44 | Args: 45 | dir (str): directory containing all the extracted features 46 | """ 47 | 48 | files = [os.path.join(dir, f) for f in os.listdir(dir)] 49 | mdfeatures = [f for f in files if os.path.isfile(f) and os.path.splitext(f)[1] == '.mdfeature'] 50 | err = 0 51 | for mdfeature in mdfeatures: 52 | dfeature = os.path.splitext(mdfeature)[0] + '.dfeature' 53 | if not compare_innerText(dfeature, mdfeature): 54 | err += 1 55 | print '%d/%d have different distilled content from mhtml' % (err, len(mdfeatures)) 56 | 57 | def main(argv): 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument('--dir', required=True, help="data directory") 60 | options = parser.parse_args(argv) 61 | 62 | compare_distilled(options.dir) 63 | 64 | if __name__ == '__main__': 65 | sys.exit(main(sys.argv[1:])) 66 | -------------------------------------------------------------------------------- /heuristics/distillable/extract_features.js: -------------------------------------------------------------------------------- 1 | return (function() { 2 | function hasOGArticle() { 3 | var elems = document.head.querySelectorAll( 4 | 'meta[property="og:type"],meta[name="og:type"]'); 5 | for (var i in elems) { 6 | if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') { 7 | return true; 8 | } 9 | } 10 | return false; 11 | } 12 | 13 | function isVisible(e) { 14 | var bounds = e.getBoundingClientRect() 15 | var style = window.getComputedStyle(e); 16 | return !( 17 | (bounds.height == 0 && bounds.width == 0) || 18 | style.display == "none" || 19 | style.visibility == "hidden" || 20 | style.opacity == 0 21 | ) 22 | } 23 | 24 | function countVisible(nodes) { 25 | var count = 0; 26 | for (var i = 0; i < nodes.length; i++) { 27 | var node = nodes[i]; 28 | if (!isVisible(node)) { 29 | continue; 30 | } 31 | count++; 32 | } 33 | return count; 34 | } 35 | 36 | var unlikelyCandidates = /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i; 37 | var okMaybeItsACandidate = /and|article|body|column|main|shadow/i; 38 | 39 | function mozScore() { 40 | return _mozScore(true, 0.5, 140, true, 1e100); 41 | } 42 | 43 | function _mozScore(trim, power, cut, excludeLi, saturate) { 44 | var score = 0; 45 | 46 | var nodes = document.querySelectorAll('p,pre') 47 | for (var i = 0; i < nodes.length; i++) { 48 | var node = nodes[i]; 49 | if (!isVisible(node)) { 50 | continue; 51 | } 52 | var matchString = node.className + " " + node.id; 53 | if (unlikelyCandidates.test(matchString) && 54 | !okMaybeItsACandidate.test(matchString)) { 55 | continue; 56 | } 57 | 58 | if (excludeLi && node.matches && node.matches("li p")) { 59 | continue; 60 | } 61 | 62 | var textContent = node.textContent; 63 | if (trim) textContent = textContent.trim(); 64 | var textContentLength = textContent.length; 65 | textContentLength = Math.min(saturate, textContentLength) 66 | if (textContentLength < cut) { 67 | continue; 68 | } 69 | 70 | score += Math.pow(textContentLength - cut, power); 71 | } 72 | return score; 73 | } 74 | 75 | var body = document.body; 76 | var features = { 77 | 'opengraph': hasOGArticle(), 78 | 'url': document.location.href, 79 | 'title': document.title, 80 | 'numElements': body.querySelectorAll('*').length, 81 | 'numAnchors': body.querySelectorAll('a').length, 82 | 'numForms': body.querySelectorAll('form').length, 83 | 'numTextInput': body.querySelectorAll('input[type="text"]').length, 84 | 'numPasswordInput': body.querySelectorAll('input[type="password"]').length, 85 | 'numPPRE': body.querySelectorAll('p,pre').length, 86 | 'innerText': body.innerText, 87 | 'textContent': body.textContent, 88 | 'innerHTML': body.innerHTML, 89 | 'mozScore': Math.min(6 * Math.sqrt(1000 - 140), _mozScore(false, 0.5, 140, true, 1000)), 90 | 'mozScoreAllSqrt': Math.min(6 * Math.sqrt(1000), _mozScore(false, 0.5, 0, true, 1000)), 91 | 'mozScoreAllLinear': Math.min(6 * 1000, _mozScore(false, 1, 0, true, 1000)), 92 | 'visibleElements': countVisible(body.querySelectorAll('*')), 93 | 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')), 94 | } 95 | return features; 96 | })() 97 | -------------------------------------------------------------------------------- /heuristics/distillable/gen_mhtml_corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import os 6 | 7 | """Converter from scrawled MHTML to DOM Distiller eval corpora. 8 | 9 | Usage: 10 | - Scrawl with get_screenshots.py with MHTML output. 11 | - cd to the output directory. 12 | - Pick the entries, and put the IDs in a file. 13 | $ cat list 14 | 9 15 | 876 16 | 5432 17 | - Run the converter like this: 18 | $ cat list | /path/to/gen_mhtml_corpus.py > mhtml.txt 19 | 20 | Known issues: 21 | - Only support utf-8 encoding. 22 | """ 23 | 24 | def gen_corpus(id): 25 | info = '%s.info' % id 26 | feature = '%s.feature' % id 27 | if not os.path.exists(feature) or not os.path.exists(info): 28 | print >>sys.stderr, "\nERROR ID %s doesn't exist" % id 29 | return 30 | features = json.load(open(feature, 'r')) 31 | url = features['url'] 32 | title = features['features']['title'] 33 | try: 34 | mhtml = open('%s.mhtml' % id, 'rb').read() 35 | if not mhtml.startswith('From:'): 36 | print >>sys.stderr, "\nSKIPPED ID %s" % id 37 | return 38 | res = [] 39 | res.append(url) 40 | res.append('url: ' + json.dumps(url)) 41 | res.append('title: ' + json.dumps(title)) 42 | res.append('html: ' + json.dumps(mhtml)) 43 | res.append('content: ""') 44 | res.append('') 45 | print '\n'.join(res) 46 | sys.stderr.write('.') 47 | except UnicodeDecodeError, e: 48 | print >>sys.stderr, "\nERROR handling ID %s" % id 49 | print >>sys.stderr, e 50 | pass 51 | 52 | def main(argv): 53 | list = sys.stdin.read().split() 54 | for i in list: 55 | gen_corpus(i) 56 | return 0 57 | 58 | if __name__ == '__main__': 59 | sys.exit(main(sys.argv[1:])) 60 | -------------------------------------------------------------------------------- /hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | 4 | # Redirect output to stderr. 5 | exec 1>&2 6 | 7 | VIOLATIONS="$(git grep --cached getInnerText -- '*.java' | grep -v DomUtil)" 8 | 9 | if [ -n "$VIOLATIONS" ]; then 10 | echo "Avoid using Element.getInnerText() in GWT." 11 | echo "Use DomUtil.getInnerText() instead." 12 | echo "Ref: http://crbug.com/637170" 13 | echo 14 | echo "Violations:" 15 | echo "$VIOLATIONS" 16 | exit 1 17 | fi 18 | 19 | VIOLATIONS="$(git grep --cached getClassName -- '*.java' | grep -v '*')" 20 | 21 | if [ -n "$VIOLATIONS" ]; then 22 | echo "Avoid using Element.getClassName() in GWT." 23 | echo "Use Element.getAttribute(\"class\") instead." 24 | echo "Ref: http://crbug.com/658038" 25 | echo 26 | echo "Violations:" 27 | echo "$VIOLATIONS" 28 | exit 1 29 | fi 30 | -------------------------------------------------------------------------------- /install-build-deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2014 The Chromium Authors 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | # Installs required build dependencies (to buildtools/ and the local system). 7 | 8 | # If Chrome is older than CHROME_MIN_VERSION, force update. 9 | [ -z "$CHROME_MIN_VERSION" ] && CHROME_MIN_VERSION=49 10 | 11 | ( 12 | set -e 13 | if [ "$(id -u)" != "0" ]; then 14 | echo "Please run this as root." 15 | exit 1 16 | fi 17 | 18 | apt-get update 19 | apt-get install -y \ 20 | ant \ 21 | openjdk-8-jdk \ 22 | protobuf-compiler \ 23 | python \ 24 | python-setuptools \ 25 | python-protobuf \ 26 | unzip \ 27 | wget \ 28 | xdotool \ 29 | xvfb 30 | 31 | # Specify JDK version in case there are other versions installed. 32 | update-alternatives --set java $(sudo update-alternatives --list java | grep java-8) 33 | 34 | if ! command -v google-chrome >/dev/null 2>&1; then 35 | wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - 36 | echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list 37 | apt-get update 38 | apt-get install google-chrome-stable 39 | fi 40 | 41 | # Update chrome if it is too old, and keep the default channel. 42 | CHROME_VERSION=$(google-chrome --version | tr " " "\n" | awk '/[0-9.]/{print ($1+0)}') 43 | if (( $CHROME_VERSION < $CHROME_MIN_VERSION )); then 44 | echo "Current Chrome version is $CHROME_VERSION. Updating to latest." 45 | case "$(google-chrome --version)" in 46 | *dev*) 47 | apt-get install google-chrome-unstable 48 | ;; 49 | *beta*) 50 | apt-get install google-chrome-beta 51 | ;; 52 | *) 53 | apt-get install google-chrome-stable 54 | ;; 55 | esac 56 | fi 57 | 58 | user=$SUDO_USER 59 | bit=$(getconf LONG_BIT) 60 | domdistiller=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 61 | pkg=selenium-3.141.0 62 | tar=${pkg}.tar.gz 63 | zip=chromedriver_linux${bit}.zip 64 | tmp=/tmp/domdistiller-$$ 65 | tools=$domdistiller/buildtools 66 | 67 | set -e 68 | mkdir $tmp 69 | cd $tmp 70 | 71 | # The version scheme changed after 2.46. See http://chromedriver.chromium.org/downloads/version-selection 72 | CHROME_VERSION=$(google-chrome --version | tr " " "\n" | awk '/[0-9.]/{print ($1+0)}') 73 | VERSION=$(wget -q --output-document - https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION} | cat) 74 | [ -z "${VERSION}" ] && VERSION=2.24 75 | wget https://chromedriver.storage.googleapis.com/${VERSION}/$zip 76 | chmod a+r $zip 77 | sudo -u $user mkdir -p $tools 78 | sudo -u $user unzip -o -d $tools $zip 79 | chmod u+x $tools/chromedriver 80 | 81 | wget https://pypi.python.org/packages/source/s/selenium/$tar 82 | tar -xf $tar 83 | cd $pkg 84 | 85 | python setup.py install 86 | 87 | rm -rf $tmp 88 | ) 89 | -------------------------------------------------------------------------------- /java/DomDistiller.gwt.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/BoilerpipeFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller; 23 | 24 | import org.chromium.distiller.document.TextDocument; 25 | 26 | /** 27 | * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and 28 | * processes it somehow. 29 | * 30 | * @author Christian Kohlschütter 31 | */ 32 | public interface BoilerpipeFilter { 33 | /** 34 | * Processes the given document doc. 35 | * 36 | * @param doc 37 | * The {@link TextDocument} that is to be processed. 38 | * @return true if changes have been made to the 39 | * {@link TextDocument}. 40 | */ 41 | boolean process(final TextDocument doc); 42 | } 43 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/DomDistillerEntry.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.core.client.EntryPoint; 8 | 9 | /** 10 | * Entry point classes define onModuleLoad(). 11 | */ 12 | public class DomDistillerEntry implements EntryPoint { 13 | /** 14 | * This is the entry point method. 15 | */ 16 | @Override 17 | public void onModuleLoad() { 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/DomWalker.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | import com.google.gwt.dom.client.Node; 9 | 10 | /** 11 | * Used to walk the subtree of the DOM rooted at a particular Node. It provides a Visitor interface 12 | * to allow some processing to be done at each node of the walk. 13 | */ 14 | public class DomWalker { 15 | private final Visitor visitor; 16 | 17 | public interface Visitor { 18 | /** 19 | * Called when reaching a Node during the walk. 20 | * 21 | * @return Whether to process the subtree rooted at this node. If false, all children of 22 | * this node will be skipped and exit() will not be called for this node. 23 | */ 24 | public boolean visit(Node n); 25 | 26 | /** 27 | * Called when exiting a node. I.e. after visiting all of its children. 28 | */ 29 | public void exit(Node n); 30 | 31 | /** 32 | * Called when skipping an element. A normal walk doesn't skip any elements. 33 | */ 34 | public void skip(Element e); 35 | } 36 | 37 | public DomWalker(Visitor v) { 38 | visitor = v; 39 | } 40 | 41 | /** 42 | * Walk the subtree rooted at n. 43 | */ 44 | public void walk(Node top) { 45 | // Conceptually, this maintains a pointer to the currently "walked" node. When first seeing 46 | // the node, it calls visit() on it. The next node to visit is then (1) the first child, (2) 47 | // the next sibling, or (3) the next sibling of the first ancestor w/ a next sibling. 48 | // 49 | // Every time the walk "crosses" the "exit" of a node (i.e. when the pointer goes from 50 | // somewhere in the node's subtree to somewhere outside of that subtree), exit() is called 51 | // for that node (unless visit() for that node returned false). 52 | if (!visitor.visit(top)) return; 53 | Node n = top.getFirstChild(); 54 | if (n != null) { 55 | while (n != top) { 56 | // shouldExit is used to suppress the exit call for the current node when visit() 57 | // returns false. 58 | boolean shouldExit = false; 59 | if (visitor.visit(n)) { 60 | Node c = n.getFirstChild(); 61 | if (c != null) { 62 | n = c; 63 | continue; 64 | } 65 | shouldExit = true; 66 | } 67 | 68 | while (n != top) { 69 | if (shouldExit) visitor.exit(n); 70 | Node s = n.getNextSibling(); 71 | if (s != null) { 72 | n = s; 73 | break; 74 | } 75 | n = n.getParentNode(); 76 | shouldExit = true; 77 | } 78 | } 79 | } 80 | visitor.exit(top); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/JavaScript.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Node; 8 | 9 | /** 10 | * Provides implementations of various javascript/DOM functions not available in GWT. 11 | */ 12 | public final class JavaScript { 13 | public static native boolean contains(Node l, Node r) /*-{ 14 | return l.contains(r); 15 | }-*/; 16 | 17 | public static native double parseFloat(String s) /*-{ 18 | return parseFloat(s); 19 | }-*/; 20 | 21 | public static int parseInt(String s) { 22 | return parseInt(s, 10); 23 | } 24 | 25 | public static native int parseInt(String s, int radix) /*-{ 26 | return parseInt(s, radix) | 0; 27 | }-*/; 28 | 29 | private JavaScript() { 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/NodeTree.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | import com.google.gwt.dom.client.Node; 9 | 10 | import java.util.LinkedList; 11 | import java.util.List; 12 | 13 | /** 14 | * A simple tree of Nodes. 15 | */ 16 | public class NodeTree { 17 | private final List children; 18 | private final Node node; 19 | 20 | public NodeTree(Node root) { 21 | node = root; 22 | children = new LinkedList(); 23 | } 24 | 25 | public void addChild(Node node) { 26 | addChildTree(new NodeTree(node)); 27 | } 28 | 29 | public void addChildTree(NodeTree child) { 30 | children.add(child); 31 | } 32 | 33 | public List getChildren() { 34 | return children; 35 | } 36 | 37 | public Node getNode() { 38 | return node; 39 | } 40 | 41 | public Node cloneSubtree() { 42 | Node clone = node.cloneNode(false); 43 | for (NodeTree child : children) { 44 | clone.appendChild(child.cloneSubtree()); 45 | } 46 | return clone; 47 | } 48 | 49 | /** 50 | * Clone this subtree while retaining text directionality from its computed style. The 51 | * "dir" attribute for each node will be set for each node. 52 | * 53 | * @return The root node of the cloned tree 54 | */ 55 | public Node cloneSubtreeRetainDirection() { 56 | Node clone = node.cloneNode(false); 57 | if (node.getNodeType() == Node.ELEMENT_NODE) { 58 | String direction = DomUtil.getComputedStyle(Element.as(node)).getProperty("direction"); 59 | if (direction.isEmpty()) { 60 | direction = "auto"; 61 | } 62 | Element.as(clone).setAttribute("dir", direction); 63 | } 64 | for (NodeTree child : children) { 65 | clone.appendChild(child.cloneSubtreeRetainDirection()); 66 | } 67 | return clone; 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/OrderedNodeMatcher.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Node; 8 | 9 | import java.util.Iterator; 10 | import java.util.List; 11 | 12 | /** 13 | * Used to match a list of nodes in order. 14 | */ 15 | public class OrderedNodeMatcher { 16 | private final Iterator nodeIter; 17 | private Node nextNode; 18 | 19 | public OrderedNodeMatcher(List nodes) { 20 | this.nodeIter = nodes.iterator(); 21 | if (!nodes.isEmpty()) { 22 | nextNode = this.nodeIter.next(); 23 | } 24 | } 25 | 26 | /** 27 | * @return Whether the node matches the next one in the list. If true, the expected next node 28 | * will be advanced. 29 | */ 30 | public boolean match(Node n) { 31 | if (!n.equals(nextNode)) return false; 32 | 33 | nextNode = nodeIter.hasNext() ? nodeIter.next() : null; 34 | return true; 35 | } 36 | 37 | public Node peek() { 38 | return nextNode; 39 | } 40 | 41 | /** 42 | * @return Whether there are any nodes left to match in the list. 43 | */ 44 | public boolean isFinished() { 45 | return nextNode == null; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/PageLinkInfo.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | /** 8 | * This class stores information about the link (anchor) after PageParameterDetector has detected 9 | * the page parameter: 10 | * - the page number (as represented by the original plain text) for the link 11 | * - the original page parameter numeric component in the URL (this component would be replaced 12 | * by PageParameterDetector.PAGE_PARAM_PLACEHOLDER in the URL pattern) 13 | * - the position of this link in the list of ascending numbers. 14 | */ 15 | class PageLinkInfo { 16 | int mPageNum; 17 | int mPageParamValue; 18 | int mPosInAscendingList; 19 | 20 | PageLinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { 21 | mPageNum = pageNum; 22 | mPageParamValue = pageParamValue; 23 | mPosInAscendingList = posInAscendingList; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/TreeCloneBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | import com.google.gwt.dom.client.Node; 9 | 10 | import java.util.List; 11 | 12 | /** 13 | * This provides a way to extract the minimal part of the DOM tree that contains the provided list 14 | * of leaf nodes (the result is still a tree). This is a faster method of generating output than 15 | * the NodeListExpander. 16 | */ 17 | public class TreeCloneBuilder { 18 | /** 19 | * Clone the provided node and attempt to specify text directionality ("dir" attribute). 20 | * @param node The node to clone. 21 | * @return The cloned node. 22 | */ 23 | public static Node cloneNode(Node node) { 24 | Node clone = node.cloneNode(false); 25 | if (node.getNodeType() == Node.ELEMENT_NODE) { 26 | String direction = DomUtil.getComputedStyle(Element.as(node)).getProperty("direction"); 27 | if (direction.isEmpty()) { 28 | direction = "auto"; 29 | } 30 | Element.as(clone).setAttribute("dir", direction); 31 | } 32 | return clone; 33 | } 34 | 35 | private static Node cloneChild(Node clone, Node newChild) { 36 | Node cl = cloneNode(newChild); 37 | clone.appendChild(cl); 38 | return cl; 39 | } 40 | 41 | private static Node cloneParent(Node clone, Node newParent) { 42 | Node p = clone.getParentNode(); 43 | if (p == null) { 44 | p = cloneNode(newParent); 45 | p.appendChild(clone); 46 | } 47 | return p; 48 | } 49 | 50 | /** 51 | * This method takes a list of nodes and returns a clone of the minimum tree in the DOM that 52 | * contains all of them. This is done by going through each node, cloning its parent and adding 53 | * children to that parent until the next node is not contained in that parent (originally). 54 | * The list cannot contain a parent of any of the other nodes. Children of the nodes in the 55 | * provided list are excluded. 56 | * @param nodes The list of nodes. 57 | * @return Root node of cloned tree. 58 | */ 59 | public static Node buildTreeClone(List nodes) { 60 | if (nodes.size() == 1) { 61 | return new NodeTree(nodes.get(0)).cloneSubtree(); 62 | } 63 | Node n = nodes.get(0); 64 | Node clone = n.cloneNode(false); 65 | OrderedNodeMatcher matcher = new OrderedNodeMatcher(nodes); 66 | while (!matcher.isFinished()) { 67 | if (matcher.match(n)) { 68 | if (matcher.isFinished()) break; 69 | } else { 70 | n = n.getFirstChild(); 71 | while (!JavaScript.contains(n, matcher.peek())) { 72 | n = n.getNextSibling(); 73 | } 74 | clone = cloneChild(clone, n); 75 | continue; 76 | } 77 | while (true) { 78 | Node s = n.getNextSibling(); 79 | while (s != null && !JavaScript.contains(s, matcher.peek())) { 80 | s = s.getNextSibling(); 81 | } 82 | if (s != null) { 83 | clone = cloneParent(clone, n.getParentNode()); 84 | clone = cloneChild(clone, s); 85 | n = s; 86 | break; 87 | } 88 | n = n.getParentNode(); 89 | clone = cloneParent(clone, n); 90 | } 91 | } 92 | while (clone.getParentNode() != null) { 93 | clone = clone.getParentNode(); 94 | } 95 | return clone; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/document/TextDocument.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | /** 5 | * boilerpipe 6 | * 7 | * Copyright (c) 2009 Christian Kohlschütter 8 | * 9 | * The author licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | */ 21 | package org.chromium.distiller.document; 22 | 23 | import java.util.List; 24 | 25 | /** 26 | * A text document, consisting of one or more {@link TextBlock}s. 27 | * 28 | * @author Christian Kohlschütter 29 | */ 30 | public class TextDocument implements Cloneable { 31 | private final List textBlocks; 32 | 33 | /** 34 | * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no 35 | * title. 36 | * 37 | * @param textBlocks 38 | * The text blocks of this document. 39 | */ 40 | public TextDocument(final List textBlocks) { 41 | this.textBlocks = textBlocks; 42 | } 43 | 44 | /** 45 | * Returns the {@link TextBlock}s of this document. 46 | * 47 | * @return A list of {@link TextBlock}s, in sequential order of appearance. 48 | */ 49 | public List getTextBlocks() { 50 | return textBlocks; 51 | } 52 | 53 | public void applyToModel() { 54 | for (TextBlock tb : getTextBlocks()) { 55 | tb.applyToModel(); 56 | } 57 | } 58 | 59 | /** 60 | * Returns detailed debugging information about the contained {@link TextBlock}s. 61 | */ 62 | public String debugString() { 63 | String s = ""; 64 | for(TextBlock tb : getTextBlocks()) { 65 | s += tb.toString() + "\n"; 66 | } 67 | return s; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/document/TextDocumentStatistics.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.document; 6 | 7 | /** 8 | * Provides shallow statistics on a given TextDocument. 9 | */ 10 | public final class TextDocumentStatistics { 11 | /** 12 | * @return the sum of number of words in content blocks. 13 | */ 14 | public static int countWordsInContent(TextDocument document) { 15 | int numWords = 0; 16 | for (TextBlock tb : document.getTextBlocks()) { 17 | if (tb.isContent()) numWords += tb.getNumWords(); 18 | } 19 | return numWords; 20 | } 21 | 22 | private TextDocumentStatistics() { 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/extractors/KeepEverythingExtractor.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.extractors; 23 | 24 | import org.chromium.distiller.document.TextDocument; 25 | import org.chromium.distiller.filters.simple.MarkEverythingContentFilter; 26 | 27 | /** 28 | * Marks everything as content. 29 | * 30 | * @author Christian Kohlschütter 31 | */ 32 | public final class KeepEverythingExtractor { 33 | 34 | public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor(); 35 | 36 | private KeepEverythingExtractor() { 37 | 38 | } 39 | 40 | public boolean process(TextDocument doc) { 41 | return MarkEverythingContentFilter.INSTANCE.process(doc); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/extractors/KeepEverythingWithMinKWordsExtractor.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.extractors; 23 | 24 | import org.chromium.distiller.document.TextDocument; 25 | import org.chromium.distiller.filters.simple.MarkEverythingContentFilter; 26 | import org.chromium.distiller.filters.simple.MinWordsFilter; 27 | 28 | /** 29 | * A full-text extractor which extracts the largest text component of a page. 30 | * For news articles, it may perform better than the {@link DefaultExtractor}, 31 | * but usually worse than {@link ArticleExtractor}. 32 | * 33 | * @author Christian Kohlschütter 34 | */ 35 | public final class KeepEverythingWithMinKWordsExtractor { 36 | 37 | private final MinWordsFilter filter; 38 | 39 | public KeepEverythingWithMinKWordsExtractor(final int kMin) { 40 | this.filter = new MinWordsFilter(kMin); 41 | } 42 | 43 | public boolean process(TextDocument doc) { 44 | return MarkEverythingContentFilter.INSTANCE.process(doc) 45 | | filter.process(doc); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/extractors/embeds/EmbedExtractor.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.extractors.embeds; 6 | 7 | import org.chromium.distiller.webdocument.WebElement; 8 | 9 | import com.google.gwt.dom.client.Element; 10 | 11 | import java.util.Set; 12 | 13 | /** 14 | * This interface is used to represent an extractor for a particular type of embedded element. 15 | */ 16 | public interface EmbedExtractor { 17 | 18 | /** 19 | * Get a set of HTML tag names that are relevant to this extractor. 20 | * @return set of HTML tag names. 21 | */ 22 | public Set getRelevantTagNames(); 23 | 24 | /** 25 | * Give a particular element, detect if it should be extracted as an embedded element; if not 26 | * return null. 27 | * @param e The element to test. 28 | * @return A {@link WebElement} object that contains information about the embed including type. 29 | */ 30 | public WebElement extract(Element e); 31 | } 32 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/extractors/embeds/VimeoExtractor.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.extractors.embeds; 6 | 7 | import org.chromium.distiller.DomUtil; 8 | import org.chromium.distiller.LogUtil; 9 | import org.chromium.distiller.webdocument.WebEmbed; 10 | 11 | import com.google.gwt.dom.client.AnchorElement; 12 | import com.google.gwt.dom.client.Document; 13 | import com.google.gwt.dom.client.Element; 14 | import com.google.gwt.dom.client.IFrameElement; 15 | 16 | import java.util.HashSet; 17 | import java.util.Map; 18 | import java.util.Set; 19 | 20 | /** 21 | * VimeoExtractor is used for extracting Vimeo videos and relevant information. 22 | */ 23 | public class VimeoExtractor implements EmbedExtractor { 24 | 25 | private static final Set relevantTags = new HashSet<>(); 26 | static { 27 | relevantTags.add("IFRAME"); 28 | } 29 | 30 | @Override 31 | public Set getRelevantTagNames() { 32 | return relevantTags; 33 | } 34 | 35 | @Override 36 | public WebEmbed extract(Element e) { 37 | if (e == null || !relevantTags.contains(e.getTagName())) { 38 | return null; 39 | } 40 | String src = IFrameElement.as(e).getSrc(); 41 | if (!DomUtil.hasRootDomain(src, "player.vimeo.com")) { 42 | return null; 43 | } 44 | 45 | // Get specific attributes about the Vimeo embed. 46 | AnchorElement anchor = Document.get().createAnchorElement(); 47 | anchor.setHref(src); 48 | String path = anchor.getPropertyString("pathname"); 49 | 50 | Map paramMap = 51 | DomUtil.splitUrlParams(anchor.getPropertyString("search").substring(1)); 52 | 53 | String id = getVimeoIdFromPath(path); 54 | if (id == null) { 55 | return null; 56 | } 57 | 58 | if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { 59 | LogUtil.logToConsole("Vimeo embed extracted:"); 60 | LogUtil.logToConsole(" ID: " + id); 61 | } 62 | 63 | return new WebEmbed(e, "vimeo", id, paramMap); 64 | } 65 | 66 | /** 67 | * Get the last non-empty part of the path for a Vimeo URL. Stop searching after "video" as it 68 | * is the section just before the ID. 69 | * @param path The full path of the URL. 70 | * @return Either the ID of the video or null. 71 | */ 72 | private String getVimeoIdFromPath(String path) { 73 | // Video ID will be the last part of the path, account for possible tail slash/empty path 74 | // sections. 75 | String[] pathSplit = path.split("/"); 76 | for (int i = pathSplit.length-1; i >=0; i--) { 77 | if ("video".equals(pathSplit[i])) { 78 | return null; 79 | } else if (pathSplit[i].length() > 0) { 80 | return pathSplit[i]; 81 | } 82 | } 83 | return null; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/debug/PrintDebugFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2012 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.debug; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.LogUtil; 26 | import org.chromium.distiller.document.TextDocument; 27 | 28 | 29 | /** 30 | * Prints debug information about the current state of the TextDocument. (= 31 | * calls {@link TextDocument#debugString()}. 32 | * 33 | * @author Christian Kohlschütter 34 | */ 35 | public final class PrintDebugFilter implements BoilerpipeFilter { 36 | /** 37 | * Returns the default instance for {@link PrintDebugFilter}, 38 | * which dumps debug information to System.out 39 | */ 40 | public static final PrintDebugFilter INSTANCE = new PrintDebugFilter(); 41 | 42 | 43 | @Override 44 | public boolean process(TextDocument doc) { 45 | if (!LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_BOILER_PIPE_PHASES)) return false; 46 | LogUtil.logToConsole(doc.debugString()); 47 | return false; 48 | } 49 | 50 | public boolean process(TextDocument doc, boolean changed, String header) { 51 | if (!LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_BOILER_PIPE_PHASES)) return false; 52 | if (changed) { 53 | LogUtil.logToConsole(LogUtil.kBlue + "<<<<< " + header + " >>>>>"); 54 | process(doc); 55 | LogUtil.logToConsole(LogUtil.kBlue + "<<<<< >>>>>"); 56 | } else { 57 | LogUtil.logToConsole(LogUtil.kRed + "~~~~~ No Changes: " + header + " ~~~~~"); 58 | } 59 | return false; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/english/TerminatingBlocksFinder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.english; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.StringUtil; 26 | import org.chromium.distiller.document.TextBlock; 27 | import org.chromium.distiller.document.TextDocument; 28 | import org.chromium.distiller.labels.DefaultLabels; 29 | 30 | import com.google.gwt.regexp.shared.RegExp; 31 | 32 | /** 33 | * Finds blocks which are potentially indicating the end of an article text and 34 | * marks them with {@link DefaultLabels#STRICTLY_NOT_CONTENT}. 35 | * 36 | * @author Christian Kohlschütter 37 | * @see IgnoreBlocksAfterContentFilter 38 | */ 39 | public class TerminatingBlocksFinder implements BoilerpipeFilter { 40 | public static final TerminatingBlocksFinder INSTANCE = new TerminatingBlocksFinder(); 41 | 42 | /** 43 | * Returns the singleton instance for TerminatingBlocksFinder. 44 | */ 45 | public static TerminatingBlocksFinder getInstance() { 46 | return INSTANCE; 47 | } 48 | 49 | public static final RegExp REG_TERMINATING = RegExp.compile("(" 50 | + "^(comments|© reuters|please rate this|post a comment|" 51 | + "\\d+\\s+(comments|users responded in)" 52 | + ")" 53 | + "|what you think\\.\\.\\." 54 | + "|add your comment" 55 | + "|add comment" 56 | + "|reader views" 57 | + "|have your say" 58 | + "|reader comments" 59 | + "|rätta artikeln" 60 | + "|^thanks for your comments - this feedback is now closed$" 61 | + ")", 62 | "i"); 63 | 64 | public static boolean isTerminatingText(String longText) { 65 | return REG_TERMINATING.test(longText); 66 | } 67 | 68 | public static boolean isTerminating(TextBlock tb) { 69 | if (tb.getNumWords() > 14) return false; 70 | String text = StringUtil.jsTrim(tb.getText()); 71 | 72 | if (text.length() >= 8) { 73 | return isTerminatingText(text); 74 | } else if (tb.getLinkDensity() == 1.0) { 75 | return text.equals("Comment"); 76 | } else if (text.equals("Shares")) { 77 | // Skip social and sharing elements. 78 | // See crbug.com/692553 79 | return true; 80 | } 81 | return false; 82 | } 83 | 84 | @Override 85 | public boolean process(TextDocument doc) { 86 | boolean changes = false; 87 | 88 | for (TextBlock tb : doc.getTextBlocks()) { 89 | if (isTerminating(tb)) { 90 | tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT); 91 | changes = true; 92 | } 93 | } 94 | 95 | return changes; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/heuristics/ExpandTitleToContentFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.heuristics; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | import org.chromium.distiller.labels.DefaultLabels; 28 | 29 | /** 30 | * Marks all {@link TextBlock}s "content" which are between the headline and the part that 31 | * has already been marked content, if they are marked {@link DefaultLabels#MIGHT_BE_CONTENT}. 32 | * 33 | * This filter is quite specific to the news domain. 34 | * 35 | * @author Christian Kohlschütter 36 | */ 37 | public final class ExpandTitleToContentFilter implements BoilerpipeFilter { 38 | public static final ExpandTitleToContentFilter INSTANCE = new ExpandTitleToContentFilter(); 39 | 40 | /** 41 | * Returns the singleton instance for ExpandTitleToContentFilter. 42 | */ 43 | public static ExpandTitleToContentFilter getInstance() { 44 | return INSTANCE; 45 | } 46 | 47 | @Override 48 | public boolean process(TextDocument doc) { 49 | int i = 0; 50 | int title = -1; 51 | int contentStart = -1; 52 | for (TextBlock tb : doc.getTextBlocks()) { 53 | if (contentStart == -1 && tb.hasLabel(DefaultLabels.TITLE)) { 54 | title = i; 55 | contentStart = -1; 56 | } 57 | if (contentStart == -1 && tb.isContent()) { 58 | contentStart = i; 59 | } 60 | 61 | i++; 62 | } 63 | 64 | if (contentStart <= title || title == -1) { 65 | return false; 66 | } 67 | boolean changes = false; 68 | for (TextBlock tb : doc.getTextBlocks().subList(title, contentStart)) { 69 | if (tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)) { 70 | changes = tb.setIsContent(true) | changes; 71 | } 72 | } 73 | return changes; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/heuristics/HeadingFusion.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.filters.heuristics; 6 | 7 | import org.chromium.distiller.BoilerpipeFilter; 8 | import org.chromium.distiller.document.TextBlock; 9 | import org.chromium.distiller.document.TextDocument; 10 | import org.chromium.distiller.labels.DefaultLabels; 11 | 12 | import java.util.List; 13 | import java.util.ListIterator; 14 | 15 | /** 16 | * Fuses headings with the blocks after them. 17 | * 18 | * If the heading was marked as boilerplate, the fused block will be labeled to prevent 19 | * BlockProximityFusion from merging through it. 20 | */ 21 | public final class HeadingFusion implements BoilerpipeFilter { 22 | 23 | /** 24 | * Creates a new {@link HeadingFusion} instance. 25 | */ 26 | public HeadingFusion() { 27 | } 28 | 29 | @Override 30 | public boolean process(TextDocument doc) { 31 | List textBlocks = doc.getTextBlocks(); 32 | if (textBlocks.size() < 2) { 33 | return false; 34 | } 35 | 36 | boolean changes = false; 37 | ListIterator it = textBlocks.listIterator(); 38 | TextBlock prevBlock = null, currBlock = it.next(); 39 | while (it.hasNext()) { 40 | prevBlock = currBlock; 41 | currBlock = it.next(); 42 | 43 | if (!prevBlock.hasLabel(DefaultLabels.HEADING)) { 44 | continue; 45 | } 46 | 47 | if (prevBlock.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT) 48 | || currBlock.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) { 49 | continue; 50 | } 51 | 52 | if (prevBlock.hasLabel(DefaultLabels.TITLE) 53 | || currBlock.hasLabel(DefaultLabels.TITLE)) { 54 | continue; 55 | } 56 | 57 | if (currBlock.isContent()) { 58 | changes = true; 59 | 60 | boolean headingWasContent = prevBlock.isContent(); 61 | prevBlock.mergeNext(currBlock); 62 | currBlock = prevBlock; 63 | it.remove(); 64 | 65 | currBlock.removeLabel(DefaultLabels.HEADING); 66 | if (!headingWasContent) { 67 | currBlock.addLabel(DefaultLabels.BOILERPLATE_HEADING_FUSED); 68 | } 69 | } else if (prevBlock.isContent()) { 70 | changes = true; 71 | prevBlock.setIsContent(false); 72 | } 73 | } 74 | 75 | return changes; 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | return getClass().getName(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.heuristics; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | import org.chromium.distiller.labels.DefaultLabels; 28 | 29 | /** 30 | * Marks all blocks as content that: 31 | *
    32 | *
  1. are on the same tag-level as very likely main content (usually the level of the largest block)
  2. 33 | *
  3. have a significant number of words, currently: at least 100
  4. 34 | *
35 | * 36 | * @author Christian Kohlschütter 37 | */ 38 | public final class LargeBlockSameTagLevelToContentFilter implements BoilerpipeFilter { 39 | public static final LargeBlockSameTagLevelToContentFilter INSTANCE = new LargeBlockSameTagLevelToContentFilter(); 40 | private LargeBlockSameTagLevelToContentFilter() { 41 | } 42 | 43 | @Override 44 | public boolean process(final TextDocument doc) { 45 | boolean changes = false; 46 | 47 | int tagLevel = -1; 48 | for (TextBlock tb : doc.getTextBlocks()) { 49 | if(tb.isContent() && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { 50 | tagLevel = tb.getTagLevel(); 51 | break; 52 | } 53 | } 54 | 55 | if(tagLevel == -1) { 56 | return false; 57 | } 58 | 59 | for (TextBlock tb : doc.getTextBlocks()) { 60 | if (!tb.isContent()) { 61 | 62 | if(tb.getNumWords() >= 100 && tb.getTagLevel() == tagLevel) { 63 | tb.setIsContent(true); 64 | changes = true; 65 | } 66 | } 67 | } 68 | 69 | return changes; 70 | 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/heuristics/ListAtEndFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.heuristics; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | import org.chromium.distiller.labels.DefaultLabels; 28 | 29 | /** 30 | * Marks nested list-item blocks after the end of the main content. 31 | * 32 | * @author Christian Kohlschütter 33 | */ 34 | public final class ListAtEndFilter implements BoilerpipeFilter { 35 | public static final ListAtEndFilter INSTANCE = new ListAtEndFilter(); 36 | 37 | private ListAtEndFilter() { 38 | } 39 | 40 | @Override 41 | public boolean process(final TextDocument doc) { 42 | boolean changes = false; 43 | 44 | int tagLevel = Integer.MAX_VALUE; 45 | for (TextBlock tb : doc.getTextBlocks()) { 46 | if (tb.isContent() 47 | && tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { 48 | tagLevel = tb.getTagLevel(); 49 | } else { 50 | if (tb.getTagLevel() > tagLevel 51 | && tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT) 52 | && tb.hasLabel(DefaultLabels.LI) 53 | && tb.getLinkDensity() == 0 54 | ) { 55 | tb.setIsContent(true); 56 | changes = true; 57 | } else { 58 | tagLevel = Integer.MAX_VALUE; 59 | } 60 | } 61 | } 62 | 63 | return changes; 64 | 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/simple/BoilerplateBlockFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.simple; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | import org.chromium.distiller.labels.DefaultLabels; 28 | 29 | import java.util.Iterator; 30 | import java.util.List; 31 | 32 | /** 33 | * Removes {@link TextBlock}s which have explicitly been marked as 34 | * "not content". 35 | * 36 | * @author Christian Kohlschütter 37 | */ 38 | public final class BoilerplateBlockFilter implements BoilerpipeFilter { 39 | public static final BoilerplateBlockFilter INSTANCE = new BoilerplateBlockFilter( 40 | null); 41 | public static final BoilerplateBlockFilter INSTANCE_KEEP_TITLE = new BoilerplateBlockFilter( 42 | DefaultLabels.TITLE); 43 | private final String labelToKeep; 44 | 45 | /** 46 | * Returns the singleton instance for BoilerplateBlockFilter. 47 | */ 48 | public static BoilerplateBlockFilter getInstance() { 49 | return INSTANCE; 50 | } 51 | 52 | public BoilerplateBlockFilter(final String labelToKeep) { 53 | this.labelToKeep = labelToKeep; 54 | } 55 | 56 | @Override 57 | public boolean process(TextDocument doc) { 58 | List textBlocks = doc.getTextBlocks(); 59 | boolean hasChanges = false; 60 | 61 | for (Iterator it = textBlocks.iterator(); it.hasNext();) { 62 | TextBlock tb = it.next(); 63 | if (!tb.isContent() 64 | && (labelToKeep == null || !tb 65 | .hasLabel(DefaultLabels.TITLE))) { 66 | it.remove(); 67 | hasChanges = true; 68 | } 69 | } 70 | 71 | return hasChanges; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/simple/LabelToBoilerplateFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.simple; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | import org.chromium.distiller.labels.DefaultLabels; 28 | 29 | /** 30 | * Marks all blocks that contain a given label as "boilerplate". 31 | * 32 | * @author Christian Kohlschütter 33 | */ 34 | public final class LabelToBoilerplateFilter implements BoilerpipeFilter { 35 | public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT = new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT); 36 | 37 | private String[] labels; 38 | 39 | public LabelToBoilerplateFilter(final String... label) { 40 | this.labels = label; 41 | } 42 | 43 | @Override 44 | public boolean process(final TextDocument doc) { 45 | boolean changes = false; 46 | 47 | BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) { 48 | if (tb.isContent()) { 49 | for (String label : labels) { 50 | if (tb.hasLabel(label)) { 51 | tb.setIsContent(false); 52 | changes = true; 53 | continue BLOCK_LOOP; 54 | } 55 | } 56 | } 57 | } 58 | 59 | return changes; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/simple/MarkEverythingBoilerplateFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.simple; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | 28 | /** 29 | * Marks all blocks as boilerplate. 30 | * 31 | * @author Christian Kohlschütter 32 | */ 33 | public final class MarkEverythingBoilerplateFilter implements BoilerpipeFilter { 34 | public static final MarkEverythingBoilerplateFilter INSTANCE = new MarkEverythingBoilerplateFilter(); 35 | private MarkEverythingBoilerplateFilter() { 36 | } 37 | 38 | @Override 39 | public boolean process(final TextDocument doc) { 40 | boolean changes = false; 41 | 42 | for (TextBlock tb : doc.getTextBlocks()) { 43 | if (tb.isContent()) { 44 | tb.setIsContent(false); 45 | changes = true; 46 | } 47 | } 48 | 49 | return changes; 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/simple/MarkEverythingContentFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.simple; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | 28 | /** 29 | * Marks all blocks as content. 30 | * 31 | * @author Christian Kohlschütter 32 | */ 33 | public final class MarkEverythingContentFilter implements BoilerpipeFilter { 34 | public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter(); 35 | private MarkEverythingContentFilter() { 36 | } 37 | 38 | @Override 39 | public boolean process(final TextDocument doc) { 40 | boolean changes = false; 41 | 42 | for (TextBlock tb : doc.getTextBlocks()) { 43 | if (!tb.isContent()) { 44 | tb.setIsContent(true); 45 | changes = true; 46 | } 47 | } 48 | 49 | return changes; 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/filters/simple/MinWordsFilter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.filters.simple; 23 | 24 | import org.chromium.distiller.BoilerpipeFilter; 25 | import org.chromium.distiller.document.TextBlock; 26 | import org.chromium.distiller.document.TextDocument; 27 | 28 | /** 29 | * Keeps only those content blocks which contain at least k words. 30 | * 31 | * @author Christian Kohlschütter 32 | */ 33 | public final class MinWordsFilter implements BoilerpipeFilter { 34 | private final int minWords; 35 | 36 | public MinWordsFilter(final int minWords) { 37 | this.minWords = minWords; 38 | } 39 | 40 | @Override 41 | public boolean process(final TextDocument doc) { 42 | boolean changes = false; 43 | 44 | for (TextBlock tb : doc.getTextBlocks()) { 45 | if (!tb.isContent()) { 46 | continue; 47 | } 48 | if (tb.getNumWords() < minWords) { 49 | tb.setIsContent(false); 50 | changes = true; 51 | } 52 | 53 | } 54 | 55 | return changes; 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/labels/DefaultLabels.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.labels; 23 | 24 | import org.chromium.distiller.document.TextBlock; 25 | 26 | /** 27 | * Some pre-defined labels which can be used in conjunction with 28 | * {@link TextBlock#addLabel(String)} and {@link TextBlock#hasLabel(String)}. 29 | * 30 | * @author Christian Kohlschütter 31 | */ 32 | public final class DefaultLabels { 33 | public static final String TITLE = "de.l3s.boilerpipe/TITLE"; 34 | public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA"; 35 | public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT"; 36 | public static final String VERY_LIKELY_CONTENT = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT"; 37 | public static final String HR = "de.l3s.boilerpipe/HR"; 38 | public static final String LI = "de.l3s.boilerpipe/LI"; 39 | 40 | public static final String HEADING = "de.l3s.boilerpipe/HEADING"; 41 | public static final String H1 = "de.l3s.boilerpipe/H1"; 42 | public static final String H2 = "de.l3s.boilerpipe/H2"; 43 | public static final String H3 = "de.l3s.boilerpipe/H3"; 44 | 45 | public static final String MARKUP_PREFIX = "<"; 46 | 47 | public static final String BOILERPLATE_HEADING_FUSED = "BOILERPLATE_HEADING_FUSED"; 48 | 49 | public static final String STRICTLY_NOT_CONTENT = "STRICTLY_NOT_CONTENT"; 50 | public static final String SIBLING_OF_MAIN_CONTENT = "SIBLING_OF_MAIN_CONTENT"; 51 | 52 | private DefaultLabels() { 53 | // not to be instantiated 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/labels/LabelAction.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | /** 6 | * boilerpipe 7 | * 8 | * Copyright (c) 2009, 2010 Christian Kohlschütter 9 | * 10 | * The author licenses this file to You under the Apache License, Version 2.0 11 | * (the "License"); you may not use this file except in compliance with 12 | * the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | package org.chromium.distiller.labels; 23 | 24 | import org.chromium.distiller.document.TextBlock; 25 | 26 | import java.util.Arrays; 27 | 28 | /** 29 | * Helps adding labels to {@link TextBlock}s. 30 | * 31 | * @author Christian Kohlschütter 32 | * @see ConditionalLabelAction 33 | */ 34 | public class LabelAction { 35 | protected final String[] labels; 36 | 37 | public LabelAction(String... labels) { 38 | this.labels = labels; 39 | } 40 | 41 | public void addTo(final TextBlock tb) { 42 | addLabelsTo(tb); 43 | } 44 | 45 | protected final void addLabelsTo(final TextBlock tb) { 46 | tb.addLabels(labels); 47 | } 48 | 49 | public String[] getLabels() { 50 | return labels; 51 | } 52 | 53 | @Override 54 | public String toString() { 55 | return super.toString()+"{"+Arrays.asList(labels)+"}"; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebDocumentBuilderInterface.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | import com.google.gwt.dom.client.Node; 9 | import com.google.gwt.dom.client.Text; 10 | 11 | public interface WebDocumentBuilderInterface { 12 | void skipElement(Element e); 13 | void startElement(Element element); 14 | void endElement(); 15 | void textNode(Text textNode); 16 | void lineBreak(Node node); 17 | void dataTable(Element e); 18 | void tag(WebTag tag); 19 | void embed(WebElement embedNode); 20 | } 21 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebElement.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | /** 8 | * A WebElement is some logical part of a web document (text block, image, video, table, etc.). 9 | */ 10 | public abstract class WebElement { 11 | private boolean isContent; 12 | 13 | /** 14 | * Generate the HTML output for this WebElement. 15 | * @return Displayable HTML content representing this WebElement. 16 | */ 17 | public abstract String generateOutput(boolean textOnly); 18 | 19 | public void setIsContent(boolean isContent) { 20 | this.isContent = isContent; 21 | } 22 | 23 | public boolean getIsContent() { 24 | return isContent; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebEmbed.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import com.google.gwt.dom.client.Document; 8 | import com.google.gwt.dom.client.Element; 9 | import com.google.gwt.dom.client.Node; 10 | 11 | import java.util.ArrayList; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | /** 17 | * WebEmbed is the base class for many site-specific embedded elements (Twitter, YouTube, etc.). 18 | */ 19 | public class WebEmbed extends WebElement { 20 | 21 | // The element that was extracted. 22 | protected List embedNodes; 23 | // The ID associated with the embed. 24 | private String id; 25 | // The type of embed that this is. 26 | private String type; 27 | // Other parameters that may have been found on the embed URL or in attributes. 28 | private final Map altParams; 29 | 30 | /** 31 | * Build an embed element. 32 | * @param e The element detected as an embed. 33 | * @param t The type of embed that this is. 34 | * @param embedId The ID of the embedded object. 35 | * @param params Extra parameters that the embed might have associated with it. 36 | */ 37 | public WebEmbed(Element e, String t, String embedId, Map params) { 38 | embedNodes = new ArrayList<>(); 39 | id = embedId; 40 | embedNodes.add(e); 41 | setType(t); 42 | if (params == null) { 43 | altParams = new HashMap<>(); 44 | } else { 45 | altParams = params; 46 | } 47 | } 48 | 49 | @Override 50 | public String generateOutput(boolean textOnly) { 51 | if (textOnly) return ""; 52 | // Generate a placeholder for javascript to replace with the real embed. 53 | Element embed = Document.get().createDivElement(); 54 | embed.setClassName("embed-placeholder"); 55 | embed.setAttribute("data-type", type); 56 | embed.setAttribute("data-id", id); 57 | return embed.getString(); 58 | } 59 | 60 | /** 61 | * Get the map of parameters associated with this embed. 62 | * @return A map of the parameters or an empty map if there are no parameters. 63 | */ 64 | public Map getParams() { 65 | return altParams; 66 | } 67 | 68 | /** 69 | * Get the ID of this embed. 70 | * @return Embed ID. 71 | */ 72 | public String getId() { 73 | return id; 74 | } 75 | 76 | protected void setType(String t) { 77 | type = t; 78 | } 79 | 80 | public String getType() { 81 | if (type == null) return ""; 82 | return type; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebFigure.java: -------------------------------------------------------------------------------- 1 | package org.chromium.distiller.webdocument; 2 | 3 | import com.google.gwt.dom.client.Document; 4 | import com.google.gwt.dom.client.Element; 5 | import org.chromium.distiller.DomUtil; 6 | 7 | /** 8 | * WebFigure represents a figure element, containing an image and optionally a caption. 9 | */ 10 | public class WebFigure extends WebImage { 11 | 12 | private Element figCaption; 13 | 14 | /** 15 | * Build a figure element. 16 | * @param e The element detected as an image. 17 | * @param w The original width of the image. 18 | * @param h The original height of the image. 19 | * @param src The source URL of the image being extracted. 20 | * @param caption The element containing the caption of the image. 21 | */ 22 | public WebFigure(Element e, int w, int h, String src, Element caption) { 23 | super(e, w, h, src); 24 | figCaption = caption; 25 | } 26 | 27 | /** 28 | * WebFigure extends WebImage so it can use WebImage generated output 29 | * and just handle the caption since an html figure is basically a 30 | * placeholder for an image and a caption. 31 | */ 32 | @Override 33 | public String generateOutput(boolean textOnly) { 34 | Element figcaption = DomUtil.cloneAndProcessTree(figCaption); 35 | if (textOnly) { 36 | return DomUtil.getTextFromTreeForTest(figcaption); 37 | } 38 | 39 | Element figure = Document.get().createElement("FIGURE"); 40 | figure.appendChild(getProcessedNode()); 41 | if (!figCaption.getInnerHTML().isEmpty()) { 42 | figure.appendChild(figcaption); 43 | } 44 | return figure.getString(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.DomUtil; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | import com.google.gwt.dom.client.Element; 13 | import com.google.gwt.dom.client.ImageElement; 14 | import com.google.gwt.dom.client.NodeList; 15 | 16 | public class WebTable extends WebElement { 17 | private Element tableElement; 18 | // Cloned and processed table element. 19 | private Element cloned; 20 | 21 | public WebTable(Element tableRoot) { 22 | tableElement = tableRoot; 23 | } 24 | 25 | private void cloneAndProcessNode() { 26 | cloned = DomUtil.cloneAndProcessTree(tableElement); 27 | } 28 | 29 | @Override 30 | public String generateOutput(boolean textOnly) { 31 | if (cloned == null) { 32 | cloneAndProcessNode(); 33 | } 34 | if (textOnly) { 35 | return DomUtil.getTextFromTreeForTest(cloned); 36 | } 37 | return Element.as(cloned).getString(); 38 | } 39 | 40 | public Element getTableElement() { 41 | return tableElement; 42 | } 43 | 44 | /** 45 | * Get the list of source URLs of this image. 46 | * It's more efficient to call after generateOutput(). 47 | * @return Source URLs or an empty List. 48 | */ 49 | public List getImageUrlList() { 50 | if (cloned == null) { 51 | cloneAndProcessNode(); 52 | } 53 | List imgUrls = new ArrayList<>(); 54 | NodeList imgs = DomUtil.querySelectorAll(cloned, "IMG, SOURCE"); 55 | for (int i = 0; i < imgs.getLength(); i++) { 56 | ImageElement ie = (ImageElement) imgs.getItem(i); 57 | if (!ie.getSrc().isEmpty()) { 58 | imgUrls.add(ie.getSrc()); 59 | } 60 | imgUrls.addAll(DomUtil.getAllSrcSetUrls(ie)); 61 | } 62 | return imgUrls; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebTag.java: -------------------------------------------------------------------------------- 1 | package org.chromium.distiller.webdocument; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | /** 7 | * This class represents HTML tags that need to be preserved over 8 | * the distillation process. 9 | */ 10 | public class WebTag extends WebElement { 11 | private String tagName; 12 | private TagType tagType; 13 | 14 | public enum TagType { 15 | START, END 16 | } 17 | 18 | private static Set nestingTags; 19 | static { 20 | nestingTags = new HashSet(); 21 | nestingTags.add("UL"); 22 | nestingTags.add("OL"); 23 | nestingTags.add("LI"); 24 | nestingTags.add("BLOCKQUOTE"); 25 | nestingTags.add("PRE"); 26 | } 27 | 28 | public WebTag(String tagName, TagType tagType) { 29 | this.tagName = tagName; 30 | this.tagType = tagType; 31 | } 32 | 33 | public boolean isStartTag() { 34 | return tagType == TagType.START; 35 | } 36 | 37 | public String getTagName() { 38 | return tagName; 39 | } 40 | 41 | @Override 42 | public String generateOutput(boolean textOnly) { 43 | if (textOnly) { 44 | return ""; 45 | } 46 | return "<" + (isStartTag() ? "" : "/") + tagName + ">"; 47 | } 48 | 49 | public static boolean canBeNested(String tagName) { 50 | return nestingTags.contains(tagName); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebTextBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.StringUtil; 8 | 9 | import com.google.gwt.dom.client.Node; 10 | import com.google.gwt.dom.client.Text; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class WebTextBuilder { 16 | private String textBuffer = ""; 17 | private int numWords; 18 | private int numAnchorWords; 19 | 20 | private int blockTagLevel = -1; 21 | private boolean inAnchor; 22 | 23 | private final List allTextNodes = new ArrayList(); 24 | private int firstNode; 25 | private int firstNonWhitespaceNode = -1; 26 | private int lastNonWhitespaceNode; 27 | 28 | public void textNode(Text textNode, int tagLevel) { 29 | String text = textNode.getData(); 30 | 31 | if (text.isEmpty()) { 32 | return; 33 | } 34 | 35 | textBuffer += text; 36 | allTextNodes.add(textNode); 37 | 38 | if (StringUtil.isStringAllWhitespace(text)) { 39 | return; 40 | } 41 | 42 | int thisWords = StringUtil.countWords(text); 43 | numWords += thisWords; 44 | if (inAnchor) { 45 | numAnchorWords += thisWords; 46 | } 47 | 48 | lastNonWhitespaceNode = allTextNodes.size() - 1; 49 | if (firstNonWhitespaceNode < firstNode) { 50 | firstNonWhitespaceNode = lastNonWhitespaceNode; 51 | } 52 | 53 | if (blockTagLevel == -1) { 54 | blockTagLevel = tagLevel; 55 | } 56 | } 57 | 58 | public void lineBreak(Node node) { 59 | textBuffer += "\n"; 60 | allTextNodes.add(node); 61 | } 62 | 63 | public void reset() { 64 | textBuffer = ""; 65 | numWords = 0; 66 | numAnchorWords = 0; 67 | firstNode = allTextNodes.size(); 68 | blockTagLevel = -1; 69 | } 70 | 71 | public WebText build(int offsetBlock) { 72 | if (firstNode == allTextNodes.size()) { 73 | return null; 74 | } 75 | 76 | if (firstNonWhitespaceNode < firstNode) { 77 | reset(); 78 | return null; 79 | } 80 | 81 | WebText tb = new WebText(textBuffer, allTextNodes, firstNode, allTextNodes.size(), 82 | firstNonWhitespaceNode, lastNonWhitespaceNode, numWords, numAnchorWords, 83 | blockTagLevel, offsetBlock); 84 | reset(); 85 | return tb; 86 | } 87 | 88 | public void enterAnchor() { 89 | inAnchor = true; 90 | textBuffer += ' '; 91 | } 92 | 93 | public void exitAnchor() { 94 | inAnchor = false; 95 | textBuffer += ' '; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/WebVideo.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import com.google.gwt.dom.client.Document; 8 | import com.google.gwt.dom.client.Element; 9 | import com.google.gwt.dom.client.Node; 10 | import com.google.gwt.dom.client.VideoElement; 11 | import org.chromium.distiller.DomUtil; 12 | 13 | /** 14 | * WebVideo represents a video in the WebDocument potentially needing extraction. 15 | */ 16 | public class WebVideo extends WebElement { 17 | /** The main video element. */ 18 | private final Element videoElement; 19 | /** The original width of the video in pixels. */ 20 | private final int width; 21 | /** The original height of the video in pixels. */ 22 | private final int height; 23 | 24 | /** 25 | * Build an video element. 26 | * @param e The element detected as an video. 27 | * @param w The original width of the video. 28 | * @param h The original height of the video. 29 | */ 30 | public WebVideo(Element e, int w, int h) { 31 | // TODO(mdjones): Handle multiple nested "source" and "track" tags. 32 | videoElement = e; 33 | width = w; 34 | height = h; 35 | } 36 | 37 | @Override 38 | public String generateOutput(boolean textOnly) { 39 | if (textOnly) return ""; 40 | VideoElement ve = (VideoElement) videoElement.cloneNode(false); 41 | for (int i = 0; i < videoElement.getChildCount(); i++) { 42 | Node curNode = videoElement.getChild(i); 43 | if (curNode.getNodeType() != Node.ELEMENT_NODE) continue; 44 | 45 | Element el = Element.as(curNode); 46 | // Only take "source" and "track" children. 47 | if ("SOURCE".equals(el.getTagName()) || "TRACK".equals(el.getTagName())) { 48 | ve.appendChild(el.cloneNode(false)); 49 | } 50 | } 51 | 52 | if (!ve.getPoster().isEmpty()) { 53 | ve.setPoster(ve.getPoster()); 54 | } 55 | DomUtil.makeAllSrcAttributesAbsolute(ve); 56 | DomUtil.stripIds(ve); 57 | DomUtil.stripAllUnsafeAttributes(ve); 58 | 59 | return ve.getString(); 60 | } 61 | 62 | /** 63 | * Get the video element of this WebVideo. 64 | * @return Video element or null. 65 | */ 66 | public Element getVideoElement() { 67 | return videoElement; 68 | } 69 | 70 | /** 71 | * Get the width of this video in pixels. 72 | * @return The width of this video in pixels. 73 | */ 74 | public int getWidth() { 75 | return width; 76 | } 77 | 78 | /** 79 | * Get the height of this video in pixels. 80 | * @return The height of this video in pixels. 81 | */ 82 | public int getHeight() { 83 | return height; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java: -------------------------------------------------------------------------------- 1 | package org.chromium.distiller.webdocument.filters; 2 | 3 | import org.chromium.distiller.webdocument.WebDocument; 4 | import org.chromium.distiller.webdocument.WebElement; 5 | import org.chromium.distiller.webdocument.WebTag; 6 | 7 | import java.util.Stack; 8 | 9 | /** 10 | * This class is used to identify what WebTag should be 11 | * marked as isContent based on its {@link WebElement}s inside. 12 | * A {@link WebTag} is content when: 13 | *
    14 | *
  • Has any {@link WebElement} which is content.
  • 15 | *
  • Has at least one nested {@link WebTag} which is content.
  • 16 | *
17 | */ 18 | public class NestedElementRetainer { 19 | public static void process(WebDocument document) { 20 | boolean isContent = false; 21 | int stackMark = -1; 22 | Stack stack = new Stack<>(); 23 | 24 | for (WebElement e : document.getElements()) { 25 | if (!(e instanceof WebTag)) { 26 | if (!isContent) { 27 | isContent = e.getIsContent(); 28 | } 29 | } else { 30 | WebTag webTag = (WebTag) e; 31 | if (webTag.isStartTag()) { 32 | webTag.setIsContent(isContent); 33 | stack.push(webTag); 34 | isContent = false; 35 | } else { 36 | WebTag startWebTag = stack.pop(); 37 | isContent |= stackMark >= stack.size(); 38 | if (isContent) { 39 | stackMark = stack.size() - 1; 40 | } 41 | boolean wasContent = startWebTag.getIsContent(); 42 | startWebTag.setIsContent(isContent); 43 | webTag.setIsContent(isContent); 44 | isContent = wasContent; 45 | } 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/RelevantElements.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters; 6 | 7 | import org.chromium.distiller.webdocument.WebDocument; 8 | import org.chromium.distiller.webdocument.WebElement; 9 | import org.chromium.distiller.webdocument.WebText; 10 | 11 | public class RelevantElements { 12 | public static boolean process(WebDocument document) { 13 | boolean changes = false; 14 | boolean inContent = false; 15 | 16 | for (WebElement e : document.getElements()) { 17 | if (e.getIsContent()) { 18 | inContent = true; 19 | } else if (e instanceof WebText) { 20 | inContent = false; 21 | } else { 22 | if (inContent) { 23 | e.setIsContent(true); 24 | changes = true; 25 | } 26 | } 27 | } 28 | return changes; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/images/AreaScorer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters.images; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | 9 | /** 10 | * ImageScorer that uses image area (length*width) as its heuristic. 11 | */ 12 | public class AreaScorer extends BaseImageScorer { 13 | public final int maxScore; 14 | public final int minArea; 15 | public final int maxArea; 16 | 17 | /** 18 | * Initialize this ImageScorer with appropriate info. 19 | * @param maximumScore The maximum score that should be given to any image. 20 | * @param minimumArea The smallest area in px an image can consume and still be scored. 21 | * @param maximumArea The largest area in px an image can consume and still be scored. 22 | */ 23 | public AreaScorer(int maximumScore, int minimumArea, int maximumArea) { 24 | maxScore = maximumScore; 25 | minArea = minimumArea; 26 | maxArea = maximumArea; 27 | } 28 | 29 | @Override 30 | protected int computeScore(Element e) { 31 | int area = e.getOffsetWidth() * e.getOffsetHeight(); 32 | if (area < minArea) return 0; 33 | 34 | int score = (int) ((float) (area - minArea) / (maxArea - minArea) * maxScore); 35 | return Math.min(score, maxScore); 36 | } 37 | 38 | @Override 39 | public int getMaxScore() { 40 | return maxScore; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/images/BaseImageScorer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters.images; 6 | 7 | import org.chromium.distiller.LogUtil; 8 | 9 | import com.google.gwt.dom.client.Element; 10 | 11 | /** 12 | * Base class for an image scorer that handles logging, null checks, and other operations not 13 | * directly related to score computation. 14 | */ 15 | public abstract class BaseImageScorer implements ImageScorer { 16 | @Override 17 | public int getImageScore(Element e) { 18 | int score = 0; 19 | if (e != null) { 20 | score = computeScore(e); 21 | } 22 | if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { 23 | LogUtil.logToConsole(getClass().getSimpleName() + ": " + score + "/" + getMaxScore()); 24 | } 25 | return Math.min(score, getMaxScore()); 26 | } 27 | 28 | @Override public abstract int getMaxScore(); 29 | 30 | /** 31 | * Do the actual computation of the score. This method is never called if the Element provided 32 | * to getImageScore is null. 33 | * @param e The image element to score. 34 | * @return An integer score for the provided image element. 35 | */ 36 | protected abstract int computeScore(Element e); 37 | } 38 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/images/DimensionsRatioScorer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters.images; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | 9 | /** 10 | * ImageScorer that uses dimension ratio (width/length) as its heuristic. 11 | */ 12 | public class DimensionsRatioScorer extends BaseImageScorer { 13 | public final int maxScore; 14 | 15 | /** 16 | * Initialize this ImageScorer with appropriate info. 17 | * @param maximumScore The maximum score that should be given to any image. 18 | */ 19 | public DimensionsRatioScorer(int maximumScore) { 20 | maxScore = maximumScore; 21 | } 22 | 23 | @Override 24 | protected int computeScore(Element e) { 25 | int height = e.getOffsetHeight(); 26 | // For divide by 0 errors. 27 | if (height <= 0) return 0; 28 | 29 | int width = e.getOffsetWidth(); 30 | float multiplier = 0.0f; 31 | // We are mainly interested in wide images. 32 | float ratio = (float) width / height; 33 | if (ratio > 1.45f && ratio < 1.8f) { 34 | multiplier = 1.0f; 35 | } else if (ratio > 1.3f && ratio < 2.2f) { 36 | multiplier = 0.4f; 37 | } 38 | return (int) (maxScore * multiplier); 39 | } 40 | 41 | @Override 42 | public int getMaxScore() { 43 | return maxScore; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/images/DomDistanceScorer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters.images; 6 | 7 | import org.chromium.distiller.DomUtil; 8 | 9 | import com.google.gwt.dom.client.Element; 10 | import com.google.gwt.dom.client.Node; 11 | 12 | /** 13 | * ImageScorer that uses DOM distance as its heuristic. 14 | */ 15 | public class DomDistanceScorer extends BaseImageScorer { 16 | public final Node firstContentNode; 17 | public final int maxScore; 18 | 19 | /** 20 | * Initialize this ImageScorer with appropriate info. 21 | * @param maximumScore The maximum score that should be given to any image. 22 | * @param firstContent The first content node as identified by Boilerpipe. 23 | */ 24 | public DomDistanceScorer(int maximumScore, Node firstContent) { 25 | maxScore = maximumScore; 26 | firstContentNode = firstContent; 27 | } 28 | 29 | @Override 30 | protected int computeScore(Element e) { 31 | if (firstContentNode == null) return 0; 32 | 33 | int depthDiff = DomUtil.getNodeDepth(firstContentNode) 34 | - DomUtil.getNodeDepth(DomUtil.getNearestCommonAncestor(firstContentNode, e)); 35 | float multiplier = 0.0f; 36 | if (depthDiff < 4) { 37 | multiplier = 1.0f; 38 | } else if (depthDiff < 6) { 39 | multiplier = 0.6f; 40 | } else if (depthDiff < 8) { 41 | multiplier = 0.2f; 42 | } 43 | return (int) (maxScore * multiplier); 44 | } 45 | 46 | @Override 47 | public int getMaxScore() { 48 | return maxScore; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/images/HasFigureScorer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters.images; 6 | 7 | import org.chromium.distiller.DomUtil; 8 | 9 | import com.google.gwt.dom.client.Element; 10 | import com.google.gwt.dom.client.Node; 11 | 12 | import java.util.List; 13 | 14 | /** 15 | * ImageScorer that scores based on if the image has a "figure" node as an ancestor. 16 | */ 17 | public class HasFigureScorer extends BaseImageScorer { 18 | public final int maxScore; 19 | 20 | /** 21 | * Initialize this ImageScorer with appropriate info. 22 | * @param maximumScore The maximum score that should be given to any image. 23 | */ 24 | public HasFigureScorer(int maximumScore) { 25 | maxScore = maximumScore; 26 | } 27 | 28 | @Override 29 | protected int computeScore(Element e) { 30 | List parents = DomUtil.getParentNodes(e); 31 | for (Node n : parents) { 32 | if (n.getNodeType() == Node.ELEMENT_NODE 33 | && "FIGURE".equals(Element.as(n).getTagName())) { 34 | return maxScore; 35 | } 36 | } 37 | return 0; 38 | } 39 | 40 | @Override 41 | public int getMaxScore() { 42 | return maxScore; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /java/org/chromium/distiller/webdocument/filters/images/ImageScorer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters.images; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | 9 | /** 10 | * This interface is used to represent a single heuristic used in image extraction. The 11 | * provided image will be given a score based on the heuristic and a max score. 12 | */ 13 | public interface ImageScorer { 14 | /** 15 | * Give a particular image a score based on the heuristic implemented in this ImageScorer and 16 | * what the max score is set to. 17 | * @param e The element to score. 18 | * @return An integer score for the image. 19 | */ 20 | public int getImageScore(Element e); 21 | 22 | /** 23 | * Get the maximum possible score that this ImageScorer can return. 24 | * @return The max score for this ImageScorer. 25 | */ 26 | public int getMaxScore(); 27 | } 28 | -------------------------------------------------------------------------------- /javatests/DomDistillerJsTest.gwt.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/DomDistillerJsTestCase.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.core.client.JsArray; 8 | import com.google.gwt.dom.client.Document; 9 | import com.google.gwt.dom.client.Element; 10 | import com.google.gwt.dom.client.Node; 11 | import com.google.gwt.dom.client.NodeList; 12 | 13 | /** 14 | * Base test case for all DomDistiller js tests. Ensures that each test starts 15 | * with a fresh document. 16 | */ 17 | public class DomDistillerJsTestCase extends JsTestCase { 18 | protected Element mRoot; 19 | protected Element mHead; 20 | protected Element mBody; 21 | 22 | @Override 23 | public void setUp() throws Exception { 24 | gwtSetUp(); 25 | } 26 | 27 | protected void gwtSetUp() throws Exception { 28 | mRoot = Document.get().getDocumentElement(); 29 | JsArray attrs = DomUtil.getAttributes(mRoot); 30 | String[] attrNames = new String[attrs.length()]; 31 | for (int i = 0; i < attrs.length(); i++) { 32 | attrNames[i] = attrs.get(i).getNodeName(); 33 | } 34 | for (int i = 0; i < attrNames.length; i++) { 35 | mRoot.removeAttribute(attrNames[i]); 36 | } 37 | assertEquals(0, DomUtil.getAttributes(mRoot).length()); 38 | NodeList children = mRoot.getChildNodes(); 39 | for (int i = children.getLength() - 1; i >= 0; i--) { 40 | children.getItem(i).removeFromParent(); 41 | } 42 | assertEquals(0, mRoot.getChildNodes().getLength()); 43 | mHead = Document.get().createElement("head"); 44 | mRoot.appendChild(mHead); 45 | mBody = Document.get().createElement("body"); 46 | mRoot.appendChild(mBody); 47 | // With this, the width of chrome window won't affect the layout. 48 | mRoot.getStyle().setProperty("width", "800px"); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/HeadingFusionTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import org.chromium.distiller.document.TextDocument; 8 | import org.chromium.distiller.document.TextDocumentTestUtil; 9 | import org.chromium.distiller.filters.heuristics.HeadingFusion; 10 | import org.chromium.distiller.labels.DefaultLabels; 11 | 12 | public class HeadingFusionTest extends DomDistillerJsTestCase { 13 | private static final String HEADING_TEXT = 14 | "Heading"; 15 | private static final String LONG_TEXT = 16 | "Leading text that's used to start a document but just to offset a " + 17 | "few text blocks. This will allow testing in-page merges."; 18 | private static final String SHORT_TEXT = "I might be a header."; 19 | 20 | public void testHeadingFused() throws Exception { 21 | TextDocument document = new TestTextDocumentBuilder() 22 | .addContentBlock(HEADING_TEXT, DefaultLabels.HEADING) 23 | .addContentBlock(LONG_TEXT) 24 | .addContentBlock(SHORT_TEXT) 25 | .build(); 26 | 27 | assertTrue(new HeadingFusion().process(document)); 28 | 29 | assertEquals(2, document.getTextBlocks().size()); 30 | assertFalse(document.getTextBlocks().get(0).hasLabel(DefaultLabels.HEADING)); 31 | assertFalse( 32 | document.getTextBlocks().get(0).hasLabel(DefaultLabels.BOILERPLATE_HEADING_FUSED)); 33 | assertTrue(TextDocumentTestUtil.getContent(document).contains(HEADING_TEXT)); 34 | assertTrue(TextDocumentTestUtil.getContent(document).contains(LONG_TEXT)); 35 | assertTrue(TextDocumentTestUtil.getContent(document).contains(SHORT_TEXT)); 36 | } 37 | 38 | public void testBoilerplateHeadingFused() throws Exception { 39 | TextDocument document = new TestTextDocumentBuilder() 40 | .addNonContentBlock(HEADING_TEXT, DefaultLabels.HEADING) 41 | .addContentBlock(LONG_TEXT) 42 | .addContentBlock(SHORT_TEXT) 43 | .build(); 44 | 45 | assertTrue(new HeadingFusion().process(document)); 46 | 47 | assertEquals(2, document.getTextBlocks().size()); 48 | assertFalse(document.getTextBlocks().get(0).hasLabel(DefaultLabels.HEADING)); 49 | assertTrue( 50 | document.getTextBlocks().get(0).hasLabel(DefaultLabels.BOILERPLATE_HEADING_FUSED)); 51 | assertTrue(TextDocumentTestUtil.getContent(document).contains(HEADING_TEXT)); 52 | assertTrue(TextDocumentTestUtil.getContent(document).contains(LONG_TEXT)); 53 | assertTrue(TextDocumentTestUtil.getContent(document).contains(SHORT_TEXT)); 54 | } 55 | 56 | public void testHeadingBeforeBoilerplate() throws Exception { 57 | TextDocument document = new TestTextDocumentBuilder() 58 | .addContentBlock(HEADING_TEXT, DefaultLabels.HEADING) 59 | .addNonContentBlock(LONG_TEXT) 60 | .addContentBlock(SHORT_TEXT) 61 | .build(); 62 | 63 | assertTrue(new HeadingFusion().process(document)); 64 | assertEquals(3, document.getTextBlocks().size()); 65 | assertFalse(document.getTextBlocks().get(0).isContent()); 66 | } 67 | 68 | public void testTitleNotFused() throws Exception { 69 | TextDocument document = new TestTextDocumentBuilder() 70 | .addContentBlock(HEADING_TEXT, DefaultLabels.HEADING, DefaultLabels.TITLE) 71 | .addContentBlock(LONG_TEXT) 72 | .addContentBlock(SHORT_TEXT) 73 | .build(); 74 | 75 | assertFalse(new HeadingFusion().process(document)); 76 | 77 | assertEquals(3, document.getTextBlocks().size()); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/JavaScriptTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | public class JavaScriptTest extends DomDistillerJsTestCase { 8 | public void testParseFloat() { 9 | assertEquals(1.0, JavaScript.parseFloat("1.0"), 1e-10); 10 | assertEquals(1.0, JavaScript.parseFloat("1.0f"), 1e-10); 11 | assertEquals(0.0, JavaScript.parseFloat("0"), 1e-10); 12 | assertEquals(3.14, JavaScript.parseFloat("3.14"), 1e-10); 13 | assertEquals(3.14159265359, JavaScript.parseFloat("3.14159265359"), 1e-10); 14 | assertTrue(Double.isNaN(JavaScript.parseFloat(""))); 15 | assertTrue(Double.isNaN(JavaScript.parseFloat("sdfg1"))); 16 | } 17 | 18 | public void testParseInt() { 19 | assertEquals(1, JavaScript.parseInt("1")); 20 | assertEquals(0, JavaScript.parseInt("0")); 21 | assertEquals(3, JavaScript.parseInt("3.14")); 22 | assertEquals(0, JavaScript.parseInt("")); 23 | assertEquals(0, JavaScript.parseInt("f1")); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/JsTestCase.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | /** 8 | * Base test case for all JS tests. 9 | */ 10 | public class JsTestCase extends Assert { 11 | public void setUp() throws Exception {} 12 | public void tearDown() throws Exception {} 13 | 14 | protected void disableAssertConsoleTrace() { 15 | setDumpTraceOnFailure(false); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/JsTestSuiteBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | public interface JsTestSuiteBuilder { 8 | public JsTestSuiteBase build(); 9 | } 10 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/MarkupParserTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | 8 | public class MarkupParserTest extends DomDistillerJsTestCase { 9 | 10 | public void testNullOpenGraphProtocolParser() { 11 | // To have a null OpenGraphProtocolParser, don't create its required 12 | // meta tags. Instead, create tags that IEReadingViewParser will 13 | // recognize and legitimize as title. 14 | String expectedTitle = "Testing null OpenGraphProtocolParser."; 15 | mHead.appendChild(TestUtil.createTitle(expectedTitle)); 16 | createMeta("title", expectedTitle); 17 | mBody.appendChild(TestUtil.createHeading(1, expectedTitle)); 18 | 19 | MarkupParser parser = new MarkupParser(mRoot); 20 | assertEquals(expectedTitle, parser.getTitle()); 21 | } 22 | 23 | // TODO(kuan): write more tests if or when we determine: 24 | // - which parser takes precedence 25 | // - how we merge the different values retrieved from the different parsers. 26 | 27 | private void createMeta(String name, String content) { 28 | mHead.appendChild(TestUtil.createMetaName(name, content)); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/NodeDirectionalityTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | import com.google.gwt.dom.client.Node; 9 | 10 | public class NodeDirectionalityTest extends DomDistillerJsTestCase { 11 | 12 | private static final String CONTENT_TEXT = "Lorem Ipsum Lorem Ipsum Lorem Ipsum."; 13 | private static final String TITLE_TEXT = "I am the document title"; 14 | 15 | public void testDirAttributeLtrAddedToTree() { 16 | Element div = TestUtil.createDiv(0); 17 | div.appendChild(TestUtil.createSpan(CONTENT_TEXT)); 18 | div.appendChild(TestUtil.createSpan(CONTENT_TEXT)); 19 | mBody.appendChild(div); 20 | 21 | NodeTree tree = new NodeTree(div); 22 | tree.addChild(div.getChild(0)); 23 | tree.addChild(div.getChild(1)); 24 | 25 | Node cloned = tree.cloneSubtreeRetainDirection(); 26 | 27 | for (int i = 0; i < cloned.getChildCount(); i++) { 28 | Node n = cloned.getChild(i); 29 | assertEquals("ltr",Element.as(n).getAttribute("dir")); 30 | } 31 | } 32 | 33 | public void testDirAttributeRtlAddedToTree() { 34 | Element div = TestUtil.createDiv(0); 35 | div.getStyle().setProperty("direction","rtl"); 36 | div.appendChild(TestUtil.createSpan(CONTENT_TEXT)); 37 | div.appendChild(TestUtil.createSpan(CONTENT_TEXT)); 38 | mBody.appendChild(div); 39 | 40 | NodeTree tree = new NodeTree(div); 41 | tree.addChild(div.getChild(0)); 42 | tree.addChild(div.getChild(1)); 43 | 44 | Node cloned = tree.cloneSubtreeRetainDirection(); 45 | 46 | for (int i = 0; i < cloned.getChildCount(); i++) { 47 | Node n = cloned.getChild(i); 48 | assertEquals("rtl",Element.as(n).getAttribute("dir")); 49 | } 50 | } 51 | 52 | public void testMixedDirAttributeAddedToTree() { 53 | Element div = TestUtil.createDiv(0); 54 | div.getStyle().setProperty("direction","ltr"); 55 | 56 | Element child1 = TestUtil.createDiv(1); 57 | child1.getStyle().setProperty("direction","rtl"); 58 | 59 | child1.appendChild(TestUtil.createSpan(CONTENT_TEXT)); 60 | 61 | div.appendChild(child1); 62 | 63 | mBody.appendChild(div); 64 | 65 | // construct node tree 66 | NodeTree tree = new NodeTree(div); 67 | tree.addChild(child1); 68 | tree.getChildren().get(0).addChild(child1.getChild(0)); 69 | 70 | Node cloned = tree.cloneSubtreeRetainDirection(); 71 | 72 | // make sure span element got the "dir" attribute correctly - based on parent 73 | assertEquals("rtl",Element.as(cloned.getChild(0).getChild(0)).getDir()); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/NodeListExpanderTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | import com.google.gwt.dom.client.Node; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | public class NodeListExpanderTest extends DomDistillerJsTestCase { 14 | public void testNodeListExpander() { 15 | List divs = TestUtil.createDivTree(); 16 | List leafNodes = new ArrayList(); 17 | leafNodes.add(divs.get(3)); 18 | leafNodes.add(divs.get(4)); 19 | leafNodes.add(divs.get(5)); 20 | leafNodes.add(divs.get(14)); 21 | NodeTree subtree = NodeListExpander.expand(leafNodes); 22 | 23 | // This is TestUtil.expectedDivTreeHtml with the nodes that should not be included 24 | // commented out. 25 | assertEquals( 26 | "
" + 27 | "
" + 28 | "
" + 29 | "
" + 30 | "
" + 31 | "
" + 32 | "
" + 33 | //"
" + 34 | //"
" + 35 | "
" + 36 | "
" + 37 | "
" + 38 | //"
" + 39 | //"
" + 40 | //"
" + 41 | //"
" + 42 | "
" + 43 | //"
" + 44 | "
" + 45 | "
" + 46 | "
" + 47 | "
", 48 | TestUtil.getElementAsString(Element.as(subtree.cloneSubtree()))); 49 | } 50 | 51 | public void testNodeListExpanderPruneTopChain() { 52 | List divs = TestUtil.createDivTree(); 53 | List leafNodes = new ArrayList(); 54 | leafNodes.add(divs.get(2)); 55 | leafNodes.add(divs.get(3)); 56 | NodeTree subtree = NodeListExpander.expand(leafNodes); 57 | 58 | // This is TestUtil.expectedDivTreeHtml with the nodes that should not be included 59 | // commented out. 60 | assertEquals("
", 61 | TestUtil.getElementAsString(Element.as(subtree.cloneSubtree()))); 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/OrderedNodeMatcherTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Node; 8 | 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | public class OrderedNodeMatcherTest extends DomDistillerJsTestCase { 13 | public void testOrderedNodeMatcher() { 14 | List matchNodes = Arrays.asList( 15 | TestUtil.createDiv(0), 16 | TestUtil.createDiv(1), 17 | TestUtil.createDiv(2), 18 | TestUtil.createDiv(3)); 19 | OrderedNodeMatcher matcher = new OrderedNodeMatcher(matchNodes); 20 | 21 | for (int i = 0; i < matchNodes.size(); ++i) { 22 | assertFalse(matcher.isFinished()); 23 | for (int j = 0; j < matchNodes.size(); ++j) { 24 | if (j != i) { 25 | assertFalse(matcher.match(matchNodes.get(j))); 26 | } 27 | } 28 | assertTrue(matcher.match(matchNodes.get(i))); 29 | } 30 | assertTrue(matcher.isFinished()); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/PageParamContentInfo.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | /** 8 | * Helper class to create content for PageParameterDetectorTest.java and PageParamInfoTest.java. 9 | */ 10 | class PageParamContentInfo { 11 | 12 | static enum Type { 13 | UNRELATED_TERMS, 14 | NUMBER_IN_PLAIN_TEXT, 15 | NUMERIC_OUTLINK, 16 | } 17 | 18 | Type mType; 19 | String mTargetUrl; 20 | int mNumber; 21 | 22 | static PageParamContentInfo UnrelatedTerms() { 23 | return new PageParamContentInfo(Type.UNRELATED_TERMS, "", -1); 24 | } 25 | 26 | static PageParamContentInfo NumberInPlainText(int number) { 27 | return new PageParamContentInfo(Type.NUMBER_IN_PLAIN_TEXT, "", number); 28 | } 29 | 30 | static PageParamContentInfo NumericOutlink(String targetUrl, int number) { 31 | return new PageParamContentInfo(Type.NUMERIC_OUTLINK, targetUrl, number); 32 | } 33 | 34 | private PageParamContentInfo(Type type, String targetUrl, int number) { 35 | mType = type; 36 | mTargetUrl = targetUrl; 37 | mNumber = number; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/ParsedUrlTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | public class ParsedUrlTest extends JsTestCase { 8 | 9 | private static final String VALID_URL = 10 | "http://fooUser:fooPwd@www.foo.com/path0/path1/;pathParams?qA=B&qC=D"; 11 | 12 | public void testAllGet() { 13 | ParsedUrl url = ParsedUrl.create(VALID_URL); 14 | assertTrue(url != null); 15 | assertEquals("www.foo.com", url.getHost()); 16 | assertEquals("http://www.foo.com", url.getOrigin()); 17 | assertEquals("/path0/path1/;pathParams", url.getPath()); 18 | assertEquals("path0/path1", url.getTrimmedPath()); 19 | assertEquals("?qA=B&qC=D", url.getQuery()); 20 | assertEquals("fooUser", url.getUsername()); 21 | assertEquals("fooPwd", url.getPassword()); 22 | String[] pathComponents = url.getPathComponents(); 23 | assertEquals(2, pathComponents.length); 24 | assertEquals("path0", pathComponents[0]); 25 | assertEquals("path1", pathComponents[1]); 26 | String[][] queryParams = url.getQueryParams(); 27 | assertEquals(2, queryParams.length); 28 | assertEquals("qA", queryParams[0][0]); 29 | assertEquals("B", queryParams[0][1]); 30 | assertEquals("qC", queryParams[1][0]); 31 | assertEquals("D", queryParams[1][1]); 32 | } 33 | 34 | public void testInvalid() { 35 | ParsedUrl url = ParsedUrl.create("abc"); 36 | assertEquals(null, url); 37 | } 38 | 39 | public void testSetUsernameAndPassword() { 40 | ParsedUrl url = ParsedUrl.create(VALID_URL); 41 | assertTrue(url != null); 42 | assertEquals("fooUser", url.getUsername()); 43 | assertEquals("fooPwd", url.getPassword()); 44 | url.setUsername("newFooUser"); 45 | url.setPassword("newFooPwd"); 46 | assertEquals("newFooUser", url.getUsername()); 47 | assertEquals("newFooPwd", url.getPassword()); 48 | assertEquals("http://newFooUser:newFooPwd@www.foo.com/path0/path1/;pathParams?qA=B&qC=D", 49 | url.toString()); 50 | url.setUsername(""); 51 | url.setPassword(""); 52 | assertEquals("", url.getUsername()); 53 | assertEquals("", url.getPassword()); 54 | assertEquals("http://www.foo.com/path0/path1/;pathParams?qA=B&qC=D", url.toString()); 55 | } 56 | 57 | public void testSetHash() { 58 | ParsedUrl url = ParsedUrl.create(VALID_URL + "#jumpToFoo"); 59 | assertTrue(url != null); 60 | assertEquals("#jumpToFoo", url.getHash()); 61 | url.setHash("dontJumpToFoo"); 62 | assertEquals("#dontJumpToFoo", url.getHash()); 63 | assertEquals(VALID_URL + "#dontJumpToFoo", url.toString()); 64 | url.setHash(""); 65 | assertEquals("", url.getHash()); 66 | assertEquals(VALID_URL, url.toString()); 67 | } 68 | 69 | public void testReplaceQueryValue() { 70 | ParsedUrl url = ParsedUrl.create(VALID_URL); 71 | assertTrue(url != null); 72 | assertEquals("?qA=B&qC=D", url.getQuery()); 73 | assertEquals("http://fooUser:fooPwd@www.foo.com/path0/path1/;pathParams?qA=E&qC=D", 74 | url.replaceQueryValue(true, "qA", "B", "E")); 75 | // Original query shouldn't change. 76 | assertEquals("?qA=B&qC=D", url.getQuery()); 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/SimpleTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | public class SimpleTest extends JsTestCase { 8 | public void testSuccess() { 9 | assertTrue(true); 10 | assertTrue("Failure message", true); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TerminatingBlocksFinderTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import org.chromium.distiller.document.TextBlock; 8 | import org.chromium.distiller.filters.english.TerminatingBlocksFinder; 9 | 10 | public class TerminatingBlocksFinderTest extends DomDistillerJsTestCase { 11 | private String[] positiveExamples = { 12 | // Startswith cases. 13 | "comments foo", "© reuters", "© reuters foo bar", "please rate this", 14 | "please rate this foo", "post a comment", "post a comment foo", "123 comments", 15 | "9 comments foo", "1346213423 users responded in", "1346213423 users responded in foo", 16 | 17 | // Contains cases. 18 | "foo what you think... bar", "what you think...", "foo what you think...", 19 | "add your comment", "foo add your comment", "add comment bar", "reader views bar", 20 | "have your say bar", "foo reader comments", "foo rätta artikeln", 21 | 22 | // Equals cases. 23 | "thanks for your comments - this feedback is now closed", 24 | 25 | // Check some case insensitivity. 26 | "Thanks for your comments - this feedback is now closed", "Add Comment Bar", 27 | "READER VIEWS BAR", "Comments FOO", 28 | }; 29 | 30 | private String[] negativeExamples = { 31 | // Startswith cases. 32 | "lcomments foo", "xd© reuters", "not please rate this", "xx post a comment", 33 | "users responded in", "123users responded in foo", 34 | 35 | // Contains cases. 36 | "what you think..", "addyour comment", "ad comment", "readerviews", 37 | 38 | // Equals cases. 39 | "thanks for your comments - this feedback is now closed foo", 40 | "foo thanks for your comments - this feedback is now closed", 41 | 42 | // Long case. 43 | "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", 44 | }; 45 | 46 | private final TestTextBlockBuilder builder = new TestTextBlockBuilder(); 47 | 48 | public void testPositives() { 49 | for (String ex : positiveExamples) { 50 | TextBlock tb = builder.createForText(ex); 51 | assertTrue("TerminatingBlocksFinder.isTerminating(createTextBlock(\"" + ex 52 | + "\"))=false" 53 | + ", expected true", 54 | TerminatingBlocksFinder.isTerminating(tb)); 55 | } 56 | } 57 | 58 | public void testNegatives() { 59 | for (String ex : negativeExamples) { 60 | TextBlock tb = builder.createForText(ex); 61 | assertFalse("TerminatingBlocksFinder.isTerminating(createTextBlock(\"" + ex 62 | + "\"))=true" 63 | + ", expected false", 64 | TerminatingBlocksFinder.isTerminating(tb)); 65 | } 66 | } 67 | 68 | public void testCommentLink() { 69 | assertTrue(TerminatingBlocksFinder.isTerminating(builder.createForAnchorText("Comment"))); 70 | assertFalse(TerminatingBlocksFinder.isTerminating(builder.createForText("Comment"))); 71 | assertFalse(TerminatingBlocksFinder.isTerminating(builder.createForAnchorText("comment"))); 72 | assertFalse(TerminatingBlocksFinder.isTerminating(builder.createForAnchorText("foobar"))); 73 | } 74 | 75 | public void testShareLink() { 76 | assertTrue(TerminatingBlocksFinder.isTerminating(builder.createForText("Shares"))); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TestLogger.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | public class TestLogger { 8 | public static final int RESULTS = -1; 9 | public static final int ERROR = 0; 10 | public static final int WARNING = 1; 11 | public static final int INFO = 2; 12 | public static final int DEBUG = 3; 13 | 14 | private int logLevel = WARNING; 15 | 16 | private StringBuilder logBuffer = new StringBuilder(); 17 | 18 | public void log(int logLevel, String message) { 19 | if (logLevel <= this.logLevel) { 20 | logBuffer.append(message + "\n"); 21 | LogUtil.logToConsole(message); 22 | } 23 | } 24 | 25 | public String getLog() { 26 | return LogUtil.getAndClearLog(); 27 | } 28 | 29 | public static class NullLogger extends TestLogger { 30 | @Override 31 | public void log(int logLevel, String message) { 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TestTextBlockBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import org.chromium.distiller.document.TextBlock; 8 | import org.chromium.distiller.webdocument.TestWebTextBuilder; 9 | import org.chromium.distiller.webdocument.WebElement; 10 | import org.chromium.distiller.webdocument.WebText; 11 | 12 | import java.util.ArrayList; 13 | 14 | class TestTextBlockBuilder { 15 | private ArrayList elements = new ArrayList(); 16 | private TestWebTextBuilder webTextBuilder = new TestWebTextBuilder(); 17 | 18 | public TextBlock createForText(String text) { 19 | WebText wt = webTextBuilder.createForText(text); 20 | elements.add(wt); 21 | return new TextBlock(elements, elements.size() - 1); 22 | } 23 | 24 | public TextBlock createForAnchorText(String text) { 25 | WebText wt = webTextBuilder.createForAnchorText(text); 26 | elements.add(wt); 27 | return new TextBlock(elements, elements.size() - 1); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TestTextDocumentBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import org.chromium.distiller.document.TextBlock; 8 | import org.chromium.distiller.document.TextDocument; 9 | import org.chromium.distiller.webdocument.DomConverter; 10 | import org.chromium.distiller.webdocument.TestWebTextBuilder; 11 | import org.chromium.distiller.webdocument.WebDocumentBuilder; 12 | import org.chromium.distiller.webdocument.WebElement; 13 | import org.chromium.distiller.webdocument.WebText; 14 | 15 | import com.google.gwt.dom.client.Document; 16 | import com.google.gwt.dom.client.Element; 17 | import com.google.gwt.dom.client.Node; 18 | 19 | import java.util.ArrayList; 20 | 21 | public class TestTextDocumentBuilder { 22 | private ArrayList textBlocks; 23 | private ArrayList elements; 24 | private TestWebTextBuilder webTextBuilder; 25 | public TestTextDocumentBuilder() { 26 | textBlocks = new ArrayList<>(); 27 | elements = new ArrayList<>(); 28 | webTextBuilder = new TestWebTextBuilder(); 29 | } 30 | 31 | private TextBlock addBlock(String text, String... labels) { 32 | WebText wt = webTextBuilder.createForText(text); 33 | for (String label : labels) { 34 | wt.addLabel(label); 35 | } 36 | elements.add(wt); 37 | textBlocks.add(new TextBlock(elements, elements.size() - 1)); 38 | return textBlocks.get(textBlocks.size() - 1); 39 | } 40 | 41 | public TestTextDocumentBuilder addContentBlock(String text, String... labels) { 42 | addBlock(text, labels).setIsContent(true); 43 | return this; 44 | } 45 | 46 | public TestTextDocumentBuilder addNonContentBlock(String text, String... labels) { 47 | addBlock(text, labels).setIsContent(false); 48 | return this; 49 | } 50 | 51 | public TextDocument build() { 52 | return new TextDocument(textBlocks); 53 | } 54 | 55 | public static TextDocument fromPage(Element docElement) { 56 | WebDocumentBuilder builder = new WebDocumentBuilder(); 57 | DomConverter domConverter = new DomConverter(builder); 58 | 59 | Node body = Document.get().getBody(); 60 | if (!JavaScript.contains(body, docElement) && body.equals(docElement)) { 61 | body.appendChild(docElement); 62 | new DomWalker(domConverter).walk(docElement); 63 | body.removeChild(docElement); 64 | } else { 65 | new DomWalker(domConverter).walk(docElement); 66 | } 67 | 68 | return builder.toWebDocument().createTextDocumentView(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TestUtilTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Element; 8 | 9 | import java.util.List; 10 | 11 | public class TestUtilTest extends DomDistillerJsTestCase { 12 | public void testCreateDivTree() { 13 | List divs = TestUtil.createDivTree(); 14 | assertEquals( 15 | TestUtil.expectedDivTreeHtml, 16 | TestUtil.getElementAsString(divs.get(0))); 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TextDocumentStatisticsTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import org.chromium.distiller.document.TextDocument; 8 | import org.chromium.distiller.document.TextDocumentStatistics; 9 | 10 | /** 11 | * Tests for {@link TextDocumentStatistics}. 12 | */ 13 | public class TextDocumentStatisticsTest extends DomDistillerJsTestCase { 14 | private static final String THREE_WORDS = "I love statistics"; 15 | public void testOnlyContent() { 16 | TextDocument document = new TestTextDocumentBuilder() 17 | .addContentBlock(THREE_WORDS) 18 | .addContentBlock(THREE_WORDS) 19 | .addContentBlock(THREE_WORDS) 20 | .build(); 21 | assertEquals(9, TextDocumentStatistics.countWordsInContent(document)); 22 | } 23 | 24 | public void testOnlyNonContent() { 25 | TextDocument document = new TestTextDocumentBuilder() 26 | .addNonContentBlock(THREE_WORDS) 27 | .addNonContentBlock(THREE_WORDS) 28 | .addNonContentBlock(THREE_WORDS) 29 | .build(); 30 | assertEquals(0, TextDocumentStatistics.countWordsInContent(document)); 31 | } 32 | 33 | public void testMixedContent() { 34 | TextDocument document = new TestTextDocumentBuilder() 35 | .addContentBlock(THREE_WORDS) 36 | .addNonContentBlock(THREE_WORDS) 37 | .addContentBlock(THREE_WORDS) 38 | .addNonContentBlock(THREE_WORDS) 39 | .build(); 40 | assertEquals(6, TextDocumentStatistics.countWordsInContent(document)); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/TreeCloneBuilderTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller; 6 | 7 | import com.google.gwt.dom.client.Document; 8 | import com.google.gwt.dom.client.Element; 9 | import com.google.gwt.dom.client.Node; 10 | import com.google.gwt.dom.client.Text; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class TreeCloneBuilderTest extends DomDistillerJsTestCase { 16 | public void testFullBuilder() { 17 | List divs = TestUtil.createDivTree(); 18 | List leafNodes = new ArrayList(); 19 | leafNodes.add(divs.get(3)); 20 | leafNodes.add(divs.get(4)); 21 | leafNodes.add(divs.get(5)); 22 | leafNodes.add(divs.get(14)); 23 | Node root = TreeCloneBuilder.buildTreeClone(leafNodes); 24 | 25 | // This is TestUtil.expectedDivTreeHtml with the nodes that should not be included 26 | // commented out. 27 | assertEquals( 28 | "
" + 29 | "
" + 30 | "
" + 31 | "
" + 32 | "
" + 33 | "
" + 34 | "
" + 35 | //"
" + 36 | //"
" + 37 | "
" + 38 | "
" + 39 | "
" + 40 | //"
" + 41 | //"
" + 42 | //"
" + 43 | //"
" + 44 | "
" + 45 | //"
" + 46 | "
" + 47 | "
" + 48 | "
" + 49 | "
", 50 | TestUtil.removeAllDirAttributes(Element.as(root).getString())); 51 | } 52 | 53 | public void testSingleNodeList() { 54 | List leaf = new ArrayList(); 55 | leaf.add(Document.get().createTextNode("some content")); 56 | 57 | Node root = TreeCloneBuilder.buildTreeClone(leaf); 58 | 59 | assertEquals(0, root.getChildCount()); 60 | assertEquals(Text.as(leaf.get(0)).getData(), Text.as(root).getData()); 61 | } 62 | 63 | public void testCloneElement() { 64 | Element element = Document.get().createDivElement(); 65 | Node clone = TreeCloneBuilder.cloneNode(element); 66 | 67 | assertEquals("
", Element.as(clone).getString()); 68 | } 69 | 70 | public void testCloneTextNode() { 71 | Node n = Document.get().createTextNode("some content"); 72 | Node clone = TreeCloneBuilder.cloneNode(n); 73 | 74 | Element container = Document.get().createDivElement(); 75 | container.appendChild(clone); 76 | 77 | assertEquals("some content", container.getInnerHTML()); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/document/TextDocumentTestUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.document; 6 | 7 | public class TextDocumentTestUtil { 8 | public static String getContent(TextDocument document) { 9 | return getText(document, true); 10 | } 11 | 12 | public static String getText(TextDocument document, boolean contentOnly) { 13 | String s = ""; 14 | for (TextBlock tb : document.getTextBlocks()) { 15 | if (!contentOnly || tb.isContent()) { 16 | s += tb.getText() + "\n"; 17 | } 18 | } 19 | return s; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/FakeWebDocumentBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.DomUtil; 8 | 9 | import com.google.gwt.core.client.JsArray; 10 | import com.google.gwt.dom.client.Element; 11 | import com.google.gwt.dom.client.Node; 12 | import com.google.gwt.dom.client.Text; 13 | 14 | import java.util.Stack; 15 | 16 | /** 17 | * A simple "WebDocumentBuilder" that just creates an html-like string from the calls. 18 | */ 19 | public class FakeWebDocumentBuilder implements WebDocumentBuilderInterface { 20 | 21 | private final StringBuilder documentStringBuilder; 22 | private final Stack elements; 23 | 24 | FakeWebDocumentBuilder() { 25 | documentStringBuilder = new StringBuilder(); 26 | elements = new Stack(); 27 | } 28 | 29 | String getDocumentString() { 30 | return documentStringBuilder.toString(); 31 | } 32 | 33 | @Override 34 | public void dataTable(Element element) { 35 | documentStringBuilder.append(""); 36 | } 37 | 38 | @Override 39 | public void skipElement(Element element) {} 40 | 41 | @Override 42 | public void startElement(Element element) { 43 | elements.push(element); 44 | documentStringBuilder.append("<"); 45 | documentStringBuilder.append(element.getTagName()); 46 | JsArray attributes = DomUtil.getAttributes(element); 47 | for (int i = 0; i < attributes.length(); i++) { 48 | Node node = attributes.get(i); 49 | documentStringBuilder.append(" "); 50 | documentStringBuilder.append(node.getNodeName()); 51 | documentStringBuilder.append("=\""); 52 | documentStringBuilder.append(node.getNodeValue()); 53 | documentStringBuilder.append("\""); 54 | } 55 | documentStringBuilder.append(">"); 56 | } 57 | 58 | @Override 59 | public void endElement() { 60 | Element el = elements.pop(); 61 | documentStringBuilder.append(""); 62 | } 63 | 64 | @Override 65 | public void textNode(Text textNode) { 66 | documentStringBuilder.append(textNode.getData()); 67 | } 68 | 69 | @Override 70 | public void lineBreak(Node node) { 71 | documentStringBuilder.append("\n"); 72 | } 73 | 74 | @Override 75 | public void embed(WebElement embed) {} 76 | 77 | @Override 78 | public void tag(WebTag tag) {} 79 | } 80 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/TestWebDocumentBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import com.google.gwt.dom.client.Document; 8 | import com.google.gwt.dom.client.Element; 9 | import org.chromium.distiller.TestUtil; 10 | 11 | public class TestWebDocumentBuilder { 12 | private WebDocument document = new WebDocument(); 13 | private TestWebTextBuilder webTextBuilder = new TestWebTextBuilder(); 14 | 15 | public WebText addText(String text) { 16 | WebText wt = webTextBuilder.createForText(text); 17 | document.addText(wt); 18 | return wt; 19 | } 20 | 21 | public WebText addNestedText(String text) { 22 | WebText wt = webTextBuilder.createNestedText(text, 5); 23 | document.addText(wt); 24 | return wt; 25 | } 26 | 27 | public WebText addAnchorText(String text) { 28 | WebText wt = webTextBuilder.createForAnchorText(text); 29 | document.addText(wt); 30 | return wt; 31 | } 32 | 33 | public WebTable addTable(String innerHtml) { 34 | Element table = Document.get().createTableElement(); 35 | table.setInnerHTML(innerHtml); 36 | Document.get().getBody().appendChild(table); 37 | WebTable wt = new WebTable(table); 38 | document.addTable(wt); 39 | return wt; 40 | } 41 | 42 | public WebImage addImage() { 43 | Element image = TestUtil.createImage(); 44 | WebImage wi = new WebImage(image, 100, 100, "http://www.example.com/foo.jpg"); 45 | document.addEmbed(wi); 46 | return wi; 47 | } 48 | 49 | public WebImage addLeadImage() { 50 | Element image = TestUtil.createImage(); 51 | image.setAttribute("width", "600"); 52 | image.setAttribute("height", "400"); 53 | Document.get().getBody().appendChild(image); 54 | WebImage wi = new WebImage(image, 600, 400, "http://www.example.com/lead.bmp"); 55 | document.addEmbed(wi); 56 | return wi; 57 | } 58 | 59 | public WebTag addTagStart() { 60 | WebTag webTag = new WebTag("OL", WebTag.TagType.START); 61 | document.addTag(webTag); 62 | return webTag; 63 | } 64 | 65 | public WebTag addTagEnd() { 66 | WebTag webTag = new WebTag("OL", WebTag.TagType.END); 67 | document.addTag(webTag); 68 | return webTag; 69 | } 70 | 71 | public WebDocument build() { 72 | return document; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/TestWebTextBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.StringUtil; 8 | 9 | import com.google.gwt.dom.client.Document; 10 | import com.google.gwt.dom.client.Element; 11 | import com.google.gwt.dom.client.Node; 12 | 13 | import java.util.ArrayList; 14 | 15 | public class TestWebTextBuilder { 16 | private final ArrayList nodes = new ArrayList<>(); 17 | 18 | public WebText createForText(String text) { 19 | return create(text, false); 20 | } 21 | 22 | public WebText createForAnchorText(String text) { 23 | return create(text, true); 24 | } 25 | 26 | private WebText create(String text, boolean isAnchor) { 27 | nodes.add(Document.get().createTextNode(text)); 28 | int numWords = StringUtil.countWords(text); 29 | int idx = nodes.size() - 1; 30 | return new WebText(text, nodes, idx, idx + 1, idx, idx, numWords, isAnchor ? numWords : 0, 31 | 0, idx); 32 | } 33 | 34 | public WebText createNestedText(String text, int levels) { 35 | Element div = Document.get().createDivElement(); 36 | Element temp = div; 37 | for (int i = 0; i < levels - 1; i++) { 38 | temp.appendChild(Document.get().createDivElement()); 39 | temp = temp.getFirstChildElement(); 40 | } 41 | temp.appendChild(Document.get().createTextNode(text)); 42 | nodes.add(temp.getFirstChild()); 43 | 44 | int numWords = StringUtil.countWords(text); 45 | int idx = nodes.size() - 1; 46 | return new WebText(text, nodes, idx, idx + 1, idx, idx, numWords, 0, 0, idx); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/WebImageTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.DomDistillerJsTestCase; 8 | 9 | import java.util.List; 10 | 11 | import com.google.gwt.dom.client.Document; 12 | import com.google.gwt.dom.client.Element; 13 | import com.google.gwt.dom.client.ImageElement; 14 | import org.chromium.distiller.DomUtil; 15 | 16 | public class WebImageTest extends DomDistillerJsTestCase { 17 | public void testGetSrcList() { 18 | mHead.setInnerHTML(""); 19 | 20 | ImageElement img = Document.get().createImageElement(); 21 | img.setSrc("image"); 22 | img.setAttribute("srcset", 23 | "image200 200w, image400 400w"); 24 | WebImage wi = new WebImage(img, 1, 1, img.getSrc()); 25 | List urls = wi.getUrlList(); 26 | assertEquals(3, urls.size()); 27 | assertEquals("http://example.com/image", urls.get(0)); 28 | assertEquals("http://example.com/image200", urls.get(1)); 29 | assertEquals("http://example.com/image400", urls.get(2)); 30 | } 31 | 32 | public void testGetSrcListInPicture() { 33 | mHead.setInnerHTML(""); 34 | 35 | String html = 36 | "" + 37 | "" + 38 | "" + 39 | "" + 40 | ""; 41 | Element container = Document.get().createDivElement(); 42 | container.setInnerHTML(html); 43 | WebImage wi = new WebImage(container.getFirstChildElement(), 1, 1, ""); 44 | List urls = wi.getUrlList(); 45 | assertEquals(4, urls.size()); 46 | assertEquals("http://example.com/image200", urls.get(0)); 47 | assertEquals("http://example.org/image400", urls.get(1)); 48 | assertEquals("http://example.com/image100", urls.get(2)); 49 | assertEquals("http://example.org/image300", urls.get(3)); 50 | } 51 | 52 | public void testGenerateOutput() { 53 | mHead.setInnerHTML(""); 54 | 55 | String html = 56 | "" + 57 | "" + 58 | "" + 59 | ""; 60 | Element container = Document.get().createDivElement(); 61 | container.setInnerHTML(html); 62 | WebImage wi = new WebImage(container.getFirstChildElement(), 0, 0, ""); 63 | assertEquals("", 64 | wi.generateOutput(false)); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/WebTableTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.DomDistillerJsTestCase; 8 | import org.chromium.distiller.TestUtil; 9 | 10 | import java.util.List; 11 | 12 | import com.google.gwt.dom.client.Document; 13 | import com.google.gwt.dom.client.Element; 14 | 15 | public class WebTableTest extends DomDistillerJsTestCase { 16 | public void testGetImageUrlList() { 17 | mHead.setInnerHTML(""); 18 | Element table = Document.get().createTableElement(); 19 | 20 | String html = 21 | "" + 22 | "" + 23 | "" + 25 | "" + 26 | "" + 27 | "" + 28 | "" + 29 | "" + 30 | "" + 31 | "" + 32 | ""; 33 | 34 | table.setInnerHTML(html); 35 | mBody.appendChild(table); 36 | 37 | WebTable webTable = new WebTable(table); 38 | List urls = webTable.getImageUrlList(); 39 | assertEquals(5, urls.size()); 40 | assertEquals("http://example.com/table.png", urls.get(0)); 41 | assertEquals("http://example.com/image100", urls.get(1)); 42 | assertEquals("http://example.org/image300", urls.get(2)); 43 | assertEquals("http://example.com/image200", urls.get(3)); 44 | assertEquals("http://example.org/image400", urls.get(4)); 45 | } 46 | 47 | public void testGenerateOutput() { 48 | Element table = Document.get().createTableElement(); 49 | String html = "" + 50 | "" + 51 | "row1col1" + 52 | "" + 53 | "" + 54 | "" + 55 | ""; 56 | table.setInnerHTML(html); 57 | mBody.appendChild(table); 58 | 59 | WebTable webTable = new WebTable(table); 60 | String got = webTable.generateOutput(false); 61 | 62 | // Output should be the same as the input in this case. 63 | assertEquals("" + html + "
", TestUtil.removeAllDirAttributes(got)); 64 | 65 | // Test getImageUrlList() as well. 66 | List imgUrls = webTable.getImageUrlList(); 67 | assertEquals(1, imgUrls.size()); 68 | assertEquals("http://example.com/table.png", imgUrls.get(0)); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/WebTagTest.java: -------------------------------------------------------------------------------- 1 | package org.chromium.distiller.webdocument; 2 | 3 | import org.chromium.distiller.DomDistillerJsTestCase; 4 | 5 | public class WebTagTest extends DomDistillerJsTestCase { 6 | 7 | public void testOLGenerateOutput() { 8 | WebTag olStartWebTag = new WebTag("ol", WebTag.TagType.START); 9 | WebTag olEndWebTag = new WebTag("ol", WebTag.TagType.END); 10 | String startResult = olStartWebTag.generateOutput(false); 11 | String endResult = olEndWebTag.generateOutput(false); 12 | assertEquals(startResult, "
    "); 13 | assertEquals(endResult, "
"); 14 | } 15 | 16 | public void testGenerateOutput() { 17 | WebTag startWebTag = new WebTag("anytext", WebTag.TagType.START); 18 | WebTag endWebTag = new WebTag("anytext", WebTag.TagType.END); 19 | String startResult = startWebTag.generateOutput(false); 20 | String endResult = endWebTag.generateOutput(false); 21 | assertEquals(startResult, ""); 22 | assertEquals(endResult, ""); 23 | } 24 | 25 | public void testCanBeNested() { 26 | assertTrue(WebTag.canBeNested("LI")); 27 | assertTrue(WebTag.canBeNested("UL")); 28 | assertTrue(WebTag.canBeNested("OL")); 29 | assertTrue(WebTag.canBeNested("BLOCKQUOTE")); 30 | assertTrue(WebTag.canBeNested("PRE")); 31 | assertFalse(WebTag.canBeNested("SPAN")); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/WebVideoTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument; 6 | 7 | import org.chromium.distiller.DomDistillerJsTestCase; 8 | 9 | import com.google.gwt.dom.client.Document; 10 | import com.google.gwt.dom.client.Element; 11 | 12 | public class WebVideoTest extends DomDistillerJsTestCase { 13 | public void testGenerateOutput() { 14 | Element video = Document.get().createVideoElement(); 15 | // This should be stripped. 16 | video.setAttribute("onfocus", "new XMLHttpRequest();"); 17 | 18 | Element child = Document.get().createElement("source"); 19 | child.setAttribute("src", "http://example.com/foo.ogg"); 20 | video.appendChild(child); 21 | 22 | child = Document.get().createElement("track"); 23 | child.setAttribute("src", "http://example.com/foo.vtt"); 24 | // This should be stripped. 25 | child.setAttribute("onclick", "alert(1)"); 26 | video.appendChild(child); 27 | 28 | String want = ""; 32 | WebVideo webVideo = new WebVideo(video, 400, 300); 33 | 34 | String got = webVideo.generateOutput(false); 35 | 36 | // Output should be the same as the input in this case. 37 | assertEquals(want, got); 38 | } 39 | 40 | public void testGenerateOutputInvalidChildren() { 41 | Element video = Document.get().createVideoElement(); 42 | Element child = Document.get().createElement("source"); 43 | child.setAttribute("src", "http://example.com/foo.ogg"); 44 | video.appendChild(child); 45 | 46 | child = Document.get().createElement("track"); 47 | child.setAttribute("src", "http://example.com/foo.vtt"); 48 | video.appendChild(child); 49 | 50 | child = Document.get().createDivElement(); 51 | child.setInnerText("We do not use custom error messages!"); 52 | video.appendChild(child); 53 | 54 | String want = ""; 58 | WebVideo webVideo = new WebVideo(video, 400, 300); 59 | 60 | String got = webVideo.generateOutput(false); 61 | 62 | // Output should ignore anything other than "track" and "source" tags. 63 | assertEquals(want, got); 64 | } 65 | 66 | public void testPosterEmpty() { 67 | Element video = Document.get().createVideoElement(); 68 | 69 | String want = ""; 70 | WebVideo webVideo = new WebVideo(video, 400, 300); 71 | 72 | 73 | String got = webVideo.generateOutput(false); 74 | 75 | assertEquals(want, got); 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /javatests/org/chromium/distiller/webdocument/filters/RelevantElementsTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | package org.chromium.distiller.webdocument.filters; 6 | 7 | import org.chromium.distiller.DomDistillerJsTestCase; 8 | import org.chromium.distiller.webdocument.TestWebDocumentBuilder; 9 | import org.chromium.distiller.webdocument.WebDocument; 10 | import org.chromium.distiller.webdocument.WebElement; 11 | import org.chromium.distiller.webdocument.WebImage; 12 | import org.chromium.distiller.webdocument.WebTable; 13 | 14 | 15 | public class RelevantElementsTest extends DomDistillerJsTestCase { 16 | public void testEmptyDocument() { 17 | WebDocument document = new WebDocument(); 18 | assertFalse(RelevantElements.process(document)); 19 | assertTrue(document.getElements().isEmpty()); 20 | } 21 | 22 | public void testNoContent() { 23 | TestWebDocumentBuilder builder = new TestWebDocumentBuilder(); 24 | builder.addText("text 1"); 25 | builder.addText("text 2"); 26 | builder.addTable("t1"); 27 | WebDocument document = builder.build(); 28 | assertFalse(RelevantElements.process(document)); 29 | for (WebElement e : document.getElements()) { 30 | assertFalse(e.getIsContent()); 31 | } 32 | } 33 | 34 | public void testRelevantTable() { 35 | TestWebDocumentBuilder builder = new TestWebDocumentBuilder(); 36 | builder.addText("text 1").setIsContent(true); 37 | WebTable wt = builder.addTable("t1"); 38 | WebDocument document = builder.build(); 39 | assertTrue(RelevantElements.process(document)); 40 | assertTrue(wt.getIsContent()); 41 | } 42 | 43 | public void testNonRelevantTable() { 44 | TestWebDocumentBuilder builder = new TestWebDocumentBuilder(); 45 | builder.addText("text 1").setIsContent(true); 46 | builder.addText("text 1"); 47 | WebTable wt = builder.addTable("t1"); 48 | WebDocument document = builder.build(); 49 | assertFalse(RelevantElements.process(document)); 50 | assertFalse(wt.getIsContent()); 51 | } 52 | 53 | public void testRelevantImage() { 54 | TestWebDocumentBuilder builder = new TestWebDocumentBuilder(); 55 | builder.addText("text 1").setIsContent(true); 56 | WebImage wi = builder.addImage(); 57 | WebDocument document = builder.build(); 58 | assertTrue(RelevantElements.process(document)); 59 | assertTrue(wi.getIsContent()); 60 | } 61 | 62 | public void testNonRelevantImage() { 63 | TestWebDocumentBuilder builder = new TestWebDocumentBuilder(); 64 | WebImage wi = builder.addImage(); 65 | builder.addText("text 1").setIsContent(true); 66 | WebDocument document = builder.build(); 67 | assertFalse(RelevantElements.process(document)); 68 | assertFalse(wi.getIsContent()); 69 | } 70 | 71 | public void testImageAfterNonContent() { 72 | TestWebDocumentBuilder builder = new TestWebDocumentBuilder(); 73 | builder.addText("text 1").setIsContent(true); 74 | builder.addText("text 2").setIsContent(false); 75 | WebImage wi = builder.addImage(); 76 | WebDocument document = builder.build(); 77 | assertFalse(RelevantElements.process(document)); 78 | assertFalse(wi.getIsContent()); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /land-external-contributor-cl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # A helpful script to land contributions from external developers. 3 | # The script requires one parameter, which is the issue number to land on behalf 4 | # of the author. 5 | if [[ -z "$1" ]]; then 6 | echo "First argument must be the issue number." 7 | echo "Example: $(basename $0) 123456" 8 | exit 1 9 | fi 10 | ( 11 | get_author_from_first_comment() { 12 | # Need to strip colors first from the result of |git cl comments|. 13 | git cl comments | \ 14 | sed -r "s/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g" | \ 15 | grep "@" | \ 16 | head -n 1 | \ 17 | awk '{print $3;}' 18 | } 19 | 20 | set -e 21 | issue_number=$1 22 | # Ensure origin/master is up to date, and checkout a branch which tracks it. 23 | git remote update 24 | git checkout -tb land-issue-${issue_number} origin/master 25 | 26 | # Setup the branch correctly with the last patch from the review. 27 | git cl patch ${issue_number} 28 | git cl issue ${issue_number} 29 | 30 | # Print the CL description to help identify if this is the correct CL. 31 | echo "The following is the description on that issue:" 32 | echo "######" 33 | GIT_EDITOR=cat git cl description | \ 34 | grep -v "^#" | \ 35 | sed -r "s/Loaded authentication cookies from.*$//g" 36 | echo "######" 37 | 38 | # Get author e-mail by inspecting the first comment, and ask user to verify. 39 | author_email=$(get_author_from_first_comment) 40 | echo "Author e-mail: Enter to use, or enter correct e-mail [${author_email}]:" 41 | read email_override 42 | if [[ ! -z "${email_override}" ]]; then 43 | author_email="${email_override}" 44 | fi 45 | 46 | # Get author name from the username in the e-mail, and ask user to verify. 47 | author_name=$(echo ${author_email} | awk '{split($0,a,"@"); print a[1]};') 48 | echo "Author name: Enter to use, or enter correct name. [${author_name}]:" 49 | read name_override 50 | if [[ ! -z "${name_override}" ]]; then 51 | author_name="${name_override}" 52 | fi 53 | 54 | # Land the current branch. 55 | git cl land -c "${author_name} <${author_email}>" 56 | ) 57 | -------------------------------------------------------------------------------- /proto/dom_distiller.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | syntax = "proto2"; 6 | 7 | package dom_distiller.proto; 8 | option optimize_for = LITE_RUNTIME; 9 | option java_package = "org.chromium.distiller.proto"; 10 | option java_outer_classname = "DomDistillerProtos"; 11 | 12 | message DistilledContent { 13 | optional string html = 1; 14 | } 15 | 16 | message PaginationInfo { 17 | optional string next_page = 1; 18 | optional string prev_page = 2; 19 | optional string canonical_page = 3; 20 | } 21 | 22 | message MarkupArticle { 23 | optional string published_time = 1; 24 | optional string modified_time = 2; 25 | optional string expiration_time = 3; 26 | optional string section = 4; 27 | repeated string authors = 5; 28 | } 29 | 30 | message MarkupImage { 31 | optional string url = 1; 32 | optional string secure_url = 2; 33 | optional string type = 3; 34 | optional string caption = 4; 35 | optional int32 width = 5; 36 | optional int32 height = 6; 37 | } 38 | 39 | message MarkupInfo { 40 | optional string title = 1; 41 | optional string type = 2; 42 | optional string url = 3; 43 | optional string description = 4; 44 | optional string publisher = 5; 45 | optional string copyright = 6; 46 | optional string author = 7; 47 | optional MarkupArticle article = 8; 48 | repeated MarkupImage images = 9; 49 | } 50 | 51 | message TimingEntry { 52 | optional string name = 1; 53 | optional double time = 2; 54 | } 55 | 56 | message TimingInfo { 57 | optional double markup_parsing_time = 1; 58 | optional double document_construction_time = 2; 59 | optional double article_processing_time = 3; 60 | optional double formatting_time = 4; 61 | optional double total_time = 5; 62 | 63 | // A place to hold arbitrary breakdowns of time. The perf scoring/server 64 | // should display these entries with appropriate names. 65 | repeated TimingEntry other_times = 6; 66 | } 67 | 68 | message DebugInfo { 69 | optional string log = 1; 70 | } 71 | 72 | message StatisticsInfo { 73 | optional int32 word_count = 1; 74 | } 75 | 76 | message DomDistillerResult { 77 | optional string title = 1; 78 | optional DistilledContent distilled_content = 2; 79 | optional PaginationInfo pagination_info = 3; 80 | optional MarkupInfo markup_info = 5; 81 | optional TimingInfo timing_info = 6; 82 | optional DebugInfo debug_info = 7; 83 | optional StatisticsInfo statistics_info = 8; 84 | optional string text_direction = 9; 85 | 86 | // Represents an image found in the content of a page. 87 | message ContentImage { 88 | optional string url = 1; 89 | } 90 | 91 | repeated ContentImage content_images = 10; 92 | } 93 | 94 | message DomDistillerOptions { 95 | // Whether to extract only the text (or to include the containing html). 96 | optional bool extract_text_only = 1; 97 | 98 | // How much debug output to dump to window.console. 99 | // (0): Logs nothing 100 | // (1): Text Node data for each stage of processing 101 | // (2): (1) and some node visibility information 102 | // (3): (2) and extracted paging information 103 | optional int32 debug_level = 2; 104 | 105 | // The original URL of the page, which is used in the heuristics in 106 | // detecting next/prev page links. 107 | optional string original_url = 3; 108 | 109 | // Which algorithm to use for next page detection: 110 | // "next" : detect anchors with "next" text 111 | // "pagenum" : detect anchors with numeric page numbers 112 | optional string pagination_algo = 4; 113 | } 114 | -------------------------------------------------------------------------------- /protoc_plugins/README: -------------------------------------------------------------------------------- 1 | # Copyright 2014 The Chromium Authors 2 | # Use of this source code is governed by a BSD-style license that can be 3 | # found in the LICENSE file. 4 | 5 | These protoc plugins use a simple json encoding. 6 | 7 | An instance of the following protobuf: 8 | 9 | message Foo { 10 | message Bar { 11 | repeated string rabbits = 1; 12 | } 13 | optional string cat = 1; 14 | repeated int32 dog = 2; 15 | optional Bar rabbit_den = 3; 16 | } 17 | 18 | could be encoded something like: 19 | 20 | { 21 | "1": "kitty", 22 | "2": [4, 16, 9], 23 | "3": { "1": ["thumper", "oreo", "daisy"] } 24 | } 25 | 26 | 27 | Only a limited part of the protocol buffer IDL is supported. 28 | 29 | Supported field types: 30 | float, double, int32, bool, string, message, and enum 31 | 32 | Supported field rules: 33 | optional, repeated 34 | 35 | Unsupported features: 36 | default values 37 | imports 38 | extensions 39 | services 40 | non-file-level options 41 | 42 | -------------------------------------------------------------------------------- /protoc_plugins/json_values_converter_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2016 The Chromium Authors 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | """Tests for json_values_converter.py. 7 | 8 | It tests json_values_converter.py. 9 | """ 10 | 11 | import argparse 12 | import os 13 | import sys 14 | 15 | 16 | def CompareFiles(file1, file2): 17 | return open(file1, 'r').read() == open(file2, 'r').read() 18 | 19 | 20 | def TouchStamp(stamp_path): 21 | dir_name = os.path.dirname(stamp_path) 22 | if not os.path.isdir(dir_name): 23 | os.makedirs(dir_name) 24 | 25 | with open(stamp_path, 'a'): 26 | os.utime(stamp_path, None) 27 | 28 | 29 | def main(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--stamp', 32 | help='Path to touch on success.') 33 | parser.add_argument('files', nargs='+', 34 | help='Files to compare.') 35 | 36 | args = parser.parse_args() 37 | 38 | passed = True 39 | for i, j in zip(args.files[::2], args.files[1::2]): 40 | passed = passed and CompareFiles(i, j) 41 | 42 | if passed and args.stamp: 43 | TouchStamp(args.stamp) 44 | 45 | return not passed 46 | 47 | if __name__ == '__main__': 48 | sys.exit(main()) 49 | -------------------------------------------------------------------------------- /protoc_plugins/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/protoc_plugins/util/__init__.py -------------------------------------------------------------------------------- /protoc_plugins/util/plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 The Chromium Authors 2 | # Use of this source code is governed by a BSD-style license that can be 3 | # found in the LICENSE file. 4 | 5 | import sys 6 | 7 | import plugin_protos 8 | import types 9 | 10 | 11 | def Debug(data): 12 | sys.stderr.write(str(data)) 13 | sys.stderr.write('\n') 14 | sys.stderr.flush() 15 | 16 | 17 | def TitleCase(s): 18 | return ''.join((p[0].upper() + p[1:] for p in s.split('_'))) 19 | 20 | 21 | def Indented(s, indent=2): 22 | return '\n'.join((' ' * indent) + p for p in s.rstrip('\n').split('\n')) 23 | 24 | 25 | proto_path_to_file_map = {} 26 | 27 | 28 | def RegisterProtoFile(proto_file): 29 | proto_path_to_file_map[proto_file.Filename()] = proto_file 30 | types.RegisterTypesForFile(proto_file) 31 | 32 | 33 | def GetProtoFileForFilename(filename): 34 | proto_file = proto_path_to_file_map[filename] 35 | assert proto_file 36 | return proto_file 37 | 38 | 39 | def ReadRequestFromStdin(): 40 | data = sys.stdin.read() 41 | return plugin_protos.PluginRequestFromString(data) 42 | -------------------------------------------------------------------------------- /protoc_plugins/util/writer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Chromium Authors 2 | # Use of this source code is governed by a BSD-style license that can be 3 | # found in the LICENSE file. 4 | 5 | import contextlib 6 | 7 | 8 | class CodeWriter(object): 9 | """Helper class for code indentation.""" 10 | def __init__(self): 11 | self.indent = 0 12 | self.value = [] 13 | self.errors = [] 14 | 15 | def GetErrors(self): 16 | return self.errors 17 | 18 | @contextlib.contextmanager 19 | def AddIndent(self, indent=2): 20 | self.indent += indent 21 | yield 0 22 | self.indent -= indent 23 | 24 | def IncreaseIndent(self, indent=2): 25 | self.indent += indent 26 | 27 | def DecreaseIndent(self, indent=2): 28 | self.indent -= indent 29 | 30 | def Output(self, fmt, **kwargs): 31 | s = fmt.format(**kwargs) 32 | s = s.rstrip('\n') 33 | lines = s.split('\n') 34 | lines = map(lambda s: (' ' * self.indent + s).rstrip(), lines) 35 | self.value.extend(lines) 36 | 37 | def AddError(self, fmt, **kwargs): 38 | self.errors.append(fmt.format(**kwargs)) 39 | 40 | def GetValue(self): 41 | return '\n'.join(self.value) + '\n' 42 | 43 | def WriteCStyleHeader(self): 44 | self.Output("// GENERATED FILE") 45 | self.Output("// This file generated by DomDistillerJs protoc plugin.") 46 | -------------------------------------------------------------------------------- /run_jstests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2014 The Chromium Authors 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | """Runs DomDistillers jstests. 7 | 8 | This uses ChromeDriver (https://sites.google.com/a/chromium.org/chromedriver/) to run the jstests. 9 | This requires that the ChromeDriver executable is on the PATH and that Selenium WebDriver is 10 | installed. 11 | 12 | In addition, ChromeDriver assumes that Chrome is available at /usr/bin/google-chrome. 13 | """ 14 | 15 | import argparse 16 | import os 17 | import sys 18 | import time 19 | import urllib 20 | 21 | try: 22 | from selenium import webdriver 23 | except: 24 | print 'ERROR:' 25 | print 'Couldn\'t import webdriver. Please run `sudo ./install-build-deps.sh`.' 26 | sys.exit(1) 27 | 28 | def main(argv): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--filter', help='See gtest_filter syntax.') 31 | parser.add_argument('--repeat', type=int, default=1, help='Number of times to repeat the tests.') 32 | parser.add_argument('--debug_level', help='Verbosity level of debug messages.') 33 | parser.add_argument('--no_console_log', 34 | action='store_true', help='Disable the console log output.') 35 | parser.add_argument('--shuffle', type=int, help='Set to 1 to run test cases in random order.') 36 | parser.add_argument('--no_sandbox', type=int, help='Set to 1 to add --no-sandbox option to Chrome.') 37 | options = parser.parse_args(argv) 38 | 39 | params = {} 40 | if options.filter: 41 | params['filter'] = options.filter 42 | 43 | if options.debug_level: 44 | params['debug_level'] = int(options.debug_level) 45 | 46 | if options.no_console_log: 47 | params['console_log'] = '0' 48 | 49 | if options.shuffle: 50 | params['shuffle'] = options.shuffle 51 | 52 | image_loaded = "return window.image_loaded" 53 | test_runner = "return org.chromium.distiller.JsTestEntry.run()" 54 | test_html = os.path.abspath(os.path.join(os.path.dirname(__file__), "war", "test.html")) 55 | test_html += "?" + urllib.urlencode(params) 56 | 57 | chrome_options = webdriver.ChromeOptions() 58 | 59 | # Travis-CI uses OpenVZ containers which are incompatible with the sandbox technology. 60 | # See https://code.google.com/p/chromium/issues/detail?id=31077 for more information. 61 | # Ref: https://github.com/travis-ci/travis-ci/issues/938#issuecomment-16336150 62 | # Drone.io also has issues running newer versions of Chrome. 63 | # Ref: http://crbug.com/495254 64 | if options.no_sandbox: 65 | chrome_options.add_argument("--no-sandbox") 66 | 67 | driver = webdriver.Chrome(chrome_options=chrome_options) 68 | for i in range(options.repeat): 69 | driver.get("file://" + test_html) 70 | while not driver.execute_script(image_loaded): 71 | print "Wait for image loading..." 72 | time.sleep(0.1) 73 | 74 | start = time.time() 75 | result = driver.execute_script(test_runner) 76 | end = time.time() 77 | 78 | if not result['success'] or options.repeat == i+1: 79 | print result['log'].encode('utf-8') 80 | print 'Tests run: %d, Failures: %d, Skipped: %d, Time elapsed: %0.3f sec' % (result['numTests'], 81 | result['failed'], result['skipped'], end - start) 82 | if not result['success']: 83 | driver.quit() 84 | if options.repeat > 1: 85 | print 'Failed at run #%d/%d' % (i+1, options.repeat) 86 | return 1 87 | driver.quit() 88 | if options.repeat > 1: 89 | print 'Passed %d runs' % (options.repeat) 90 | return 0 91 | 92 | if __name__ == '__main__': 93 | sys.exit(main(sys.argv[1:])) 94 | 95 | -------------------------------------------------------------------------------- /test/proto/test.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | syntax = "proto2"; 6 | 7 | package dom_distiller.test.proto; 8 | option java_package = "org.chromium.distiller.test.proto"; 9 | option java_outer_classname = "TestProtos"; 10 | 11 | message SimpleMessage { 12 | optional bool value = 1; 13 | } 14 | 15 | message PrimitiveTypes { 16 | optional bool type_bool = 1; 17 | 18 | optional int32 type_int32 = 2; 19 | 20 | optional float type_float = 4; 21 | optional double type_double = 5; 22 | 23 | optional string type_string = 6; 24 | } 25 | 26 | message Enum { 27 | enum Values { 28 | ONE = 1; 29 | TWO = 2; 30 | THREE = 3; 31 | } 32 | 33 | optional Values value = 1; 34 | } 35 | 36 | message MessageField { 37 | optional SimpleMessage simple_message = 1; 38 | } 39 | 40 | message RepeatedTypes { 41 | repeated int32 repeated_int32 = 1; 42 | repeated SimpleMessage repeated_simple_message = 2; 43 | } 44 | 45 | message OuterMessage { 46 | message InnerMessage { 47 | optional bool value = 1; 48 | } 49 | optional InnerMessage inner_message = 2; 50 | } 51 | 52 | message QualifiedMessageField { 53 | optional .dom_distiller.test.proto.SimpleMessage simple_message = 1; 54 | } 55 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Google Web Toolkit 2.7.0 5 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
Google Web Toolkit
Version 2.7.0
(Git revision a6da588)
49 | 50 |
51 | 52 |
53 | Copyright © 2009 Google Inc. 54 | All rights reserved. 55 | All other product, service names, brands, or trademarks, are the property of their respective owners. 56 |
57 | 58 |
59 | This product includes software developed by 60 | 99 | For source availability and license information see COPYING.html. 100 | 101 |
102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/about.txt: -------------------------------------------------------------------------------- 1 | Google Web Toolkit 2.7.0 2 | (git revision a6da588) 3 | Copyright (c) Google, Inc. 2009. All rights reserved. 4 | Visit Google Code (http://code.google.com/webtoolkit/). 5 | 6 | This product includes software developed by: 7 | - The Apache Software Foundation (http://www.apache.org/). 8 | - Tomcat (http://tomcat.apache.org/) with modifications 9 | - Tapestry (http://tapestry.apache.org/) 10 | - The Eclipse Foundation (http://www.eclipse.org/). 11 | - Java Development Tools (http://www.eclipse.org/jdt/) 12 | - Standard Widget Toolkit (http://www.eclipse.org/swt/) with modifications 13 | - Mort Bay Consulting (http://www.mortbay.com/) 14 | - Jetty 6.1.11 (http://mortbay.org/jetty/) 15 | - The Mozilla Foundation (http://www.mozilla.org/). 16 | - Mozilla 1.7.12 (http://www.mozilla.org/releases/mozilla1.7.12/) 17 | - Rhino (http://www.mozilla.org/rhino/) with modifications 18 | - ObjectWeb (http://www.objectweb.org/) 19 | - ASM (http://asm.objectweb.org/) with modifications 20 | - The OpenQA Project (http://openqa.org/) 21 | - Selenium-RC (http://selenium-rc.openqa.org/) 22 | - The Protobuf Project (http://code.google.com/p/protobuf/) with modifications 23 | 24 | For source availability and license information see COPYING. 25 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-api-checker.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-api-checker.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-codeserver.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-codeserver.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-dev.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-dev.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-elemental.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-elemental.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-ll.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-ll.dll -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-servlet-deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-servlet-deps.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-servlet.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-servlet.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/gwt-user.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/gwt-user.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/i18nCreator: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | HOMEDIR=`dirname $0`; 3 | java -cp $HOMEDIR/gwt-user.jar:$HOMEDIR/gwt-dev.jar com.google.gwt.i18n.tools.I18NCreator "$@"; 4 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/i18nCreator.cmd: -------------------------------------------------------------------------------- 1 | @java -cp "%~dp0\gwt-user.jar;%~dp0\gwt-dev.jar" com.google.gwt.i18n.tools.I18NCreator %* 2 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/release_notes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Google Web Toolkit Release Notes 5 | 6 | 19 | 20 | 21 |

Google Web Toolkit Release Notes

22 |

23 | Release notes for the 24 | Latest Version 25 | and Older Versions 26 | can be found on the GWT project hosting website. 27 |

28 | 29 | 30 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-apt-src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-apt-src.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-apt.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-apt.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-client+src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-client+src.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-client-src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-client-src.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-client.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-client.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-server+src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-server+src.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-server-src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-server-src.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/requestfactory-server.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/requestfactory-server.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/validation-api-1.0.0.GA-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/validation-api-1.0.0.GA-sources.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/validation-api-1.0.0.GA.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/gwt-2.7.0/validation-api-1.0.0.GA.jar -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/webAppCreator: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | HOMEDIR=`dirname $0`; 3 | java -cp $HOMEDIR/gwt-user.jar:$HOMEDIR/gwt-dev.jar com.google.gwt.user.tools.WebAppCreator "$@"; 4 | -------------------------------------------------------------------------------- /third_party/gwt-2.7.0/webAppCreator.cmd: -------------------------------------------------------------------------------- 1 | @java -cp "%~dp0\gwt-user.jar;%~dp0\gwt-dev.jar" com.google.gwt.user.tools.WebAppCreator %* 2 | -------------------------------------------------------------------------------- /third_party/junit/junit-4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/third_party/junit/junit-4.11.jar -------------------------------------------------------------------------------- /third_party/protobuf/COPYING.txt: -------------------------------------------------------------------------------- 1 | Copyright 2008, Google Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following disclaimer 12 | in the documentation and/or other materials provided with the 13 | distribution. 14 | * Neither the name of Google Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | Code generated by the Protocol Buffer compiler is owned by the owner 31 | of the input file used when generating it. This code is not 32 | standalone and requires a support library to be linked with it. This 33 | support library is itself covered by the above license. 34 | -------------------------------------------------------------------------------- /tools/UnicodePatternGenerator.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | import java.util.regex.Pattern; 6 | import java.util.regex.Matcher; 7 | 8 | /** 9 | * This class generates unicode patterns (of the form \u0000-\u0005\u0009\u000c-\u0010) for a 10 | * couple of character matching routines used in boilerpipe (e.g. unicode character classes, etc.). 11 | * 12 | * It only supports Unicode's Basic Multilingual Plane (i.e. code points \u0000 to \uFFFF). 13 | */ 14 | class UnicodePatternGenerator { 15 | public static void main(String[] args) { 16 | String range = createRange(PAT_VALID_WORD_CHARACTER); 17 | String verify = createRange(Pattern.compile("[" + range + "]")); 18 | if (range.equals(verify)) { 19 | System.out.println("SUCCESS"); 20 | System.out.println(range); 21 | } else { 22 | System.out.println("FAILURE"); 23 | } 24 | 25 | String spaceRange = createRange(new Checker() { 26 | public boolean check(char c) { 27 | return Character.isWhitespace(c); 28 | } 29 | }); 30 | String verifySpaceRange = createRange(Pattern.compile("[" + spaceRange + "]")); 31 | if (spaceRange.equals(verifySpaceRange)) { 32 | System.out.println("SUCCESS"); 33 | System.out.println(spaceRange); 34 | } else { 35 | System.out.println("FAILURE"); 36 | } 37 | } 38 | 39 | private static interface Checker { 40 | boolean check(char s); 41 | } 42 | 43 | private static class PatternChecker implements Checker { 44 | PatternChecker(Pattern p) { this.p = p; } 45 | public boolean check(char c) { 46 | return p.matcher(Character.toString(c)).find(); 47 | } 48 | Pattern p; 49 | } 50 | 51 | private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern 52 | .compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]"); 53 | 54 | public static String toCodePoint(Integer i) { 55 | return String.format("\\u%04x", i); 56 | } 57 | 58 | public static String createRange(Pattern p) { 59 | return createRange(new PatternChecker(p)); 60 | } 61 | 62 | public static String createRange(Checker c) { 63 | String range = ""; 64 | int start = -1; 65 | for (int i = 0; i < (1 << 16); ++i) { 66 | if (!c.check((char)i)) { 67 | if (start >= 0) { 68 | range += toCodePoint(start); 69 | if (start != i - 1) { 70 | range += "-"; 71 | range += toCodePoint(i - 1); 72 | } 73 | } 74 | start = -1; 75 | } else if (start == -1) { 76 | start = i; 77 | } 78 | } 79 | return range; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /war/DomDistiller.html: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | 10 | Dom Distiller Project 11 | 12 | 13 | 14 |

15 | This is just a block of arbitary text to provide content for proper extraction. 16 | To test extraction of specific HTML content, copy the interesting segment and paste it between the body HTML tags here. 17 | Then build with "ant devmode", click "Launch Default Browser", and watch the results in the new tab opened in the browser. 18 | This is just an arbitary link to test that it shows up in extracted content. 19 |

20 |

21 |

22 | This is just an arbitary link to test that it's ignored by content extraction. 23 |

24 | 25 | 26 | -------------------------------------------------------------------------------- /war/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chromium/dom-distiller/2a180397710719913340a12804affc65b789275e/war/favicon.ico -------------------------------------------------------------------------------- /war/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | hello body 8 | 9 | 10 | -------------------------------------------------------------------------------- /war/wrapped_domdistiller_template.js: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. 4 | 5 | // Creates a DomDistiller, applies to to the content of the page, and returns 6 | // a DomDistillerResults as a JavaScript object/dictionary. 7 | (function(options) { 8 | try { 9 | // The generated domdistiller.js accesses the window object only explicitly 10 | // via the window name. This creates a new object with the normal window 11 | // object as its prototype and initialize the domdistiller.js with that new 12 | // context so that it does not change the real window object. 13 | function initialize(window) { 14 | $$DISTILLER_JAVASCRIPT 15 | } 16 | var context = Object.create(window); 17 | context.setTimeout = function() {}; 18 | context.clearTimeout = function() {}; 19 | initialize(context); 20 | 21 | var distiller = context.org.chromium.distiller.DomDistiller; 22 | var res = distiller.applyWithOptions(options); 23 | return res; 24 | } catch (e) { 25 | window.console.error("Error during distillation: " + e); 26 | if (e.stack != undefined) window.console.error(e.stack); 27 | } 28 | return undefined; 29 | // The OPTIONS placeholder will be replaced with the DomDistillerOptions at 30 | // runtime. 31 | })($$OPTIONS) 32 | --------------------------------------------------------------------------------