├── README.md └── yacy_indexer.user.js /README.md: -------------------------------------------------------------------------------- 1 | YaCyIndexerGreasemonkey 2 | ======================= 3 | 4 | This is a Greasemonkey script to index visited websites with the [YaCy P2P search engine](http://yacy.net/). It has the advantage over YaCy's proxy server that it can index pages with cookies, pages which use HTTPS encryption, and (in many cases) pages with query strings, without leaking private data. It can also crawl visited pages with a depth other than 0 (something YaCy's proxy does not support). 5 | 6 | Installation (Basic) 7 | -------------------- 8 | 9 | First off, if you haven't installed [YaCy](http://yacy.net/en/index.html), you should do that first. 10 | 11 | You should also install Greasemonkey in Firefox, or a compatible add-on depending on your browser. 12 | 13 | Finally, just view the yacy_indexer.user.js file in GitHub and click the "Raw" link. Greasemonkey should prompt you to install the script. 14 | 15 | If you want to use the default settings (crawl depth 1), you're done! Otherwise, see the next section. 16 | 17 | Installation (Custom Settings) 18 | ------------------------------ 19 | 20 | If you'd like to customize the settings, perform the Basic Installation above, and then use the Edit User Script feature in your browser. In Firefox, this is Tools -> Greasemonkey -> Manage User Scripts -> YaCyIndexer -> Options -> Edit This User Script. 21 | 22 | The following settings are customizable: 23 | 24 | ###var paramYaCyLocation = 'http://localhost:8090'; 25 | If your YaCy installation is on a different host or port from the default, change this setting. 26 | 27 | ###var paramEnableQueryString = false; 28 | Set this to true if you wish to index pages with a query string (something after a question mark in the URL). Note that this is a potential privacy leak. The default setting (false) will remove the query string before indexing pages to protect your privacy. 29 | 30 | ###var paramDepth = 1; 31 | This is the crawl depth for each page you visit. 0 will only index the visited page. Higher values will index deeper but use exponentially more bandwidth. 32 | 33 | ###var paramAgeNum = 7; and var paramAgeUnit = 'day'; 34 | These control how frequently YaCy will re-index content which was previously indexed. 35 | 36 | ###Other settings 37 | The other YaCy API arguments can be changed too, if you've read the [YaCy API documentation](http://www.yacy-websuche.de/wiki/index.php/Dev:API#Managing_crawl_jobs). 38 | -------------------------------------------------------------------------------- /yacy_indexer.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name YaCyIndexer 3 | // @namespace https://veclabs.posterous.com/ 4 | // @description Indexes visited pages with YaCy. 5 | // @version 0.1 6 | // @match *://*/* 7 | // @grant GM_xmlhttpRequest 8 | // @grant GM_log 9 | // ==/UserScript== 10 | 11 | // User parameters: 12 | var paramYaCyLocation = 'http://localhost:8090'; 13 | var paramEnableQueryString = false; // Index pages with query strings (possible privacy leak). 14 | var paramDepth = 1; // 0 = only the visited page; 1 = all links on visted page; higher values will index deeper but use exponentially more bandwidth. 15 | var paramAgeNum = 7; // pages already indexed since this time won't be re-indexed. 16 | var paramAgeUnit = 'day'; // units for above 17 | 18 | // YaCy Arguments -- Don't change these unless you've read the YaCy API docs. 19 | var crawlingstart = ''; 20 | var crawlingMode = 'url'; 21 | var crawlingURL = paramEnableQueryString ? window.location.href : [location.protocol, '//', location.host, location.pathname].join(''); 22 | var bookmarkTitle = ''; 23 | var crawlingDepth = paramDepth; 24 | var directDocByURL = 'off'; 25 | var crawlingDepthExtension = ''; 26 | var range = 'wide'; 27 | var mustmatch = '.*'; 28 | var mustnotmatch = ''; 29 | var ipMustmatch = '.*'; 30 | //var ipMustnotmatch = '(^127\.)|(^10\.)|(^172\.1[6-9]\.)|(^172\.2[0-9]\.)|(^172\.3[0-1]\.)|(^192\.168\.)'; // http://stackoverflow.com/questions/2814002/private-ip-address-identifier-in-regular-expression 31 | var ipMustnotmatch = ''; 32 | var indexmustmatch = '.*'; 33 | var indexmustnotmatch = ''; 34 | var deleteold = 'off'; 35 | var recrawl = 'reload'; 36 | var reloadIfOlderNumber = paramAgeNum; 37 | var reloadIfOlderUnit = paramAgeUnit; 38 | var countryMustMatchSwitch = 'false'; 39 | var crawlingDomMaxCheck = 'off'; 40 | var crawlingQ = paramEnableQueryString ? 'on' : 'off'; 41 | var storeHTCache = 'off'; 42 | var cachePolicy = 'iffresh'; 43 | var indexText = 'on'; 44 | var indexMedia = 'on'; 45 | var crawlOrder = 'off'; 46 | var collection = 'user'; 47 | 48 | var yacy_url = paramYaCyLocation + '/Crawler_p.html?crawlingstart=' + encodeURIComponent(crawlingstart) + '&crawlingMode=' + encodeURIComponent(crawlingMode) + '&crawlingURL=' + encodeURIComponent(crawlingURL) + '&bookmarkTitle=' + encodeURIComponent(bookmarkTitle) + '&crawlingDepth=' + encodeURIComponent(crawlingDepth) + '&directDocByURL=' + encodeURIComponent(directDocByURL) + '&crawlingDepthExtension=' + encodeURIComponent(crawlingDepthExtension) + '&range=' + encodeURIComponent(range) + '&mustmatch=' + encodeURIComponent(mustmatch) + '&mustnotmatch=' + encodeURIComponent(mustnotmatch) + '&ipMustmatch=' + encodeURIComponent(ipMustmatch) + '&ipMustnotmatch=' + encodeURIComponent(ipMustnotmatch) + '&indexmustmatch=' + encodeURIComponent(indexmustmatch) + '&indexmustnotmatch=' + encodeURIComponent(indexmustnotmatch) + '&deleteold=' + encodeURIComponent(deleteold) + '&recrawl=' + encodeURIComponent(recrawl) + '&reloadIfOlderNumber=' + encodeURIComponent(reloadIfOlderNumber) + '&reloadIfOlderUnit=' + encodeURIComponent(reloadIfOlderUnit) + '&countryMustMatchSwitch=' + encodeURIComponent(countryMustMatchSwitch) + '&crawlingDomMaxCheck=' + encodeURIComponent(crawlingDomMaxCheck) + '&crawlingQ=' + encodeURIComponent(crawlingQ) + '&storeHTCache=' + encodeURIComponent(storeHTCache) + '&cachePolicy=' + encodeURIComponent(cachePolicy) + '&indexText=' + encodeURIComponent(indexText) + '&indexMedia=' + encodeURIComponent(indexMedia) + '&crawlOrder=' + encodeURIComponent(crawlOrder) + '&collection=' + encodeURIComponent(collection); 49 | 50 | //GM_log(yacy_url); 51 | 52 | GM_xmlhttpRequest({ 53 | method: "GET", 54 | url: yacy_url, 55 | onload: function(response) { 56 | //GM_log("YaCy indexing should commence."); 57 | } 58 | }); 59 | 60 | --------------------------------------------------------------------------------