%s...
" % (description)
46 | if not description:
47 | hrefs = "
20 |
21 |
22 |
23 |
27 |
28 |  }})
29 |
30 |
31 |
32 |
33 |
34 | {% if results == '0' %}
35 |
36 |
Hmmm... Darksearch couldn't find anything. Try being less specific.
37 | {% else %}
38 |
Darksearch found the following {{results}} results across {{pageTotal}} pages in {{dur}} seconds...
39 | {% endif %}
40 |
41 |
42 |
43 |
54 | {{engineList}}
55 |
56 |
76 |
77 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
Note: We do not host any media on Darksearch and take no liability for the content you may find.
We cannot guarantee the safety of any .onion link. Contribute to this project on GitHub
87 |
88 |
89 |
--------------------------------------------------------------------------------
/darksearch/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from elas import DarkElastic
--------------------------------------------------------------------------------
/darksearch/tools/elas.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import os
4 | import pandas as pd
5 | import json
6 | from elasticsearch import Elasticsearch
7 | import requests
8 | import re
9 | es = Elasticsearch()
10 |
11 | class DarkElastic(object):
12 |
13 | def __init__(self):
14 | self.size = 0
15 |
16 | def pandas_to_json(self, jsonPath):
17 | """
18 | Take logFile, open as Dataframe, covert to JSON, Save JSON.
19 | """
20 | self.jsonPath = jsonPath
21 | self.logPath = os.getcwd()+'/../logs/process2.csv'
22 | with open(self.logPath) as logs:
23 | searchIndex = pd.read_csv(
24 | logs,
25 | header=None,
26 | sep='\t',
27 | names=[
28 | "DATES",
29 | "URLS",
30 | "NAMES",
31 | "SIZE",
32 | "LANG",
33 | "TITLE",
34 | "CONTENT"
35 | ]
36 | )
37 | self.size = len(searchIndex.index)
38 | searchIndex = searchIndex.to_json(orient='index')
39 | # If you want to use a JSON file rather than converting
40 | # with open(self.jsonPath) as searchIndex:
41 | searchIndex = json.loads(searchIndex)
42 | self.searchIndex = searchIndex
43 | self.save_json(searchIndex)
44 |
45 | def save_json(self, dataframe):
46 | with open(self.jsonPath, "w") as outfile:
47 | json.dump(dataframe, outfile, indent=4)
48 | print('Dataframe converted to JSON.')
49 |
50 | def ingest_items(self):
51 | for i in range(0, self.size):
52 | doc = self.searchIndex[str(i)]
53 | res = es.index(
54 | index="dark",
55 | doc_type='html',
56 | id=i,
57 | body=doc
58 | )
59 | print('Ingested document %d...' % i)
60 | return (res['created'])
61 |
62 | def get_items(self, i):
63 | res = es.get(
64 | index="dark",
65 | doc_type='html',
66 | id=i
67 | )
68 | return (res['_source'])
69 |
70 | def search_index(self, myIndex, myQuery, start=0, end=10):
71 | res = es.search(
72 | index=myIndex,
73 | body={
74 | "from": start,
75 | "size": end,
76 | 'query': {
77 | "query_string": {
78 | "default_field": "CONTENT",
79 | "query": myQuery
80 | }
81 | },
82 | "sort": {
83 | "_score": {
84 | "order": "desc"
85 | }
86 | }
87 | }
88 | )
89 | self.briefList = []
90 | self.namesList = []
91 | self.datesList = []
92 | self.titleList = []
93 | hitList = ("Got %d Hits:" % res['hits']['total'])
94 | for hit in res['hits']['hits']:
95 | print("%(DATES)s: %(URLS)s" % hit['_source'])
96 | content = hit['_source']['CONTENT']
97 | names = hit['_source']['NAMES']
98 | dates = hit['_source']['DATES']
99 | title = hit['_source']['TITLE']
100 | brief = self.get_brief(myQuery, content, 20)
101 | self.briefList.append(brief)
102 | self.namesList.append(names)
103 | self.datesList.append(dates)
104 | self.titleList.append(title)
105 | self.size = res['hits']['total']
106 | return hitList
107 |
108 | def delete_deuplicates(self, i):
109 | pass
110 |
111 | def delete_all(self, index='dark'):
112 | """
113 | Runs $ curl -XDELETE 'http://localhost:9200/your_index/'
114 | """
115 | r = requests.delete('http://localhost:9200/%s' % (index))
116 | print('Index %s deleted.' % index)
117 |
118 | def get_brief(self, query, content, n):
119 | """
120 | Obtain the brief description that shows up in search
121 | """
122 | query = query.lower()
123 | # Strips quotes
124 | query = query.replace('\"', "")
125 | queryList = query.split()
126 | queryList.sort(key=len)
127 | content = content.lower().split()
128 | try:
129 | pos = content.index(query)
130 | except ValueError:
131 | pos = 0
132 | if ((pos - n) < 0):
133 | start = 0
134 | end = pos + n + abs((pos - n))
135 | else:
136 | start = pos - n
137 | end = pos + n
138 | # Find Nearest period to end sentence...
139 | # try:
140 | # endSentence = content.index(".")
141 | # if endSentence < (start+40):
142 | # end = endSentence
143 | # except:
144 | # pass
145 | content = content[start:end]
146 | if len(content) >= 500:
147 | content = content[0:400]
148 | for query in queryList:
149 | wrap = '
'+query+''
150 | try:
151 | content[content.index(query)] = wrap
152 | except:
153 | pass
154 | brief = " ".join(content)
155 | return brief
156 |
157 | def runSetup(self, jsonPath):
158 | self.pandas_to_json(jsonPath)
159 | self.save_json(self.searchIndex)
160 |
161 | def check_cat(self, description):
162 | return 'tor'
163 |
164 | def free_mem(self):
165 | del self.briefList
166 | del self.namesList
167 | del self.datesList
168 | del self.titleList
169 |
170 | if __name__ == '__main__':
171 | test = DarkElastic()
172 | test.runSetup("../logs/process2.json")
173 | # Build your index.
174 | test.ingest_items()
175 | es.indices.refresh(index='dark')
176 | print test.search_index('dark', 'cocaine', 15, 10)
177 |
--------------------------------------------------------------------------------
/darksearch/tools/tk.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tika
3 | from tika import parser
4 | import re
5 | from tika import language, translate
6 | import os
7 | import csv
8 | import pandas as pd
9 |
10 | class Tikify(object):
11 |
12 | '''
13 | DataBase Ingestion Script
14 | - You have Data in html files in '/data'
15 | - You have 'logs/scrape.log with the time scraped, size.
16 | - Create table in Postgres with
17 | - Date, .onion, name(.html), tikify(text), size, language, type, title, sentiment, etc
18 | -
19 | '''
20 |
21 | def __init__(self, fileName):
22 | parsed = parser.from_file(fileName)
23 | metadata = parsed["metadata"]
24 | # Return re.sub('[\s+]', '', content)
25 | # TODO: Delete... Very Redundant..
26 | content = parsed["content"]
27 | content = content.replace('\n', '')
28 | content = content.replace('\t', '')
29 | content = content.replace('\'', '')
30 | content = content.replace('\"', '')
31 | rx = re.compile('\W+')
32 | content = rx.sub(' ', content).strip()
33 | self.content = content
34 | # Title...
35 | try:
36 | title = metadata['title']
37 | except:
38 | title = 'Untitled'
39 | title = title.replace('\t', '')
40 | title = title.replace('\t', '')
41 | title = title.replace('\'', '')
42 | title = title.replace('\"', '')
43 | title = rx.sub(' ', title).strip()
44 | self.title = title
45 | # self.type = self.metadata['Content-Type-Hint']
46 | # self.name = self.metadata['resourceName']
47 | # lanFix = re.sub('[\s+]', '', content)
48 | self.lang = language.from_file(fileName)
49 |
50 | def toEnglish(self, language='en'):
51 | self.eng = translate.from_file(self.content.encode('UTF-8'), self.lang, language)
52 |
53 | def analyze(self, translate):
54 | pass
55 |
56 |
57 | if __name__ == "__main__":
58 | dataPath = os.getcwd()+'/../data/'
59 | logPath = os.getcwd()+'/../logs/scrape.log'
60 | print 'Started...'
61 | with open(logPath) as logs:
62 | print 'Reading csv...'
63 | logs = pd.read_csv(
64 | logs,
65 | header=None,
66 | sep=',',
67 | skipinitialspace=True,
68 | names=[
69 | "DATES",
70 | "URLS",
71 | "NAMES",
72 | "SIZE",
73 | ]
74 |
75 | )
76 | # Columns = [DATE,URL,NAME,SIZE,LANG,CONTENT')]
77 | # with open("logs/process.csv", "a") as log:
78 | # log.write('DATE,URL,NAME,SIZE,LANG,CONTENT\n')
79 | for i in range(0, len(logs)):
80 | date = str(logs['DATE'][i].strip())
81 | url = str(logs['URL'][i].strip())
82 | name = str(logs['NAME'][i].strip())
83 | size = str(logs['SIZE'][i])
84 | try:
85 | output = Tikify(dataPath + name)
86 | content = unicode(output.content)
87 | title = str(output.title)
88 | lang = str(output.lang)
89 | with open("../logs/process2.csv", "a") as log:
90 | log.write(('%s\t%s\t%s\t%s\t%s\t%s\t%s\n') % (date, url, name, size, lang, title, content))
91 | print ('Appended line %d...') % i
92 | except Exception:
93 | continue
94 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch==2.2.0
2 | Flask==0.10.1
3 | Flask-API==0.6.5
4 | Flask-Limiter==0.9.1
5 | itsdangerous==0.24
6 | Jinja2==2.8
7 | limits==1.1
8 | MarkupSafe==0.23
9 | numpy==1.10.4
10 | pandas==0.17.1
11 | Pympler==0.4.2
12 | python-dateutil==2.4.2
13 | pytz==2015.7
14 | requests==2.9.1
15 | six==1.10.0
16 | urllib3==1.14
17 | Werkzeug==0.11.4
18 | wheel==0.24.0
19 |
--------------------------------------------------------------------------------