├── src
└── searchengines
│ ├── Bing.fs
│ ├── Aol.fs
│ ├── Google.fs
│ └── Yahoo.fs
├── MagicBox.fsproj
├── README.md
├── MagicBox.sln
├── .gitattributes
└── .gitignore
/src/searchengines/Bing.fs:
--------------------------------------------------------------------------------
1 | namespace Magicbox.SearchEngines
2 |
3 | open FSharp.Data
4 |
5 | module Bing =
6 | let Search (url: string) =
7 | let doc = HtmlDocument.Load url
8 |
9 | doc.CssSelect("h2 > a")
10 | |> List.choose (fun x ->
11 | x.TryGetAttribute("href")
12 | |> Option.map (fun a -> x.InnerText(), a.Value()))
13 | |> List.toSeq
14 |
--------------------------------------------------------------------------------
/src/searchengines/Aol.fs:
--------------------------------------------------------------------------------
1 | namespace Magicbox.SearchEngines
2 |
3 | open FSharp.Data
4 |
5 | module Aol =
6 | let Search (url: string) =
7 | let doc = HtmlDocument.Load url
8 |
9 | doc.CssSelect("h3 > a")
10 | |> List.choose (fun x ->
11 | x.TryGetAttribute("href")
12 | |> Option.map (fun a -> x.InnerText(), a.Value()))
13 | |> Yahoo.filterHrefs
14 | |> List.map (fun (name, url) -> name, (Yahoo.sanitizeUrl url))
15 | |> List.toSeq
16 |
--------------------------------------------------------------------------------
/MagicBox.fsproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | net6.0
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MagicBox
2 |
3 | ## Features:
4 |
5 | ### **1) Scrape Seach engines (Raw html, Search Api Scraping)**
6 |
7 | **Features:** Proxies, Pluggable Downloaders, Extractors
8 |
9 | 1.Google - All TLDs/Country
10 | 2.Yahoo
11 | 3.Bing
12 | 4.Yandex
13 | 5.Duckduckgo
14 | 6.Ask
15 | 7.AOL
16 | 8.Rediff
17 | 9.IXQuick
18 | 10.YouTube
19 | 11.Similar Site Search
20 | 12.Biglobe: biglobe.co.jp
21 | 13.goo: search.goo.ne.jp
22 | 14.Orange : orange.com
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/searchengines/Google.fs:
--------------------------------------------------------------------------------
1 | namespace Magicbox.SearchEngines
2 |
3 | open FSharp.Data
4 |
5 | module Google =
6 |
7 | let Search (url: string) =
8 | let doc = HtmlDocument.Load url
9 |
10 | let searchResults =
11 | doc.Descendants [ "a" ]
12 | |> Seq.choose (fun x ->
13 | x.TryGetAttribute("href")
14 | |> Option.map (fun a -> x.InnerText(), a.Value()))
15 | |> Seq.toList
16 | |> List.filter (fun (name, url) ->
17 | name <> "Cached"
18 | && name <> "Similar"
19 | && url.StartsWith("/url?"))
20 | |> List.map (fun (name, url) ->
21 | name,
22 | url
23 | .Substring(0, url.IndexOf("&sa="))
24 | .Replace("/url?q=", ""))
25 |
26 | searchResults |> List.toSeq
27 |
--------------------------------------------------------------------------------
/MagicBox.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.30804.86
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "MagicBox", "MagicBox.fsproj", "{76914E93-5528-44BF-BD1B-4FC1ADB23298}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {76914E93-5528-44BF-BD1B-4FC1ADB23298}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {76914E93-5528-44BF-BD1B-4FC1ADB23298}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {76914E93-5528-44BF-BD1B-4FC1ADB23298}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | {76914E93-5528-44BF-BD1B-4FC1ADB23298}.Release|Any CPU.Build.0 = Release|Any CPU
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | GlobalSection(ExtensibilityGlobals) = postSolution
23 | SolutionGuid = {E32D07F3-E29F-465A-8154-B7CE50655C6C}
24 | EndGlobalSection
25 | EndGlobal
26 |
--------------------------------------------------------------------------------
/src/searchengines/Yahoo.fs:
--------------------------------------------------------------------------------
1 | namespace Magicbox.SearchEngines
2 |
3 | open FSharp.Data
4 | open System.Text
5 |
6 | module Yahoo =
7 | let subStringBetween (source: string) (token1: string) (token2: string) =
8 | let pos1 = source.IndexOf(token1) + token1.Length
9 | let pos2 = source.IndexOf(token2)
10 |
11 | if (pos1 > 0 && pos2 > 0 && pos2 > pos1) then
12 | source.Substring(pos1, pos2 - pos1)
13 | else
14 | ""
15 |
16 | let sanitizeUrl (url: string) =
17 | let replaceStrings =
18 | [ "/RU=", ""
19 | "/RK", ""
20 | "%3a", ":"
21 | "%2f", "/"
22 | "%3f", "?" ]
23 |
24 | let url' = subStringBetween url "/RU=" "/RK"
25 | let sb = new StringBuilder(url')
26 |
27 | replaceStrings
28 | |> List.fold (fun (state: StringBuilder) (x, y) -> state.Replace(x, y)) sb
29 | |> fun x -> x.ToString()
30 |
31 | let filterHrefs (urls: (string * string) list) =
32 | urls
33 | |> List.filter (fun (name, url) ->
34 | let index1 = url.IndexOf("/RU=")
35 | let index2 = url.IndexOf("/RK")
36 | (index1 > 0 && index2 > 0 && index2 > index1))
37 |
38 | let Search (url: string) =
39 | let doc = HtmlDocument.Load url
40 |
41 | let searchResults =
42 | doc.Descendants [ "a" ]
43 | |> Seq.choose (fun x ->
44 | x.TryGetAttribute("href")
45 | |> Option.map (fun a -> x.InnerText(), a.Value()))
46 | |> Seq.toList
47 | |> filterHrefs
48 | |> List.map (fun (name, url) ->
49 | let url' = sanitizeUrl url
50 | name, url')
51 |
52 | searchResults |> List.toSeq
53 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Build results
17 | [Dd]ebug/
18 | [Dd]ebugPublic/
19 | [Rr]elease/
20 | [Rr]eleases/
21 | x64/
22 | x86/
23 | [Aa][Rr][Mm]/
24 | [Aa][Rr][Mm]64/
25 | bld/
26 | [Bb]in/
27 | [Oo]bj/
28 | [Ll]og/
29 |
30 | # Visual Studio 2015/2017 cache/options directory
31 | .vs/
32 | # Uncomment if you have tasks that create the project's static files in wwwroot
33 | #wwwroot/
34 |
35 | # Visual Studio 2017 auto generated files
36 | Generated\ Files/
37 |
38 | # MSTest test Results
39 | [Tt]est[Rr]esult*/
40 | [Bb]uild[Ll]og.*
41 |
42 | # NUNIT
43 | *.VisualState.xml
44 | TestResult.xml
45 |
46 | # Build Results of an ATL Project
47 | [Dd]ebugPS/
48 | [Rr]eleasePS/
49 | dlldata.c
50 |
51 | # Benchmark Results
52 | BenchmarkDotNet.Artifacts/
53 |
54 | # .NET Core
55 | project.lock.json
56 | project.fragment.lock.json
57 | artifacts/
58 |
59 | # StyleCop
60 | StyleCopReport.xml
61 |
62 | # Files built by Visual Studio
63 | *_i.c
64 | *_p.c
65 | *_h.h
66 | *.ilk
67 | *.meta
68 | *.obj
69 | *.iobj
70 | *.pch
71 | *.pdb
72 | *.ipdb
73 | *.pgc
74 | *.pgd
75 | *.rsp
76 | *.sbr
77 | *.tlb
78 | *.tli
79 | *.tlh
80 | *.tmp
81 | *.tmp_proj
82 | *_wpftmp.csproj
83 | *.log
84 | *.vspscc
85 | *.vssscc
86 | .builds
87 | *.pidb
88 | *.svclog
89 | *.scc
90 |
91 | # Chutzpah Test files
92 | _Chutzpah*
93 |
94 | # Visual C++ cache files
95 | ipch/
96 | *.aps
97 | *.ncb
98 | *.opendb
99 | *.opensdf
100 | *.sdf
101 | *.cachefile
102 | *.VC.db
103 | *.VC.VC.opendb
104 |
105 | # Visual Studio profiler
106 | *.psess
107 | *.vsp
108 | *.vspx
109 | *.sap
110 |
111 | # Visual Studio Trace Files
112 | *.e2e
113 |
114 | # TFS 2012 Local Workspace
115 | $tf/
116 |
117 | # Guidance Automation Toolkit
118 | *.gpState
119 |
120 | # ReSharper is a .NET coding add-in
121 | _ReSharper*/
122 | *.[Rr]e[Ss]harper
123 | *.DotSettings.user
124 |
125 | # JustCode is a .NET coding add-in
126 | .JustCode
127 |
128 | # TeamCity is a build add-in
129 | _TeamCity*
130 |
131 | # DotCover is a Code Coverage Tool
132 | *.dotCover
133 |
134 | # AxoCover is a Code Coverage Tool
135 | .axoCover/*
136 | !.axoCover/settings.json
137 |
138 | # Visual Studio code coverage results
139 | *.coverage
140 | *.coveragexml
141 |
142 | # NCrunch
143 | _NCrunch_*
144 | .*crunch*.local.xml
145 | nCrunchTemp_*
146 |
147 | # MightyMoose
148 | *.mm.*
149 | AutoTest.Net/
150 |
151 | # Web workbench (sass)
152 | .sass-cache/
153 |
154 | # Installshield output folder
155 | [Ee]xpress/
156 |
157 | # DocProject is a documentation generator add-in
158 | DocProject/buildhelp/
159 | DocProject/Help/*.HxT
160 | DocProject/Help/*.HxC
161 | DocProject/Help/*.hhc
162 | DocProject/Help/*.hhk
163 | DocProject/Help/*.hhp
164 | DocProject/Help/Html2
165 | DocProject/Help/html
166 |
167 | # Click-Once directory
168 | publish/
169 |
170 | # Publish Web Output
171 | *.[Pp]ublish.xml
172 | *.azurePubxml
173 | # Note: Comment the next line if you want to checkin your web deploy settings,
174 | # but database connection strings (with potential passwords) will be unencrypted
175 | *.pubxml
176 | *.publishproj
177 |
178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
179 | # checkin your Azure Web App publish settings, but sensitive information contained
180 | # in these scripts will be unencrypted
181 | PublishScripts/
182 |
183 | # NuGet Packages
184 | *.nupkg
185 | # The packages folder can be ignored because of Package Restore
186 | **/[Pp]ackages/*
187 | # except build/, which is used as an MSBuild target.
188 | !**/[Pp]ackages/build/
189 | # Uncomment if necessary however generally it will be regenerated when needed
190 | #!**/[Pp]ackages/repositories.config
191 | # NuGet v3's project.json files produces more ignorable files
192 | *.nuget.props
193 | *.nuget.targets
194 |
195 | # Microsoft Azure Build Output
196 | csx/
197 | *.build.csdef
198 |
199 | # Microsoft Azure Emulator
200 | ecf/
201 | rcf/
202 |
203 | # Windows Store app package directories and files
204 | AppPackages/
205 | BundleArtifacts/
206 | Package.StoreAssociation.xml
207 | _pkginfo.txt
208 | *.appx
209 |
210 | # Visual Studio cache files
211 | # files ending in .cache can be ignored
212 | *.[Cc]ache
213 | # but keep track of directories ending in .cache
214 | !?*.[Cc]ache/
215 |
216 | # Others
217 | ClientBin/
218 | ~$*
219 | *~
220 | *.dbmdl
221 | *.dbproj.schemaview
222 | *.jfm
223 | *.pfx
224 | *.publishsettings
225 | orleans.codegen.cs
226 |
227 | # Including strong name files can present a security risk
228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
229 | #*.snk
230 |
231 | # Since there are multiple workflows, uncomment next line to ignore bower_components
232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
233 | #bower_components/
234 |
235 | # RIA/Silverlight projects
236 | Generated_Code/
237 |
238 | # Backup & report files from converting an old project file
239 | # to a newer Visual Studio version. Backup files are not needed,
240 | # because we have git ;-)
241 | _UpgradeReport_Files/
242 | Backup*/
243 | UpgradeLog*.XML
244 | UpgradeLog*.htm
245 | ServiceFabricBackup/
246 | *.rptproj.bak
247 |
248 | # SQL Server files
249 | *.mdf
250 | *.ldf
251 | *.ndf
252 |
253 | # Business Intelligence projects
254 | *.rdl.data
255 | *.bim.layout
256 | *.bim_*.settings
257 | *.rptproj.rsuser
258 | *- Backup*.rdl
259 |
260 | # Microsoft Fakes
261 | FakesAssemblies/
262 |
263 | # GhostDoc plugin setting file
264 | *.GhostDoc.xml
265 |
266 | # Node.js Tools for Visual Studio
267 | .ntvs_analysis.dat
268 | node_modules/
269 |
270 | # Visual Studio 6 build log
271 | *.plg
272 |
273 | # Visual Studio 6 workspace options file
274 | *.opt
275 |
276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
277 | *.vbw
278 |
279 | # Visual Studio LightSwitch build output
280 | **/*.HTMLClient/GeneratedArtifacts
281 | **/*.DesktopClient/GeneratedArtifacts
282 | **/*.DesktopClient/ModelManifest.xml
283 | **/*.Server/GeneratedArtifacts
284 | **/*.Server/ModelManifest.xml
285 | _Pvt_Extensions
286 |
287 | # Paket dependency manager
288 | .paket/paket.exe
289 | paket-files/
290 |
291 | # FAKE - F# Make
292 | .fake/
293 |
294 | # JetBrains Rider
295 | .idea/
296 | *.sln.iml
297 |
298 | # CodeRush personal settings
299 | .cr/personal
300 |
301 | # Python Tools for Visual Studio (PTVS)
302 | __pycache__/
303 | *.pyc
304 |
305 | # Cake - Uncomment if you are using it
306 | # tools/**
307 | # !tools/packages.config
308 |
309 | # Tabs Studio
310 | *.tss
311 |
312 | # Telerik's JustMock configuration file
313 | *.jmconfig
314 |
315 | # BizTalk build output
316 | *.btp.cs
317 | *.btm.cs
318 | *.odx.cs
319 | *.xsd.cs
320 |
321 | # OpenCover UI analysis results
322 | OpenCover/
323 |
324 | # Azure Stream Analytics local run output
325 | ASALocalRun/
326 |
327 | # MSBuild Binary and Structured Log
328 | *.binlog
329 |
330 | # NVidia Nsight GPU debugger configuration file
331 | *.nvuser
332 |
333 | # MFractors (Xamarin productivity tool) working folder
334 | .mfractor/
335 |
336 | # Local History for Visual Studio
337 | .localhistory/
338 |
339 | # BeatPulse healthcheck temp database
340 | healthchecksdb
341 | .ionide
--------------------------------------------------------------------------------