├── .gitignore ├── LICENSE ├── README.md ├── cli.go ├── crawler └── crawler.go ├── data ├── banned-domains.txt ├── banned-suffixes.txt ├── boring-domains.txt ├── boring-words.txt ├── crawled.txt ├── heuristics.txt ├── preview-query-list.txt ├── webring.txt └── wordlist.txt ├── database └── database.go ├── docs ├── files.md └── querying.md ├── go.mod ├── go.sum ├── html ├── about.html ├── assets │ ├── NotoSerif-Bold.ttf │ ├── NotoSerif-Bold.woff2 │ ├── NotoSerif-Italic.ttf │ ├── NotoSerif-Italic.woff2 │ ├── NotoSerif-Regular.ttf │ ├── NotoSerif.woff2 │ ├── favicon.ico │ ├── favicon.png │ ├── favicon.svg │ ├── inter-ui-web │ │ ├── Inter-UI-Italic.woff │ │ ├── Inter-UI-Italic.woff2 │ │ ├── Inter-UI-Regular.woff │ │ ├── Inter-UI-Regular.woff2 │ │ ├── LICENSE.txt │ │ └── inter-ui.css │ ├── logo.svg │ ├── old_css │ │ ├── base.css │ │ └── style.css │ ├── opensearch.xml │ └── style.css ├── footer.html ├── head.html ├── index.html ├── list.html ├── nav.html ├── robots.txt ├── search.html └── webring.html ├── ingest └── ingest.go ├── lieu.toml ├── release.sh ├── server └── server.go ├── types └── types.go └── util └── util.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Lieu 2 | data/ 3 | searchengine.db 4 | 5 | #~top ignores~ 6 | node_modules/ 7 | *.vim 8 | *bundle*.js 9 | *.sw[a-z] 10 | config.conf 11 | config.js 12 | *.pdf 13 | archives 14 | builds 15 | dist 16 | 17 | ################# 18 | ## Eclipse 19 | ################# 20 | *.pydevproject 21 | .project 22 | .metadata 23 | bin/ 24 | tmp/ 25 | *.tmp 26 | *.bak 27 | *.swp 28 | *~.nib 29 | local.properties 30 | .classpath 31 | .settings/ 32 | .loadpath 33 | 34 | # External tool builders 35 | .externalToolBuilders/ 36 | 37 | # Locally stored "Eclipse launch configurations" 38 | *.launch 39 | 40 | # CDT-specific 41 | .cproject 42 | 43 | # PDT-specific 44 | .buildpath 45 | 46 | 47 | ################# 48 | ## Visual Studio 49 | ################# 50 | 51 | ## Ignore Visual Studio temporary files, build results, and 52 | ## files generated by popular Visual Studio add-ons. 53 | 54 | # User-specific files 55 | *.suo 56 | *.user 57 | *.sln.docstates 58 | 59 | # Build results 60 | 61 | [Dd]ebug/ 62 | [Rr]elease/ 63 | x64/ 64 | build/ 65 | [Bb]in/ 66 | [Oo]bj/ 67 | 68 | # MSTest test Results 69 | [Tt]est[Rr]esult*/ 70 | [Bb]uild[Ll]og.* 71 | 72 | *_i.c 73 | *_p.c 74 | *.ilk 75 | *.meta 76 | *.obj 77 | *.pch 78 | *.pdb 79 | *.pgc 80 | *.pgd 81 | *.rsp 82 | *.sbr 83 | *.tlb 84 | *.tli 85 | *.tlh 86 | *.tmp 87 | *.tmp_proj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.log 94 | *.scc 95 | 96 | # Visual C++ cache files 97 | ipch/ 98 | *.aps 99 | *.ncb 100 | *.opensdf 101 | *.sdf 102 | *.cachefile 103 | 104 | # Visual Studio profiler 105 | *.psess 106 | *.vsp 107 | *.vspx 108 | 109 | # Guidance Automation Toolkit 110 | *.gpState 111 | 112 | # ReSharper is a .NET coding add-in 113 | _ReSharper*/ 114 | *.[Rr]e[Ss]harper 115 | 116 | # TeamCity is a build add-in 117 | _TeamCity* 118 | 119 | # DotCover is a Code Coverage Tool 120 | *.dotCover 121 | 122 | # NCrunch 123 | *.ncrunch* 124 | .*crunch*.local.xml 125 | 126 | # Installshield output folder 127 | [Ee]xpress/ 128 | 129 | # DocProject is a documentation generator add-in 130 | DocProject/buildhelp/ 131 | DocProject/Help/*.HxT 132 | DocProject/Help/*.HxC 133 | DocProject/Help/*.hhc 134 | DocProject/Help/*.hhk 135 | DocProject/Help/*.hhp 136 | DocProject/Help/Html2 137 | DocProject/Help/html 138 | 139 | # Click-Once directory 140 | publish/ 141 | 142 | # Publish Web Output 143 | *.Publish.xml 144 | *.pubxml 145 | 146 | # Windows Azure Build Output 147 | csx 148 | *.build.csdef 149 | 150 | # Windows Store app package directory 151 | AppPackages/ 152 | 153 | # Others 154 | sql/ 155 | *.Cache 156 | ClientBin/ 157 | [Ss]tyle[Cc]op.* 158 | ~$* 159 | *~ 160 | *.dbmdl 161 | *.[Pp]ublish.xml 162 | *.pfx 163 | *.publishsettings 164 | 165 | # RIA/Silverlight projects 166 | Generated_Code/ 167 | 168 | # Backup & report files from converting an old project file to a newer 169 | # Visual Studio version. Backup files are not needed, because we have git ;-) 170 | _UpgradeReport_Files/ 171 | Backup*/ 172 | UpgradeLog*.XML 173 | UpgradeLog*.htm 174 | 175 | # SQL Server files 176 | App_Data/*.mdf 177 | App_Data/*.ldf 178 | 179 | ############# 180 | ## Windows detritus 181 | ############# 182 | 183 | # Windows image file caches 184 | Thumbs.db 185 | ehthumbs.db 186 | 187 | # Folder config file 188 | Desktop.ini 189 | 190 | # Recycle Bin used on file shares 191 | $RECYCLE.BIN/ 192 | 193 | # Mac crap 194 | .DS_Store 195 | 196 | 197 | ############# 198 | ## Python 199 | ############# 200 | 201 | *.py[co] 202 | 203 | # Packages 204 | *.egg 205 | *.egg-info 206 | dist/ 207 | build/ 208 | eggs/ 209 | parts/ 210 | var/ 211 | sdist/ 212 | develop-eggs/ 213 | .installed.cfg 214 | 215 | # Installer logs 216 | pip-log.txt 217 | 218 | # Unit test / coverage reports 219 | .coverage 220 | .tox 221 | 222 | #Translations 223 | *.mo 224 | 225 | #Mr Developer 226 | .mr.developer.cfg 227 | lieu 228 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lieu 2 | 3 | _an alternative search engine_ 4 | 5 | Created in response to the environs of apathy concerning the use of hypertext 6 | search and discovery. In Lieu, the internet is not what is made searchable, but 7 | instead one's own neighbourhood. Put differently, Lieu is a neighbourhood search 8 | engine, a way for personal webrings to increase serendipitous connexions. 9 | 10 | ![lieu screenshot](https://user-images.githubusercontent.com/3862362/107115659-75624d80-686e-11eb-81c8-0c6bdec07082.png) 11 | 12 | 13 | ## Goals 14 | 15 | * Enable serendipitous discovery 16 | * Support personal communities 17 | * Be reusable, easily 18 | 19 | ## Usage 20 | 21 | ### How to search 22 | 23 | For the full search syntax (including how to use `site:` and `-site:`), see the [search syntax and API documentation](docs/querying.md). For more tips, read the [appendix](https://cblgh.org/lieu/). 24 | 25 | ### Getting Lieu running 26 | 27 | ``` 28 | $ lieu help 29 | Lieu: neighbourhood search engine 30 | 31 | Commands 32 | - precrawl (scrapes config's general.url for a list of links:
  • elements containing an anchor tag) 33 | - crawl (start crawler, crawls all urls in config's crawler.webring file) 34 | - ingest (ingest crawled data, generates database) 35 | - search (interactive cli for searching the database) 36 | - host (hosts search engine over http) 37 | 38 | Example: 39 | lieu precrawl > data/webring.txt 40 | lieu crawl > data/crawled.txt 41 | lieu ingest 42 | lieu host 43 | ``` 44 | 45 | Lieu's crawl & precrawl commands output to [standard 46 | output](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)), 47 | for easy inspection of the data. You typically want to redirect their output to 48 | the files Lieu reads from, as defined in the config file. See below for a 49 | typical workflow. 50 | 51 | 52 | ### Workflow 53 | 54 | * Edit the config 55 | * Add domains to crawl in `config.crawler.webring` 56 | * **If you have a webpage with links you want to crawl:** 57 | * Set the config's `url` field to that page 58 | * Populate the list of domains to crawl with `precrawl`: `lieu precrawl > data/webring.txt` 59 | * Crawl: `lieu crawl > data/crawled.txt` 60 | * Create database: `lieu ingest` 61 | * Host engine: `lieu host` 62 | 63 | After ingesting the data with `lieu ingest`, you can also use lieu to search the 64 | corpus in the terminal with `lieu search`. 65 | 66 | ## Theming 67 | 68 | Tweak the `theme` values of the config, specified below. 69 | 70 | ## Config 71 | 72 | The config file is written in [TOML](https://toml.io/en/). 73 | 74 | ```toml 75 | [general] 76 | name = "Merveilles Webring" 77 | # used by the precrawl command and linked to in /about route 78 | url = "https://webring.xxiivv.com" 79 | # used by the precrawl command to populate the Crawler.Webring file; 80 | # takes simple html selectors. might be a bit wonky :) 81 | webringSelector = "li > a[href]:first-of-type" 82 | port = 10001 83 | 84 | [theme] 85 | # colors specified in hex (or valid css names) which determine the theme of the lieu instance 86 | # NOTE: If (and only if) all three values are set lieu uses those to generate the file html/assets/theme.css at startup. 87 | # You can also write directly to that file istead of adding this section to your configuration file 88 | foreground = "#ffffff" 89 | background = "#000000" 90 | links = "#ffffff" 91 | 92 | [data] 93 | # the source file should contain the crawl command's output 94 | source = "data/crawled.txt" 95 | # location & name of the sqlite database 96 | database = "data/searchengine.db" 97 | # contains words and phrases disqualifying scraped paragraphs from being presented in search results 98 | heuristics = "data/heuristics.txt" 99 | # aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word 100 | wordlist = "data/wordlist.txt" 101 | 102 | [crawler] 103 | # manually curated list of domains, or the output of the precrawl command 104 | webring = "data/webring.txt" 105 | # domains that are banned from being crawled but might originally be part of the webring 106 | bannedDomains = "data/banned-domains.txt" 107 | # file suffixes that are banned from being crawled 108 | bannedSuffixes = "data/banned-suffixes.txt" 109 | # phrases and words which won't be scraped (e.g. if a contained in a link) 110 | boringWords = "data/boring-words.txt" 111 | # domains that won't be output as outgoing links 112 | boringDomains = "data/boring-domains.txt" 113 | # queries to search for finding preview text 114 | previewQueryList = "data/preview-query-list.txt" 115 | ``` 116 | 117 | For your own use, the following config fields should be customized: 118 | 119 | * `name` 120 | * `url ` 121 | * `port` 122 | * `source` 123 | * `webring` 124 | * `bannedDomains` 125 | 126 | The following config-defined files can stay as-is unless you have specific requirements: 127 | 128 | * `database` 129 | * `heuristics` 130 | * `wordlist` 131 | * `bannedSuffixes` 132 | * `previewQueryList` 133 | 134 | For a full rundown of the files and their various jobs, see the [files 135 | description](docs/files.md). 136 | 137 | ## Developing 138 | Build a binary: 139 | ```sh 140 | # this project has an experimental fulltext-search feature, so we need to include sqlite's fts engine (fts5) 141 | go build --tags fts5 142 | # or using go run 143 | go run --tags fts5 . 144 | ``` 145 | 146 | Create new release binaries: 147 | ```sh 148 | ./release.sh 149 | ``` 150 | 151 | ### License 152 | 153 | Source code `AGPL-3.0-or-later`, Inter is available under `SIL OPEN FONT 154 | LICENSE Version 1.1`, Noto Serif is licensed as `Apache License, Version 2.0`. 155 | -------------------------------------------------------------------------------- /cli.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "lieu/crawler" 7 | "lieu/database" 8 | "lieu/ingest" 9 | "lieu/server" 10 | "lieu/util" 11 | "os" 12 | "strings" 13 | ) 14 | 15 | const help = `Lieu: neighbourhood search engine 16 | 17 | Commands 18 | - precrawl (scrapes config's general.url for a list of links:
  • elements containing an anchor tag) 19 | - crawl (start crawler, crawls all urls in config's crawler.webring file. outputs to stdout) 20 | - ingest (ingest crawled data, generates database) 21 | - search (interactive cli for searching the database) 22 | - host (hosts search engine over http) 23 | 24 | Example: 25 | lieu precrawl > data/webring.txt 26 | lieu crawl > data/source.txt 27 | lieu ingest 28 | lieu host 29 | 30 | See the configuration file lieu.toml or 31 | https://github.com/cblgh/lieu for more information. 32 | ` 33 | 34 | func main() { 35 | exists := util.CheckFileExists("lieu.toml") 36 | if !exists { 37 | fmt.Println("lieu: can't find config, saving an example config in the working directory") 38 | util.WriteMockConfig() 39 | fmt.Println("lieu: lieu.toml written to disk") 40 | util.Exit() 41 | } 42 | config := util.ReadConfig() 43 | 44 | var cmd string 45 | if len(os.Args) > 1 { 46 | cmd = os.Args[1] 47 | } else { 48 | cmd = "help" 49 | } 50 | 51 | switch cmd { 52 | case "help": 53 | fmt.Println(help) 54 | case "precrawl": 55 | if config.General.URL == "https://example.com/" { 56 | fmt.Println("lieu: the url is not set (example.com)") 57 | util.Exit() 58 | } 59 | crawler.Precrawl(config) 60 | case "crawl": 61 | exists := util.CheckFileExists(config.Crawler.Webring) 62 | if !exists { 63 | fmt.Printf("lieu: webring file %s does not exist\n", config.Crawler.Webring) 64 | util.Exit() 65 | } 66 | sourceLen := len(util.ReadList(config.Crawler.Webring, "\n")) 67 | if sourceLen == 0 { 68 | fmt.Printf("lieu: nothing to crawl; the webring file %s is empty\n", config.Crawler.Webring) 69 | util.Exit() 70 | } 71 | crawler.Crawl(config) 72 | case "ingest": 73 | exists := util.CheckFileExists(config.Data.Source) 74 | if !exists { 75 | fmt.Printf("lieu: data source %s does not exist\n", config.Data.Source) 76 | fmt.Println("lieu: try running `lieu crawl`") 77 | util.Exit() 78 | } 79 | sourceLen := len(util.ReadList(config.Data.Source, "\n")) 80 | if sourceLen == 0 { 81 | fmt.Printf("lieu: nothing to ingest; data source %s is empty\n", config.Data.Source) 82 | fmt.Println("lieu: try running `lieu crawl`") 83 | util.Exit() 84 | } 85 | fmt.Println("lieu: creating a new database & initiating ingestion") 86 | ingest.Ingest(config) 87 | case "search": 88 | exists := util.CheckFileExists(config.Data.Database) 89 | if !exists { 90 | util.DatabaseDoesNotExist(config.Data.Database) 91 | } 92 | interactiveMode(config.Data.Database) 93 | case "random": 94 | exists := util.CheckFileExists(config.Data.Database) 95 | if !exists { 96 | util.DatabaseDoesNotExist(config.Data.Database) 97 | } 98 | db := database.InitDB(config.Data.Database) 99 | fmt.Println(database.GetRandomPage(db)) 100 | case "host": 101 | exists := util.CheckFileExists(config.Data.Database) 102 | if !exists { 103 | util.DatabaseDoesNotExist(config.Data.Database) 104 | } 105 | open := util.CheckPortOpen(config.General.Port) 106 | if !open { 107 | fmt.Printf("lieu: port %d is not open; try another one\n", config.General.Port) 108 | util.Exit() 109 | } 110 | server.Serve(config) 111 | default: 112 | fmt.Println("Lieu: no such command, currently. Try `lieu help`") 113 | } 114 | } 115 | 116 | func interactiveMode(databasePath string) { 117 | db := database.InitDB(databasePath) 118 | reader := bufio.NewReader(os.Stdin) 119 | for { 120 | fmt.Printf("> ") 121 | input, err := reader.ReadString('\n') 122 | util.Check(err) 123 | input = strings.TrimSuffix(input, "\n") 124 | pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(input))) 125 | for _, pageData := range pages { 126 | fmt.Println(pageData.URL) 127 | if len(pageData.About) > 0 { 128 | fmt.Println(pageData.About) 129 | } 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /crawler/crawler.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "fmt" 5 | "lieu/types" 6 | "lieu/util" 7 | "log" 8 | "net/http" 9 | "net/url" 10 | "regexp" 11 | "strings" 12 | "time" 13 | 14 | "github.com/PuerkitoBio/goquery" 15 | "github.com/gocolly/colly/v2" 16 | "github.com/gocolly/colly/v2/queue" 17 | ) 18 | 19 | // the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages 20 | // (very spammy) 21 | func getBannedDomains(path string) []string { 22 | return util.ReadList(path, "\n") 23 | } 24 | 25 | func getBannedSuffixes(path string) []string { 26 | return util.ReadList(path, "\n") 27 | } 28 | 29 | func getBoringWords(path string) []string { 30 | return util.ReadList(path, "\n") 31 | } 32 | 33 | func getBoringDomains(path string) []string { 34 | return util.ReadList(path, "\n") 35 | } 36 | 37 | func getAboutHeuristics(path string) []string { 38 | return util.ReadList(path, "\n") 39 | } 40 | 41 | func getPreviewQueries(path string) []string { 42 | previewQueries := util.ReadList(path, "\n") 43 | if len(previewQueries) > 0 { 44 | return previewQueries 45 | } else { 46 | return []string{"main p", "article p", "section p", "p"} 47 | } 48 | } 49 | 50 | func find(list []string, query string) bool { 51 | for _, item := range list { 52 | if item == query { 53 | return true 54 | } 55 | } 56 | return false 57 | } 58 | 59 | func getLink(target string) string { 60 | // remove anchor links 61 | if strings.Contains(target, "#") { 62 | target = strings.Split(target, "#")[0] 63 | } 64 | if strings.Contains(target, "?") { 65 | target = strings.Split(target, "?")[0] 66 | } 67 | target = strings.TrimSpace(target) 68 | // remove trailing / 69 | return strings.TrimSuffix(target, "/") 70 | } 71 | 72 | func getWebringLinks(path string) []string { 73 | var links []string 74 | candidates := util.ReadList(path, "\n") 75 | for _, l := range candidates { 76 | u, err := url.Parse(l) 77 | if err != nil { 78 | continue 79 | } 80 | if u.Scheme == "" { 81 | u.Scheme = "https" 82 | } 83 | links = append(links, u.String()) 84 | } 85 | return links 86 | } 87 | 88 | func getDomains(links []string) ([]string, []string) { 89 | var domains []string 90 | // sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites) 91 | // pathsites are sites that are passed in which contain path, 92 | // e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled 93 | var pathsites []string 94 | for _, l := range links { 95 | u, err := url.Parse(l) 96 | if err != nil { 97 | continue 98 | } 99 | domains = append(domains, u.Hostname()) 100 | if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") { 101 | pathsites = append(pathsites, l) 102 | } 103 | } 104 | return domains, pathsites 105 | } 106 | 107 | func findSuffix(suffixes []string, query string) bool { 108 | for _, suffix := range suffixes { 109 | if strings.HasSuffix(strings.ToLower(query), suffix) { 110 | return true 111 | } 112 | } 113 | return false 114 | } 115 | 116 | func cleanText(s string) string { 117 | s = strings.TrimSpace(s) 118 | s = strings.ReplaceAll(s, "\n", " ") 119 | whitespace := regexp.MustCompile(`\p{Z}+`) 120 | s = whitespace.ReplaceAllString(s, " ") 121 | return s 122 | } 123 | 124 | func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) { 125 | c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) { 126 | fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL) 127 | }) 128 | 129 | c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) { 130 | desc := cleanText(e.Attr("content")) 131 | if len(desc) > 0 && len(desc) < 1500 { 132 | fmt.Println("desc", desc, e.Request.URL) 133 | } 134 | }) 135 | 136 | c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) { 137 | ogDesc := cleanText(e.Attr("content")) 138 | if len(ogDesc) > 0 && len(ogDesc) < 1500 { 139 | fmt.Println("og-desc", ogDesc, e.Request.URL) 140 | } 141 | }) 142 | 143 | c.OnHTML("html[lang]", func(e *colly.HTMLElement) { 144 | lang := cleanText(e.Attr("lang")) 145 | if len(lang) > 0 && len(lang) < 100 { 146 | fmt.Println("lang", lang, e.Request.URL) 147 | } 148 | }) 149 | 150 | // get page title 151 | c.OnHTML("title", func(e *colly.HTMLElement) { 152 | fmt.Println("title", cleanText(e.Text), e.Request.URL) 153 | }) 154 | 155 | c.OnHTML("body", func(e *colly.HTMLElement) { 156 | QueryLoop: 157 | for i := 0; i < len(previewQueries); i++ { 158 | // After the fourth paragraph we're probably too far in to get something interesting for a preview 159 | elements := e.DOM.Find(previewQueries[i]) 160 | for j := 0; j < 4 && j < elements.Length(); j++ { 161 | element_text := elements.Slice(j, j+1).Text() 162 | paragraph := cleanText(element_text) 163 | if len(paragraph) < 1500 && len(paragraph) > 20 { 164 | if !util.Contains(heuristics, strings.ToLower(paragraph)) { 165 | fmt.Println("para", paragraph, e.Request.URL) 166 | break QueryLoop 167 | } 168 | } 169 | } 170 | } 171 | paragraph := cleanText(e.DOM.Find("p").First().Text()) 172 | if len(paragraph) < 1500 && len(paragraph) > 0 { 173 | fmt.Println("para-just-p", paragraph, e.Request.URL) 174 | } 175 | 176 | // get all relevant page headings 177 | collectHeadingText("h1", e) 178 | collectHeadingText("h2", e) 179 | collectHeadingText("h3", e) 180 | }) 181 | } 182 | 183 | func collectHeadingText(heading string, e *colly.HTMLElement) { 184 | for _, headingText := range e.ChildTexts(heading) { 185 | if len(headingText) < 500 { 186 | fmt.Println(heading, cleanText(headingText), e.Request.URL) 187 | } 188 | } 189 | } 190 | 191 | func SetupDefaultProxy(config types.Config) error { 192 | // no proxy configured, go back 193 | if config.General.Proxy == "" { 194 | return nil 195 | } 196 | proxyURL, err := url.Parse(config.General.Proxy) 197 | if err != nil { 198 | return err 199 | } 200 | 201 | httpClient := &http.Client{ 202 | Transport: &http.Transport{ 203 | Proxy: http.ProxyURL(proxyURL), 204 | }, 205 | } 206 | 207 | http.DefaultClient = httpClient 208 | return nil 209 | } 210 | 211 | func Precrawl(config types.Config) { 212 | // setup proxy 213 | err := SetupDefaultProxy(config) 214 | if err != nil { 215 | log.Fatal(err) 216 | } 217 | 218 | res, err := http.Get(config.General.URL) 219 | util.Check(err) 220 | defer res.Body.Close() 221 | 222 | if res.StatusCode != 200 { 223 | log.Fatal("status not 200") 224 | } 225 | 226 | doc, err := goquery.NewDocumentFromReader(res.Body) 227 | util.Check(err) 228 | 229 | items := make([]string, 0) 230 | s := doc.Find("html") 231 | query := config.General.WebringSelector 232 | if query == "" { 233 | query = "li > a[href]:first-of-type" 234 | } 235 | util.QuerySelector(query, s, &items) 236 | 237 | BANNED := getBannedDomains(config.Crawler.BannedDomains) 238 | for _, item := range items { 239 | link := getLink(item) 240 | u, err := url.Parse(link) 241 | // invalid link 242 | if err != nil { 243 | continue 244 | } 245 | domain := u.Hostname() 246 | if find(BANNED, domain) { 247 | continue 248 | } 249 | fmt.Println(link) 250 | } 251 | } 252 | 253 | func Crawl(config types.Config) { 254 | // setup proxy 255 | err := SetupDefaultProxy(config) 256 | if err != nil { 257 | log.Fatal(err) 258 | } 259 | SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes) 260 | links := getWebringLinks(config.Crawler.Webring) 261 | domains, pathsites := getDomains(links) 262 | initialDomain := config.General.URL 263 | 264 | // TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains 265 | // instantiate default collector 266 | c := colly.NewCollector( 267 | colly.MaxDepth(3), 268 | ) 269 | if config.General.Proxy != "" { 270 | c.SetProxy(config.General.Proxy) 271 | } 272 | 273 | q, _ := queue.New( 274 | 5, /* threads */ 275 | &queue.InMemoryQueueStorage{MaxSize: 100000}, 276 | ) 277 | 278 | for _, link := range links { 279 | q.AddURL(link) 280 | } 281 | 282 | c.UserAgent = "Lieu" 283 | c.AllowedDomains = domains 284 | c.AllowURLRevisit = false 285 | c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains) 286 | c.IgnoreRobotsTxt = false 287 | 288 | delay, _ := time.ParseDuration("200ms") 289 | c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3}) 290 | 291 | boringDomains := getBoringDomains(config.Crawler.BoringDomains) 292 | boringWords := getBoringWords(config.Crawler.BoringWords) 293 | previewQueries := getPreviewQueries(config.Crawler.PreviewQueries) 294 | heuristics := getAboutHeuristics(config.Data.Heuristics) 295 | 296 | // on every a element which has an href attribute, call callback 297 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 298 | 299 | if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 { 300 | return 301 | } 302 | 303 | link := getLink(e.Attr("href")) 304 | if findSuffix(SUFFIXES, link) { 305 | return 306 | } 307 | 308 | link = e.Request.AbsoluteURL(link) 309 | u, err := url.Parse(link) 310 | if err != nil { 311 | return 312 | } 313 | 314 | outgoingDomain := u.Hostname() 315 | currentDomain := e.Request.URL.Hostname() 316 | 317 | // log which site links to what 318 | if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) { 319 | if !find(domains, outgoingDomain) { 320 | fmt.Println("non-webring-link", link, e.Request.URL) 321 | // solidarity! someone in the webring linked to someone else in it 322 | } else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain { 323 | fmt.Println("webring-link", link, e.Request.URL) 324 | } 325 | } 326 | 327 | // rule-based crawling 328 | var pathsite string 329 | for _, s := range pathsites { 330 | if strings.Contains(s, outgoingDomain) { 331 | pathsite = s 332 | break 333 | } 334 | } 335 | // the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to 336 | // existing on a shared domain) 337 | if pathsite != "" { 338 | // make sure we're only crawling descendents of the original path 339 | if strings.HasPrefix(link, pathsite) { 340 | q.AddURL(link) 341 | } 342 | } else { 343 | // visits links from AllowedDomains 344 | q.AddURL(link) 345 | } 346 | }) 347 | 348 | handleIndexing(c, previewQueries, heuristics) 349 | 350 | // start scraping 351 | q.Run(c) 352 | } 353 | -------------------------------------------------------------------------------- /data/banned-domains.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/data/banned-domains.txt -------------------------------------------------------------------------------- /data/banned-suffixes.txt: -------------------------------------------------------------------------------- 1 | .xml 2 | .pdf 3 | .rss 4 | .jpg 5 | .png 6 | .gif 7 | .avi 8 | .webm 9 | .mp4 10 | .ogg 11 | .mp3 12 | .zip 13 | .exe 14 | .txt 15 | .asc 16 | .key 17 | .csv 18 | -------------------------------------------------------------------------------- /data/boring-domains.txt: -------------------------------------------------------------------------------- 1 | instagram.com 2 | twitter.com 3 | linkedin.com 4 | facebook.com 5 | getpoole.com 6 | jekyllrb.com 7 | twitter.com 8 | amazon.com 9 | google.com 10 | microsoft.com 11 | youtube.com 12 | github.io 13 | meetup.com 14 | ebay.com 15 | t.co 16 | a.co 17 | wsj.com 18 | creativecommons.org 19 | patreon.com 20 | -------------------------------------------------------------------------------- /data/boring-words.txt: -------------------------------------------------------------------------------- 1 | bitcoin 2 | javascript: 3 | mailto: 4 | subscribe 5 | -------------------------------------------------------------------------------- /data/crawled.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/heuristics.txt: -------------------------------------------------------------------------------- 1 | incoming 2 | tagged 3 | edited 4 | updated 5 | last update 6 | last edit 7 | © 8 | (c) 9 | all rights reserved 10 | licensed under 11 | subscribe 12 | | 13 | • 14 | generated by 15 | powered by 16 | this post was 17 | click here for 18 | click here to 19 | published on: 20 | published: 21 | posted: 22 | share this article 23 | estimated read time 24 | -------------------------------------------------------------------------------- /data/preview-query-list.txt: -------------------------------------------------------------------------------- 1 | header p.p-summary 2 | main p.p-summary 3 | main p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p) 4 | article p.p-summary 5 | article p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p) 6 | p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p) 7 | header ~ p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p) 8 | h1 ~ p:not(.post-meta):not(.alternate) 9 | p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p) 10 | -------------------------------------------------------------------------------- /data/webring.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/wordlist.txt: -------------------------------------------------------------------------------- 1 | understandings|understanding|conversations|disappearing|informations|grandmothers|grandfathers|questionings|conversation|information|approaching|understands|immediately|positioning|grandmother|travellings|questioners|recognizing|recognizers|televisions|rememberers|expressions|discovering|disappeared|interesting|grandfather|straightest|controllers|controlling|considering|remembered|cigarettes|companying|completely|spreadings|considered|continuing|controlled|stationing|controller|straighter|stretching|businesses|somebodies|soldiering|countering|darknesses|situations|directions|disappears|younglings|suggesting|afternoons|breathings|distancing|screenings|schoolings|especially|everything|everywhere|explaining|explainers|expression|branchings|revealings|repeatings|surprising|rememberer|somewheres|television|themselves|recognizer|recognizes|recognized|belongings|finishings|travelling|questioner|beginnings|travelings|questioned|followings|pretending|forgetting|forgetters|forwarding|positioned|travellers|gatherings|perfecting|understand|understood|weightings|approaches|officering|numberings|happenings|mentioning|letterings|husbanding|imaginings|approached|apartments|whispering|interested|discovered|spinnings|clearings|climbings|spendings|clothings|colorings|soundings|truckings|somewhere|troubling|companies|companied|beautiful|computers|confusing|considers|travelers|youngling|continues|continued|traveller|traveling|yellowing|apartment|beginning|wheelings|travelled|sometimes|something|appearing|cornering|believing|countered|believers|countries|soldiered|coverings|creatures|crossings|accepting|daughters|belonging|situation|silvering|different|silencing|touchings|bettering|tomorrows|disappear|thinkings|boardings|discovers|admitting|wrappings|distances|distanced|sightings|shrugging|doctoring|showering|shoulders|shoppings|shootings|dressings|sheetings|shadowing|settlings|servicing|seriously|seconding|searching|weighting|screening|screaming|schooling|teachings|bothering|everybody|botherers|bottoming|excepting|expecting|explained|direction|explainer|surprised|surprises|waterings|branching|revealing|returning|surfacing|familiars|repeating|fathering|reminding|supposing|breasting|attacking|remembers|breathing|remaining|breathers|brightest|brownings|suggested|recognize|fightings|attention|figurings|receiving|reasoning|realizing|fingering|buildings|finishing|stupidest|stuffings|watchings|flashings|strongest|strikings|flighting|flowering|promisers|promising|following|bathrooms|prettiest|pretended|stretched|foreheads|foresting|stretches|forgotten|pressings|forgetter|strangest|preparing|forwarded|strangers|possibles|positions|afternoon|straights|pocketing|gardening|pleasings|wondering|gathering|picturing|personals|perfected|stomaches|stomached|carefully|stationed|catchings|parenting|paintings|orderings|groupings|wintering|officered|offerings|centering|numbering|neighbors|certainly|happening|narrowing|narrowest|mountains|mothering|mirroring|middlings|messaging|standings|mentioned|mattering|marriages|histories|machining|hospitals|listening|lightings|springing|lettering|husbanded|spreaders|whispered|imagining|imaginers|spreading|important|languages|answering|cigarette|interests|spiriting|cleanings|knockings|soundest|coatings|sounders|sounding|colleges|coloring|colorful|wouldn't|training|colorers|sorriest|worrying|belonged|approach|touchers|touching|computer|whatever|toppings|confused|confuses|workings|consider|bettered|teething|tonights|tonguers|tonguing|continue|arriving|tomorrow|controls|together|blacking|blackest|throwers|throwing|coolings|someones|blockers|somebody|thirties|soldiers|cornered|weighted|counting|thoughts|counters|thinking|thinners|thinning|coursing|covering|thinnest|craziest|snapping|creating|creature|thickest|boarding|crossing|smokings|crowding|smelling|smallest|cuttings|slipping|slightly|dancings|sleepers|slamming|wordings|darkness|daughter|boatings|skinning|weddings|thanking|sittings|deciding|deciders|singling|singings|despites|simplest|terrible|silvered|tellings|wearings|youngest|watering|silences|teachers|bookings|agreeing|teaching|discover|attacked|bothered|botherer|watching|swingers|bottling|distance|silenced|signings|bottomed|sighting|shutting|shrugged|wondered|swinging|doctored|sweetest|showered|showings|doorways|shouting|shoulder|wronging|shortest|surprise|dragging|shopping|shooters|drawings|actually|shooting|dreaming|dressing|avoiding|shitting|shirting|shipping|drinking|drinkers|braining|sheeting|sharpest|drivings|sharpers|dropping|droppers|shadowed|surfaced|settling|washings|settings|services|serviced|earliest|backings|earthing|servings|branches|branched|seconded|seatings|surfaces|searched|searches|walkings|screened|waitings|screamed|supposed|emptiest|emptying|breaking|breakers|schooled|enjoying|enjoyers|entering|runnings|breasted|rounders|rounding|supposes|everyone|visitors|visiting|breathed|excepted|roofings|exciting|breathes|expected|rollings|bankings|breather|explains|villages|bridging|viewings|brighter|ringings|righting|suitings|bringing|revealed|bringers|returned|failings|repliers|replying|repeated|brothers|familiar|wintered|families|suggests|farthest|furthest|browning|fathered|removing|building|reminded|bathroom|allowing|suddenly|allowers|feedings|builders|burnings|feelings|remained|refusing|stupider|windings|although|stuffing|studying|business|angriest|fighting|fighters|students|figuring|received|twenties|receives|fillings|reasoned|findings|stronger|turnings|realizes|realized|readiest|fingered|readying|striking|trusters|finishes|trusting|finished|readings|reachers|reaching|quieters|quietest|quieting|fittings|quickest|writings|beaching|trucking|callings|stranger|flashing|beatings|answered|flattest|flatting|flighted|straight|troubled|flowered|pullings|storming|promiser|couldn't|promised|promises|couldn’t|followed|stoppers|problems|probably|prettier|stopping|pretends|stomachs|troubles|pressers|tripping|forehead|stickers|forested|pressing|whispers|carrying|sticking|carriers|stepping|stealers|forwards|stealing|becoming|prepares|prepared|powering|freeings|stations|possible|position|freshest|beddings|wrapping|fronting|catching|fuckings|policing|funniest|pointers|pointing|catchers|pocketed|gardened|ceilings|pleasing|gathered|starting|centered|platings|plastics|planning|pictured|pictures|traveler|pickings|personal|glancing|yourself|chancing|perfects|changing|peopling|partying|partings|parented|grabbing|grabbers|changers|checking|starring|bedrooms|checkers|pairings|standing|painting|outsides|greatest|cheeking|greening|greenest|grouping|ordering|anything|openings|guarding|wheeling|officers|guessing|spreader|offering|children|anywhere|numbered|choicest|noticers|noticing|hallways|nothings|hangings|nobodies|admitted|neighbor|choosing|choosers|happened|neckings|happiest|narrowed|narrower|spotting|churches|mouthing|traveled|mountain|mothered|accepted|mornings|mirrored|headings|spirited|hearings|heatings|circling|middling|messaged|messages|heaviest|wouldn’t|spinners|mentions|helpings|cleanest|memories|meetings|meanings|appeared|mattered|marrieds|marrying|marriage|yellowed|markings|cleaning|managing|cleaners|holdings|machined|machines|lunching|luckiest|lowering|longings|clearest|hospital|lockings|littlest|clearing|listened|housings|lightest|lighting|lighters|spinning|hundreds|hurrying|believes|spenders|believed|husbands|lettered|lettings|leadings|ignoring|laughing|ignorers|imagines|yellower|imagined|climbers|imaginer|spending|closings|specials|speakers|language|believer|clothing|clouding|speaking|interest|spacings|landings|knowings|southest|jacketed|knocking|kitchens|kissings|killings|keepings|dresses|biggest|sticker|careful|shirted|warmers|shipped|birding|drinker|carries|sheeted|warming|carried|carrier|driving|sharper|tonight|drivers|casings|sharers|sharing|stepped|dropped|dropper|whisper|shapers|shaping|shakers|shaking|tonguer|shadows|stealer|several|tongued|staying|settles|settled|dusting|setting|tongues|catting|backing|catches|earlier|warmest|earthed|service|serving|warring|wanters|catcher|serious|eastest|sensing|senders|easiest|sending|sellers|selling|seeming|seeings|tiniest|seconds|station|causing|seating|edgings|stating|timings|efforts|causers|screens|blacker|ceiling|screams|centers|wanting|walling|walkers|certain|emptied|empties|emptier|thrower|endings|started|schools|scarers|scaring|sayings|engines|savings|sanding|enjoyed|starers|saddest|enjoyer|staring|enoughs|rushing|bagging|runners|entered|running|chances|entires|chancer|rubbing|rowings|rounder|chanced|rounded|starred|rooming|changed|changes|blocked|angrier|exactly|changer|blocker|excepts|checked|excited|walking|excites|roofing|through|expects|blooded|checker|cheeked|throats|explain|wakings|springs|thought|waiting|blowing|rolling|rocking|risings|ringing|baggers|animals|righter|righted|ridings|richest|facings|reveals|blowers|choicer|choices|returns|voicing|worries|resting|chooses|failing|spreads|replier|failers|falling|spotted|replies|replied|chooser|thinned|fallers|thinner|balling|boarded|repeats|visitor|farther|further|circles|another|removed|fastest|removes|fathers|thicker|circled|visited|reminds|fearing|spirits|classes|banking|boating|cleaned|feeding|spinner|thanked|village|worried|feeling|cleaner|remains|cleared|refuses|refused|workers|reddest|telling|yellows|spender|working|clearer|clearly|climbed|tearing|fighter|teaming|figured|figures|booking|viewing|climber|usually|closest|receive|filling|teacher|reasons|closing|finally|closers|anybody|finding|anymore|realize|special|finders|booting|realest|clothed|readier|readies|readied|fingers|teaches|tallest|speaker|readers|talkers|clouded|talking|reading|firings|spacing|takings|reacher|reached|coating|reaches|raising|raining|fishing|quietly|fittest|fitting|systems|whether|bothers|wrapped|fitters|quieted|quieter|quickly|coffees|quicker|fixings|coldest|sounded|sounder|actings|anyways|college|flashed|flashes|bottles|flatter|flatted|colored|bottled|wording|turning|sorting|flights|colorer|putting|pushers|pushing|flowers|pullers|swinger|wonders|sorrier|pulling|proving|comings|bottoms|promise|truster|boxings|company|follows|younger|sweeter|yelling|problem|without|beached|footing|confuse|beaches|brained|bearing|pretend|trucked|forcing|presser|wishing|trouble|forests|appears|beating|airings|forever|surface|control|forgets|accepts|pressed|wronged|winters|forming|presses|prepare|beaters|breaker|wheeled|because|forward|coolers|cooling|allowed|powered|pourers|freeing|pouring|tripped|coolest|breasts|someone|fresher|suppose|somehow|friends|breaths|copping|fronted|becomes|porches|poppers|popping|poorest|treeing|fucking|fullest|pooling|breathe|polices|funnier|funnies|policed|bedding|corners|futures|pointer|pointed|gamings|counted|soldier|pockets|wetting|pleased|gardens|wetters|wettest|pleases|counter|sunning|players|westest|country|gathers|bridges|playing|plating|bridged|plastic|couples|softest|getting|planned|getters|placing|gifting|pinking|pilings|piecing|picture|coursed|courses|summers|picking|snowing|phoning|bedroom|glances|glanced|winging|snapped|glassed|glasses|perhaps|covered|crazies|crazier|perfect|peopled|persons|peoples|suiting|pausing|passing|goldest|partied|windows|parties|parting|creates|grabbed|smokers|created|grabber|brought|weights|bringer|arrives|crosser|crosses|grasses|parents|palming|graying|pairing|crossed|painted|arrived|greying|smoking|paining|outside|brother|greater|smilers|outings|greened|greener|crowded|travels|smiling|ordered|grounds|offings|smelled|openers|browner|grouped|opening|smaller|growing|okaying|officer|guarded|slowest|slowing|cupping|slipped|guessed|guesses|cutting|offices|gunning|offered|browned|allower|nursing|numbing|suggest|cutters|numbers|sliders|halving|sliding|noticer|wedding|notices|noticed|nothing|writers|hallway|handing|sleeper|normals|noising|hanging|nodding|dancing|wearing|writing|slammed|hangers|darkest|skinned|happens|trained|needing|builder|beliefs|happier|necking|nearest|hardest|nearing|burning|believe|winding|hatting|narrows|stupids|sitting|mouthed|deadest|watered|sisters|mothers|singled|winning|morning|mooning|moments|heading|missing|decides|decided|decider|mirrors|minutes|hearing|minings|already|minding|middled|heating|burners|singles|middles|deepest|stuffed|heaters|singing|simpler|heavier|heavies|belongs|message|despite|mention|simples|studies|studied|silvers|helping|helpers|members|meeting|willing|meanest|attacks|herself|meaning|dinners|student|hidings|matters|marries|married|busying|busiest|silence|against|highest|wildest|hilling|marking|mapping|manages|managed|himself|history|tracked|strikes|manning|hitting|makings|hitters|whiting|towards|watched|holding|toucher|machine|holders|lunches|lunched|watches|luckier|stretch|streets|lowered|loudest|lookers|looking|longing|calling|longest|locking|bending|washing|signing|hottest|littler|benders|strange|sighted|listens|linings|likings|housing|beneath|sighing|sicking|however|lighted|sickest|lighter|calming|lifters|hundred|calmest|hurried|hurries|lifting|touched|doesn't|doesn’t|hurting|touches|showers|husband|doctors|letters|cameras|letting|tossing|leaving|dogging|leaning|leafing|leaders|leading|whitest|layered|ignored|showing|ignores|stories|ignorer|shoving|laughed|lasting|largest|imaging|doorway|besting|imagine|shouted|stormed|downing|storing|topping|avoided|dragged|shorter|betters|stopper|landers|insides|instead|written|drawing|shopped|stopped|between|landing|shooter|knowing|jackets|dreamed|carding|toothed|knocked|knifing|kitchen|joining|teethed|stomach|joiners|kissing|kindest|killers|killing|shoeing|kidding|jumping|kickers|kicking|jumpers|keepers|dressed|keeping|enough|checks|kicked|jumper|kicker|kidded|jumped|killed|joking|killer|kinder|joiner|kisses|kissed|joined|knives|knifes|knifed|jacket|knocks|itself|ladies|landed|lander|inside|larger|images|lasted|imaged|laughs|ignore|aboves|laying|accept|layers|across|yellow|leaded|leader|leaved|leaned|learns|leaves|yelled|lesser|letter|living|lifted|lifter|humans|hugest|lights|wrongs|houses|liking|likers|lining|housed|acting|listen|hotels|little|hotter|locals|locked|horses|longer|longed|looked|hoping|looker|losing|adding|louder|loving|lovers|lowing|lowest|writer|lowers|homing|holing|holder|making|hitter|makers|manned|manage|writes|admits|mapped|marked|hilled|higher|afraid|hiding|hidden|matter|ageing|helper|member|helped|hellos|heater|metals|middle|heated|mights|minded|hearts|mining|minute|headed|mirror|misses|missed|moment|moneys|monies|months|mooned|mostly|having|mother|worlds|hating|mouths|moving|movers|movies|worker|myself|naming|namers|narrow|hatted|hardly|nearer|neared|nearly|harder|necked|needed|happen|hanger|newest|nicest|nights|worked|nobody|nodded|handed|noises|noised|worded|normal|norths|nosing|agrees|noting|notice|halves|halved|number|guying|numbed|nurses|nursed|agreed|wooden|offing|gunned|offers|office|guards|wonder|okayed|okay'd|okay’d|ok'ing|ok’ing|oldest|womens|opened|opener|groups|womans|within|ground|orders|others|outing|wished|greens|greats|owning|wishes|owners|paging|pained|paints|greyed|greyer|paired|palest|grayed|palmed|papers|grayer|parent|parted|passed|golder|passes|pauses|paused|paying|person|people|wipers|goings|glance|phones|phoned|picked|giving|givens|pieces|pieced|piling|gifted|pinked|pinker|places|placed|getter|gotten|plated|plates|gently|played|gather|player|please|gating|garden|pocket|gamers|points|pointy|gaming|future|wiping|fuller|police|pooled|poorer|fucked|popped|popper|fronts|friend|freers|poured|pourer|freest|powers|formed|forget|forgot|forest|forces|forced|footed|pretty|follow|fliers|flyers|proven|airing|proves|proved|prover|pulled|flying|puller|flower|pushes|pushed|floors|pusher|flight|fixers|fixing|quicks|winter|fitted|quiets|fitter|winged|radios|rained|raises|raised|fishes|rather|fished|firsts|firing|reader|finish|finger|fining|finest|realer|finder|really|finals|reason|filled|figure|fought|fights|fields|fewest|redder|refuse|remain|feeing|remind|feared|father|faster|remove|repeat|family|faller|fallen|failer|failed|rested|fading|return|reveal|riches|richer|riding|ridden|window|riders|rights|facing|allows|ringed|rising|rivers|extras|rocked|rolled|expect|roofed|excite|except|rooves|roomed|events|rounds|rowing|evened|rubbed|almost|entire|runner|enters|keying|rushed|rushes|sadder|safest|sanded|enjoys|saving|engine|savers|winded|saying|enders|scared|scares|scarer|scenes|ending|school|scream|either|eights|screen|egging|effort|edging|seated|second|eaters|seeing|seemed|eating|seller|sender|senses|sensed|easier|easily|earths|serves|served|willed|dusted|settle|during|driers|sevens|sexing|shadow|shakes|shaken|dryers|shaker|always|shaped|driest|shapes|shaper|drying|shares|shared|sharer|sharps|driver|drives|driven|sheets|droves|drinks|shirts|drunks|shoots|shorts|dozens|should|downed|shouts|shoved|shoves|showed|wilder|shower|dogged|doctor|shrugs|didn’t|sicker|sicked|didn't|siding|sighed|doings|sights|signed|dinner|silent|silver|dyings|widest|simple|simply|deeper|single|decide|deaths|sister|deader|sizing|darker|wholes|dances|danced|slides|slider|cutter|slower|slowed|slowly|smalls|cupped|smells|smelly|crying|smiles|smiled|smiler|crowds|smokes|smoked|smoker|covers|snowed|whited|softer|course|softly|couple|counts|corner|whiter|copped|cooled|cooler|coming|whites|sorted|colors|colder|sounds|coffee|coated|spaces|clouds|spaced|spoken|speaks|clothe|closed|closes|closer|spends|climbs|clears|cleans|spirit|cities|circle|church|choose|spread|chosen|choice|chests|sprung|sprang|stages|stairs|cheeks|stands|keeper|change|chance|stared|stares|starer|chairs|starts|center|causer|caused|states|stated|causes|caught|catted|stayed|steals|stolen|casing|sticks|caring|carded|stones|animal|cannot|stored|stores|storms|calmer|calmed|called|street|buyers|bought|strike|struck|buying|anyone|strong|busier|busied|busing|burner|stuffs|burned|stupid|builds|browns|suites|suited|brings|summer|bright|sunned|bridge|breath|breast|breaks|broken|surest|branch|brains|anyway|boxing|wheels|sweets|swings|bottom|bottle|system|bother|tables|taking|takers|talked|talker|boring|taller|booted|taught|booked|teamed|teared|boning|appear|bodies|thanks|boated|thicks|boards|bluest|things|thinks|blower|thirds|thirty|though|threes|throat|bloods|thrown|throws|blocks|blacks|tinier|biters|tiring|todays|biting|toning|tongue|arming|birded|bigger|wetter|toothy|beyond|better|topped|tossed|bested|tosses|beside|bender|toward|bended|tracks|belong|trains|belief|travel|behind|begins|before|bedded|became|become|beater|beaten|trucks|truest|aren’t|aren't|trusts|truths|trying|turned|twenty|around|uncles|weight|wasn’t|wasn't|arrive|unless|upping|wedded|viewed|barely|visits|banked|balled|voices|voiced|waited|bagger|waking|walked|bagged|walker|walled|asking|wanted|wanter|warred|waring|backed|warmed|warmer|babies|washed|washes|avoids|attack|waters|asleep|watery|waving|wavers|seems|party|minds|eaten|sells|sends|known|sense|hours|pasts|paths|easts|pause|mined|layer|payed|serve|earth|early|wills|aired|heard|hears|dusts|kills|goers|hotel|seven|dried|sexed|going|drier|dries|dryer|glass|heads|shake|leads|shook|gives|shape|picks|above|locks|money|drops|share|given|wrong|girls|month|sharp|piece|wilds|sheet|drove|drive|moons|lands|piles|ships|drink|piled|drank|drunk|shirt|pinks|shits|dress|shoes|mores|shoot|longs|shots|drawn|draws|drags|shops|haves|horse|short|gifts|dozen|place|downs|shout|hopes|shove|hoped|plans|wiper|doors|shown|shows|wiped|plate|world|mouth|doers|joins|shrug|shuts|leafs|moved|plays|moves|sicks|don’t|pleas|sided|sides|sighs|don't|gated|sight|looks|gates|wives|mover|signs|doing|dirts|knees|movie|gamer|gamed|dying|since|desks|sings|singe|deeps|point|acted|musts|yells|funny|wider|loses|sixes|whose|names|sizes|sized|skins|keyed|skies|pools|slams|darks|named|slept|namer|leave|dance|slide|hated|young|whole|fucks|who’s|slips|who's|slows|front|porch|loved|hates|small|fresh|cries|cried|smell|white|nears|loves|smile|freer|pours|lover|freed|power|smoke|frees|yeses|crowd|cross|jokes|fours|snaps|crazy|forms|cover|homed|snows|among|necks|happy|least|press|force|homes|count|needs|wipes|years|cools|foots|joked|never|songs|comes|sorry|flier|color|sorts|souls|lower|newer|flyer|colds|sound|flown|south|works|coats|space|nicer|prove|lucky|spoke|night|speak|cloud|hurts|yards|pulls|holed|flies|close|spent|spend|words|holes|hangs|clear|lunch|spins|clean|class|liars|floor|holds|spots|alive|noise|flats|chose|flash|nones|child|fixer|fixed|fixes|chest|cheek|mains|stage|hands|makes|stair|quick|stood|check|fiver|stand|fives|north|wrote|stare|lying|quiet|noses|quite|start|chair|nosed|lived|rains|notes|state|large|cause|raise|catch|noted|maker|stays|halls|angry|stole|steal|reach|first|cased|cases|steps|lives|fires|stuck|carry|stick|cares|still|cared|fired|cards|added|stone|halve|stops|can’t|ready|hairy|store|hairs|can't|storm|numbs|story|could|finer|knife|fines|calms|fined|calls|hurry|while|buyer|finds|nurse|found|which|lifts|admit|final|fills|lasts|keeps|where|buses|bused|study|offed|stuff|fight|woods|burnt|burns|field|human|built|wings|offer|brown|allow|guyed|suite|suits|bring|marks|fewer|feels|hills|wines|later|feeds|agree|guess|surer|fears|broke|break|guard|brain|highs|often|marry|ahead|knock|boxes|sweet|boxed|okays|swing|swung|falls|reply|hides|fails|huger|table|takes|taken|laugh|taker|rests|house|talks|bored|women|faded|fades|wheel|facts|wraps|boots|teach|faces|teams|older|tears|bones|maybe|faced|areas|boned|opens|tells|rides|grows|thank|their|boats|thens|there|these|thick|rider|after|board|right|bluer|thins|blues|blued|grown|thing|again|rings|think|blows|blown|third|would|means|those|risen|three|rises|blood|eying|heres|throw|threw|roses|group|river|black|tying|times|timed|roads|rocks|order|meant|green|tired|tires|extra|meets|today|rolls|biter|bitey|other|toned|tones|light|bites|worry|birds|roofs|armed|outer|rooms|outed|every|tooth|teeth|round|image|bests|event|liked|evens|rowed|likes|touch|bends|windy|bents|towns|winds|great|below|overs|owned|liker|train|enter|wound|begun|helps|began|begin|owner|beers|kinds|wests|paged|trees|treed|tripe|trips|pages|alone|hello|beats|enjoy|bears|truck|beach|safer|trues|truer|trued|safes|hells|sames|truth|pains|wells|sands|tried|tries|greys|turns|isn’t|isn't|heavy|twice|saves|uncle|saved|under|kicks|saver|paint|lines|grays|until|weeks|upped|pairs|using|asked|usual|scare|being|ender|metal|views|paled|banks|visit|pales|paler|voice|scene|heats|waits|balls|ended|empty|woken|palms|wakes|waked|lined|knows|pants|worse|paper|walls|worst|wants|eight|heart|along|backs|egged|jumps|warms|grass|might|edges|grabs|seats|avoid|parts|edged|aunts|watch|about|eater|won’t|water|won't|waved|waves|goods|waver|golds|wears|ears|grab|fits|each|sets|knee|lots|part|dust|noes|fish|stay|good|rain|cats|work|wild|laid|hang|gold|pass|step|loud|case|help|your|past|nods|home|care|path|hell|love|fire|gods|lift|card|stop|pays|keys|cars|paid|fine|none|real|into|drop|heat|wish|cans|kids|find|goer|goes|went|calm|just|lead|gone|call|fill|nose|ship|huge|acts|lows|buys|some|note|kind|shit|shat|mind|ices|busy|pick|hand|shod|shoe|gave|reds|shot|hall|fews|ours|feel|burn|drew|such|draw|shop|give|felt|wing|suit|drag|hear|feed|mine|girl|feds|iced|down|when|fees|half|suns|able|word|fear|nows|door|fast|sure|leaf|pile|jobs|show|wine|boys|dogs|yell|hair|guys|kept|doer|fall|fell|head|shut|gift|hole|rest|numb|kick|lean|take|both|sick|fail|fade|took|miss|side|sigh|held|talk|last|plan|bore|hold|done|tall|teas|fact|boot|like|wife|rich|sign|wood|team|does|main|offs|tear|tore|torn|rode|dirt|gets|bone|joke|ride|make|told|play|died|tell|dies|tens|area|body|than|boat|line|guns|desk|that|what|kiss|them|they|gate|sang|then|plea|kill|face|sing|sung|eyes|thin|blue|deep|made|rung|ring|sirs|wide|he’s|rang|moon|blow|eyed|sits|more|whys|dead|blew|days|this|left|grew|he's|size|rise|rose|whom|have|skin|most|late|grow|slam|road|game|tied|ties|arms|dark|rock|okay|ages|mens|roll|mans|tiny|slid|dads|airs|ok'd|tire|wets|ok’d|i’ll|roof|slip|full|cuts|pool|slow|tone|bite|lips|cups|bits|room|olds|poor|bird|adds|ever|knew|hate|fuck|pops|even|tops|wipe|hits|once|west|hour|rows|rubs|toss|best|ones|only|from|runs|bend|bent|onto|open|move|town|free|pour|legs|rush|jump|snap|many|hill|less|snow|keep|safe|much|soft|join|beer|i'll|beds|four|tree|same|sand|form|cops|must|year|cool|trip|lets|beat|mark|born|bear|with|come|save|know|true|sons|lock|song|soon|laws|came|outs|name|well|been|says|said|sort|feet|soul|high|yeah|were|hide|foot|turn|cold|wind|yard|twos|coat|over|hats|owns|ends|lady|aged|arts|else|long|flew|hurt|page|week|upon|lays|used|uses|hard|eggs|wins|very|mays|seas|pain|near|view|bars|weds|pull|edge|wrap|lies|bank|spin|ball|grey|seat|spun|lied|neck|push|wait|hope|bags|city|look|wake|spot|saws|woke|wear|pink|liar|eats|need|sees|seen|puts|seem|wall|want|pair|gray|sell|will|flat|back|pale|sold|asks|wars|land|send|mean|warm|baby|sent|also|wash|away|here|easy|hung|sens|hers|aunt|palm|worn|meet|wore|east|live|news|five|wave|next|lost|lose|nice|ways|far|few|war|bad|bag|bar|wed|use|ups|art|was|two|try|are|bed|top|arm|wet|big|too|bit|tie|the|ten|tvs|tea|box|boy|sun|bus|but|buy|any|can|car|cat|and|son|cop|sos|cry|cup|cut|who|dad|sky|day|six|why|sit|sat|sir|die|did|dog|she|dry|set|ear|ate|eat|see|saw|win|won|egg|end|say|sad|ran|run|rub|row|eye|rid|ask|fed|fee|red|way|fit|fix|all|put|fly|for|pop|fun|get|got|god|pay|own|out|our|air|ors|one|old|ohs|gun|key|off|guy|now|not|nor|nod|nos|ago|new|hat|had|has|her|met|hey|may|hid|him|add|his|men|hit|mad|low|lot|hot|lip|how|lit|lie|kid|i'm|let|i’m|leg|i'd|i’d|ice|led|act|lay|law|ins|yes|yet|you|its|job|no|at|by|my|on|ha|do|ok|he|oh|is|tv|me|us|as|hi|go|if|of|am|up|to|we|so|in|or|it|be|an|i|a 2 | -------------------------------------------------------------------------------- /database/database.go: -------------------------------------------------------------------------------- 1 | package database 2 | 3 | /* example query 4 | SELECT p.url 5 | FROM inv_index index 6 | INNER JOIN pages p ON p.id = index.pageid 7 | WHERE i.word = "project"; 8 | 9 | select url from inv_index where word="esoteric" group by url order by sum(score) desc limit 15; 10 | 11 | select url from inv_index where word = "" group by url order by sum(score) desc; 12 | */ 13 | 14 | import ( 15 | "database/sql" 16 | "fmt" 17 | "lieu/types" 18 | "lieu/util" 19 | "log" 20 | "net/url" 21 | "strings" 22 | "regexp" 23 | 24 | _ "github.com/mattn/go-sqlite3" 25 | ) 26 | 27 | var languageCodeSanityRegex = regexp.MustCompile("^[a-zA-Z\\-0-9]+$") 28 | 29 | func InitDB(filepath string) *sql.DB { 30 | db, err := sql.Open("sqlite3", filepath) 31 | if err != nil { 32 | log.Fatalln(err) 33 | } 34 | if db == nil { 35 | log.Fatalln("db is nil") 36 | } 37 | createTables(db) 38 | return db 39 | } 40 | 41 | func createTables(db *sql.DB) { 42 | // create the table if it doesn't exist 43 | queries := []string{` 44 | CREATE TABLE IF NOT EXISTS domains ( 45 | id INTEGER PRIMARY KEY AUTOINCREMENT, 46 | domain TEXT NOT NULL UNIQUE 47 | ); 48 | `, 49 | ` 50 | CREATE TABLE IF NOT EXISTS stats ( 51 | id INTEGER PRIMARY KEY AUTOINCREMENT, 52 | last_crawl TEXT 53 | ); 54 | `, 55 | ` 56 | CREATE TABLE IF NOT EXISTS pages ( 57 | id INTEGER PRIMARY KEY AUTOINCREMENT, 58 | url TEXT NOT NULL UNIQUE, 59 | title TEXT, 60 | about TEXT, 61 | lang TEXT, 62 | domain TEXT NOT NULL, 63 | FOREIGN KEY(domain) REFERENCES domains(domain) 64 | ); 65 | `, 66 | ` 67 | CREATE TABLE IF NOT EXISTS external_pages ( 68 | id INTEGER PRIMARY KEY AUTOINCREMENT, 69 | url TEXT NOT NULL UNIQUE, 70 | domain TEXT NOT NULL, 71 | title TEXT 72 | ); 73 | `, 74 | ` 75 | CREATE TABLE IF NOT EXISTS inv_index ( 76 | word TEXT NOT NULL, 77 | score INTEGER NOT NULL, 78 | url TEXT NOT NULL, 79 | FOREIGN KEY(url) REFERENCES pages(url) 80 | )`, 81 | `CREATE VIRTUAL TABLE IF NOT EXISTS external_links USING fts5 (url, tokenize="trigram")`, 82 | } 83 | 84 | for _, query := range queries { 85 | if _, err := db.Exec(query); err != nil { 86 | log.Fatalln(fmt.Errorf("failed to execute %s (%w)", query, err)) 87 | } 88 | } 89 | } 90 | 91 | /* TODO: filters 92 | lang:en|fr|en|<..> 93 | nosite:excluded-domain.com 94 | 95 | "word1 word2 word3" strict query 96 | 97 | query params: 98 | &order=score, &order=count 99 | */ 100 | 101 | var emptyStringArray = []string{} 102 | 103 | func SearchWordsByScore(db *sql.DB, words []string) []types.PageData { 104 | return SearchWords(db, words, true, emptyStringArray, emptyStringArray, emptyStringArray) 105 | } 106 | 107 | func SearchWordsBySite(db *sql.DB, words []string, domain string) []types.PageData { 108 | // search words by site is same as search words by score, but adds a domain condition 109 | return SearchWords(db, words, true, []string{domain}, emptyStringArray, emptyStringArray) 110 | } 111 | 112 | func SearchWordsByCount(db *sql.DB, words []string) []types.PageData { 113 | return SearchWords(db, words, false, emptyStringArray, emptyStringArray, emptyStringArray) 114 | } 115 | 116 | func FulltextSearchWords(db *sql.DB, phrase string) []types.PageData { 117 | query := fmt.Sprintf(`SELECT url from external_links WHERE url MATCH ? GROUP BY url ORDER BY RANDOM() LIMIT 30`) 118 | 119 | stmt, err := db.Prepare(query) 120 | util.Check(err) 121 | defer stmt.Close() 122 | 123 | rows, err := stmt.Query(phrase) 124 | util.Check(err) 125 | defer rows.Close() 126 | 127 | var pageData types.PageData 128 | var pages []types.PageData 129 | for rows.Next() { 130 | if err := rows.Scan(&pageData.URL); err != nil { 131 | log.Fatalln(err) 132 | } 133 | pageData.Title = pageData.URL 134 | pages = append(pages, pageData) 135 | } 136 | return pages 137 | } 138 | 139 | func UpdateCrawlDate(db *sql.DB, date string) { 140 | stmt := `INSERT OR IGNORE INTO stats(last_crawl) VALUES (?)` 141 | _, err := db.Exec(stmt, date) 142 | if err != nil { 143 | util.Check(fmt.Errorf("failed to update crawl date (%w)", err)) 144 | } 145 | } 146 | 147 | func GetLastCrawl(db *sql.DB) string { 148 | rows, err := db.Query("SELECT last_crawl FROM stats WHERE last_crawl IS NOT NULL ORDER BY id DESC LIMIT 1") 149 | util.Check(err) 150 | defer rows.Close() 151 | 152 | var date string 153 | for rows.Next() { 154 | err = rows.Scan(&date) 155 | if err != nil { 156 | util.Check(fmt.Errorf("failed to get last crawl (%w)", err)) 157 | } 158 | } 159 | return date 160 | } 161 | 162 | func GetDomainCount(db *sql.DB) int { 163 | return countQuery(db, "domains") 164 | } 165 | 166 | func GetPageCount(db *sql.DB) int { 167 | return countQuery(db, "pages") 168 | } 169 | 170 | func GetWordCount(db *sql.DB) int { 171 | return countQuery(db, "inv_index") 172 | } 173 | 174 | func GetRandomDomain(db *sql.DB) string { 175 | rows, err := db.Query("SELECT domain FROM domains ORDER BY RANDOM() LIMIT 1;") 176 | util.Check(err) 177 | defer rows.Close() 178 | 179 | var domain string 180 | for rows.Next() { 181 | err = rows.Scan(&domain) 182 | util.Check(err) 183 | } 184 | return domain 185 | } 186 | 187 | func GetRandomExternalLink(db *sql.DB) string { 188 | rows, err := db.Query("SELECT url FROM external_links ORDER BY RANDOM() LIMIT 1;") 189 | util.Check(err) 190 | defer rows.Close() 191 | 192 | var link string 193 | for rows.Next() { 194 | err = rows.Scan(&link) 195 | util.Check(err) 196 | } 197 | return link 198 | } 199 | 200 | func GetRandomPage(db *sql.DB) string { 201 | domain := GetRandomDomain(db) 202 | stmt, err := db.Prepare("SELECT url FROM pages WHERE domain = ? ORDER BY RANDOM() LIMIT 1;") 203 | defer stmt.Close() 204 | util.Check(err) 205 | 206 | rows, err := stmt.Query(domain) 207 | defer rows.Close() 208 | 209 | var link string 210 | for rows.Next() { 211 | err = rows.Scan(&link) 212 | util.Check(err) 213 | } 214 | return link 215 | } 216 | 217 | func countQuery(db *sql.DB, table string) int { 218 | rows, err := db.Query(fmt.Sprintf("SELECT COUNT(*) FROM %s;", table)) 219 | util.Check(err) 220 | defer rows.Close() 221 | 222 | var count int 223 | for rows.Next() { 224 | err = rows.Scan(&count) 225 | util.Check(err) 226 | } 227 | return count 228 | } 229 | 230 | func SearchWords(db *sql.DB, words []string, searchByScore bool, domain []string, nodomain []string, language []string) []types.PageData { 231 | var args []interface{} 232 | 233 | wordlist := []string{"1"} 234 | if len(words) > 0 && words[0] != "" { 235 | wordlist = make([]string, 0) 236 | for _, word := range words { 237 | wordlist = append(wordlist, "word = ?") 238 | args = append(args, strings.ToLower(word)) 239 | } 240 | } 241 | 242 | // the domains conditional defaults to just 'true' i.e. no domain condition 243 | domains := []string{"1"} 244 | if len(domain) > 0 && domain[0] != "" { 245 | domains = make([]string, 0) // we've got at least one domain! clear domains default 246 | for _, d := range domain { 247 | domains = append(domains, "domain = ?") 248 | args = append(args, d) 249 | } 250 | } 251 | 252 | nodomains := []string{"1"} 253 | if len(nodomain) > 0 && nodomain[0] != "" { 254 | nodomains = make([]string, 0) 255 | for _, d := range nodomain { 256 | nodomains = append(nodomains, "domain != ?") 257 | args = append(args, d) 258 | } 259 | } 260 | 261 | //This needs some wildcard support … 262 | languages := []string{"1"} 263 | if len(language) > 0 && language[0] != "" { 264 | languages = make([]string, 0) 265 | for _, d := range language { 266 | // Do a little check to avoid the database being DOSed 267 | if languageCodeSanityRegex.MatchString(d) { 268 | languages = append(languages, "lang LIKE ?") 269 | args = append(args, d+"%") 270 | } 271 | } 272 | } 273 | 274 | orderType := "SUM(score)" 275 | if !searchByScore { 276 | orderType = "COUNT(*)" 277 | } 278 | 279 | query := fmt.Sprintf(` 280 | SELECT p.url, p.about, p.title 281 | FROM inv_index inv INNER JOIN pages p ON inv.url = p.url 282 | WHERE (%s) 283 | AND (%s) 284 | AND (%s) 285 | AND (%s) 286 | GROUP BY inv.url 287 | ORDER BY %s 288 | DESC 289 | LIMIT 15 290 | `, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), strings.Join(nodomains, " AND "), strings.Join(languages, " OR "), orderType) 291 | 292 | stmt, err := db.Prepare(query) 293 | util.Check(err) 294 | defer stmt.Close() 295 | 296 | rows, err := stmt.Query(args...) 297 | util.Check(err) 298 | defer rows.Close() 299 | 300 | var pageData types.PageData 301 | var pages []types.PageData 302 | for rows.Next() { 303 | if err := rows.Scan(&pageData.URL, &pageData.About, &pageData.Title); err != nil { 304 | log.Fatalln(err) 305 | } 306 | pages = append(pages, pageData) 307 | } 308 | return pages 309 | } 310 | 311 | func InsertManyDomains(db *sql.DB, pages []types.PageData) { 312 | if len(pages) == 0 { 313 | return 314 | } 315 | values := make([]string, 0, len(pages)) 316 | args := make([]interface{}, 0, len(pages)) 317 | 318 | for _, b := range pages { 319 | values = append(values, "(?)") 320 | u, err := url.Parse(b.URL) 321 | util.Check(err) 322 | args = append(args, u.Hostname()) 323 | } 324 | 325 | stmt := fmt.Sprintf(`INSERT OR IGNORE INTO domains(domain) VALUES %s`, strings.Join(values, ",")) 326 | _, err := db.Exec(stmt, args...) 327 | util.Check(err) 328 | } 329 | 330 | func InsertManyPages(db *sql.DB, pages []types.PageData) { 331 | if len(pages) == 0 { 332 | return 333 | } 334 | values := make([]string, 0, len(pages)) 335 | args := make([]interface{}, 0, len(pages)) 336 | 337 | for _, b := range pages { 338 | // url, title, lang, about, domain 339 | values = append(values, "(?, ?, ?, ?, ?)") 340 | u, err := url.Parse(b.URL) 341 | util.Check(err) 342 | args = append(args, b.URL, b.Title, b.Lang, b.About, u.Hostname()) 343 | } 344 | 345 | stmt := fmt.Sprintf(`INSERT OR IGNORE INTO pages(url, title, lang, about, domain) VALUES %s`, strings.Join(values, ",")) 346 | _, err := db.Exec(stmt, args...) 347 | util.Check(err) 348 | } 349 | 350 | func InsertManyWords(db *sql.DB, batch []types.SearchFragment) { 351 | if len(batch) == 0 { 352 | return 353 | } 354 | 355 | values := make([]string, 0, len(batch)) 356 | args := make([]interface{}, 0, len(batch)) 357 | 358 | for _, b := range batch { 359 | pageurl := strings.TrimSuffix(b.URL, "/") 360 | values = append(values, "(?, ?, ?)") 361 | args = append(args, b.Word, pageurl, b.Score) 362 | } 363 | 364 | stmt := fmt.Sprintf(`INSERT OR IGNORE INTO inv_index(word, url, score) VALUES %s`, strings.Join(values, ",")) 365 | _, err := db.Exec(stmt, args...) 366 | util.Check(err) 367 | } 368 | 369 | func InsertManyExternalLinks(db *sql.DB, externalLinks []string) { 370 | if len(externalLinks) == 0 { 371 | return 372 | } 373 | 374 | values := make([]string, 0, len(externalLinks)) 375 | args := make([]interface{}, 0, len(externalLinks)) 376 | 377 | for _, externalLink := range externalLinks { 378 | values = append(values, "(?)") 379 | args = append(args, externalLink) 380 | } 381 | 382 | stmt := fmt.Sprintf(`INSERT OR IGNORE INTO external_links(url) VALUES %s`, strings.Join(values, ",")) 383 | _, err := db.Exec(stmt, args...) 384 | util.Check(err) 385 | } 386 | -------------------------------------------------------------------------------- /docs/files.md: -------------------------------------------------------------------------------- 1 | # Files 2 | _what the purposes are of all those damn files_ 3 | 4 | Lieu is based on a few files, which in turn configure various behaviours in the 5 | **crawler** (visits urls & extracts relevant elements) and the **ingester** 6 | (converts the crawled source data into database fields). The basic reason is to 7 | minimize hardcoded assumptions in the source, furthering Lieu's reuse. 8 | 9 | Below, I will refer to the files by their config defined names. Here's the 10 | config example from the [README](../README.md), again. 11 | 12 | ```toml 13 | [general] 14 | name = "Merveilles Webring" 15 | # used by the precrawl command and linked to in /about route 16 | url = "https://webring.xxiivv.com" 17 | port = 10001 18 | 19 | [data] 20 | # the source file should contain the crawl command's output 21 | source = "data/crawled.txt" 22 | # location & name of the sqlite database 23 | database = "data/searchengine.db" 24 | # contains words and phrases disqualifying scraped paragraphs from being presented in search results 25 | heuristics = "data/heuristics.txt" 26 | # aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word 27 | wordlist = "data/wordlist.txt" 28 | 29 | [crawler] 30 | # manually curated list of domains, or the output of the precrawl command 31 | webring = "data/webring.txt" 32 | # domains that are banned from being crawled but might originally be part of the webring 33 | bannedDomains = "data/banned-domains.txt" 34 | # file suffixes that are banned from being crawled 35 | bannedSuffixes = "data/banned-suffixes.txt" 36 | # phrases and words which won't be scraped (e.g. if a contained in a link) 37 | boringWords = "data/boring-words.txt" 38 | # domains that won't be output as outgoing links 39 | boringDomains = "data/boring-domains.txt" 40 | # queries to search for finding preview text 41 | previewQueryList = "data/preview-query-list.txt" 42 | ``` 43 | 44 | ## HTML 45 | Before we start, a final note on some other types of files in use. The HTML 46 | templates, used when presenting the search engine in the browser, are all 47 | available in the [`html`](../html) folder. The includes—currently only css 48 | & font files—are available in [`html/assets`](../html/assets). 49 | 50 | ## `[crawler]` 51 | #### `webring` 52 | Defines which domains will be crawled for pages. At current writing, no domains 53 | outside of this file will be crawled. 54 | 55 | You can populate the `webring` file manually or by precrawling an existing 56 | webpage that contains all of the domains you want to crawl: 57 | 58 | lieu precrawl > data/webring.txt 59 | 60 | #### `bannedDomains` 61 | A list of domains that will not be crawled. This means that if they are present 62 | in the `webring` file, they will be skipped over as candidates for crawling. 63 | 64 | The rationale is that some of the domains of a webring may be unsuitable for ingestion 65 | into the database. I typically find this is the case for domains that include 66 | microblogs with 100s or 1000s of one line pages—needlessly gunking up the search 67 | results without providing anything of interest outside the individual creating 68 | the logs. 69 | 70 | #### `bannedSuffixes` 71 | Eliminates html links that end with suffixes present in this file. Typically I want 72 | to avoid crawling links to media formats such as `.mp4`, and other types of 73 | non-html documents, really. 74 | 75 | It's fine to leave this file intact with its defaults. 76 | 77 | #### `boringWords` 78 | This file is a bit more specific. It contains words which, if present in a link, 79 | will prevent the link from being logged. The reason is cause it suggests the 80 | link target is boring—irrelevant for this application of the search engine. 81 | 82 | This can be `javascript:` script links, or other types of content that is less 83 | relevant to the focus of the search engine & webring. 84 | 85 | Link data of this type is as yet unused in Lieu's ingestion. 86 | 87 | #### `boringDomains` 88 | Like `boringWords` except it contains a list of domains which are banned from 89 | having their links be logged, typically because they are deemed less relevant 90 | for the focus of the search engine. 91 | 92 | Link data of this type is as yet unused in Lieu's ingestion. 93 | 94 | ## `[data]` 95 | #### `source` 96 | Contains the linewise data that was produced by the crawler. The first word 97 | identifies the type of data and the last word identifies the page the data 98 | originated from. 99 | 100 | Example: 101 | ``` 102 | h2 Prelude https://cblgh.org/articles/four-nights-in-tornio.html 103 | ``` 104 | 105 | * An `

    ` tag was scraped, 106 | * its contents were `Prelude`, and 107 | * the originating article was https://cblgh.org/articles/four-nights-in-tornio.html 108 | 109 | #### `database` 110 | The location the sqlite3 database will be created & read from. 111 | 112 | #### `heuristics` 113 | Heuristics contains a list of words or phrases which disqualify scraped 114 | paragraphs from being used as descriptive text Lieu's search results. Typically 115 | excluded are e.g. paragraphs which contain copyright symbols—as that indicates we 116 | have scraped the bottom-most paragraph, i.e. the page was likely a short stub, 117 | with a better content description elsewhere. 118 | 119 | #### `wordlist` 120 | Also known as [stopwords](https://en.wikipedia.org/wiki/Stop_word)—words which 121 | are stopped from entering the search index. The default wordlist consists of the 122 | 1000 or so most common English words, albeit curated slightly to still allow for 123 | interesting concepts and verbs—such as `reading` and `books`, for example. 124 | 125 | #### `previewQueryList` 126 | A list of css selectors—one per line—used to fetch preview paragraphs. The first paragraph 127 | found passing a check against the `heuristics` file makes it into the search index. For 128 | each selector in `previewQueryList`, Lieu tries the first four paragraphs—as found by the 129 | selector—before trying to find a new set of paragraphs using the file's next selector. 130 | 131 | To get good results, one usually wants to tune this list to getting the first "real" paragraph 132 | after common page headers, or finding a summary paragraph. The default has been, at the time of 133 | writing, tuned for use with the [Fediring](https://fediring.net). 134 | 135 | Depending on the structure of the websites you are indexing, this will get you 70-90% of the 136 | way in terms of accurate link descriptions. For the rest of the way, fine-tune `heuristics.txt` 137 | and reach out the creators of the websites you are indexing; they often appreciate the 138 | feedback. 139 | 140 | #### OpenSearch metadata 141 | If you are running your own instance of Lieu, you might want to look into changing the URL 142 | defined in the file `opensearch.xml`, which specifies [OpenSearch 143 | metadata](https://en.wikipedia.org/wiki/OpenSearch). This file allows a Lieu instance to be 144 | added to any browser supporting OpenSearch as one of the search engines that can be used for 145 | browser searches. 146 | 147 | See [html/assets/opensearch.xml](../html/assets/opensearch.xml). 148 | -------------------------------------------------------------------------------- /docs/querying.md: -------------------------------------------------------------------------------- 1 | # Querying Lieu 2 | 3 | ## Search Syntax 4 | 5 | * `cat dog` - search for pages about cats or dogs, most probably both 6 | * `fox site:example.org` - search example.org (if indexed) for term "fox" 7 | * `fox -site:example.org` - search all indexed sites except `example.org` for term "fox" 8 | * `emoji lang:de` - search pages that claim to mainly contain German content for the term "emoji" 9 | 10 | When searching, capitalisation and inflection do not matter, as search terms are: 11 | 12 | * Converted to lowercase using the go standard library 13 | * Passed through [jinzhu's inflection library](https://github.com/jinzhu/inflection) for 14 | converting to a possible singular form (intended to work with English nouns) 15 | 16 | ## Search API 17 | 18 | Lieu currently only renders its results to HTML. A query can be passed to the `/` endpoint using a `GET` request. 19 | 20 | It supports two URL parameters: 21 | * `q` - used for the search query 22 | * `site` - accepts one domain name and will have the same effect as the `site:` syntax. 23 | You can use this to make your webrings search engine double as a searchbox on your website. 24 | 25 | ### Examples 26 | To search `example.org` for the term "ssh" using `https://search.webring.example`: 27 | 28 | ``` 29 | https://search.webring.example/?q=ssh&site=example.org 30 | ``` 31 | 32 | Adding a form element, to use Lieu as a search engine, to the HTML at example.org: 33 | 34 | ``` 35 |
    36 | 37 | 38 | 39 | 40 |
    41 | ``` 42 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module lieu 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.5.1 7 | github.com/gocolly/colly/v2 v2.1.0 8 | github.com/jinzhu/inflection v1.0.0 9 | github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b 10 | github.com/mattn/go-sqlite3 v1.14.6 11 | ) 12 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 3 | github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= 4 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= 5 | github.com/anaskhan96/soup v1.2.4 h1:or+sKs9QbzJGZVTYFmTs2VBateEywoq00a6K14z331E= 6 | github.com/anaskhan96/soup v1.2.4/go.mod h1:6YnEp9A2yywlYdM4EgDz9NEHclocMepEtku7wg6Cq3s= 7 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 8 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= 9 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= 10 | github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= 11 | github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= 12 | github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= 13 | github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= 14 | github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= 15 | github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= 16 | github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= 17 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= 18 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= 19 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 20 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 21 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= 22 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= 23 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= 24 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= 25 | github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= 26 | github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= 27 | github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= 28 | github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= 29 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 30 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= 31 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 32 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 33 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 34 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 35 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 36 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= 37 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= 38 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= 39 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= 40 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= 41 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= 42 | github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= 43 | github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= 44 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 45 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 46 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 47 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 48 | github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= 49 | github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= 50 | github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= 51 | github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= 52 | github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= 53 | github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b h1:UmqyLHqfYJjkiuA2hddGeovwAGOCBm5gOTVKuxtvoMo= 54 | github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b/go.mod h1:wLcNqnyr6riTbnFObg4o2/GemTCso9AnsUdLsMsdspw= 55 | github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg= 56 | github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= 57 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 58 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 59 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 60 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= 61 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= 62 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= 63 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 64 | github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= 65 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 66 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 67 | github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= 68 | github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 69 | github.com/yuin/goldmark v1.2.1 h1:ruQGxdhGHe7FWOJPT0mKs5+pD2Xs1Bm/kdGlHO04FmM= 70 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 71 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 72 | golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 73 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 74 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 75 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 76 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 77 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= 78 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= 79 | golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4= 80 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 81 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 82 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 83 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 84 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 85 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 86 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 87 | golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= 88 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 89 | golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 90 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 91 | golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 92 | golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM= 93 | golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 94 | golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI= 95 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 96 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 97 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 98 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 99 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 100 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 101 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 102 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 103 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 104 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 105 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 106 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4 h1:myAQVi0cGEoqQVR5POX+8RR2mrocKqNN1hmeMqhX27k= 107 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 108 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 109 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= 110 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 111 | golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k= 112 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 113 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 114 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 115 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= 116 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 117 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= 118 | golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= 119 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 120 | golang.org/x/tools v0.0.0-20210114065538-d78b04bdf963/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 121 | golang.org/x/tools v0.1.0 h1:po9/4sTYwZU9lPhi1tOrb4hCv3qrhiQ77LZfGa2OjwY= 122 | golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= 123 | golang.org/x/tour v0.0.0-20210317163553-0a3a62c5e5c0 h1:u0bliLHgSO64Pb0xbhtwNIHspZc11X8M1bJqBkYl4Co= 124 | golang.org/x/tour v0.0.0-20210317163553-0a3a62c5e5c0/go.mod h1:bWzMdWN2SiLomDzvESYfljDnNu60fUM2ATO8j09tZ5Y= 125 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 126 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 127 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 128 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= 129 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 130 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= 131 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= 132 | google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= 133 | google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= 134 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= 135 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= 136 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= 137 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= 138 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= 139 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= 140 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= 141 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= 142 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= 143 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= 144 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= 145 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 146 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 147 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 148 | google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA= 149 | google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= 150 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 151 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 152 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 153 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 154 | -------------------------------------------------------------------------------- /html/about.html: -------------------------------------------------------------------------------- 1 | {{ template "head" . }} 2 | {{ template "nav" . }} 3 |
    4 |
    5 |

    About

    6 |

    7 | Lieu—an alternative search engine. Created in response to the environs of 8 | apathy concerning the use of hypertext search and discovery. In Lieu, the 9 | internet is not what is made searchable, but instead one's own neighbourhood. Put differently, 10 | Lieu is a neighbourhood search engine, a way for personal webrings to increase 11 | serendipitous connexions. 12 |

    13 |

    14 | This instance indexes {{ .Data.WebringName }}—{{ .Data.DomainCount }} domains, 15 | {{ .Data.PageCount }} pages, {{ .Data.TermCount }} search terms. {{ if ne .Data.LastCrawl "" }} Index 16 | updated {{ .Data.LastCrawl }}. {{ end }} 17 | Some domains of the webring have been filtered out for a better search experience, 18 | see the filtered list. 19 | Visit a random page. 20 |

    21 |

    Lieu was created by cblgh at the onset of 2021.

    22 |

    For Lieu's AGPL licensed source code, the repository.

    23 |
    24 |
    25 | {{ template "footer" . }} 26 | -------------------------------------------------------------------------------- /html/assets/NotoSerif-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/NotoSerif-Bold.ttf -------------------------------------------------------------------------------- /html/assets/NotoSerif-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/NotoSerif-Bold.woff2 -------------------------------------------------------------------------------- /html/assets/NotoSerif-Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/NotoSerif-Italic.ttf -------------------------------------------------------------------------------- /html/assets/NotoSerif-Italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/NotoSerif-Italic.woff2 -------------------------------------------------------------------------------- /html/assets/NotoSerif-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/NotoSerif-Regular.ttf -------------------------------------------------------------------------------- /html/assets/NotoSerif.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/NotoSerif.woff2 -------------------------------------------------------------------------------- /html/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/favicon.ico -------------------------------------------------------------------------------- /html/assets/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/favicon.png -------------------------------------------------------------------------------- /html/assets/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /html/assets/inter-ui-web/Inter-UI-Italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/inter-ui-web/Inter-UI-Italic.woff -------------------------------------------------------------------------------- /html/assets/inter-ui-web/Inter-UI-Italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/inter-ui-web/Inter-UI-Italic.woff2 -------------------------------------------------------------------------------- /html/assets/inter-ui-web/Inter-UI-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/inter-ui-web/Inter-UI-Regular.woff -------------------------------------------------------------------------------- /html/assets/inter-ui-web/Inter-UI-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cblgh/lieu/bcddc10517ee57ca6edd81b2a7409b8255fc308b/html/assets/inter-ui-web/Inter-UI-Regular.woff2 -------------------------------------------------------------------------------- /html/assets/inter-ui-web/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-2018 The Inter UI Project Authors (me@rsms.me) 2 | 3 | This Font Software is licensed under the SIL Open Font License, Version 1.1. 4 | This license is copied below, and is also available with a FAQ at: 5 | http://scripts.sil.org/OFL 6 | 7 | ----------------------------------------------------------- 8 | SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 9 | ----------------------------------------------------------- 10 | 11 | PREAMBLE 12 | The goals of the Open Font License (OFL) are to stimulate worldwide 13 | development of collaborative font projects, to support the font creation 14 | efforts of academic and linguistic communities, and to provide a free and 15 | open framework in which fonts may be shared and improved in partnership 16 | with others. 17 | 18 | The OFL allows the licensed fonts to be used, studied, modified and 19 | redistributed freely as long as they are not sold by themselves. The 20 | fonts, including any derivative works, can be bundled, embedded, 21 | redistributed and/or sold with any software provided that any reserved 22 | names are not used by derivative works. The fonts and derivatives, 23 | however, cannot be released under any other type of license. The 24 | requirement for fonts to remain under this license does not apply 25 | to any document created using the fonts or their derivatives. 26 | 27 | DEFINITIONS 28 | "Font Software" refers to the set of files released by the Copyright 29 | Holder(s) under this license and clearly marked as such. This may 30 | include source files, build scripts and documentation. 31 | 32 | "Reserved Font Name" refers to any names specified as such after the 33 | copyright statement(s). 34 | 35 | "Original Version" refers to the collection of Font Software components as 36 | distributed by the Copyright Holder(s). 37 | 38 | "Modified Version" refers to any derivative made by adding to, deleting, 39 | or substituting -- in part or in whole -- any of the components of the 40 | Original Version, by changing formats or by porting the Font Software to a 41 | new environment. 42 | 43 | "Author" refers to any designer, engineer, programmer, technical 44 | writer or other person who contributed to the Font Software. 45 | 46 | PERMISSION AND CONDITIONS 47 | Permission is hereby granted, free of charge, to any person obtaining 48 | a copy of the Font Software, to use, study, copy, merge, embed, modify, 49 | redistribute, and sell modified and unmodified copies of the Font 50 | Software, subject to the following conditions: 51 | 52 | 1) Neither the Font Software nor any of its individual components, 53 | in Original or Modified Versions, may be sold by itself. 54 | 55 | 2) Original or Modified Versions of the Font Software may be bundled, 56 | redistributed and/or sold with any software, provided that each copy 57 | contains the above copyright notice and this license. These can be 58 | included either as stand-alone text files, human-readable headers or 59 | in the appropriate machine-readable metadata fields within text or 60 | binary files as long as those fields can be easily viewed by the user. 61 | 62 | 3) No Modified Version of the Font Software may use the Reserved Font 63 | Name(s) unless explicit written permission is granted by the corresponding 64 | Copyright Holder. This restriction only applies to the primary font name as 65 | presented to the users. 66 | 67 | 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font 68 | Software shall not be used to promote, endorse or advertise any 69 | Modified Version, except to acknowledge the contribution(s) of the 70 | Copyright Holder(s) and the Author(s) or with their explicit written 71 | permission. 72 | 73 | 5) The Font Software, modified or unmodified, in part or in whole, 74 | must be distributed entirely under this license, and must not be 75 | distributed under any other license. The requirement for fonts to 76 | remain under this license does not apply to any document created 77 | using the Font Software. 78 | 79 | TERMINATION 80 | This license becomes null and void if any of the above conditions are 81 | not met. 82 | 83 | DISCLAIMER 84 | THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 85 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF 86 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 87 | OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE 88 | COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 89 | INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL 90 | DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 91 | FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM 92 | OTHER DEALINGS IN THE FONT SOFTWARE. 93 | -------------------------------------------------------------------------------- /html/assets/inter-ui-web/inter-ui.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: 'Inter UI'; 3 | font-style: normal; 4 | src: url("Inter-UI-Regular.woff2") format("woff2"), 5 | url("Inter-UI-Regular.woff") format("woff"); 6 | font-display: swap; 7 | } 8 | 9 | @font-face { 10 | font-family: 'Inter UI'; 11 | font-style: italic; 12 | src: url("Inter-UI-Italic.woff2") format("woff2"), 13 | url("Inter-UI-Italic.woff") format("woff"); 14 | font-display: swap; 15 | } -------------------------------------------------------------------------------- /html/assets/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /html/assets/old_css/base.css: -------------------------------------------------------------------------------- 1 | li { 2 | list-style-type: circle; 3 | } 4 | 5 | ul { 6 | padding-left: 1rem; 7 | } 8 | 9 | /* Font-size 62.5% allows using rems as pixels. For ex: 1.6rem = 16px. Way es */ 10 | html { 11 | font-size: 62.5%; 12 | } 13 | 14 | body { 15 | font-family: "Inter UI", sans-serif; 16 | background: var(--secondary); 17 | color: var(--primary); 18 | font-size: 1.6rem; 19 | } 20 | 21 | h1 { 22 | font-family: "Noto Serif"; 23 | font-weight: 400; 24 | font-size: 3rem; 25 | } 26 | 27 | h1>a, 28 | h1>a:hover { 29 | border-bottom: none; 30 | } 31 | 32 | a { 33 | cursor: pointer; 34 | color: var(--primary); 35 | text-decoration: none; 36 | border-bottom: 0.1rem solid var(--primary); 37 | word-wrap: break-word; 38 | } 39 | 40 | a:hover { 41 | border-bottom-style: dotted; 42 | } 43 | 44 | p { 45 | hyphens: auto; 46 | /* margin-bottom: 1.5rem; */ 47 | } 48 | 49 | .entry { 50 | -webkit-column-break-inside: avoid; 51 | -moz-column-break-inside: avoid; 52 | -moz-page-break-inside: avoid; 53 | page-break-inside: avoid; 54 | break-inside: avoid-column; 55 | } 56 | 57 | .search-container { 58 | display: grid; 59 | height: 2.5rem; 60 | align-items: center; 61 | grid-template-columns: 16rem 3rem; 62 | grid-auto-flow: column; 63 | grid-column-gap: .5rem; 64 | } 65 | 66 | .search-box { 67 | font-size: 1rem; 68 | border-radius: 0.1rem; 69 | padding: .5rem; 70 | padding-left: 0.75rem; 71 | border: 0; 72 | color: var(--secondary); 73 | background: var(--primary); 74 | } 75 | 76 | .search-button { 77 | background-image: url("logo.svg"); 78 | cursor: pointer; 79 | border: 0; 80 | transition: opacity 150ms; 81 | height: 2rem; 82 | width: auto; 83 | background-size: cover; 84 | background-position: center; 85 | } 86 | 87 | .search-button:hover { 88 | opacity: 0.5; 89 | transition: opacity 150ms; 90 | } 91 | 92 | .about-link { 93 | position: absolute; 94 | top: 1rem; 95 | right: 1rem; 96 | font-style: normal; 97 | } 98 | 99 | .lieu-link { 100 | font-size: 3rem; 101 | font-family: "Noto Serif"; 102 | font-weight: 400; 103 | text-decoration: none; 104 | } 105 | 106 | @media only screen and (min-device-width : 320px) and (max-device-width : 720px) { 107 | html { 108 | padding-left: 0.75rem; 109 | padding-right: 0.75rem; 110 | font-size: 30pt; 111 | max-width: 100vw !important; 112 | } 113 | 114 | #results { 115 | display: grid; 116 | } 117 | } 118 | 119 | @media only screen and (min-device-width : 320px) and (max-device-width : 374px) { 120 | html { 121 | font-size: 40pt; 122 | } 123 | } 124 | 125 | /* 126 | @media(prefers-color-scheme: light) { 127 | :root { 128 | --primary: #000; 129 | --secondary: #fefefe; 130 | } 131 | */ -------------------------------------------------------------------------------- /html/assets/old_css/style.css: -------------------------------------------------------------------------------- 1 | @import url("reset.css"); 2 | @import url("base.css"); 3 | 4 | h1 { 5 | font-size: 3rem; 6 | margin-bottom: 0rem; 7 | } 8 | 9 | h2 { 10 | font-family: "Noto Serif"; 11 | font-weight: 400; 12 | font-size: 1.5rem; 13 | margin-top: 0; 14 | margin-bottom: 1rem; 15 | } 16 | 17 | .lieu-container h2 { 18 | font-style: italic; 19 | } 20 | 21 | header { 22 | clear: both; 23 | display: grid; 24 | grid-auto-flow: column; 25 | grid-template-columns: max-content max-content 1fr; 26 | grid-column-gap: 1rem; 27 | align-items: start; 28 | } 29 | 30 | header h2 a, 31 | header h2 a:hover { 32 | border-bottom: none; 33 | } 34 | 35 | header ul { 36 | justify-self: end; 37 | margin-top: 0.5rem; 38 | grid-column-start: 3; 39 | } 40 | 41 | header ul li { 42 | margin-left: 1.5rem; 43 | display: inline-block; 44 | } 45 | 46 | header ul li:first-of-type { 47 | margin-left: 0; 48 | } 49 | 50 | main { 51 | display: grid; 52 | justify-items: left; 53 | align-items: left; 54 | margin-top: 1rem; 55 | } 56 | 57 | main#results { 58 | display: block; 59 | margin-top: 4rem; 60 | columns: 2; 61 | max-width: 1200px; 62 | } 63 | 64 | main#about { 65 | max-width: 600px; 66 | } 67 | 68 | .lieu { 69 | font-family: "Noto Serif"; 70 | font-weight: 400; 71 | } 72 | 73 | .search-container { 74 | grid-template-columns: 19rem 3rem; 75 | } 76 | 77 | .lieu-container { 78 | display: grid; 79 | justify-items: center; 80 | align-items: center; 81 | margin-top: 5rem; 82 | width: 100%; 83 | } 84 | 85 | .entry { 86 | -webkit-column-break-inside: avoid; 87 | -moz-column-break-inside: avoid; 88 | -moz-page-break-inside: avoid; 89 | page-break-inside: avoid; 90 | break-inside: avoid-column; 91 | margin-bottom: 1rem; 92 | } 93 | 94 | .entry p { 95 | color: var(--primary); 96 | opacity: 0.45; 97 | } 98 | 99 | .link { 100 | font-style: italic; 101 | } 102 | 103 | @media only screen and (min-device-width : 320px) and (max-device-width : 720px) { 104 | main { 105 | columns: 1 !important; 106 | } 107 | } -------------------------------------------------------------------------------- /html/assets/opensearch.xml: -------------------------------------------------------------------------------- 1 | 2 | Lieu 3 | Lieu - the search for the new—endless 4 | UTF-8 5 | https://lieu.cblgh.org/assets/favicon.ico 6 | 7 | 8 | -------------------------------------------------------------------------------- /html/assets/style.css: -------------------------------------------------------------------------------- 1 | /* BASE OR CONFIG */ 2 | 3 | :root { 4 | --primary: #fefefe; 5 | --secondary: #000; 6 | --link: #fefefe; 7 | /* alt colorscheme: 1 */ 8 | /* --primary: red; */ 9 | /* --secondary: #fefefe; */ 10 | /* alt colorscheme: 2 */ 11 | /* --primary: #F35363; */ 12 | /* --secondary: black; */ 13 | } 14 | 15 | /* Font-size 62.5% allows using rems as pixels. For ex: 1.6rem = 16px. Way es */ 16 | html { 17 | font-size: 62.5%; 18 | } 19 | 20 | /* defaults for old browsers */ 21 | body { 22 | color: white; 23 | background: black; 24 | } 25 | 26 | input[type="search"] { 27 | color: black; 28 | border: black solid 0.1rem; 29 | } 30 | /* end oldie defaults */ 31 | 32 | body { 33 | font-family: "Inter UI", sans-serif; 34 | background: var(--secondary); 35 | color: var(--primary); 36 | font-size: 1.6rem; 37 | } 38 | 39 | a { 40 | cursor: pointer; 41 | color: var(--link); 42 | text-decoration: none; 43 | border-bottom: 0.1rem solid var(--link); 44 | word-wrap: break-word; 45 | } 46 | 47 | a:hover { 48 | border-bottom-style: dotted; 49 | } 50 | 51 | p { 52 | hyphens: auto; 53 | } 54 | 55 | h1 { 56 | font-family: "Noto Serif"; 57 | font-weight: 400; 58 | font-size: 4.8rem; 59 | line-height: 1; 60 | } 61 | 62 | h2 { 63 | font-family: "Noto Serif"; 64 | font-weight: 400; 65 | font-size: 2.4rem; 66 | line-height: 1; 67 | } 68 | 69 | input[type="search"] { 70 | font-size: 1.6rem; 71 | border-radius: 0.16rem; 72 | padding: 0.8rem 0.8rem 0.8rem 1.2rem; 73 | border: 0; 74 | color: var(--primary); 75 | background: var(--secondary); 76 | border: var(--primary) solid 0.1rem; 77 | width: 100%; 78 | max-width: 30rem; 79 | } 80 | 81 | ul { 82 | padding: 0; 83 | } 84 | 85 | 86 | /* COMPOSITION */ 87 | 88 | header { 89 | display: flex; 90 | justify-content: space-between; 91 | align-items: center; 92 | height: auto; 93 | width: 100%; 94 | } 95 | 96 | main { 97 | height: calc(100vh - 10rem); 98 | padding-left: 3.2rem; 99 | padding-right: 3.2rem; 100 | } 101 | 102 | nav li { 103 | display: inline-block; 104 | } 105 | 106 | 107 | /* BLOCKS */ 108 | 109 | /* Block elements are grouped by "family" */ 110 | 111 | .header-home { 112 | padding: 2.1rem 3.2rem; 113 | } 114 | 115 | .header-home_link, .header-home_link:hover { 116 | font-family: "Noto Serif"; 117 | font-weight: 400; 118 | font-size: 2.4rem; 119 | border-bottom: none; 120 | } 121 | 122 | .header-home_navigation *+* { 123 | margin-left: 1.6rem; 124 | } 125 | 126 | 127 | /* Search block */ 128 | 129 | .search { 130 | display: flex; 131 | flex-direction: column; 132 | align-items: flex-start; 133 | } 134 | 135 | .search__input { 136 | display: flex; 137 | width: 100%; 138 | } 139 | 140 | .search__button { 141 | cursor: pointer; 142 | border: 0; 143 | transition: opacity 150ms; 144 | height: 4rem; 145 | min-width: 4.8rem; 146 | padding: 0; 147 | display: flex; 148 | align-items: center; 149 | background-color: var(--secondary); 150 | margin: 0 5px; 151 | } 152 | 153 | .search__button svg { 154 | width: 100%; 155 | height: auto; 156 | } 157 | 158 | /* Search Results */ 159 | .result-nav-list { 160 | display: grid; 161 | grid-auto-flow: column; 162 | justify-content: start; 163 | grid-column-gap: 0.75rem; 164 | padding-bottom: 0; 165 | font-size: 1.8rem; 166 | } 167 | 168 | .result__current { 169 | /* font-weight: bold; */ 170 | color: var(--link); 171 | text-decoration-line: underline; 172 | } 173 | 174 | /* Entries */ 175 | 176 | .entry { 177 | -webkit-column-break-inside: avoid; 178 | -moz-column-break-inside: avoid; 179 | -moz-page-break-inside: avoid; 180 | page-break-inside: avoid; 181 | break-inside: avoid-column; 182 | } 183 | 184 | .entry>*+* { 185 | margin-top: 0.8rem; 186 | } 187 | 188 | .entry__link { 189 | font-style: italic; 190 | } 191 | 192 | .entry__text { 193 | color: var(--primary); 194 | line-height: 1.2; 195 | } 196 | 197 | /* UTILITY CLASSES */ 198 | 199 | .italic-text { 200 | font-style: italic; 201 | } 202 | 203 | .flow>*+* { 204 | margin-top: 1.6rem; 205 | } 206 | 207 | .flow2>*+* { 208 | margin-top: calc(1.6rem * 2); 209 | } 210 | 211 | .flex-grow { 212 | flex-grow: 1; 213 | } 214 | 215 | .grid-items-center { 216 | display: grid; 217 | place-items: center; 218 | } 219 | 220 | .width-63ch { 221 | max-width: 63ch; 222 | } 223 | 224 | .width-126ch { 225 | max-width: 126ch; 226 | } 227 | 228 | .two-columns { 229 | columns: 2; 230 | } 231 | 232 | @media (max-width: 700px) { 233 | .two-columns { 234 | columns: 1; 235 | } 236 | } 237 | 238 | .visually-hidden { 239 | clip: rect(0 0 0 0); 240 | clip-path: inset(50%); 241 | height: 1px; 242 | overflow: hidden; 243 | position: absolute; 244 | white-space: nowrap; 245 | width: 1px; 246 | } 247 | 248 | .translateY-75 { 249 | transform: translateY(-75%); 250 | } 251 | 252 | @import url('inter-ui-web/inter-ui.css'); 253 | 254 | 255 | @font-face { 256 | font-family: 'Noto Serif'; 257 | src: url('NotoSerif.woff2') format('woff2'), url("NotoSerif-Regular.ttf") format("ttf"); 258 | font-weight: 400; 259 | font-style: normal; 260 | font-display: swap; 261 | } 262 | 263 | @font-face { 264 | font-family: 'Noto Serif'; 265 | src: url('NotoSerif-Italic.woff2') format('woff2'), url("NotoSerif-Italic.ttf") format('ttf'); 266 | font-weight: 400; 267 | font-style: italic; 268 | font-display: swap; 269 | } 270 | 271 | @font-face { 272 | font-family: 'Noto Serif'; 273 | src: url('NotoSerif-Bold.woff2') format('woff2'), url("NotoSerif-Bold.ttf") format('ttf'); 274 | font-weight: bold; 275 | font-style: normal; 276 | font-display: swap; 277 | } 278 | -------------------------------------------------------------------------------- /html/footer.html: -------------------------------------------------------------------------------- 1 | {{ define "footer" }} 2 | 3 | 4 | {{ end }} 5 | -------------------------------------------------------------------------------- /html/head.html: -------------------------------------------------------------------------------- 1 | {{ define "head" }} 2 | 3 | 4 | 5 | 6 | 7 | 8 | Lieu — webring search engine 9 | 10 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {{ end }} 41 | -------------------------------------------------------------------------------- /html/index.html: -------------------------------------------------------------------------------- 1 | {{ template "head" . }} 2 |
    3 | Lieu 4 | 10 |
    11 |
    12 |
    13 |
    14 |

    15 | {{ .SiteName }} 16 |

    17 |

    18 | {{ .Data.Tagline }} 19 |

    20 | 29 |
    30 |
    31 |
    32 | {{ template "footer" . }} 33 | -------------------------------------------------------------------------------- /html/list.html: -------------------------------------------------------------------------------- 1 | {{ template "head" . }} 2 | {{ template "nav" . }} 3 |
    4 |
    5 |
      6 | {{ range .Data.URLs }} 7 |
    • 8 | {{ .Title }} 9 |
    • 10 | {{ end }} 11 |
    12 |
    13 |
    14 | {{ template "footer" . }} 15 | -------------------------------------------------------------------------------- /html/nav.html: -------------------------------------------------------------------------------- 1 | {{ define "nav" }} 2 |
    3 | {{ .SiteName }} 4 | 10 |
    11 | {{ end }} 12 | -------------------------------------------------------------------------------- /html/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /*? 3 | -------------------------------------------------------------------------------- /html/search.html: -------------------------------------------------------------------------------- 1 | {{ template "head" . }} 2 | {{ template "nav" . }} 3 |
    4 |

    {{ .Data.Title }} {{ if ne .Data.Site "" }} for {{ .Data.Site }} {{ end }}

    5 | 6 | 18 | {{ if ne .Data.Site "" }} 19 | 20 | 27 | {{ else }} 28 | 40 | {{ end }} 41 |
    42 |
      43 | {{ range $index, $a := .Data.Pages }} 44 |
    • 45 | {{ .Title }} 46 |

      {{ .About }}

      47 |
    • 48 | {{ end }} 49 |
    50 |
    51 | {{ template "footer" . }} 52 | -------------------------------------------------------------------------------- /html/webring.html: -------------------------------------------------------------------------------- 1 | {{ template "head" . }} 2 | {{ template "nav" . }} 3 |
    4 |
    5 |
      6 | {{ range .Data.Domains }} 7 |
    • 8 | {{ .Title }} 9 |

      {{ .About }}

      10 |
    • 11 | {{ end }} 12 |
    13 |
    14 |
    15 | {{ template "footer" . }} 16 | -------------------------------------------------------------------------------- /ingest/ingest.go: -------------------------------------------------------------------------------- 1 | package ingest 2 | 3 | import ( 4 | "bufio" 5 | "database/sql" 6 | "fmt" 7 | "lieu/database" 8 | "lieu/types" 9 | "lieu/util" 10 | "log" 11 | "net/url" 12 | "os" 13 | "regexp" 14 | "strings" 15 | "time" 16 | 17 | "github.com/jinzhu/inflection" 18 | ) 19 | 20 | func partitionSentence(s string) []string { 21 | punctuation := regexp.MustCompile(`\p{P}`) 22 | whitespace := regexp.MustCompile(`\p{Z}`) 23 | invisible := regexp.MustCompile(`\p{C}`) 24 | symbols := regexp.MustCompile(`\p{S}`) 25 | 26 | s = punctuation.ReplaceAllString(s, " ") 27 | s = whitespace.ReplaceAllString(s, " ") 28 | s = invisible.ReplaceAllString(s, " ") 29 | s = symbols.ReplaceAllString(s, " ") 30 | s = strings.ReplaceAll(s, "|", " ") 31 | s = strings.ReplaceAll(s, "/", " ") 32 | return strings.Fields(s) 33 | } 34 | 35 | func filterCommonWords(words, wordlist []string) []string { 36 | var filtered []string 37 | for _, word := range words { 38 | // ingested word was too common, skip it 39 | if len(word) == 1 || find(wordlist, word) { 40 | continue 41 | } 42 | filtered = append(filtered, inflection.Singular(word)) 43 | } 44 | return filtered 45 | } 46 | 47 | func find(slice []string, sought string) bool { 48 | for _, item := range slice { 49 | if item == sought { 50 | return true 51 | } 52 | } 53 | return false 54 | } 55 | 56 | func performAboutHeuristic(heuristicPath, phrase string) bool { 57 | disallowed := util.ReadList(heuristicPath, "\n") 58 | ok := !util.Contains(disallowed, phrase) 59 | return ok && len(phrase) > 20 60 | } 61 | 62 | func Ingest(config types.Config) { 63 | if _, err := os.Stat(config.Data.Database); err == nil || os.IsExist(err) { 64 | err = os.Remove(config.Data.Database) 65 | util.Check(err) 66 | } 67 | 68 | db := database.InitDB(config.Data.Database) 69 | date := time.Now().Format("2006-01-02") 70 | database.UpdateCrawlDate(db, date) 71 | 72 | wordlist := util.ReadList(config.Data.Wordlist, "|") 73 | 74 | buf, err := os.Open(config.Data.Source) 75 | util.Check(err) 76 | 77 | defer func() { 78 | err = buf.Close() 79 | util.Check(err) 80 | }() 81 | 82 | pages := make(map[string]types.PageData) 83 | var count int 84 | var batchsize = 100 85 | batch := make([]types.SearchFragment, 0, 0) 86 | var externalLinks []string 87 | 88 | scanner := bufio.NewScanner(buf) 89 | for scanner.Scan() { 90 | line := scanner.Text() 91 | firstSpace := strings.Index(line, " ") 92 | lastSpace := strings.LastIndex(line, " ") 93 | 94 | if len(line) == 0 || firstSpace == -1 { 95 | continue 96 | } 97 | 98 | pageurl := strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/") 99 | if !strings.HasPrefix(pageurl, "http") { 100 | continue 101 | } 102 | 103 | var page types.PageData 104 | if data, exists := pages[pageurl]; exists { 105 | page = data 106 | } else { 107 | page.URL = pageurl 108 | } 109 | 110 | token := line[0:firstSpace] 111 | rawdata := strings.TrimSpace(line[firstSpace:lastSpace]) 112 | payload := strings.ToLower(rawdata) 113 | 114 | var processed []string 115 | score := 1 116 | switch token { 117 | case "title": 118 | if len(page.About) == 0 { 119 | page.About = rawdata 120 | page.AboutSource = token 121 | } 122 | score = 5 123 | page.Title = rawdata 124 | processed = partitionSentence(payload) 125 | case "h1": 126 | if len(page.About) == 0 { 127 | page.About = rawdata 128 | page.AboutSource = token 129 | } 130 | fallthrough 131 | case "h2": 132 | fallthrough 133 | case "h3": 134 | score = 15 135 | processed = partitionSentence(payload) 136 | case "desc": 137 | if len(page.About) < 30 && len(rawdata) < 100 && len(rawdata) > len(page.About) { 138 | page.About = rawdata 139 | page.AboutSource = token 140 | } 141 | processed = partitionSentence(payload) 142 | case "og-desc": 143 | page.About = rawdata 144 | page.AboutSource = token 145 | processed = partitionSentence(payload) 146 | case "para": 147 | if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 { 148 | if performAboutHeuristic(config.Data.Heuristics, payload) { 149 | page.About = rawdata 150 | page.AboutSource = token 151 | } 152 | } 153 | processed = partitionSentence(payload) 154 | case "lang": 155 | page.Lang = rawdata 156 | case "keywords": 157 | processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",") 158 | case "non-webring-link": 159 | externalLinks = append(externalLinks, rawdata) 160 | default: 161 | continue 162 | } 163 | 164 | pages[pageurl] = page 165 | processed = filterCommonWords(processed, wordlist) 166 | count += len(processed) 167 | 168 | for _, word := range processed { 169 | batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: score}) 170 | } 171 | if token == "title" { 172 | // only extract path segments once per url. 173 | // we do it here because every page is virtually guaranteed to have a title attr & 174 | // it only appears once 175 | for _, word := range extractPathSegments(strings.ToLower(pageurl)) { 176 | batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2}) 177 | } 178 | } 179 | 180 | if len(pages) > batchsize { 181 | ingestBatch(db, batch, pages, externalLinks) 182 | externalLinks = make([]string, 0, 0) 183 | batch = make([]types.SearchFragment, 0, 0) 184 | // TODO: make sure we don't partially insert any page data 185 | pages = make(map[string]types.PageData) 186 | } 187 | } 188 | ingestBatch(db, batch, pages, externalLinks) 189 | fmt.Printf("ingested %d words\n", count) 190 | 191 | err = scanner.Err() 192 | util.Check(err) 193 | } 194 | 195 | func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData, links []string) { 196 | pages := make([]types.PageData, len(pageMap)) 197 | i := 0 198 | for k := range pageMap { 199 | pages[i] = pageMap[k] 200 | i++ 201 | } 202 | // TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from 203 | log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")") 204 | database.InsertManyDomains(db, pages) 205 | database.InsertManyPages(db, pages) 206 | for i := 0; i < len(batch); i += 3000 { 207 | end_i := i + 3000 208 | if end_i > len(batch) { 209 | end_i = len(batch) 210 | } 211 | database.InsertManyWords(db, batch[i:end_i]) 212 | } 213 | database.InsertManyExternalLinks(db, links) 214 | log.Println("finished ingesting batch") 215 | } 216 | 217 | func extractPathSegments(pageurl string) []string { 218 | u, err := url.Parse(pageurl) 219 | util.Check(err) 220 | if len(u.Path) == 0 { 221 | return make([]string, 0, 0) 222 | } 223 | s := u.Path 224 | s = strings.TrimSuffix(s, ".html") 225 | s = strings.TrimSuffix(s, ".htm") 226 | s = strings.ReplaceAll(s, "/", " ") 227 | s = strings.ReplaceAll(s, "-", " ") 228 | s = strings.ReplaceAll(s, "_", " ") 229 | s = strings.ToLower(s) 230 | return strings.Fields(s) 231 | } 232 | -------------------------------------------------------------------------------- /lieu.toml: -------------------------------------------------------------------------------- 1 | [general] 2 | name = "Sweet Webring" 3 | tagline = "the search for the new—endless" 4 | placeholder = "Search" 5 | # used by the precrawl command and linked to in /about route 6 | url = "https://example.com/" 7 | port = 10001 8 | 9 | [data] 10 | # the source file should contain the crawl command's output 11 | source = "data/crawled.txt" 12 | # location & name of the sqlite database 13 | database = "data/searchengine.db" 14 | # contains words and phrases disqualifying scraped paragraphs from being presented in search results 15 | heuristics = "data/heuristics.txt" 16 | # aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word 17 | wordlist = "data/wordlist.txt" 18 | 19 | [crawler] 20 | # manually curated list of domains, or the output of the precrawl command 21 | webring = "data/webring.txt" 22 | # domains that are banned from being crawled but might originally be part of the webring 23 | bannedDomains = "data/banned-domains.txt" 24 | # file suffixes that are banned from being crawled 25 | bannedSuffixes = "data/banned-suffixes.txt" 26 | # phrases and words which won't be scraped (e.g. if a contained in a link) 27 | boringWords = "data/boring-words.txt" 28 | # domains that won't be output as outgoing links 29 | boringDomains = "data/boring-domains.txt" 30 | # queries to search for finding preview text 31 | previewQueryList = "data/preview-query-list.txt" 32 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for os in linux darwin openbsd 4 | do 5 | executable="lieu" 6 | if [ $os = "windows" ]; then 7 | executable="lieu.exe" 8 | fi 9 | env GOOS="$os" go build -tags fts5 -ldflags "-s -w" 10 | tar czf "lieu-$os.tar.gz" README.md html/ data/ lieu.toml "$executable" 11 | echo "lieu-$os.tar.gz" 12 | rm -f "$executable" 13 | done 14 | 15 | -------------------------------------------------------------------------------- /server/server.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "database/sql" 5 | "errors" 6 | "fmt" 7 | "net/http" 8 | "net/url" 9 | "os" 10 | "strings" 11 | "syscall" 12 | 13 | "html/template" 14 | "lieu/database" 15 | "lieu/types" 16 | "lieu/util" 17 | ) 18 | 19 | type RequestHandler struct { 20 | config types.Config 21 | db *sql.DB 22 | } 23 | 24 | type TemplateView struct { 25 | SiteName string 26 | Data interface{} 27 | } 28 | 29 | type SearchData struct { 30 | Query string 31 | Title string 32 | Site string 33 | Pages []types.PageData 34 | IsInternal bool 35 | } 36 | 37 | type IndexData struct { 38 | Tagline string 39 | Placeholder string 40 | } 41 | 42 | type ListData struct { 43 | Title string 44 | URLs []types.PageData 45 | } 46 | 47 | type AboutData struct { 48 | DomainCount int 49 | WebringName string 50 | LastCrawl string 51 | PageCount string 52 | TermCount string 53 | FilteredLink string 54 | RingLink string 55 | } 56 | 57 | var templates = template.Must(template.ParseFiles( 58 | "html/head.html", "html/nav.html", "html/footer.html", 59 | "html/about.html", "html/index.html", "html/list.html", "html/search.html", "html/webring.html")) 60 | 61 | const useURLTitles = true 62 | 63 | func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request) { 64 | var query string 65 | var domain string 66 | view := &TemplateView{} 67 | 68 | var domains = []string{} 69 | var nodomains = []string{} 70 | var langs = []string{} 71 | var queryFields = []string{} 72 | 73 | if req.Method == http.MethodGet{ 74 | params := req.URL.Query() 75 | if words, exists := params["q"]; exists && words[0] != "" { 76 | query = words[0] 77 | queryFields = strings.Fields(query) 78 | } 79 | 80 | // how to use: https://gist.github.com/cblgh/29991ba0a9e65cccbe14f4afd7c975f1 81 | if parts, exists := params["site"]; exists && parts[0] != "" { 82 | // make sure we only have the domain, and no protocol prefix 83 | domain = strings.TrimPrefix(parts[0], "https://") 84 | domain = strings.TrimPrefix(domain, "http://") 85 | domain = strings.TrimSuffix(domain, "/") 86 | domains = append(domains, domain) 87 | } 88 | 89 | // don't process if there are too many fields 90 | if len(queryFields) <= 100 { 91 | var newQueryFields []string; 92 | for _, word := range queryFields { 93 | // This could be more efficient by splitting arrays, but I'm going with the more readable version for now 94 | if strings.HasPrefix(word, "site:") { 95 | domains = append(domains, strings.TrimPrefix(word, "site:")) 96 | } else if strings.HasPrefix(word, "-site:") { 97 | nodomains = append(nodomains, strings.TrimPrefix(word, "-site:")) 98 | } else if strings.HasPrefix(word, "lang:") { 99 | langs = append(langs, strings.TrimPrefix(word, "lang:")) 100 | } else { 101 | newQueryFields = append(newQueryFields, word) 102 | } 103 | } 104 | queryFields = newQueryFields; 105 | } 106 | 107 | } 108 | 109 | if len(queryFields) == 0 || len(queryFields) > 100 || len(query) >= 8192 { 110 | view.Data = IndexData{Tagline: h.config.General.Tagline, Placeholder: h.config.General.Placeholder} 111 | h.renderView(res, "index", view) 112 | return 113 | } 114 | 115 | var pages = database.SearchWords(h.db, util.Inflect(queryFields), true, domains, nodomains, langs) 116 | 117 | if useURLTitles { 118 | for i, pageData := range pages { 119 | prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://")) 120 | util.Check(err) 121 | pageData.Title = prettyURL 122 | pages[i] = pageData 123 | } 124 | } 125 | 126 | view.Data = SearchData{ 127 | Title: "Results", 128 | Query: query, 129 | Site: domain, 130 | Pages: pages, 131 | IsInternal: true, 132 | } 133 | h.renderView(res, "search", view) 134 | } 135 | 136 | func (h RequestHandler) externalSearchRoute(res http.ResponseWriter, req *http.Request) { 137 | var query string 138 | view := &TemplateView{} 139 | 140 | if req.Method == http.MethodGet { 141 | params := req.URL.Query() 142 | if words, exists := params["q"]; exists && words[0] != "" { 143 | query = words[0] 144 | } 145 | } 146 | 147 | pages := database.FulltextSearchWords(h.db, query) 148 | 149 | if useURLTitles { 150 | for i, pageData := range pages { 151 | prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://")) 152 | util.Check(err) 153 | pageData.Title = prettyURL 154 | pages[i] = pageData 155 | } 156 | } 157 | 158 | view.Data = SearchData{ 159 | Title: "External Results", 160 | Query: query, 161 | Pages: pages, 162 | IsInternal: false, 163 | } 164 | h.renderView(res, "search", view) 165 | } 166 | 167 | func (h RequestHandler) aboutRoute(res http.ResponseWriter, req *http.Request) { 168 | view := &TemplateView{} 169 | 170 | pageCount := util.Humanize(database.GetPageCount(h.db)) 171 | wordCount := util.Humanize(database.GetWordCount(h.db)) 172 | domainCount := database.GetDomainCount(h.db) 173 | lastCrawl := database.GetLastCrawl(h.db) 174 | 175 | view.Data = AboutData{ 176 | WebringName: h.config.General.Name, 177 | DomainCount: domainCount, 178 | PageCount: pageCount, 179 | TermCount: wordCount, 180 | LastCrawl: lastCrawl, 181 | FilteredLink: "/filtered", 182 | RingLink: h.config.General.URL, 183 | } 184 | h.renderView(res, "about", view) 185 | } 186 | 187 | func (h RequestHandler) filteredRoute(res http.ResponseWriter, req *http.Request) { 188 | view := &TemplateView{} 189 | 190 | var URLs []types.PageData 191 | for _, domain := range util.ReadList(h.config.Crawler.BannedDomains, "\n") { 192 | u, err := url.Parse(domain) 193 | if err != nil { 194 | continue 195 | } 196 | u.Scheme = "https" 197 | p := types.PageData{Title: domain, URL: u.String()} 198 | URLs = append(URLs, p) 199 | } 200 | 201 | view.Data = ListData{ 202 | Title: "Filtered Domains", 203 | URLs: URLs, 204 | } 205 | h.renderView(res, "list", view) 206 | } 207 | 208 | func (h RequestHandler) randomRoute(res http.ResponseWriter, req *http.Request) { 209 | link := database.GetRandomPage(h.db) 210 | http.Redirect(res, req, link, http.StatusSeeOther) 211 | } 212 | 213 | func (h RequestHandler) randomExternalRoute(res http.ResponseWriter, req *http.Request) { 214 | link := database.GetRandomExternalLink(h.db) 215 | http.Redirect(res, req, link, http.StatusSeeOther) 216 | } 217 | 218 | func (h RequestHandler) webringRoute(res http.ResponseWriter, req *http.Request) { 219 | http.Redirect(res, req, h.config.General.URL, http.StatusSeeOther) 220 | } 221 | 222 | func (h RequestHandler) renderView(res http.ResponseWriter, tmpl string, view *TemplateView) { 223 | view.SiteName = h.config.General.Name 224 | var errTemp error 225 | if _, exists := os.LookupEnv("LIEU_DEV"); exists { 226 | var templates = template.Must(template.ParseFiles( 227 | "html/head.html", "html/nav.html", "html/footer.html", 228 | "html/about.html", "html/index.html", "html/list.html", "html/search.html", "html/webring.html")) 229 | errTemp = templates.ExecuteTemplate(res, tmpl+".html", view) 230 | } else { 231 | errTemp = templates.ExecuteTemplate(res, tmpl+".html", view) 232 | } 233 | if errors.Is(errTemp, syscall.EPIPE) { 234 | fmt.Println("had a broken pipe, continuing") 235 | } else { 236 | util.Check(errTemp) 237 | } 238 | } 239 | 240 | func WriteTheme(config types.Config) { 241 | theme := config.Theme 242 | // no theme is set, use the default 243 | if theme.Foreground == "" || theme.Background == "" || theme.Links =="" { 244 | return 245 | } 246 | colors := fmt.Sprintf(`/*This file will be automatically regenerated by lieu on startup if the theme colors are set in the configuration file*/ 247 | :root { 248 | --primary: %s; 249 | --secondary: %s; 250 | --link: %s; 251 | }`, theme.Foreground, theme.Background, theme.Links) 252 | err := os.WriteFile("html/assets/theme.css", []byte(colors), 0644) 253 | util.Check(err) 254 | } 255 | 256 | func Serve(config types.Config) { 257 | WriteTheme(config) 258 | db := database.InitDB(config.Data.Database) 259 | handler := RequestHandler{config: config, db: db} 260 | 261 | http.HandleFunc("/about", handler.aboutRoute) 262 | http.HandleFunc("/", handler.searchRoute) 263 | http.HandleFunc("/outgoing", handler.externalSearchRoute) 264 | http.HandleFunc("/random/outgoing", handler.randomExternalRoute) 265 | http.HandleFunc("/random", handler.randomRoute) 266 | http.HandleFunc("/webring", handler.webringRoute) 267 | http.HandleFunc("/filtered", handler.filteredRoute) 268 | 269 | fileserver := http.FileServer(http.Dir("html/")) 270 | http.Handle("/assets/", fileserver) 271 | http.Handle("/robots.txt", fileserver) 272 | 273 | portstr := fmt.Sprintf(":%d", config.General.Port) 274 | fmt.Println("Listening on port: ", portstr) 275 | 276 | http.ListenAndServe(portstr, nil) 277 | } 278 | -------------------------------------------------------------------------------- /types/types.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type SearchFragment struct { 4 | Word string 5 | URL string 6 | Score int 7 | } 8 | 9 | type PageData struct { 10 | URL string 11 | Title string 12 | About string 13 | Lang string 14 | AboutSource string 15 | } 16 | 17 | type Config struct { 18 | General struct { 19 | Name string `json:name` 20 | Tagline string `json:tagline` 21 | Placeholder string `json:placeholder` 22 | URL string `json:url` 23 | WebringSelector string `json:"webringSelector"` 24 | Port int `json:port` 25 | Proxy string `json:proxy` 26 | } `json:general` 27 | Theme struct { 28 | Foreground string `json:"foreground"` 29 | Background string `json:"background"` 30 | Links string `json:"links"` 31 | } `json:"theme"` 32 | Data struct { 33 | Source string `json:source` 34 | Database string `json:database` 35 | Heuristics string `json:heuristics` 36 | Wordlist string `json:wordlist` 37 | } `json:data` 38 | Crawler struct { 39 | Webring string `json:webring` 40 | BannedDomains string `json:bannedDomains` 41 | BannedSuffixes string `json:bannedSuffixes` 42 | BoringWords string `json:boringWords` 43 | BoringDomains string `json:boringDomains` 44 | PreviewQueries string `json:"previewQueryList"` 45 | } `json:crawler` 46 | } 47 | -------------------------------------------------------------------------------- /util/util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "github.com/PuerkitoBio/goquery" 8 | "io/ioutil" 9 | "log" 10 | "net" 11 | "os" 12 | "regexp" 13 | "strings" 14 | 15 | "lieu/types" 16 | 17 | "github.com/jinzhu/inflection" 18 | "github.com/komkom/toml" 19 | ) 20 | 21 | func Inflect(words []string) []string { 22 | var inflected []string 23 | for _, word := range words { 24 | inflected = append(inflected, inflection.Singular(word)) 25 | } 26 | return inflected 27 | } 28 | 29 | func Check(err error) { 30 | if err != nil { 31 | log.Fatalln(err) 32 | } 33 | } 34 | 35 | // document.querySelectorAll-type functionality. limited functionality as of now (no classes or id support atm, i think!!) 36 | func QuerySelector(query string, current *goquery.Selection, results *[]string) { 37 | var op, operand string 38 | 39 | attrPattern := regexp.MustCompile(`(\w+)\[(\w+)\](.+)?`) 40 | attrValuePattern := regexp.MustCompile(`\[(\w+)\]`) 41 | 42 | if len(query) == 0 { 43 | return 44 | } 45 | 46 | fields := strings.Fields(query) 47 | part := fields[0] 48 | query = strings.Join(fields[1:], " ") 49 | if part == ">" { 50 | op = "subchild" 51 | } else if attrPattern.MatchString(part) { 52 | op = "element" 53 | matches := attrPattern.FindStringSubmatch(part) 54 | operand = matches[1] 55 | var optional string 56 | if len(matches) == 4 { 57 | optional = matches[3] 58 | } 59 | query = strings.TrimSpace(fmt.Sprintf("[%s]%s %s", matches[2], optional, query)) 60 | } else if attrValuePattern.MatchString(part) { 61 | op = "attr" 62 | operand = attrValuePattern.FindStringSubmatch(part)[1] 63 | } else if len(query) == 0 { 64 | op = "final" 65 | } else { 66 | op = "element" 67 | operand = part 68 | } 69 | 70 | switch op { 71 | case "element": // e.g. [el]; bla > [el]; but also [el] > bla 72 | current = current.Find(operand) 73 | if strings.HasSuffix(query, "first-of-type") { 74 | break 75 | } 76 | fallthrough 77 | case "subchild": // [preceding] > [future] 78 | // recurse querySelector on all [preceding] element types 79 | current.Each(func(j int, s *goquery.Selection) { 80 | QuerySelector(query, s, results) 81 | }) 82 | return 83 | case "attr": // x[attr] 84 | // extract the attribute 85 | if str, exists := current.Attr(operand); exists { 86 | *results = append(*results, str) 87 | } 88 | return 89 | case "final": // no more in query, and we did not end on an attr: get text 90 | *results = append(*results, current.Text()) 91 | } 92 | QuerySelector(query, current, results) 93 | } 94 | 95 | func DatabaseDoesNotExist(filepath string) { 96 | fmt.Printf("lieu: database %s does not exist\n", filepath) 97 | fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data") 98 | Exit() 99 | } 100 | 101 | func CheckFileExists(path string) bool { 102 | _, err := os.Stat(path) 103 | if err == nil { 104 | return true 105 | } 106 | return os.IsExist(err) 107 | } 108 | 109 | func Humanize(n int) string { 110 | if n > 1000 { 111 | return fmt.Sprintf("%dk", n/1000) 112 | } else if n > 1000000 { 113 | return fmt.Sprintf("%dm", n/1000000) 114 | } 115 | 116 | return fmt.Sprintf("%d", n) 117 | } 118 | 119 | func Contains(arr []string, query string) bool { 120 | for _, item := range arr { 121 | if strings.Contains(query, item) { 122 | return true 123 | } 124 | } 125 | return false 126 | } 127 | 128 | func ReadList(filepath, sep string) []string { 129 | data, err := ioutil.ReadFile(filepath) 130 | if err != nil || len(data) == 0 { 131 | return []string{} 132 | } 133 | return strings.Split(strings.TrimSuffix(string(data), sep), sep) 134 | } 135 | 136 | func CheckPortOpen(port int) bool { 137 | tcpaddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("localhost:%d", port)) 138 | if err != nil { 139 | return false 140 | } 141 | 142 | l, err := net.ListenTCP("tcp", tcpaddr) 143 | defer l.Close() 144 | 145 | if err != nil { 146 | return false 147 | } 148 | return true 149 | } 150 | 151 | func ReadConfig() types.Config { 152 | data, err := ioutil.ReadFile("lieu.toml") 153 | Check(err) 154 | 155 | var conf types.Config 156 | decoder := json.NewDecoder(toml.New(bytes.NewBuffer(data))) 157 | 158 | err = decoder.Decode(&conf) 159 | Check(err) 160 | 161 | return conf 162 | } 163 | 164 | func WriteMockConfig() { 165 | conf := []byte(`[general] 166 | name = "Sweet Webring" 167 | # used by the precrawl command and linked to in /about route 168 | url = "https://example.com/" 169 | webringSelector = "li > a" 170 | port = 10001 171 | 172 | [theme] 173 | # colors specified in hex (or valid css names) which determine the theme of the lieu instance 174 | foreground = "#ffffff" 175 | background = "#000000" 176 | links = "#ffffff" 177 | 178 | [data] 179 | # the source file should contain the crawl command's output 180 | source = "data/crawled.txt" 181 | # location & name of the sqlite database 182 | database = "data/searchengine.db" 183 | # contains words and phrases disqualifying scraped paragraphs from being presented in search results 184 | heuristics = "data/heuristics.txt" 185 | # aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word 186 | wordlist = "data/wordlist.txt" 187 | 188 | [crawler] 189 | # manually curated list of domains, or the output of the precrawl command 190 | webring = "data/webring.txt" 191 | # domains that are banned from being crawled but might originally be part of the webring 192 | bannedDomains = "data/banned-domains.txt" 193 | # file suffixes that are banned from being crawled 194 | bannedSuffixes = "data/banned-suffixes.txt" 195 | # phrases and words which won't be scraped (e.g. if a contained in a link) 196 | boringWords = "data/boring-words.txt" 197 | # domains that won't be output as outgoing links 198 | boringDomains = "data/boring-domains.txt" 199 | # queries to search for finding preview text 200 | previewQueryList = "data/preview-query-list.txt" 201 | `) 202 | err := ioutil.WriteFile("lieu.toml", conf, 0644) 203 | Check(err) 204 | } 205 | 206 | func Exit() { 207 | os.Exit(0) 208 | } 209 | 210 | func DeduplicateSlice(intSlice []string) []string { 211 | keys := make(map[string]bool) 212 | list := []string{} 213 | for _, entry := range intSlice { 214 | if _, value := keys[entry]; !value { 215 | keys[entry] = true 216 | list = append(list, entry) 217 | } 218 | } 219 | return list 220 | } 221 | --------------------------------------------------------------------------------