├── License ├── README.md └── wikipedia2text /License: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006,2007,2008 C.Brabandt 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY The AUTHOR ``AS IS'' AND ANY EXPRESS 14 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wikipedia2text 2 | > A Shell script to query the Wikipedia. 3 | 4 | This script fetches Wikipedia articles (currently supports around 30 5 | Wikipedia languages) and displays them as plain text in a pager or 6 | just sends the text to standard out. Alternatively it opens the 7 | Wikipedia article in a (possibly GUI) web browser or just shows the 8 | URL of the appropriate Wikipedia article. 9 | 10 | ## Installation 11 | copy wikipedia2text into your `$PATH` 12 | 13 | alternatively, on debian or derivitatives, you can install it using your 14 | packages manager: `sudo apt-get install wikipedia2text` 15 | 16 | ## License & Copyright 17 | © Christian Brabandt, License: BSD 18 | -------------------------------------------------------------------------------- /wikipedia2text: -------------------------------------------------------------------------------- 1 | #!/bin/bash -- 2 | # 3 | # Shell script to query the Wikipedia. 4 | # 5 | # It can be used to output Wikipedia articles to the console, but can also 6 | # just open the article in any browser. 7 | # 8 | # Author: Christian Brabandt 9 | # License: BSD 10 | VERSION=0.14 11 | 12 | set -e 13 | 14 | function display_help(){ #{{{1 15 | cat << EOF 16 | NAME 17 | 18 | This script uses text-browser to query and render Wikipedia 19 | articles. The output will be printed to standard out. 20 | 21 | SYNOPSIS 22 | `basename $0` [-BCnNoOpPsSuU] [-b prog] [-c patt] [-i patt] [-l lang] 23 | [-X browseroptions] query 24 | `basename $0` -o [-b prog] [-l lang] query 25 | `basename $0` [-h] 26 | `basename $0` -v|-r 27 | 28 | -n do not colorize -N simple colorization (alias -C) 29 | -p display using a pager -P don't use pager 30 | -o open Wikipedia article -O don't open in browser 31 | -s display only a summary -S display whole article 32 | -u output the query URL -U open URL in browser 33 | -v display version -h display help 34 | -t show available sections 35 | 36 | -r open Random Page 37 | -d debug mode 38 | -i patt colorize pattern (case insensitive) 39 | -I patt colorize pattern (case-sensitive, alias -c) 40 | -b prog use prog as browser (by default to invoke 41 | elinks, links2, links, lynx or w3m, if found) 42 | -l lang use language (currently supported are: af, als, ca, cs, da, 43 | de, en, eo, es, fi, fr, hu, ia, is, it, la, lb, nds, nl, nn, 44 | no, pl, pt, rm, ro, simple, sk, sl, sv, tr) 45 | -T custom print custom section (anything in html h2 tag) 46 | -W url use url as base-URL for wikipedia (e.g. use a different 47 | Wiki, Querying this URL will happen by appending the search 48 | term. 49 | -X "options" pass through options to browser, e.g., "-width 180" 50 | (warnings: must be in quotes; browser specific, not checked) 51 | 52 | Query can be any term to search for at Wikipedia. Special 53 | characters will be taken care of. Note that only one query term is 54 | supported, however this term can consist of one or more words. 55 | 56 | Configuration can also be controlled by creating a runcontrol file 57 | .`basename $0`rc your home directory. 58 | 59 | Note that when requesting to open the article in a browser, other 60 | parameters will be ignored. The same holds for the options -h and 61 | -v. 62 | EOF 63 | } 64 | function getVersion(){ #{{{1 65 | cat <&2 71 | exit 3 72 | } 73 | function colorize(){ #{{{1 74 | if [ "${IGNCASE}" = "true" ]; then 75 | OUTPUT=$(echo -e "`cat`"|sed -s "s|\(${PATT}\)|\\\033\[0;31m\1\\\033\[0m|gi") 76 | else 77 | OUTPUT=$(echo -e "`cat`"|sed -s "s|\(${PATT}\)|\\\033\[0;31m\1\\\033\[0m|g") 78 | fi 79 | echo -e "${OUTPUT}" 80 | } 81 | function uri_decode(){ #{{{1 82 | echo -e "$*" |perl -MURI::Escape -lne 's/ /_/g;s/"//g;print uri_escape($_);' 83 | } 84 | function localize(){ #{{{1 85 | # Per default we use the english localized version of 86 | # Wikipedia 87 | LOCAL=$(echo ${LOCAL:="en"}) 88 | if [ "${LOCAL}" = "de" ]; then 89 | MARKER='^\s*Kategorien\?:' 90 | MARKER2='Bearbeiten' 91 | RANDOMP='Spezial:Zufällige Seite' 92 | elif [ "${LOCAL}" = "en" -o "${LOCAL}" = "simple" ]; then 93 | MARKER='^\s*Categories:\|^\s*Category:' 94 | MARKER2='edit' 95 | RANDOMP='Special:Random' 96 | elif [ "${LOCAL}" = "fr" ]; then 97 | MARKER='^\s*Catégories :' 98 | MARKER2='modifier' 99 | RANDOMP='Special:Random' 100 | elif [ "${LOCAL}" = "nl" ]; then 101 | MARKER='^\s*Categorie:' 102 | MARKER2='bewerk' 103 | RANDOMP='Speciaal:Willekeurig' 104 | elif [ "${LOCAL}" = "sv" ]; then 105 | MARKER='^\s*Kategorier:' 106 | MARKER2='redigera' 107 | RANDOMP='Special:Random' 108 | elif [ "${LOCAL}" = "es" ]; then 109 | MARKER='^\s*Categorías:' 110 | MARKER2='editar' 111 | RANDOMP='Especial:Random' 112 | elif [ "${LOCAL}" = "pt" ]; then 113 | MARKER='^\s*Categorias:' 114 | MARKER2='editar' 115 | RANDOMP='Especial:Random' 116 | elif [ "${LOCAL}" = "pl" ]; then 117 | MARKER='^\s*Kategorie:' 118 | MARKER2='Edytuj' 119 | RANDOMP='Specjalna:Losowa_strona' 120 | elif [ "${LOCAL}" = "it" ]; then 121 | MARKER='^\s*Categorie:' 122 | MARKER2='modifica' 123 | RANDOMP='Speciale:PaginaCasuale' 124 | elif [ "${LOCAL}" = "da" ]; then 125 | MARKER='^\s*Kategori:' 126 | MARKER2='\(redigér\|rediger\)' 127 | RANDOMP='Speciel:Tilfældig_side' 128 | elif [ "${LOCAL}" = "eo" ]; then 129 | MARKER='^\s*Kategorio:' 130 | MARKER2='redaktu' 131 | RANDOMP='Speciala:Random' 132 | elif [ "${LOCAL}" = "no" ]; then 133 | MARKER='^\s*Kategorier:' 134 | MARKER2='rediger' 135 | RANDOMP='Spesial:Tilfeldig_side' 136 | elif [ "${LOCAL}" = "nn" ]; then 137 | MARKER='^\s*Kategoriar:' 138 | MARKER2='endre' 139 | RANDOMP='Special:Random' 140 | elif [ "${LOCAL}" = "fi" ]; then 141 | MARKER='^\s*Luokat:' 142 | MARKER2='muokkaa' 143 | RANDOMP='Toiminnot:Satunnainen_sivu' 144 | elif [ "${LOCAL}" = "ca" ]; then 145 | MARKER='^\s*Categoria:' 146 | MARKER2='edita' 147 | RANDOMP='Especial:Random' 148 | elif [ "${LOCAL}" = "ro" ]; then 149 | MARKER='^\s*Categorii:' 150 | MARKER2='modifica' 151 | RANDOMP='Special:Random' 152 | elif [ "${LOCAL}" = "cs" ]; then 153 | MARKER='^\s*Kategorie:' 154 | MARKER2='editovat' 155 | RANDOMP='Speciální:Random' 156 | elif [ "${LOCAL}" = "sk" ]; then 157 | MARKER='^\s*Kategórie:' 158 | MARKER2='úprava' 159 | RANDOMP='Špeciálne:Random' 160 | elif [ "${LOCAL}" = "sl" ]; then 161 | MARKER='^\s*Kategorije:' 162 | MARKER2='spremeni' 163 | RANDOMP='Posebno:Random' 164 | elif [ "${LOCAL}" = "lb" ]; then 165 | MARKER='^\s*Kategorie:' 166 | MARKER2='Änneren' 167 | RANDOMP='Special:Random' 168 | elif [ "${LOCAL}" = "la" ]; then 169 | MARKER='^\s*Categoriae:' 170 | MARKER2='recensere' 171 | RANDOMP='Specialis:Random' 172 | elif [ "${LOCAL}" = "rm" ]; then 173 | MARKER='Views' 174 | MARKER2='edit' 175 | RANDOMP='Special:Random' 176 | elif [ "${LOCAL}" = "ia" ]; then 177 | MARKER='Views' 178 | MARKER2='modificar' 179 | RANDOMP='Special:Random' 180 | elif [ "${LOCAL}" = "is" ]; then 181 | MARKER='^\s*Flokkar:' 182 | MARKER2='breyta' 183 | RANDOMP='Kerfissíða:Random' 184 | elif [ "${LOCAL}" = "hu" ]; then 185 | MARKER='^\s*Kategóriák:' 186 | MARKER2='szerkesztés' 187 | RANDOMP='Speciális:Lap_találomra' 188 | elif [ "${LOCAL}" = "tr" ]; then 189 | MARKER='^\s*Sayfa kategorisi:' 190 | MARKER2='degistir' 191 | RANDOMP='Özel:Random' 192 | elif [ "${LOCAL}" = "af" ]; then 193 | MARKER='^\s*Kategorieë: ' 194 | MARKER2='wysig' 195 | RANDOMP='Spesiaal:Random' 196 | elif [ "${LOCAL}" = "nds" ]; then 197 | MARKER='^\s*Kategorien:' 198 | MARKER2='Ännern' 199 | RANDOMP='Spezial:Random' 200 | elif [ "${LOCAL}" = "als" ]; then 201 | MARKER='^\s*Kategorie:' 202 | MARKER2='ändere' 203 | RANDOMP='Spezial:Zufällige_Seite' 204 | else 205 | MARKER='\(Views\|References\|Visible links\)' 206 | RANDOMP='Special:Random' 207 | fi 208 | } 209 | function stripOutput(){ #{{{1 210 | # Now comes the magic: Strip everything from Marker to end, 211 | # cause this is only the linkdump 212 | SED='sed -e "s|\^\?\\[[0-9]*\\]||g" -e "s|\\[IMG\\]||g" -e "/${MARKER}/,$ D" ' 213 | if [ -n "${MARKER2}" ]; then 214 | echo "`cat`"| eval ${SED} -e '"s#\[${MARKER2}\]##g"' 215 | else 216 | echo "`cat`"| eval ${SED} 217 | fi 218 | } 219 | function openurl(){ #{{{1 220 | "${BROWSER}" "${URL}" 221 | } 222 | function summary() { #{{{1 223 | TMPFILE="/tmp/wiki-sum_$$.html" 224 | if [ "${COLOR}" = "true" ]; then 225 | summaryCommand="curl -s -L ${URL} | grep \/table -A400 | grep -v \/table | grep \
$TMPFILE && w3m -dump $TMPFILE | stripOutput | colorize && rm $TMPFILE" 227 | else 228 | summaryCommand="curl -s -L ${URL} | grep \/table -A400 | grep -v \/table | grep \
$TMPFILE && w3m -dump $TMPFILE | stripOutput && rm $TMPFILE" 230 | fi 231 | eval ${summaryCommand} 232 | } 233 | function print_sections() { #{{{1 234 | TMPFILE="/tmp/wiki-sections_$$.html" 235 | Command="curl -s -L ${URL} | grep '\( $TMPFILE && w3m -dump $TMPFILE | stripOutput && rm $TMPFILE" 237 | eval ${Command} 238 | } 239 | function print_section_detail() { #{{{1 240 | TMPFILE="/tmp/wiki-section_$$.html" 241 | Command="curl -s -L ${URL} | 242 | sed -n -e '/\(<\/\?html\)\|\(<\/\?body\)\|\(/p\" | 243 | sed -e 's/^.*

$TMPFILE && w3m -dump $TMPFILE | stripOutput" 244 | if [ "${COLOR}" = "true" ]; then 245 | eval "${Command} | colorize" 246 | else 247 | eval "${Command}" 248 | fi 249 | rm $TMPFILE 250 | } 251 | function getInfo(){ #{{{1 252 | getInfoCommand="${BROWSER} ${BROWSEROPTIONS} -dump ${URL} | stripOutput" 253 | if [ "${COLOR}" = "true" ]; then 254 | getInfoCommand="${getInfoCommand} | colorize" 255 | fi 256 | eval ${getInfoCommand} 257 | } 258 | 259 | # First read in the Run configuration File, if one is found #{{{1 260 | if [ -r ~/.`basename $0`rc ]; then 261 | source ~/.`basename $0`rc 262 | ABROWSER=${BROWSER} 263 | fi 264 | 265 | # Process commandline parameters {{{1 266 | while getopts "BCdnNoOpPsStuvhrUl:b:c:i:I:B:T:W:X:-help" ARGS 267 | do 268 | case ${ARGS} in 269 | b) ABROWSER=${OPTARG} ;; 270 | d) DEBUG="true" ;; 271 | B) ABROWSER='' ;; 272 | c) IGNCASE="false";COLOR="true"; PATT=${OPTARG} ;; 273 | C) COLOR="true" ;; 274 | i) IGNCASE="true";COLOR="true"; PATT=${OPTARG} ;; 275 | I) IGNCASE="false";COLOR="true"; PATT=${OPTARG} ;; 276 | l) LOCAL=${OPTARG} ;; 277 | n) COLOR="false" ;; 278 | N) COLOR="true" ;; 279 | o) USEBROWSER="true" ;; 280 | O) USEBROWSER="false" ;; 281 | p) PAGER="true" ;; 282 | P) PAGER="false" ;; 283 | r) RAND="true" ;; 284 | s) SHORT="true" ;; 285 | S) SHORT="false" ;; 286 | t) SECTION="show" ;; 287 | T) SECTION=$OPTARG ;; 288 | u) OUTPUTURL="true" ;; 289 | U) OPENURL="true";; 290 | v) getVersion; exit 0 ;; 291 | W) WURL=${OPTARG} ;; 292 | X) BROWSEROPTIONS=${OPTARG} ;; 293 | h) display_help; exit 0 ;; 294 | -help) display_help; exit 0 ;; 295 | *) display_help; exit 1 ;; 296 | esac 297 | done 298 | 299 | shift `expr ${OPTIND} - 1` 300 | 301 | # Init some variables {{{1 302 | localize 303 | 304 | # Setting Up some Variables, to determine, what actually to do 305 | if [ -z "$1" -a -z "${RAND}" ]; then 306 | display_help 307 | exit 1; 308 | fi 309 | 310 | IGNCASE=$(echo ${IGNCASE:="false"}) 311 | PAGER=$(echo ${PAGER:="false"}) 312 | OPENURL=$(echo ${OPENURL:="false"}) 313 | RAND=$(echo ${RAND:="false"}) 314 | 315 | if [ "$PAGER" = "true" ]; then 316 | { PAGER=$(which less) || PAGER=$(which more) ; } || errorExit "No Pager found!" ; 317 | fi 318 | 319 | PAGER=$(echo ${PAGER/less/less -Rr}) 320 | COLOR=$(echo ${COLOR:="false"}) 321 | 322 | if [ "$COLOR" = "true" -a -z "${PATT}" ]; then 323 | PATT="$*" 324 | fi 325 | 326 | if [ "$OPENURL" = "true" ]; then 327 | URL="$*" 328 | fi 329 | 330 | # Check for Alternative Browser 331 | if [ -n "${ABROWSER}" ]; then 332 | BROWSER=$(which "${ABROWSER}") || errorExit "${ABROWSER} not found" 333 | elif [ -n "${BROWSER}" ]; then 334 | BROWSER=$(which "${BROWSER}") || errorExit "${BROWSER} not found" 335 | else 336 | { BROWSER=$(which w3m) || 337 | BROWSER=$(which elinks) || 338 | BROWSER=$(which links2) || 339 | BROWSER=$(which lynx) || 340 | BROWSER=$(which links.main) || 341 | BROWSER=$(which links) ; } || errorExit "No Browser found" 342 | fi 343 | 344 | # Open page in Browser? 345 | USEBROWSER=$(echo ${USEBROWSER:="false"}) 346 | 347 | # Output only a summary? 348 | SHORT=$(echo ${SHORT:="false"}) 349 | 350 | # custom Section 351 | SECTION=$(echo ${SECTION:=""}) 352 | 353 | # Output only the URL? 354 | OUTPUTURL=$(echo ${OUTPUTURL:="false"}) 355 | 356 | # Now we do some input sanitizing. {{{1 357 | ARGUMENT="$(uri_decode "$*")" 358 | LOCAL="$(echo "${LOCAL}"|tr '[:upper:]' '[:lower:]')" 359 | # Random page? 360 | if [ "${RAND}" = "true" ]; then 361 | ARGUMENT="$(uri_decode "${RANDOMP}")" 362 | fi 363 | if [ -z "${URL}" ]; then 364 | URL="http://${LOCAL}.wikipedia.org/wiki/${ARGUMENT}" 365 | fi 366 | 367 | if [ -n "${WURL}" ]; then 368 | WURL="$(echo "${WURL%%/}")" 369 | case "${WURL}" in 370 | http://*) URL="${WURL}"/wiki/"${ARGUMENT}" ;; 371 | *) URL="http://""${WURL}"/wiki/"${ARGUMENT}" ;; 372 | esac; 373 | # unset $LOCAL to force using an english-locale 374 | # this is used to strip the tags [edit], eg. 375 | LOCAL="en" 376 | fi 377 | 378 | # Debug mode? {{{1 379 | if [ "${DEBUG:=false}" = "true" ]; then 380 | printf "PAGER: $PAGER Browser: $BROWSER Local: $LOCAL COLOR: $COLOR PATT: $PATT IGNCASE: $IGNCASE URL: $URL Summary: $SHORT\n" 381 | fi 382 | 383 | # Depending on some Variables, we do some different things here {{{1 384 | if [ "${USEBROWSER}" = "true" ]; then 385 | openurl 386 | exit 0; 387 | fi 388 | 389 | if [ "${SHORT}" = "true" ]; then 390 | summary 391 | exit 0; 392 | fi 393 | 394 | if [ "${SECTION}" = "show" ]; then 395 | print_sections 396 | exit 0; 397 | elif [ -n "${SECTION}" ]; then 398 | print_section_detail ${SECTION} 399 | exit 0; 400 | fi 401 | 402 | if [ "${OUTPUTURL}" = "true" ]; then 403 | if [ "${COLOR}" = "false" ]; then 404 | echo "${URL}" 405 | echo "${BROWSER}" "${BROWSEROPTIONS}" -dump "${URL}" 406 | else 407 | echo -e "\033[0;34m${URL}\033[0m" 408 | fi 409 | exit 0; 410 | fi 411 | 412 | if [ "$PAGER" != "false" ]; then 413 | getInfo | ${PAGER} 414 | else 415 | getInfo 416 | fi 417 | # Vim Modeline {{{1 418 | # vim: ft=sh et sts=-1 sw=0 ts=2 419 | --------------------------------------------------------------------------------