├── EDITSETUP
├── LICENSE
├── CHANGELOG
├── README.md
└── wikiget.awk


/EDITSETUP:
--------------------------------------------------------------------------------
 1 | The edit (-E) feature requires an account with bot-flag permissions and OAuth owner-only consumer key/secrets
 2 | 
 3 | . If you already have a bot account and OAuth owner-only consumer keys/secrets skip to the last step.
 4 | 
 5 | . Obtain bot-flag from an administrator. Usually this is done through WP:BRFA 
 6 | 
 7 | . Login to https://meta.wikimedia.org with the bot userid
 8 | 
 9 | . If the userid is new and doesn't have Confirmed user permissions, apply for it in two places:
10 | 
11 |        enwiki: Wikipedia:Requests for permissions/Confirmed
12 |        meta  : https://meta.wikimedia.org/wiki/Steward_requests/Miscellaneous
13 | 
14 | . OAuth owner-only consumer registration:
15 | 
16 |        https://meta.wikimedia.org/wiki/Special:OAuthConsumerRegistration/propose
17 |         . Check the box "This consumer is for use only by.."
18 |         . IP Ranges and Public RSA Key should be left empty
19 |         . Check the box for edit permissions
20 |        If you ever need to add/change permissions, the old key/secrets are discarded and new ones registered
21 | 
22 | . Add the provided Keys and Secrets to wikiget in the "Program cfg" section
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016-2025 by User:GreenC (at en.wikipedia.org)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | # Changes in reverse chronological order
 2 | #
 3 | #
 4 | # 1.30 Aug 17      - MAJOR BUG: Agent string was not correctly defined. This caused API calls
 5 | #                    to fail or partly. Ensure the G["agent"] string is correctly defined
 6 | #                    with your contact information. See near the top of the program.
 7 | # 1.22 Jul 08 2025 - improve memory management in getallpages()
 8 | # 1.21 Dec 06 2024 - fix regex statements generating warnings under newer awk versions
 9 | #                  - add secure location to store OAuth secrets
10 | # 1.20 Dec 16 2023 - minor changes 
11 | # 1.19 Dec 19      - add -G page purge for mass purges
12 | # 1.18 May 29 2021 - fix bug in -n with multiple namespaces
13 | # 1.17 Oct 31      - add -i and -j option for use with -r 
14 | # 1.16 Jun 30      - fix bug in join2()
15 | # 1.15 Apr 08 2020 - add optional -i and -j functions used with -u
16 | #                  - add random seed
17 | #                  - fix bug in getcontinue() when page name is "0", "0.0" etc.
18 | # 1.14 Jun 08      - fix bug in -R (page move)
19 | # 1.13 Jan 13 2019 - fix bug in -A when -k = 0
20 | # 1.12 Dec 21      - add -B
21 | # 1.11 Dec 11      - add -k for -A
22 | # 1.10 Dec 07      - fix bug in getcontinue() to use query_json()
23 | #                    add -A (get all page titles)
24 | #                    change &utf8=1 to &formatversion=2 in all API calls
25 | #                    fix -x / rewrite xlinks() and getxlinks()
26 | # 1.04 Jul 15      - fix -P STDIN (strip newlines)
27 | # 1.03 Jun 16      - add spamblacklist error to printResult()
28 | # 1.02 Jun 15      - fix removefile(), urlencodeawk(), printResult()
29 | # 1.01 Jun 14      - new version of urlElement() + new subs() and splits()
30 | # 1.00 Jun 12      - add edit functions -E, -R and -I
31 | #                    add OAuth 
32 | #                    add new functions (randomnumber,splitx,etc)
33 | #                    add new JSON library
34 | #                    update library functions
35 | #                    code spacing
36 | #
37 | # 0.76 Jun 07      - fix API url (define only once)
38 | #                    fix bug in files_verify()
39 | #                    add asplit() and _defaults
40 | # 0.75 May 01      - fix -u (remove User: prefix from entity)
41 | # 0.74 Apr 26      - add &#10; to convertxml() (line-feed)
42 | # 0.73 Mar 23      - fix bug in -w -p
43 | # 0.72 Mar 15      - add -F forward-links
44 | # 0.71 Mar 06      - add -z project option
45 | #                    extended help display changed to ~80 column
46 | #                    add regex example to help
47 | # 0.70 Feb 21 2018 - add -y debug option
48 | #                    add -n option when using -b
49 | #                    help display changed to ~80 column
50 | # 0.62 Oct 03 2017 - -a max's out at 10000
51 | # 0.61 Apr 14      - fix -r (rccontinue)
52 | # 0.60 Mar 17      - add -r option
53 | # 0.51 Mar 13 2017 - add -n option when using -x
54 | # 0.50 Dec 15 2016 - add -x (external link search)
55 | # 0.47 Nov 30      - fix &utf8=1 in API for usercontributions and categories
56 | # 0.46 Nov 30      - fix -g (for +50 results)
57 | #                    add stdErr()
58 | # 0.45 Nov 29      - fix -g (search title or article)
59 | # 0.40 Nov 28      - add -a (search and sub-options)
60 | #                    add apierror() (check for API errors)
61 | # 0.30 Nov 27      - converted -c and -u to JSON
62 | # 0.20 Nov 26      - add -t (backlink types)
63 | #                    add -q (category link types)
64 | # 0.10 Nov 24 2016 - initial release
65 | 
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Wikiget
  2 | ===================
  3 | Wikiget is a Unix command-line tool to retrieve lists of article titles from Wikipedia, search Wikipedia, edit Wikipedia and more.
  4 | 
  5 | Features:
  6 | 
  7 | * A list of target article titles is often needed for bot makers. For example all articles in a category, articles that use a 
  8 | template (backlinks), or articles edited by a username (user contributions). Wget provides a simple front-end to common API requests.
  9 | 
 10 | * Search Wikipedia from the command-line with the option for regex and snippits output.
 11 | 
 12 | * Editing Wikipedia couldn't be easier with the -E option. See EDITSETUP for authentication.
 13 | 
 14 | Wikiget options and examples:
 15 | 
 16 | 	Wikiget - command-line access to some Wikimedia API functions
 17 | 	
 18 | 	Usage:
 19 | 	
 20 | 	 Backlinks:
 21 | 	       -b <name>        Backlinks for article, template, userpage, etc..
 22 | 	         -t <types>     (option) 1-3 letter string of types of backlinks:
 23 | 	                         n(ormal)t(ranscluded)f(ile). Default: "ntf".
 24 | 	                         See -h for more info 
 25 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace(s)
 26 | 	                         Only list pages in this namespace. Default: 0
 27 | 	                         See -h for NS codes and examples
 28 | 	
 29 | 	 Forward-links:
 30 | 	       -F <name>        Forward-links for article, template, userpage, etc..
 31 | 	
 32 | 	 Redirects:
 33 | 	       -B <name>        Redirects for article, template, userpage, etc..
 34 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace(s)
 35 | 	                         Only list redirects in this namespace. Default: 0
 36 | 	                         See -h for NS codes and examples
 37 | 	
 38 | 	 User contributions:
 39 | 	       -u <username>    Username without User: prefix
 40 | 	         -s <starttime> Start time in YMD format (-s 20150101). Required with -u
 41 | 	         -e <endtime>   End time in YMD format (-e 20151231). If same as -s,
 42 | 	                         does 24hr range. Required with -u
 43 | 	         -i <regex>     (option) Edit comment must include regex match
 44 | 	         -j <regex>     (option) Edit comment must exclude regex match
 45 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace
 46 | 	                         Only list pages in this namespace. Default: 0
 47 | 	                         See -h for NS codes and examples
 48 | 	
 49 | 	 Recent changes:
 50 | 	       -r               Recent changes (past 30 days) aka Special:RecentChanges
 51 | 	                         Either -o or -t required
 52 | 	         -o <username>  Only list changes made by this user
 53 | 	         -k <tag>       Only list changes tagged with this tag
 54 | 	         -i <regex>     (option) Edit comment must include regex match
 55 | 	         -j <regex>     (option) Edit comment must exclude regex match
 56 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace
 57 | 	                         Only list pages in this namespace. Default: 0
 58 | 	                         See -h for NS codes and examples
 59 | 	
 60 | 	 Category list:
 61 | 	       -c <category>    List articles in a category
 62 | 	         -q <types>     (option) 1-3 letter string of types of links: 
 63 | 	                         p(age)s(ubcat)f(ile). Default: "p"
 64 | 	
 65 | 	 Search-result list:
 66 | 	       -a <search>      List of articles containing a search string
 67 | 	                         See docs https://www.mediawiki.org/wiki/Help:CirrusSearch
 68 | 	         -d             (option) Include search-result snippet in output (def: title)
 69 | 	         -g <target>    (option) Search in "title" or "text" (def: "text")
 70 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace
 71 | 	                         Only list pages in this namespace. Default: 0
 72 | 	                         See -h for NS codes and examples
 73 | 	         -i <maxsize>   (option) Max number of results to return. Default: 10000
 74 | 	                         10k max limit imposed by search engine
 75 | 	         -j             (option) Show number of search results
 76 | 	
 77 | 	 External links list:
 78 | 	       -x <domain name> List articles containing domain name (Special:Linksearch)
 79 | 	                        Works with domain-name only. To search for a full URI use
 80 | 	                          regex. eg. -a "insource:/http:\/\/gq.com\/home.htm/"
 81 | 	                        To include subdomains use wildcards: "-x *.domain.com"
 82 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace
 83 | 	                         Only list pages in this namespace. Default: 0
 84 | 	                         See -h for NS codes and examples
 85 | 	
 86 | 	 Print wiki text:
 87 | 	       -w <article>     Print wiki text of article
 88 | 	         -p             (option) Plain-text version (strip wiki markup)
 89 | 	         -f             (option) Don't follow redirects (print redirect page)
 90 | 	
 91 | 	 All pages:
 92 | 	       -A               Print a list of page titles on the wiki (possibly very large)
 93 | 	         -t <# type>    1=All, 2=Skip redirects, 3=Only redirects. Default: 2
 94 | 	         -k <#>         Number of pages to return. 0 is all. Default: 10
 95 | 	         -n <namespace> (option) Pipe-separated numeric value(s) of namespace
 96 | 	                         Only list pages in this namespace. Default: 0
 97 | 	                         See -h for NS codes and examples
 98 | 	
 99 | 	 Edit page:
100 | 	       -E <title>       Edit a page with this title. Requires -S and -P
101 | 	         -S <summary>   Edit summary
102 | 	         -P <filename>  Page content filename. If "STDIN" read from stdin
103 | 	                         See EDITSETUP for authentication configuration
104 | 	
105 | 	       -R <page>        Move from page name. Requires -T
106 | 	         -T <page>      Move to page name
107 | 	
108 | 	       -G <page>        Purge page
109 | 	       -I               Show OAuth userinfo
110 | 	
111 | 	 Global options:
112 | 	       -l <language>    Wiki language code (default: en)
113 | 	                         See https://en.wikipedia.org/wiki/List_of_Wikipedias
114 | 	       -z <project>     Wiki project (default: wikipedia)
115 | 	                         https://en.wikipedia.org/wiki/Wikipedia:Wikimedia_sister_projects
116 | 	       -m <#>           API maxlag value (default: 5)
117 | 	                         See https://www.mediawiki.org/wiki/API:Etiquette#Use_maxlag_parameter
118 | 	       -y               Print debugging to stderr (show URLs sent to API)
119 | 	       -V               Version and copyright
120 | 	       -h               Help with examples
121 | 	
122 | 	Examples:
123 | 	
124 | 	 Backlinks:
125 | 	   for a User: showing all link types ("ntf")
126 | 	     wikiget -b "User:Jimbo Wales"
127 | 	   for a User: showing normal and transcluded links
128 | 	     wikiget -b "User:Jimbo Wales" -t nt
129 | 	   for a Template: showing transcluded links
130 | 	     wikiget -b "Template:Gutenberg author" -t t
131 | 	   for a File: showing file links
132 | 	     wikiget -b "File:Justforyoucritter.jpg" -t f
133 | 	   for article "Paris (Idaho)" on the French Wiki
134 | 	     wikiget -b "Paris (Idaho)" -l fr
135 | 	
136 | 	 User contributions:
137 | 	   show all edits from 9/10-9/12 on 2001
138 | 	     wikiget -u "Jimbo Wales" -s 20010910 -e 20010912
139 | 	   show all edits during the 24hrs of 9/11
140 | 	     wikiget -u "Jimbo Wales" -s 20010911 -e 20010911
141 | 	   show all edits when the edit-comment starts with 'A' 
142 | 	     wikiget -u "Jimbo Wales" -s 20010911 -e 20010911 -i "^A"
143 | 	   articles only
144 | 	     wikiget -u "Jimbo Wales" -s 20010911 -e 20010930 -n 0
145 | 	   talk pages only
146 | 	     wikiget -u "Jimbo Wales" -s 20010911 -e 20010930 -n 1
147 | 	   talk and articles only
148 | 	     wikiget -u "Jimbo Wales" -s 20010911 -e 20010930 -n "0|1"
149 | 	
150 | 	   -n codes: https://www.mediawiki.org/wiki/Extension_default_namespaces
151 | 	
152 | 	 Recent changes:
153 | 	   show edits for prior 30 days by IABot made under someone else's name
154 | 	   (ie. OAuth) with an edit summary including this target word
155 | 	     wikiget -k "OAuth CID: 1804" -r -i "Bluelinking"
156 | 	
157 | 	   CID list: https://en.wikipedia.org/wiki/Special:Tags
158 | 	
159 | 	 Category list:
160 | 	   pages in a category
161 | 	     wikiget -c "Category:1900 births"
162 | 	   subcats in a category
163 | 	     wikiget -c "Category:Dead people" -q s
164 | 	   subcats and pages in a category
165 | 	     wikiget -c "Category:Dead people" -q sp
166 | 	
167 | 	 Search-result list:
168 | 	   article titles containing a search
169 | 	     wikiget -a "Jethro Tull" -g title
170 | 	   first 50 articles containing a search
171 | 	     wikiget -a John -i 50
172 | 	   include snippet of text containing the search string
173 | 	     wikiget -a John -i 50 -d
174 | 	   search talk and articles only
175 | 	     wikiget -a "Barleycorn" -n "0|1"
176 | 	   regex search, include debug output
177 | 	     wikiget -a "insource:/ia[^.]*[.]us[.]/" -y
178 | 	   subpages of User:GreenC
179 | 	     wikiget -a "user: subpageof:GreenC"
180 | 	
181 | 	   search docs: https://www.mediawiki.org/wiki/Help:CirrusSearch
182 | 	   -n codes: https://www.mediawiki.org/wiki/Extension_default_namespaces
183 | 	
184 | 	 External link list:
185 | 	   list articles containing a URL with this domain
186 | 	     wikiget -x "news.yahoo.com"
187 | 	   list articles in NS 1 containing a URL with this domain
188 | 	     wikiget -x "*.yahoo.com" -n 1
189 | 	
190 | 	 All pages:
191 | 	   all page titles excluding redirects w/debug tracking progress
192 | 	     wikiget -A -t 2 -y > list.txt
193 | 	   first 50 page titles including redirects
194 | 	     wikiget -A -t 1 -k 50 > list.txt
195 | 	
196 | 	 Print wiki text:
197 | 	   wiki text of article "Paris" on the English Wiki
198 | 	     wikiget -w "Paris"
199 | 	   plain text of article "China" on the French Wiki
200 | 	     wikiget -w "China" -p -l fr
201 | 	   wiki text of article on Wikinews
202 | 	     wikiget -w "Healthy cloned monkeys born in Shanghai" -z wikinews
203 | 	
204 | 	 Edit page:
205 | 	   Edit "Paris" by uploading new content from the local file paris.ws
206 | 	     wikiget -E "Paris" -S "Fix spelling" -P "/home/paris.ws"
207 | 	   Input via stdin
208 | 	     cat /home/paris.ws | wikiget -E "Paris" -S "Fix spelling" -P STDIN
209 | 	
210 | 
211 | Installation
212 | =============
213 | Download wikiget.awk
214 | 
215 | Set executable: chmod 750 wikiget.awk
216 | 
217 | Optionally create a symlink: ln -s wikiget.awk wikiget
218 | 
219 | Change hashbang (first line) to location of GNU Awk 4+  - use 'which gawk' to see where it is on your system.
220 | 
221 | Change the agent "Contact" line to your Wikipedia Username (near the top of the program). It's vital to have correct contact information per WMF bot policy. API calls may fail with missing Agent information.
222 | 
223 | Requires one of the following to be in the path: wget, curl or lynx (use 'which wget' to see where it is on your system)
224 | 
225 | Usage
226 | ==========
227 | The advantage of working in Unix is access to other tools. Some examples follow.
228 | 
229 | A search-replace bot:
230 | 
231 | 	wikiget -w "Wikipedia" | sed 's/Wikipedia/Wikipodium/g' | wikiget -E "Wikipedia" -S "Change to Wikipodium" -P STDIN
232 | 
233 | Expand: download the wikisource (-w) for article "Wikipedia". Search/replace (sed) all occurances of 'Wikipedia' with 'Wikipodium'. Upload result (-E) with (-S) edit summary taking input from STDIN. This can be added to a for-loop that operates on a list of articles. 
234 | 
235 | This unix pipe method is for light and quick work, for a production bot a script would invoke wikiget with -P <filename> and check its output for an error ie. a result other than "Success" or "No change" then make a retry. In about 5% of uploads the WMF servers fail and a retry is needed, up to 3 are usually enough. Retries are not built-in to Wikiget as it depends on the calling application how to handle error results.
236 | 
237 | To find the intersection of two categories (articles that exist in both), download the category lists using the -c option, then use grep to find the intersection:
238 | 
239 | 	grep -xF -f list1 list2
240 | 
241 | Or to find the names unique to list2
242 | 
243 | 	grep -vxF -f list1 list2
244 | 
245 | Credits
246 | ==================
247 | by User:GreenC (en.wikipedia.org)
248 | 
249 | MIT License
250 | 
251 | Wikiget is part of the BotWikiAwk framework of tools and libraries for building and running bots on Wikipedia
252 | 
253 | https://github.com/greencardamom/BotWikiAwk
254 | 
255 | 


--------------------------------------------------------------------------------
/wikiget.awk:
--------------------------------------------------------------------------------
   1 | #!/usr/local/bin/awk -bE
   2 | 
   3 | #
   4 | # Wikiget - command-line access to Wikimedia API read/write functions
   5 | #           https://github.com/greencardamom/Wikiget
   6 | #
   7 | 
   8 | # The MIT License (MIT)
   9 | #
  10 | # Copyright (c) 2016-2025 by User:GreenC (at en.wikipedia.org)
  11 | #
  12 | # Permission is hereby granted, free of charge, to any person obtaining a copy      
  13 | # of this software and associated documentation files (the "Software"), to deal   
  14 | # in the Software without restriction, including without limitation the rights                
  15 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell    
  16 | # copies of the Software, and to permit persons to whom the Software is              
  17 | # furnished to do so, subject to the following conditions:
  18 | #
  19 | # The above copyright notice and this permission notice shall be included in
  20 | # all copies or substantial portions of the Software.
  21 | #
  22 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  23 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  24 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  25 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  26 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  27 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  28 | # THE SOFTWARE.
  29 | 
  30 | #                              Code Table of Contents
  31 | #
  32 | #  [[ _______ Global system vars ____________________________________________ ]]
  33 | #  [[ _______ Command line parsing and argument processing __________________ ]]
  34 | #  [[ _______ Setup _________________________________________________________ ]]
  35 | #  [[ _______ Core read-only functions ______________________________________ ]]
  36 | #          ___ Categories (-c)         
  37 | #          ___ External links list (-x) 
  38 | #          ___ Recent changes (-r)     
  39 | #          ___ User Contributions (-u) 
  40 | #          ___ Forward links (-F) 
  41 | #          ___ Backlinks (-b)
  42 | #          ___ Print wiki text (-w)
  43 | #          ___ Search list (-a) 
  44 | #  [[ _______ Utilities ______________________________________________________ ]]
  45 | #  [[ _______ Library ________________________________________________________ ]]
  46 | #  [[ _______ JSON ___________________________________________________________ ]]
  47 | #  [[ _______ Edit ___________________________________________________________ ]]
  48 | 
  49 | 
  50 | # [[ ________ Global system vars _____________________________________________ ]]
  51 | 
  52 | BEGIN { # Program cfg
  53 | 
  54 |     _defaults = "contact   = User:MY_NAME @ MYSITE.wikipedia.org \
  55 |                  program   = Wikiget \
  56 |                  version   = 1.31 \
  57 |                  copyright = 2016-2025 \
  58 |                  maxlag    = 5 \
  59 |                  lang      = en \
  60 |                  project   = wikipedia"
  61 | 
  62 |     asplit(G, _defaults, "[ ]*[=][ ]*", "[ ]{9,}")
  63 |     G["agent"] = G["program"] " " G["version"] " " G["contact"]
  64 |                                  
  65 |     setup("wget curl lynx")                                     # Use one of wget, curl or lynx - searches PATH in this order
  66 |                                                                 #  They do the same, need at least one available in PATH
  67 |                                                                 #  For edit (-E) wget is required
  68 |     Optind = Opterr = 1                                         
  69 | 
  70 |     # randomnumber() seed
  71 |     _cliff_seed = "0.00" splitx(sprintf("%f", systime() * 0.000001), ".", 2)
  72 | 
  73 |     # Optional OAuth consumer keys. See EDITSETUP for more info. 
  74 | 
  75 |     # Create the files below with secure permissions to store your secrets:
  76 | 
  77 |     #   mkdir /home/user/.config/wikiget
  78 |     #   mkdir /home/user/.config/wikiget/secrets
  79 |     #   chmod 700 /home/user/.config/wikiget/secrets
  80 |     #   chmod 600 mybot.consumerkey
  81 |     #   chmod 600 mybot.consumersecret
  82 |     #   chmod 600 mybot.accesskey
  83 |     #   chmod 600 mybot.accesssecret
  84 | 
  85 |     G["consumerKey"]    = strip(readfile("/home/user/.config/wikiget/secrets/mybot.consumerkey"))
  86 |     G["consumerSecret"] = strip(readfile("/home/user/.config/wikiget/secrets/mybot.consumersecret"))
  87 |     G["accessKey"]      = strip(readfile("/home/user/.config/wikiget/secrets/mybot.accesskey"))
  88 |     G["accessSecret"]   = strip(readfile("/home/user/.config/wikiget/secrets/mybot.accesssecret"))
  89 | 
  90 | }
  91 | 
  92 | BEGIN { # Program run
  93 | 
  94 |     parsecommandline()
  95 | 
  96 | }
  97 | 
  98 | 
  99 | # [[ ________ Command line parsing and argument processing ___________________ ]]
 100 | 
 101 | #
 102 | # parsecommandline() - parse command-line
 103 | #
 104 | function parsecommandline(c,opts,Arguments) {
 105 | 
 106 |     while ((c = getopt(ARGC, ARGV, "yrhVfjpdo:k:a:g:i:s:e:u:m:b:l:n:w:c:t:q:x:z:F:E:S:P:I:R:T:AB:G:")) != -1) {
 107 |         opts++
 108 |         if (c == "h") {
 109 |             usage()
 110 |             usage_extended()
 111 |             exit
 112 |         }
 113 | 
 114 |         if (c == "b") {                               #  -b <entity>     Backlinks for entity ( -b "Template:Project Gutenberg" )
 115 |             Arguments["main"] = verifyval(Optarg)
 116 |             Arguments["main_c"] = "b"
 117 |         }
 118 |         if (c == "t")                                 #  -t <types>      Types of backlinks ( -t "ntf" )
 119 |             Arguments["bltypes"] = verifyval(Optarg)
 120 | 
 121 |         if (c == "F") {                               #  -F <entity>     Forward-links for entity ( -F "Example" )
 122 |             Arguments["main"] = verifyval(Optarg)
 123 |             Arguments["main_c"] = "F"
 124 |         }
 125 | 
 126 |         if (c == "B") {                               #  -B <entity>     Redirects for entity ( -B "Example" )
 127 |             Arguments["main"] = verifyval(Optarg)
 128 |             Arguments["main_c"] = "B"
 129 |         }
 130 | 
 131 |         if (c == "c") {                               #  -b <entity>     List articles in a category ( -c "Category:1900 births" )
 132 |             Arguments["main"] = verifyval(Optarg)
 133 |             Arguments["main_c"] = "c"
 134 |         }
 135 |         if (c == "q")                                 #  -q <types>      Types of links in a category ( -t "psf" )
 136 |             Arguments["cattypes"] = verifyval(Optarg)
 137 | 
 138 |         if (c == "a") {                               #  -a <search>     List articles in search results ( -a "John Smith" )
 139 |             Arguments["main"] = verifyval(Optarg)
 140 |             Arguments["main_c"] = "a"
 141 |         }
 142 |         if (c == "d")                                 #  -d              Include search snippet in results (optional with -a )
 143 |             Arguments["snippet"] = "true"
 144 |         if (c == "j")                                 #  -j              Show number of search results (optional with -a)
 145 |             Arguments["numsearch"] = "true"
 146 |         if (c == "i")                                 #  -i <max>        Max number of search results (optional with -a)
 147 |             Arguments["maxsearch"] = verifyval(Optarg)
 148 |         if (c == "g")                                 #  -g <type>       Target search (optional with -a)
 149 |             Arguments["searchtarget"] = verifyval(Optarg)
 150 | 
 151 |         if (c == "u") {                               #  -u <username>   User contributions ( -u "User:Green Cardamom")
 152 |             Arguments["main"] = verifyval(Optarg)
 153 |             Arguments["main_c"] = "u"
 154 |         }
 155 |         if (c == "s")                                 #  -s <time>       Start time for -u (required w/ -u)
 156 |             Arguments["starttime"] = verifyval(Optarg)
 157 |         if (c == "e")                                 #  -e <time>       End time for -u (required w/ -u)
 158 |             Arguments["endtime"] = verifyval(Optarg)
 159 |         if (c == "i")                                 #  -i <regex>      Edit comment must include this regex match
 160 |             Arguments["inccomments"] = verifyval(Optarg)
 161 |         if (c == "j")                                 #  -j <regex>      Edit comment must exclude this regex match
 162 |             Arguments["exccomments"] = verifyval(Optarg)
 163 |         
 164 |         if (c == "n")                                 #  -n <namespace>  Namespace for -u, -a and -x (option)
 165 |             Arguments["namespace"] = verifyval(Optarg)
 166 |    
 167 |         if (c == "r")                                 #  -r              Recent changes
 168 |             Arguments["main_c"] = "r"
 169 |         if (c == "o")                                 #  -o <username>   Username for recent changes
 170 |             Arguments["username"] = verifyval(Optarg)
 171 |         if (c == "k")                                 #  -k <tag>        Tag for recent changes
 172 |             Arguments["tags"] = verifyval(Optarg)
 173 |         
 174 | 
 175 |         if (c == "A") {                               #  -A              Dump a list of all article titles on Wikipedia (no redirects)
 176 |             Arguments["main_c"] = "A"
 177 |         }
 178 |         if (c == "t")                                 #  -t <type>       Filter redirects
 179 |             Arguments["redirtype"] = verifyval(Optarg)
 180 |         if (c == "k")                                 #  -k <#>          Number of pages to return
 181 |             Arguments["maxpages"] = verifyval(Optarg)
 182 |    
 183 |         if (c == "w") {                               #  -w <article>    Print wiki text 
 184 |             Arguments["main"] = verifyval(Optarg)
 185 |             Arguments["main_c"] = "w"
 186 |         }
 187 |         if (c == "f")                                 #  -f              Don't follow redirect (return source of redirect page)
 188 |             Arguments["followredirect"] = "false"
 189 |         if (c == "p")                                 #  -p              Plain text (strip wiki markup)
 190 |             Arguments["plaintext"] = "true"
 191 | 
 192 |         if (c == "x") {                               #  -x <URL>        List articles containing an external link 
 193 |             Arguments["main"] = verifyval(Optarg)
 194 |             Arguments["main_c"] = "x"
 195 |         }
 196 | 
 197 | 
 198 |         if (c == "E") {                               #  -E <title>      Edit a page with this title. Requires -S and -P
 199 |             Arguments["main_c"] = "E"
 200 |             Arguments["title"] = verifyval(Optarg)
 201 |         }
 202 |         if (c == "S")                                 #  -S <summary>    Edit summary
 203 |             Arguments["summary"] = verifyval(Optarg)
 204 |         if (c == "P")                                 #  -P <filename>   Page content filename
 205 |             Arguments["page"] = verifyval(Optarg)               
 206 |         
 207 |         if (c == "R") {                               #  -R <page>       Move from page name
 208 |             Arguments["main_c"] = "R"
 209 |             Arguments["movefrom"] = verifyval(Optarg)
 210 |         }
 211 |         if (c == "T")                                 #  -T <page>       Move to page name
 212 |             Arguments["moveto"] = verifyval(Optarg)
 213 | 
 214 |         if (c == "G") {                               #  -G <page>       Purge page
 215 |             Arguments["main_c"] = "G"
 216 |             Arguments["title"] = verifyval(Optarg)
 217 |         }
 218 | 
 219 |         if (c == "I")                                 #  -I              User info
 220 |             Arguments["main_c"] = "I"
 221 | 
 222 |         if (c == "m")                                 #  -m <maxlag>     Maxlag setting when using API, default set in BEGIN{} section
 223 |             Arguments["maxlag"] = verifyval(Optarg)
 224 |         if (c == "l")                                 #  -l <lang>       Language code, default set in BEGIN{} section
 225 |             Arguments["lang"] = verifyval(Optarg)
 226 |         if (c == "z")                                 #  -z <project>    Project name, default set in BEGIN{} section
 227 |             Arguments["project"] = verifyval(Optarg)
 228 |         if (c == "y")                                 #  -y              Show debugging info to stderr
 229 |             Arguments["debug"] = 1
 230 |         if (c == "V") {                               #  -V              Version and copyright info.
 231 |             version()
 232 |             exit
 233 |         }  
 234 |     }
 235 |     if (opts < 1) 
 236 |         usage(1)
 237 | 
 238 |     processarguments(Arguments)
 239 | }
 240 | 
 241 | #
 242 | # processarguments() - process arguments
 243 | #
 244 | function processarguments(Arguments,   c,a,i) {
 245 | 
 246 |     if (length(Arguments["lang"]) > 0)                                # Check options, set defaults
 247 |         G["lang"] = Arguments["lang"]
 248 |         # default set in BEGIN{}
 249 | 
 250 |     if (length(Arguments["project"]) > 0)                             # Check options, set defaults
 251 |         G["project"] = Arguments["project"]
 252 |         # default set in BEGIN{}
 253 | 
 254 |     if (isanumber(Arguments["maxlag"])) 
 255 |         G["maxlag"] = Arguments["maxlag"]
 256 |         # default set in BEGIN{}
 257 | 
 258 |     if (isanumber(Arguments["maxpages"])) 
 259 |         G["maxpages"] = Arguments["maxpages"]
 260 |     else
 261 |         G["maxpages"] = 10
 262 | 
 263 |     if (isanumber(Arguments["maxsearch"])) 
 264 |         G["maxsearch"] = Arguments["maxsearch"]
 265 |     else
 266 |         G["maxsearch"] = 10000
 267 | 
 268 |     if (isanumber(Arguments["namespace"]) || Arguments["namespace"] ~ "[|]") 
 269 |         G["namespace"] = Arguments["namespace"]
 270 |     else
 271 |         G["namespace"] = "0"
 272 | 
 273 |     if (Arguments["followredirect"] == "false")
 274 |         G["followredirect"] = "false"
 275 |     else
 276 |         G["followredirect"] = "true"
 277 | 
 278 |     if (Arguments["plaintext"] == "true")
 279 |         G["plaintext"] = "true"
 280 |     else
 281 |         G["plaintext"] = "false"
 282 | 
 283 |     if (Arguments["snippet"] == "true")
 284 |         G["snippet"] = "true"
 285 |     else
 286 |         G["snippet"] = "false"
 287 | 
 288 |     if (Arguments["redirtype"] !~ /1|2|3/)
 289 |         G["redirtype"] = "2"
 290 |     else
 291 |         G["redirtype"] = Arguments["redirtype"]
 292 | 
 293 |     if (Arguments["numsearch"] == "true")
 294 |         G["numsearch"] = "true"
 295 |     else
 296 |         G["numsearch"] = "false"
 297 | 
 298 |     if (Arguments["searchtarget"] !~ /^text$|^title$/)
 299 |         G["searchtarget"] = "text"
 300 |     else
 301 |         G["searchtarget"] = Arguments["searchtarget"]
 302 | 
 303 |     if (length(Arguments["bltypes"]) > 0 && Arguments["main_c"] == "b") {
 304 |         if (Arguments["bltypes"] !~ /[^ntf]/) {    # ie. contains only those letters
 305 |             c = split(Arguments["bltypes"], a, "")
 306 |             while (i++ < c) 
 307 |                 G["bltypes"] = G["bltypes"] a[i]
 308 |         }
 309 |         else {
 310 |             stdErr("Invalid \"-t\" value(s)")
 311 |             exit
 312 |         }
 313 |     }
 314 |     else
 315 |         G["bltypes"] = "ntf"
 316 | 
 317 |     if (length(Arguments["cattypes"]) > 0) {
 318 |         if (Arguments["cattypes"] !~ /[^psf]/) {    
 319 |             c = split(Arguments["cattypes"], a, "")
 320 |             while (i++ < c) 
 321 |                 G["cattypes"] = G["cattypes"] a[i]
 322 |         }
 323 |         else {
 324 |             stdErr("Invalid \"-q\" value(s)")
 325 |             exit
 326 |         }
 327 |     }
 328 |     else
 329 |         G["cattypes"] = "p"
 330 | 
 331 |     if(! empty(Arguments["inccomments"]))
 332 |       G["inccomments"] = Arguments["inccomments"]
 333 |     if(! empty(Arguments["exccomments"]))
 334 |       G["exccomments"] = Arguments["exccomments"]
 335 | 
 336 |     if (Arguments["debug"])                                    # Enable debugging
 337 |         G["debug"] = 1
 338 | 
 339 |     G["apiURL"] = "https://" G["lang"] "." G["project"] ".org/w/api.php?"
 340 | 
 341 |   # ________________ program entry points _______________________ #
 342 | 
 343 |     if (Arguments["main_c"] == "E") {                          # edit page
 344 |         if (empty(Arguments["summary"]) || empty(Arguments["page"])) {
 345 |             stdErr("Missing -S and/or -P") 
 346 |             usage(1)
 347 |         }
 348 |         editPage(Arguments["title"], Arguments["summary"], Arguments["page"])
 349 |     }
 350 |     else if (Arguments["main_c"] == "I") {                     # OAuth userinfo
 351 |         userInfo()
 352 |     }
 353 |     else if (Arguments["main_c"] == "G") {                     # purge page
 354 |         if (empty(Arguments["title"])) {
 355 |             stdErr("Missing page title")
 356 |             usage(1)
 357 |         }
 358 |         purgePage(Arguments["title"])
 359 |     }
 360 |     else if (Arguments["main_c"] == "R") {                     # move page
 361 |         if (empty(Arguments["summary"])) {
 362 |           stdErr("Missing -S (reason for move)")
 363 |           usage(1)
 364 |         }
 365 |         if (empty(Arguments["moveto"]))
 366 |             usage(1)
 367 |         movePage(Arguments["movefrom"], Arguments["moveto"], Arguments["summary"])
 368 |     }
 369 |     else if (Arguments["main_c"] == "A") {
 370 |         allPages(G["redirtype"])
 371 |     }
 372 |     else if (Arguments["main_c"] == "b") {                     # backlinks
 373 |         if ( entity_exists(Arguments["main"]) ) {
 374 |             if ( ! backlinks(Arguments["main"]) )
 375 |                 stdErr("No backlinks for " Arguments["main"]) 
 376 |         }
 377 |     }
 378 |     else if (Arguments["main_c"] == "F") {                     # forward-links
 379 |         forlinks(Arguments["main"])
 380 |     }
 381 |     else if (Arguments["main_c"] == "B") {                     # redirects
 382 |         redirects(Arguments["main"])
 383 |     }
 384 |     else if (Arguments["main_c"] == "c") {                     # categories
 385 |         category(Arguments["main"])
 386 |     }
 387 |     else if (Arguments["main_c"] == "x") {                     # external links
 388 |         xlinks(Arguments["main"])
 389 |     }
 390 |     else if (Arguments["main_c"] == "a") {                     # search results
 391 |         search(Arguments["main"])
 392 |     }
 393 | 
 394 |     else if (Arguments["main_c"] == "u") {                     # user contributions
 395 |         if (! isanumber(Arguments["starttime"]) || ! isanumber(Arguments["endtime"])) {
 396 |             stdErr("Invalid start time (-s) or end time (-e)\n")
 397 |             usage(1)
 398 |         }
 399 |         Arguments["starttime"] = Arguments["starttime"] "000000"
 400 |         Arguments["endtime"] = Arguments["endtime"] "235959"
 401 |         if (! ucontribs(Arguments["main"],Arguments["starttime"],Arguments["endtime"]) )
 402 |             stdErr("No user and/or edits found.")
 403 |     }
 404 | 
 405 |     else if (Arguments["main_c"] == "r") {                     # recent changes
 406 |         if ((length(Arguments["username"]) == 0 && length(Arguments["tags"]) == 0) || (length(Arguments["username"]) > 0 && length(Arguments["tags"]) > 0)) {
 407 |             stdErr("Recent changes requires either -f or -k\n")
 408 |             usage(1)
 409 |         }
 410 |         if (! rechanges(Arguments["username"],Arguments["tags"]) )
 411 |             stdErr("No recent changes found.")
 412 |     }
 413 | 
 414 |     else if (Arguments["main_c"] == "w") {                     # wiki text
 415 |         if (entity_exists(Arguments["main"]) ) {
 416 |             if (G["plaintext"] == "true")
 417 |                 print wikitextplain(Arguments["main"])
 418 |             else
 419 |                 print wikitext(Arguments["main"])
 420 |         }
 421 |         else {
 422 |             stdErr("Unable to find " Arguments["main"])
 423 |             exit
 424 |         }
 425 |     }
 426 |     else 
 427 |         usage(1)
 428 | }
 429 | 
 430 | #
 431 | # usage()
 432 | #
 433 | function usage(die) {
 434 |     print ""              
 435 |     print G["program"] " - command-line access to some Wikimedia API functions"
 436 |     print ""
 437 |     print "Usage:"         
 438 |     print ""
 439 |     print " Backlinks:"
 440 |     print "       -b <name>        Backlinks for article, template, userpage, etc.."
 441 |     print "         -t <types>     (option) 1-3 letter string of types of backlinks:" 
 442 |     print "                         n(ormal)t(ranscluded)f(ile). Default: \"ntf\"."
 443 |     print "                         See -h for more info "
 444 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace(s)" 
 445 |     print "                         Only list pages in this namespace. Default: 0"
 446 |     print "                         See -h for NS codes and examples"
 447 |     print ""
 448 |     print " Forward-links:"
 449 |     print "       -F <name>        Forward-links for article, template, userpage, etc.."
 450 |     print ""
 451 |     print " Redirects:"
 452 |     print "       -B <name>        Redirects for article, template, userpage, etc.."
 453 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace(s)" 
 454 |     print "                         Only list redirects in this namespace. Default: 0"
 455 |     print "                         See -h for NS codes and examples"
 456 |     print ""
 457 |     print " User contributions:"
 458 |     print "       -u <username>    Username without User: prefix"
 459 |     print "         -s <starttime> Start time in YMD format (-s 20150101). Required with -u"
 460 |     print "         -e <endtime>   End time in YMD format (-e 20151231). If same as -s," 
 461 |     print "                         does 24hr range. Required with -u"
 462 |     print "         -i <regex>     (option) Edit comment must include regex match"
 463 |     print "         -j <regex>     (option) Edit comment must exclude regex match"
 464 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace"
 465 |     print "                         Only list pages in this namespace. Default: 0"
 466 |     print "                         See -h for NS codes and examples"
 467 |     print ""
 468 |     print " Recent changes:"
 469 |     print "       -r               Recent changes (past 30 days) aka Special:RecentChanges" 
 470 |     print "                         Either -o or -t required"
 471 |     print "         -o <username>  Only list changes made by this user"
 472 |     print "         -k <tag>       Only list changes tagged with this tag"
 473 |     print "         -i <regex>     (option) Edit comment must include regex match"
 474 |     print "         -j <regex>     (option) Edit comment must exclude regex match"
 475 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace"
 476 |     print "                         Only list pages in this namespace. Default: 0"
 477 |     print "                         See -h for NS codes and examples"
 478 |     print ""
 479 |     print " Category list:"
 480 |     print "       -c <category>    List articles in a category"
 481 |     print "         -q <types>     (option) 1-3 letter string of types of links: "
 482 |     print "                         p(age)s(ubcat)f(ile). Default: \"p\""
 483 |     print ""
 484 |     print " Search-result list:"
 485 |     print "       -a <search>      List of articles containing a search string"
 486 |     print "                         See docs https://www.mediawiki.org/wiki/Help:CirrusSearch"
 487 |     print "         -d             (option) Include search-result snippet in output (def: title)"
 488 |     print "         -g <target>    (option) Search in \"title\" or \"text\" (def: \"text\")"
 489 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace"
 490 |     print "                         Only list pages in this namespace. Default: 0"
 491 |     print "                         See -h for NS codes and examples"
 492 |     print "         -i <maxsize>   (option) Max number of results to return. Default: 10000"
 493 |     print "                         10k max limit imposed by search engine"
 494 |     print "         -j             (option) Show number of search results"
 495 |     print ""
 496 |     print " External links list:"
 497 |     print "       -x <domain name> List articles containing domain name (Special:Linksearch)"
 498 |     print "                        Works with domain-name only. To search for a full URI use" 
 499 |     print "                          regex. eg. -a \"insource:/http:\\/\\/gq.com\\/home.htm/\""  
 500 |     print "                        To include subdomains use wildcards: \"-x *.domain.com\""
 501 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace"
 502 |     print "                         Only list pages in this namespace. Default: 0"
 503 |     print "                         See -h for NS codes and examples"
 504 |     print ""
 505 |     print " Print wiki text:"
 506 |     print "       -w <article>     Print wiki text of article"
 507 |     print "         -p             (option) Plain-text version (strip wiki markup)"
 508 |     print "         -f             (option) Don't follow redirects (print redirect page)"
 509 |     print ""
 510 |     print " All pages:"
 511 |     print "       -A               Print a list of page titles on the wiki (possibly very large)" 
 512 |     print "         -t <# type>    1=All, 2=Skip redirects, 3=Only redirects. Default: 2"
 513 |     print "         -k <#>         Number of pages to return. 0 is all. Default: 10"
 514 |     print "         -n <namespace> (option) Pipe-separated numeric value(s) of namespace"
 515 |     print "                         Only list pages in this namespace. Default: 0"
 516 |     print "                         See -h for NS codes and examples"
 517 |     print ""
 518 |     print " Edit page:"
 519 |     print "       -E <title>       Edit a page with this title. Requires -S and -P"
 520 |     print "         -S <summary>   Edit summary"
 521 |     print "         -P <filename>  Page content filename. If \"STDIN\" read from stdin"
 522 |     print "                         See EDITSETUP for authentication configuration"
 523 |     print ""
 524 |     print "       -R <page>        Move from page name. Requires -T"
 525 |     print "         -T <page>      Move to page name"
 526 |     print ""
 527 |     print "       -G <page>        Purge page"
 528 |     print "       -I               Show OAuth userinfo"
 529 |     print ""
 530 |     print " Global options:"
 531 |     print "       -l <language>    Wiki language code (default: " G["lang"] ")" 
 532 |     print "                         See https://en.wikipedia.org/wiki/List_of_Wikipedias"
 533 |     print "       -z <project>     Wiki project (default: " G["project"] ")"
 534 |     print "                         https://en.wikipedia.org/wiki/Wikipedia:Wikimedia_sister_projects"
 535 |     print "       -m <#>           API maxlag value (default: " G["maxlag"] ")"
 536 |     print "                         See https://www.mediawiki.org/wiki/API:Etiquette#Use_maxlag_parameter"
 537 |     print "       -y               Print debugging to stderr (show URLs sent to API)"
 538 |     print "       -V               Version and copyright"
 539 |     print "       -h               Help with examples"
 540 |     print ""
 541 |     if(die) exit
 542 | }
 543 | function usage_extended() {
 544 |     print "Examples:"
 545 |     print ""
 546 |     print " Backlinks:"
 547 |     print "   for a User: showing all link types (\"ntf\")"
 548 |     print "     wikiget -b \"User:Jimbo Wales\""
 549 |     print "   for a User: showing normal and transcluded links"
 550 |     print "     wikiget -b \"User:Jimbo Wales\" -t nt"                               
 551 |     print "   for a Template: showing transcluded links"  
 552 |     print "     wikiget -b \"Template:Gutenberg author\" -t t"
 553 |     print "   for a File: showing file links"
 554 |     print "     wikiget -b \"File:Justforyoucritter.jpg\" -t f"
 555 |     print "   for article \"Paris (Idaho)\" on the French Wiki"
 556 |     print "     wikiget -b \"Paris (Idaho)\" -l fr"
 557 |     print ""
 558 |     print " User contributions:"
 559 |     print "   show all edits from 9/10-9/12 on 2001"
 560 |     print "     wikiget -u \"Jimbo Wales\" -s 20010910 -e 20010912" 
 561 |     print "   show all edits during the 24hrs of 9/11" 
 562 |     print "     wikiget -u \"Jimbo Wales\" -s 20010911 -e 20010911"  
 563 |     print "   show all edits when the edit-comment starts with 'A' " 
 564 |     print "     wikiget -u \"Jimbo Wales\" -s 20010911 -e 20010911 -i \"^A\""  
 565 |     print "   articles only"
 566 |     print "     wikiget -u \"Jimbo Wales\" -s 20010911 -e 20010930 -n 0"
 567 |     print "   talk pages only"
 568 |     print "     wikiget -u \"Jimbo Wales\" -s 20010911 -e 20010930 -n 1"
 569 |     print "   talk and articles only"
 570 |     print "     wikiget -u \"Jimbo Wales\" -s 20010911 -e 20010930 -n \"0|1\""
 571 |     print ""
 572 |     print "   -n codes: https://www.mediawiki.org/wiki/Extension_default_namespaces"
 573 |     print ""
 574 |     print " Recent changes:"
 575 |     print "   show edits for prior 30 days by IABot made under someone else's name"
 576 |     print "   (ie. OAuth) with an edit summary including this target word"
 577 |     print "     wikiget -k \"OAuth CID: 1804\" -r -i \"Bluelinking\""
 578 |     print ""
 579 |     print "   CID list: https://en.wikipedia.org/wiki/Special:Tags"
 580 |     print ""
 581 |     print " Category list:"
 582 |     print "   pages in a category"
 583 |     print "     wikiget -c \"Category:1900 births\""
 584 |     print "   subcats in a category"
 585 |     print "     wikiget -c \"Category:Dead people\" -q s"
 586 |     print "   subcats and pages in a category"
 587 |     print "     wikiget -c \"Category:Dead people\" -q sp"
 588 |     print ""
 589 |     print " Search-result list:"
 590 |     print "   article titles containing a search"
 591 |     print "     wikiget -a \"Jethro Tull\" -g title"
 592 |     print "   first 50 articles containing a search"
 593 |     print "     wikiget -a John -i 50"
 594 |     print "   include snippet of text containing the search string"
 595 |     print "     wikiget -a John -i 50 -d"
 596 |     print "   search talk and articles only"
 597 |     print "     wikiget -a \"Barleycorn\" -n \"0|1\""
 598 |     print "   regex search, include debug output"
 599 |     print "     wikiget -a \"insource:/ia[^.]*[.]us[.]/\" -y"
 600 |     print "   subpages of User:GreenC"
 601 |     print "     wikiget -a \"user: subpageof:GreenC\""
 602 |     print ""
 603 |     print "   search docs: https://www.mediawiki.org/wiki/Help:CirrusSearch"
 604 |     print "   -n codes: https://www.mediawiki.org/wiki/Extension_default_namespaces"
 605 |     print ""
 606 |     print " External link list:"
 607 |     print "   list articles containing a URL with this domain"
 608 |     print "     wikiget -x \"news.yahoo.com\""
 609 |     print "   list articles in NS 1 containing a URL with this domain"
 610 |     print "     wikiget -x \"*.yahoo.com\" -n 1"
 611 |     print ""
 612 |     print " All pages:"
 613 |     print "   all page titles excluding redirects w/debug tracking progress"
 614 |     print "     wikiget -A -t 2 -y > list.txt"
 615 |     print "   first 50 page titles including redirects"
 616 |     print "     wikiget -A -t 1 -k 50 > list.txt"
 617 |     print ""
 618 |     print " Print wiki text:"
 619 |     print "   wiki text of article \"Paris\" on the English Wiki"
 620 |     print "     wikiget -w \"Paris\""
 621 |     print "   plain text of article \"China\" on the French Wiki"
 622 |     print "     wikiget -w \"China\" -p -l fr"
 623 |     print "   wiki text of article on Wikinews"
 624 |     print "     wikiget -w \"Healthy cloned monkeys born in Shanghai\" -z wikinews"
 625 |     print ""  
 626 |     print " Edit page:"
 627 |     print "   Edit \"Paris\" by uploading new content from the local file paris.ws"
 628 |     print "     wikiget -E \"Paris\" -S \"Fix spelling\" -P \"/home/paris.ws\""
 629 |     print "   Input via stdin"
 630 |     print "     cat /home/paris.ws | wikiget -E \"Paris\" -S \"Fix spelling\" -P STDIN"
 631 |     print "   Purge page"
 632 |     print "     wikiget -G \"Paris\""
 633 |     print ""
 634 |  
 635 | }
 636 | function version() {
 637 |     print G["program"] " " G["version"]
 638 |     print "Copyright (C) " G["copyright"] " User:GreenC (en.wikipedia.org)"
 639 |     print
 640 |     print "The MIT License (MIT)"
 641 |     print
 642 |     print "Permission is hereby granted, free of charge, to any person obtaining a copy"      
 643 |     print "of this software and associated documentation files (the "Software"), to deal"   
 644 |     print "in the Software without restriction, including without limitation the rights"                
 645 |     print "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell"    
 646 |     print "copies of the Software, and to permit persons to whom the Software is"              
 647 |     print "furnished to do so, subject to the following conditions:"
 648 |     print
 649 |     print "The above copyright notice and this permission notice shall be included in"
 650 |     print "all copies or substantial portions of the Software."
 651 |     print
 652 |     print "THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR"
 653 |     print "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,"
 654 |     print "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE"
 655 |     print "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER"
 656 |     print "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,"
 657 |     print "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN"
 658 |     print "THE SOFTWARE."
 659 |     print 
 660 | }
 661 | 
 662 | # 
 663 | # Verify an argument has a valid value                
 664 | #
 665 | function verifyval(val) {
 666 | 
 667 |     if (val == "" || substr(val,1,1) ~ /^[-]/) {
 668 |         stdErr("\nCommand line argument has an empty value when it should have something.\n")
 669 |         usage(1)
 670 |     }
 671 |     return val
 672 | }              
 673 | 
 674 | #
 675 | # getopt()
 676 | #   Credit: GNU awk (/usr/local/share/awk/getopt.awk)
 677 | #   Pre-define these globaly: Optind = Opterr = 1 
 678 | #
 679 | function getopt(argc, argv, options,    thisopt, i) {
 680 | 
 681 |     if (length(options) == 0)    # no options given
 682 |         return -1
 683 | 
 684 |     if (argv[Optind] == "--") {  # all done
 685 |         Optind++
 686 |         _opti = 0
 687 |         return -1
 688 |     } else if (argv[Optind] !~ /^-[^:[:space:]]/) {
 689 |         _opti = 0
 690 |         return -1
 691 |     }
 692 |     if (_opti == 0)
 693 |         _opti = 2
 694 |     thisopt = substr(argv[Optind], _opti, 1)
 695 |     Optopt = thisopt
 696 |     i = index(options, thisopt)
 697 |     if (i == 0) {
 698 |         if (Opterr)
 699 |             printf("%c -- invalid option\n", thisopt) > "/dev/stderr"
 700 |         if (_opti >= length(argv[Optind])) {
 701 |             Optind++
 702 |             _opti = 0
 703 |         } else
 704 |             _opti++
 705 |         return "?"
 706 |     }
 707 |     if (substr(options, i + 1, 1) == ":") {
 708 |         # get option argument
 709 |         if (length(substr(argv[Optind], _opti + 1)) > 0) {
 710 |             Optarg = substr(argv[Optind], _opti + 1)
 711 |         }
 712 |         else {
 713 |             Optarg = argv[++Optind]
 714 |         }
 715 |         _opti = 0
 716 |     } else { 
 717 |         Optarg = ""
 718 |     }
 719 |     if (_opti == 0 || _opti >= length(argv[Optind])) {
 720 |         Optind++
 721 |         _opti = 0
 722 |     } else
 723 |         _opti++
 724 |     return thisopt
 725 | }
 726 | 
 727 | # [[ ________ Setup __________________________________________________________ ]]
 728 | 
 729 | #
 730 | # Check for existence of needed programs and files.
 731 | #
 732 | function setup(files_system) {
 733 | 
 734 |     if (! files_verify("ls") ) {
 735 |         stdErr("Unable to find 'ls' and/or 'command'. PATH problem?\n")
 736 |         exit
 737 |     }
 738 |     if (! files_verify(files_system) ) 
 739 |         exit
 740 | }
 741 | 
 742 | #
 743 | # Verify existence of programs in path
 744 | # Return 0 if fail.
 745 | #
 746 | function files_verify(files_system,    a, i, missing) {
 747 | 
 748 |     missing = 0
 749 |     split(files_system, a, " ")
 750 |     for ( i in a ) {
 751 |         if (! sys2var(sprintf("command -v %s",a[i])) ) {
 752 |             if (a[i] == "wget") G["wget"] = "false"
 753 |                 else if (a[i] == "curl") G["curl"] = "false"
 754 |                 else if (a[i] == "lynx") G["lynx"] = "false"
 755 |                 else {
 756 |                     stdErr("Abort: command not found in PATH: " a[i])
 757 |                     missing++
 758 |                 }
 759 |             }
 760 |             else if (a[i] == "wget") G["wget"] = "true"
 761 |             else if (a[i] == "curl") G["curl"] = "true"
 762 |             else if (a[i] == "lynx") G["lynx"] = "true"
 763 |         }
 764 | 
 765 |         if (G["wget"] == "false" && G["curl"] == "false" && G["lynx"] == "false") {
 766 |             stdErr("Abort: unable to find wget, curl or lynx in PATH.")
 767 |             return 0
 768 |         }
 769 |         else if (G["wget"] == "true")
 770 |             G["wta"] = "wget"
 771 |         else if (G["curl"] == "true")
 772 |             G["wta"] = "curl"
 773 |         else if (G["lynx"] == "true")
 774 |             G["wta"] = "lynx"
 775 | 
 776 |         if ( missing ) 
 777 |           return 0
 778 |         return 1
 779 | }
 780 | 
 781 | # [[ ________ Core read-only functions _______________________________________ ]]
 782 | 
 783 | 
 784 | # ___ Categories (-c)
 785 | 
 786 | #
 787 | # MediaWiki API:Categorymembers
 788 | #  https://www.mediawiki.org/wiki/API:Categorymembers
 789 | #
 790 | function category(entity,   ct, url, results) {
 791 | 
 792 |         if (entity !~ /^[Cc]ategory[:]/)
 793 |             entity = "Category:" entity
 794 | 
 795 |         if (G["cattypes"] ~ /p/)
 796 |             ct = ct " page"
 797 |         if (G["cattypes"] ~ /s/)
 798 |             ct = ct " subcat"
 799 |         if (G["cattypes"] ~ /f/)
 800 |             ct = ct " file"
 801 |         ct = strip(ct)
 802 |         gsub(/[ ]/,"|",ct)
 803 |  
 804 |         url = G["apiURL"] "action=query&list=categorymembers&cmtitle=" urlencodeawk(entity) "&cmtype=" urlencodeawk(ct) "&cmprop=title&cmlimit=500&format=json&formatversion=2&maxlag=" G["maxlag"]
 805 | 
 806 |         results = uniq(getcategory(url, entity) )
 807 | 
 808 |         if ( length(results) > 0)
 809 |             print results
 810 |         return length(results)
 811 | }
 812 | function getcategory(url, entity,   jsonin, jsonout, continuecode) {
 813 | 
 814 |         jsonin = http2var(url)
 815 |         if (apierror(jsonin, "json") > 0)
 816 |             return ""
 817 |         jsonout = json2var(jsonin)
 818 |         continuecode = getcontinue(jsonin, "cmcontinue")
 819 |         while ( continuecode != "-1-1!!-1-1" ) {
 820 |             url = G["apiURL"] "action=query&list=categorymembers&cmtitle=" urlencodeawk(entity) "&cmtype=page&cmprop=title&cmlimit=500&format=json&formatversion=2&maxlag=" G["maxlag"] "&continue=-||&cmcontinue=" continuecode 
 821 |             jsonin = http2var(url)
 822 |             jsonout = jsonout "\n" json2var(jsonin)
 823 |             continuecode = getcontinue(jsonin, "cmcontinue")
 824 |         }
 825 |         return jsonout
 826 | }
 827 | 
 828 | # ___ External links list (-x) 
 829 | 
 830 | #
 831 | # MediaWiki API:Exturlusage
 832 | #  https://www.mediawiki.org/wiki/API:Exturlusage
 833 | #
 834 | function xlinks(entity,   url,results,a,c,i) {
 835 | 
 836 |         if (entity ~ /^https?/ )
 837 |             gsub(/^https?[:]\/\//,"",entity)
 838 |         else if(entity ~ /^\/\// ) 
 839 |             gsub(/^\/\//,"",entity)
 840 |         
 841 |         if (entity ~ /^[*]$/) {
 842 |             entity = ""
 843 |         }
 844 |         
 845 |         c = split("http|https|ftp|ftps|sftp", a, /[|]/)
 846 |         # iterate for euprotocol=a[i]
 847 |         for(i = 1; i <= c; i++) {
 848 |           url = G["apiURL"] "action=query&list=exturlusage&euprotocol=" urlencodeawk(a[i]) "&euexpandurl=&euquery=" urlencodeawk(entity) "&euprop=title&eulimit=500&eunamespace=" urlencodeawk(G["namespace"]) "&format=json&formatversion=2&maxlag=" G["maxlag"]
 849 |           results = results "\n" getxlinks(url, entity, "http") 
 850 |         }
 851 | 
 852 |         results = uniq( results )
 853 | 
 854 |         if ( length(results) > 0)
 855 |             print results
 856 |         return length(results)
 857 | 
 858 | }
 859 | function getxlinks(url, entity, euprotocol,     jsonin, jsonout, continuecode) {
 860 | 
 861 |         jsonin = http2var(url)
 862 |         if (apierror(jsonin, "json") > 0)
 863 |             return ""
 864 |         jsonout = json2var(jsonin)
 865 |         continuecode = getcontinue(jsonin,"eucontinue")
 866 | 
 867 |         while ( continuecode != "-1-1!!-1-1" ) {
 868 |             url = G["apiURL"] "action=query&list=exturlusage&euprotocol=" urlencodeawk(euprotocol) "&euexpandurl=&euquery=" urlencodeawk(entity) "&euprop=title&eulimit=500&eunamespace=" urlencodeawk(G["namespace"]) "&format=json&formatversion=2&maxlag=" G["maxlag"] "&continue=" urlencodeawk("-||") "&eucontinue=" urlencodeawk(continuecode, "rawphp")
 869 |             jsonin = http2var(url)
 870 |             jsonout = jsonout "\n" json2var(jsonin)
 871 |             continuecode = getcontinue(jsonin,"eucontinue")
 872 | 
 873 |         }
 874 |         return jsonout
 875 | }
 876 | 
 877 | # ___ Recent changes (-r) 
 878 | 
 879 | #
 880 | # MediaWiki API:RecentChanges
 881 | #  https://www.mediawiki.org/wiki/API:RecentChanges#cite_note-1
 882 | #
 883 | function rechanges(username, tag,      url, results, entity) {
 884 | 
 885 |         if (length(username) > 0) 
 886 |             entity = "&rcuser=" urlencodeawk(username)
 887 |         else if (length(tag) > 0)
 888 |             entity = "&rctag=" urlencodeawk(tag)
 889 |         else 
 890 |             return 0
 891 | 
 892 |         url = G["apiURL"] "action=query&list=recentchanges&rcprop=" urlencodeawk("title|parsedcomment") entity "&rclimit=500&rcnamespace=" urlencodeawk(G["namespace"]) "&format=json&formatversion=2&maxlag=" G["maxlag"]
 893 | 
 894 |         results = uniq( getrechanges(url, entity) )
 895 | 
 896 |         if ( length(results) > 0) 
 897 |             print results
 898 |         return length(results)
 899 | }
 900 | function getrechanges(url, entity,         jsonin, jsonout, continuecode) {
 901 | 
 902 |         jsonin = http2var(url)
 903 |         if (apierror(jsonin, "json") > 0)
 904 |             return ""
 905 |         jsonout = json2varUcontribs(jsonin)
 906 |         continuecode = getcontinue(jsonin,"rccontinue")
 907 | 
 908 |         while ( continuecode != "-1-1!!-1-1" ) {
 909 |             url = G["apiURL"] "action=query&list=recentchanges&rcprop=" urlencodeawk("title|parsedcomment") entity "&rclimit=500&continue=" urlencodeawk("-||") "&rccontinue=" urlencodeawk(continuecode) "&rcnamespace=" urlencodeawk(G["namespace"]) "&format=json&formatversion=2&maxlag=" G["maxlag"]
 910 |             jsonin = http2var(url)
 911 |             jsonout = jsonout "\n" json2varUcontribs(jsonin)
 912 |             continuecode = getcontinue(jsonin,"rccontinue")
 913 |         }
 914 | 
 915 |         return jsonout
 916 | }
 917 | 
 918 | # ___ User Contributions (-u) 
 919 | 
 920 | #
 921 | # MediaWiki API:Usercontribs
 922 | #  https://www.mediawiki.org/wiki/API:Usercontribs
 923 | #
 924 | function ucontribs(entity,sdate,edate,      url, results) {
 925 | 
 926 |         # API stopped working with User: prefix sometime in April 2018
 927 |         sub(/^[Uu]ser[:]/, "", entity)
 928 | 
 929 |         url = G["apiURL"] "action=query&list=usercontribs&ucuser=" urlencodeawk(entity) "&uclimit=500&ucstart=" urlencodeawk(sdate) "&ucend=" urlencodeawk(edate) "&ucdir=newer&ucnamespace=" urlencodeawk(G["namespace"]) "&ucprop=" urlencodeawk("title|parsedcomment") "&format=json&formatversion=2&maxlag=" G["maxlag"]
 930 | 
 931 |         results = uniq( getucontribs(url, entity, sdate, edate) )
 932 | 
 933 |         if ( length(results) > 0) 
 934 |             print results
 935 |         return length(results)
 936 | }
 937 | function getucontribs(url, entity, sdate, edate,         jsonin, jsonout, continuecode) {
 938 | 
 939 |         jsonin = http2var(url)
 940 |         if (apierror(jsonin, "json") > 0)
 941 |             return ""
 942 |         jsonout = json2varUcontribs(jsonin)
 943 |         continuecode = getcontinue(jsonin,"uccontinue")
 944 | 
 945 |         while ( continuecode != "-1-1!!-1-1" ) {
 946 |             url = G["apiURL"] "action=query&list=usercontribs&ucuser=" urlencodeawk(entity) "&uclimit=500&continue=" urlencodeawk("-||") "&uccontinue=" urlencodeawk(continuecode) "&ucstart=" urlencodeawk(sdate) "&ucend=" urlencodeawk(edate) "&ucdir=newer&ucnamespace=" urlencodeawk(G["namespace"]) "&ucprop=" urlencodeawk("title|parsedcomment") "&format=json&formatversion=2&maxlag=" G["maxlag"]
 947 |             jsonin = http2var(url)
 948 |             jsonout = jsonout "\n" json2varUcontribs(jsonin)
 949 |             continuecode = getcontinue(jsonin,"uccontinue")
 950 |         }
 951 | 
 952 |         return jsonout
 953 | }
 954 | 
 955 | # ___ Forward links (-F) 
 956 | 
 957 | #
 958 | # MediaWiki API:Parsing_wikitext
 959 | #  https://www.mediawiki.org/wiki/API:Parsing_wikitext
 960 | #
 961 | function forlinks(entity,sdate,edate,      url,jsonin,jsonout) {
 962 | 
 963 |         url = G["apiURL"] "action=parse&prop=" urlencodeawk("links") "&page=" urlencodeawk(entity) "&format=json&formatversion=2&maxlag=" G["maxlag"]
 964 |         jsonin = http2var(url)
 965 |         if (apierror(jsonin, "json") > 0)
 966 |             return ""
 967 |         jsonout = json2var(jsonin)
 968 |         if ( length(jsonout) > 0) 
 969 |             print jsonout
 970 |         return length(jsonout)       
 971 | }
 972 | 
 973 | # ___ Redirects (-B) 
 974 | # Note: Must set namespace - will only return for the given namespace
 975 | 
 976 | #
 977 | # MediaWiki API:Redirects
 978 | #  https://www.mediawiki.org/wiki/API:Redirects
 979 | #
 980 | function redirects(entity,      url, results) {
 981 | 
 982 |         url = G["apiURL"] "action=query&prop=redirects&titles=" urlencodeawk(entity) "&rdprop=title&rdnamespace=" urlencodeawk(G["namespace"]) "&format=json&formatversion=2&rdlimit=500&maxlag=" G["maxlag"]
 983 | 
 984 |         results = uniq( getrdchanges(url, entity) )
 985 | 
 986 |         if ( length(results) > 0) 
 987 |             print results
 988 |         return length(results)
 989 | }
 990 | function getrdchanges(url, entity,         jsonin, jsonout, continuecode) {
 991 | 
 992 |         jsonin = http2var(url)
 993 |         if (apierror(jsonin, "json") > 0)
 994 |             return ""
 995 |         jsonout = json2varRd(jsonin)
 996 |         continuecode = getcontinue(jsonin,"rdcontinue")
 997 | 
 998 |         while ( continuecode != "-1-1!!-1-1" ) {
 999 |             url = G["apiURL"] "action=query&prop=redirects&rdprop=title&rdcontinue=" urlencodeawk(continuecode) "&titles=" urlencodeawk(entity) "&rdnamespace=" urlencodeawk(G["namespace"]) "&format=json&formatversion=2&rdlimit=500&maxlag=" G["maxlag"]
1000 |             jsonin = http2var(url)
1001 |             jsonout = jsonout "\n" json2varRd(jsonin)
1002 |             continuecode = getcontinue(jsonin,"rdcontinue")
1003 |         }
1004 |         return jsonout
1005 | }
1006 | 
1007 | 
1008 | # ___ Backlinks (-b) 
1009 | 
1010 | #
1011 | # MediaWiki API:Backlinks
1012 | #  https://www.mediawiki.org/wiki/API:Backlinks
1013 | #
1014 | function backlinks(entity,      url, blinks) {
1015 | 
1016 |         if (G["bltypes"] ~ /n/) {
1017 |             url = G["apiURL"] "action=query&list=backlinks&bltitle=" urlencodeawk(entity) "&blnamespace=" urlencodeawk(G["namespace"]) "&blredirect&bllimit=250&continue=&blfilterredir=nonredirects&format=json&formatversion=2&maxlag=" G["maxlag"]
1018 |             blinks = getbacklinks(url, entity, "blcontinue") # normal backlinks
1019 |         }
1020 | 
1021 |         if ( entity ~ /^[Tt]emplate[:]/ && G["bltypes"] ~ /t/) {    # transclusion backlinks
1022 |             url = G["apiURL"] "action=query&list=embeddedin&eititle=" urlencodeawk(entity) "&einamespace=" urlencodeawk(G["namespace"]) "&continue=&eilimit=500&format=json&formatversion=2&maxlag=" G["maxlag"]
1023 |             if (length(blinks) > 0)
1024 |                 blinks = blinks "\n" getbacklinks(url, entity, "eicontinue")
1025 |             else
1026 |                 blinks = getbacklinks(url, entity, "eicontinue")
1027 |         } 
1028 |         else if ( entity ~ /^[Ff]ile[:]/ && G["bltypes"] ~ /f/) { # file backlinks
1029 |             url = G["apiURL"] "action=query&list=imageusage&iutitle=" urlencodeawk(entity) "&iunamespace=" urlencodeawk(G["namespace"]) "&iuredirect&iulimit=250&continue=&iufilterredir=nonredirects&format=json&formatversion=2&maxlag=" G["maxlag"]
1030 |             if (length(blinks) > 0)
1031 |                 blinks = blinks "\n" getbacklinks(url, entity, "iucontinue")
1032 |             else
1033 |                 blinks = getbacklinks(url, entity, "iucontinue")
1034 |         }
1035 | 
1036 |         blinks = uniq(blinks)
1037 |         if (length(blinks) > 0)
1038 |             print blinks 
1039 | 
1040 |         close(outfile)
1041 |         return length(blinks)
1042 | }
1043 | function getbacklinks(url, entity, method,      jsonin, jsonout, continuecode) {
1044 | 
1045 |         jsonin = http2var(url)
1046 |         if (apierror(jsonin, "json") > 0)
1047 |             return ""
1048 |         jsonout = json2var(jsonin)
1049 |         continuecode = getcontinue(jsonin, method)
1050 | 
1051 |         while ( continuecode != "-1-1!!-1-1" ) {
1052 | 
1053 |             if ( method == "eicontinue" )
1054 |                 url = G["apiURL"] "action=query&list=embeddedin&eititle=" urlencodeawk(entity) "&einamespace=" urlencodeawk(G["namespace"]) "&eilimit=500&continue=" urlencodeawk("-||") "&eicontinue=" urlencodeawk(continuecode) "&format=json&formatversion=2&maxlag=" G["maxlag"]
1055 |             if ( method == "iucontinue" )
1056 |                 url = G["apiURL"] "action=query&list=imageusage&iutitle=" urlencodeawk(entity) "&iunamespace=" urlencodeawk(G["namespace"]) "&iuredirect&iulimit=250&continue=" urlencodeawk("-||") "&iucontinue=" urlencodeawk(continuecode) "&iufilterredir=nonredirects&format=json&formatversion=2&maxlag=" G["maxlag"]
1057 |             if ( method == "blcontinue" )
1058 |                 url = G["apiURL"] "action=query&list=backlinks&bltitle=" urlencodeawk(entity) "&blnamespace=" urlencodeawk(G["namespace"]) "&blredirect&bllimit=250&continue=" urlencodeawk("-||") "&blcontinue=" urlencodeawk(continuecode) "&blfilterredir=nonredirects&format=json&formatversion=2&maxlag=" G["maxlag"]
1059 | 
1060 |             jsonin = http2var(url)
1061 |             jsonout = jsonout "\n" json2var(jsonin)
1062 |             continuecode = getcontinue(jsonin, method)
1063 |         }
1064 | 
1065 |         return jsonout
1066 | }
1067 | 
1068 | # ___ Print wiki text (-w) 
1069 | 
1070 | #
1071 | # Print wiki text (-w) with the plain text option (-p)
1072 | #  MediaWiki API Extension:TextExtracts
1073 | #   https://www.mediawiki.org/wiki/Extension:TextExtracts
1074 | #
1075 | function wikitextplain(namewiki,   command,f,r,redirurl,xmlin,i,c,b,k) {
1076 | 
1077 |         command = "https://" G["lang"] "." G["project"] ".org/w/index.php?title=" urlencodeawk(strip(namewiki)) "&action=raw"
1078 |         f = http2var(command)
1079 |         if (length(f) < 5) 
1080 |             return ""
1081 |         if (tolower(f) ~ /[#][ ]{0,}redirect[ ]{0,}[[]/ && G["followredirect"] == "true") {
1082 |             match(f, /[#][ ]{0,}[Rr][Ee][^]]*[]]/, r)
1083 |             gsub(/[#][ ]{0,}[Rr][Ee][Dd][Ii][^[]*[[]/,"",r[0])
1084 |             redirurl = strip(substr(r[0], 2, length(r[0]) - 2))
1085 |             command = G["apiURL"] "format=xml&action=query&prop=extracts&exlimit=1&explaintext&titles=" urlencodeawk(redirurl) 
1086 |             xmlin = http2var(command)
1087 |         }
1088 |         else {
1089 |             command = G["apiURL"] "format=xml&action=query&prop=extracts&exlimit=1&explaintext&titles=" urlencodeawk(namewiki)
1090 |             xmlin = http2var(command)
1091 |         }
1092 | 
1093 |         if (apierror(xmlin, "xml") > 0) {
1094 |             return ""
1095 |         }
1096 |         else {
1097 |             c = split(convertxml(xmlin), b, "<extract[^>]*>")
1098 |             i = 1
1099 |             while (i++ < c) {
1100 |                 k = substr(b[i], 1, match(b[i], "</extract>") - 1)
1101 |                 return strip(k)
1102 |             }
1103 |         }
1104 | }
1105 | 
1106 | function wikitext(namewiki,   command,f,r,redirurl) {
1107 | 
1108 |         command = "https://" G["lang"] "." G["project"] ".org/w/index.php?title=" urlencodeawk(strip(namewiki)) "&action=raw"
1109 |         f = http2var(command)
1110 |         if (length(f) < 5) 
1111 |             return ""
1112 | 
1113 |         if (tolower(f) ~ /[#][ ]{0,}redirect[ ]{0,}[[]/ && G["followredirect"] == "true") {
1114 |             match(f, /[#][ ]{0,}[Rr][Ee][^]]*[]]/, r)
1115 |             gsub(/[#][ ]{0,}[Rr][Ee][Dd][Ii][^[]*[[]/,"",r[0])
1116 |             redirurl = strip(substr(r[0], 2, length(r[0]) - 2))
1117 |             command = "https://" G["lang"] "." G["project"] ".org/w/index.php?title=" urlencodeawk(redirurl) "&action=raw"
1118 |             f = http2var(command)
1119 |         }
1120 |         if (length(f) < 5)
1121 |             return ""
1122 |         else
1123 |             return f
1124 | }
1125 | 
1126 | # ___ All pages (-A)
1127 | 
1128 | #
1129 | # MediaWiki API: Allpages
1130 | #  https://www.mediawiki.org/wiki/API:Allpages
1131 | #
1132 | function allPages(redirtype,    url,results,apfilterredir,aplimit) {
1133 | 
1134 |         if (redirtype == "1")
1135 |             apfilterredir = "all"
1136 |         else if (redirtype == "2")
1137 |             apfilterredir = "nonredirects"
1138 |         else if (redirtype == "3")
1139 |             apfilterredir = "redirects"
1140 |         else
1141 |             apfilterredir = "nonredirects"
1142 | 
1143 |         if (G["maxpages"] < 500 && G["maxpages"] > 0)
1144 |             aplimit = G["maxpages"] + G["maxpages"]  # get extra in case redirs are filtered
1145 |         else
1146 |             aplimit = 500
1147 | 
1148 |         url = G["apiURL"] "action=query&list=allpages&aplimit=" aplimit "&apfilterredir=" apfilterredir "&apnamespace=" urlencodeawk(G["namespace"], "rawphp") "&format=json&formatversion=2&maxlag=" G["maxlag"]
1149 | 
1150 |         getallpages(url, apfilterredir, aplimit)
1151 | }
1152 | function getallpages(url,apfilterredir,aplimit,         jsonin, jsonout, continuecode, count, pages_printed, pages_in_buffer, a, i) {
1153 | 
1154 |         pages_printed = 0
1155 | 
1156 |         jsonin = http2var(url)
1157 |         if ( apierror(jsonin, "json") > 0)
1158 |             return
1159 |         continuecode = getcontinue(jsonin,"apcontinue")
1160 | 
1161 |         jsonout = json2var(jsonin)
1162 |         if ( ! empty(jsonout)) {
1163 |             pages_in_buffer = split(jsonout, a, "\n")
1164 |             if (G["maxpages"] > 0) {
1165 |                 if (pages_printed + pages_in_buffer >= G["maxpages"]) {
1166 |                     for(i = 1; i <= G["maxpages"] - pages_printed; i++) {
1167 |                         if ( ! empty(a[i])) print a[i]
1168 |                     }
1169 |                     return
1170 |                 }
1171 |             }
1172 |             print jsonout
1173 |             pages_printed += pages_in_buffer
1174 |         }
1175 | 
1176 |         while ( continuecode != "-1-1!!-1-1" ) {
1177 |             if (G["maxpages"] > 0 && pages_printed >= G["maxpages"]) {
1178 |                 return
1179 |             }
1180 | 
1181 |             url = G["apiURL"] "action=query&list=allpages&aplimit=" aplimit "&apfilterredir=" apfilterredir "&apnamespace=" urlencodeawk(G["namespace"], "rawphp") "&apcontinue=" urlencodeawk(continuecode, "rawphp") "&continue=" urlencodeawk("-||") "&format=json&formatversion=2&maxlag=" G["maxlag"]
1182 |             jsonin = http2var(url)
1183 |             continuecode = getcontinue(jsonin,"apcontinue")
1184 |             jsonout = json2var(jsonin)
1185 | 
1186 |             if ( ! empty(jsonout)) {
1187 |                 pages_in_buffer = split(jsonout, a, "\n")
1188 |                 if (G["maxpages"] > 0) {
1189 |                     if (pages_printed + pages_in_buffer >= G["maxpages"]) {
1190 |                         for(i = 1; i <= G["maxpages"] - pages_printed; i++) {
1191 |                             if ( ! empty(a[i])) print a[i]
1192 |                         }
1193 |                         return
1194 |                     }
1195 |                 }
1196 |                 print jsonout
1197 |                 pages_printed += pages_in_buffer
1198 |             }
1199 |             jsonin = "" # free memory
1200 |         }
1201 | }
1202 | 
1203 | 
1204 | # ___ Search list (-a) 
1205 | 
1206 | #
1207 | # MediaWiki API:Search
1208 | #  https://www.mediawiki.org/wiki/API:Search
1209 | #
1210 | function search(srchstr,    url, results, a) {
1211 | 
1212 |         if (G["snippet"] == "false")
1213 |             G["srprop"] = "timestamp"
1214 |         else
1215 |             G["srprop"] = "timestamp|snippet"
1216 |                                        
1217 |         if (G["searchtarget"] ~ /title/)  # Use this instead of &srwhat
1218 |             srchstr = "intitle:" srchstr   # See https://www.mediawiki.org/wiki/API_talk:Search#title_search_is_disabled  
1219 | 
1220 |         url = G["apiURL"] "action=query&list=search&srsearch=" urlencodeawk(srchstr) "&srprop=" urlencodeawk(G["srprop"]) "&srnamespace=" urlencodeawk(G["namespace"]) "&srlimit=50&continue=" urlencodeawk("-||") "&format=xml&maxlag=" G["maxlag"]
1221 | 
1222 |         results = strip(getsearch(url, srchstr))   # Don't uniq, confuses ordering and not needed for search results
1223 | 
1224 |         l = length(results)
1225 | 
1226 |         if (length(results) > 0)
1227 |             print results
1228 |         if (split(results,a,"\n") > 9999)
1229 |             print "Warning (wikiget): Search results max out at 10000. See https://www.mediawiki.org/wiki/API:Search" > "/dev/stderr"
1230 |         return length(results)
1231 | }
1232 | function getsearch(url, srchstr,   xmlin,xmlout,offset,retrieved) {
1233 | 
1234 |         xmlin = http2var(url)
1235 |         if (apierror(xmlin, "xml") > 0)
1236 |             return ""
1237 |         xmlout = parsexmlsearch(xmlin)
1238 |         offset = getoffsetxml(xmlin)
1239 | 
1240 |         if (G["numsearch"] == "true") 
1241 |             return totalhits(xmlin)
1242 | 
1243 |         retrieved = 50
1244 |         if (retrieved > G["maxsearch"] && G["maxsearch"] != 0) 
1245 |             return trimxmlout(xmlout, G["maxsearch"])
1246 |    
1247 |         while ( offset) {
1248 |             url = G["apiURL"] "action=query&list=search&srsearch=" urlencodeawk(srchstr) "&srprop=" urlencodeawk(G["srprop"]) "&srnamespace=" urlencodeawk(G["namespace"]) "&srlimit=50&continue=" urlencodeawk("-||") "&format=xml&maxlag=" G["maxlag"] "&sroffset=" offset
1249 |             xmlin = http2var(url)
1250 |             xmlout = xmlout "\n" parsexmlsearch(xmlin)
1251 |             offset = getoffsetxml(xmlin)
1252 |             retrieved = retrieved + 50
1253 |             if (retrieved > G["maxsearch"] && G["maxsearch"] != 0)
1254 |                 return trimxmlout(xmlout, G["maxsearch"])
1255 |         }
1256 | 
1257 |         return xmlout
1258 | } 
1259 | function parsexmlsearch(xmlin,   f,g,e,c,a,i,out,snippet,title) {
1260 | 
1261 |         if (xmlin ~ /error code="maxlag"/) {
1262 |             stdErr("Max lag (" G["maxlag"] ") exceeded - aborting. Try again when API servers are less busy, or increase Maxlag (-m)")
1263 |             exit
1264 |         }
1265 | 
1266 |         f = split(xmlin,e,/<search>|<\/search>/)
1267 |         c = split(e[2],a,"/>")  
1268 | 
1269 |         while (++i < c) {
1270 |             if (a[i] ~ /title[=]/) {
1271 |                 match(a[i], /title="[^"]*"/,k)
1272 |                 split(gensub("title=","","g",k[0]), g, "\"")
1273 |                 title = convertxml(g[2])
1274 |                 match(a[i], /snippet="[^"]*"/,k)
1275 |                 snippet = gensub("snippet=","","g",k[0])
1276 |                 snippet = convertxml(snippet)
1277 |                 gsub(/<span class[=]"searchmatch">|<\/span>/,"",snippet)
1278 |                 snippet = convertxml(snippet)
1279 |                 gsub(/^"|"$/,"",snippet)
1280 |                 if (G["snippet"] != "false") 
1281 |                     out = out title " <snippet>" snippet "</snippet>\n"
1282 |                 else
1283 |                     out = out title "\n"
1284 |           }
1285 |         }
1286 |         return strip(out)
1287 | }
1288 | function getoffsetxml(xmlin,  a) {
1289 | 
1290 |         if ( match(xmlin, /<continue sroffset[=]"[0-9]{1,}"/, offset) > 0) {     
1291 |             split(offset[0],a,/"/)
1292 |             return a[2]
1293 |         }
1294 |         else 
1295 |             return ""
1296 | }
1297 | function trimxmlout(xmlout, max,   c,a,i) {
1298 | 
1299 |         if ( split(xmlout, a, "\n") > 0) {
1300 |             while (i++ < max) 
1301 |                 out = out a[i] "\n"
1302 |             return out
1303 |         }
1304 | }
1305 | function totalhits(xmlin) {
1306 | 
1307 |         # <searchinfo totalhits="40"/>
1308 |         if (match(xmlin, /<searchinfo totalhits[=]"[0-9]{1,}"/, a) > 0) {
1309 |             if (split(a[0],b,"\"") > 0) 
1310 |                 return b[2]
1311 |             else
1312 |                 return "error"
1313 |         }
1314 |         else
1315 |             return "error"
1316 | }
1317 | 
1318 | # [[ ________ Utilities ______________________________________________________ ]]
1319 | 
1320 | #
1321 | # readfile() - same as @include "readfile"
1322 | #
1323 | #   . leaves an extra trailing \n just like with the @include readfile
1324 | #
1325 | #   Credit: https://www.gnu.org/software/gawk/manual/html_node/Readfile-Function.html by Denis Shirokov
1326 | #
1327 | function readfile(file,     tmp, save_rs) {
1328 |     save_rs = RS     
1329 |     RS = "^$"
1330 |     getline tmp < file
1331 |     close(file)
1332 |     RS = save_rs
1333 |     return tmp
1334 | }
1335 | 
1336 | #
1337 | # json2var - given raw json extract field "title" and convert to \n seperated string
1338 | #
1339 | function json2var(json,  jsona,arr) {
1340 |     if (query_json(json, jsona) >= 0) {
1341 |         splitja(jsona, arr, 3, "title")
1342 |         return join(arr, 1, length(arr), "\n")
1343 |     }
1344 | }
1345 | 
1346 | #
1347 | # Uncontribs version 
1348 | #
1349 | function json2varUcontribs(json,  jsona, arrTitle, arrComment, arr, i, j) {
1350 |     
1351 |     delete arr
1352 |     if (query_json(json, jsona) >= 0) {
1353 |         splitja(jsona, arrTitle, 3, "title")
1354 |         splitja(jsona, arrComment, 3, "parsedcomment")
1355 |         # awkenough_dump(jsona, "jsona")
1356 |         for(i = 1; i <= length(arrComment); i++) {
1357 | 
1358 |           if(! empty(G["inccomments"]) && ! empty(G["exccomments"]) ) {
1359 |             if(arrComment[i] ~ G["inccomments"] && arrComment[i] !~ G["exccomments"]) {
1360 |               j++
1361 |               arr[j] = arrTitle[i]
1362 |             }
1363 |           }
1364 |           else if(! empty(G["inccomments"])) {
1365 |             if(arrComment[i] ~ G["inccomments"] ) {
1366 |               j++
1367 |               arr[j] = arrTitle[i]
1368 |             }
1369 |           }
1370 |           else if(! empty(G["exccomments"])) {
1371 |             if(arrComment[i] !~ G["exccomments"] ) {
1372 |               j++
1373 |               arr[j] = arrTitle[i]
1374 |             }
1375 |           }
1376 |           else if( empty(G["exccomments"]) && empty(G["inccomments"])) {
1377 |             j++
1378 |             arr[j] = arrTitle[i]
1379 |           }
1380 |         }
1381 |         return join(arr, 1, length(arr), "\n")
1382 |     }
1383 | 
1384 | }
1385 | 
1386 | #
1387 | # json2varRd - given raw json extract field "title" and convert to \n seperated string - for API:Redirects
1388 | #
1389 | function json2varRd(json,  jsona,arr) {
1390 |     if (query_json(json, jsona) >= 0) {
1391 |         # jsona["query","pages","1","redirects","4","title"]=Template:Cite-web
1392 |         splitja(jsona, arr, 5, "title")
1393 |         return join(arr, 1, length(arr), "\n")
1394 |     }
1395 | }
1396 | 
1397 | #
1398 | # Parse continue code from JSON input
1399 | #
1400 | function getcontinue(jsonin, method,    jsona,id) {
1401 | 
1402 |         if( query_json(jsonin, jsona) >= 0) {
1403 |           id = jsona["continue", method]
1404 |           if(!empty(id))
1405 |             return id
1406 |         }
1407 |         return "-1-1!!-1-1" # Random string unlikely to be the name of a Wiki article
1408 | }
1409 | 
1410 | #
1411 | # entity_exists - see if a page on Wikipedia exists
1412 | #   eg. if ( ! entity_exists("Gutenberg author") ) print "Unknown page"
1413 | #
1414 | function entity_exists(entity   ,url,jsonin) {
1415 | 
1416 |         url = G["apiURL"] "action=query&titles=" urlencodeawk(entity) "&format=json"
1417 |         jsonin = http2var(url)
1418 |         if (jsonin ~ "\"missing\"")
1419 |             return 0
1420 |         return 1
1421 | }
1422 | 
1423 | #
1424 | # Basic check of API results for error
1425 | #
1426 | function apierror(input, type,   pre, code) {
1427 | 
1428 |         pre = "API error: "
1429 | 
1430 |         if (length(input) < 5) {
1431 |             stdErr(pre "Received no response.")
1432 |             return 1
1433 |         }
1434 | 
1435 |         if (type == "json") {
1436 |             if (match(input, /"error"[:]{"code"[:]"[^"]*","info"[:]"[^"]*"/, code) > 0) {
1437 |                 stdErr(pre code[0])
1438 |                 return 1
1439 |             }
1440 |         }
1441 |         else if (type == "xml") {
1442 |             if (match(input, /error code[=]"[^"]*" info[=]"[^"]*"/, code) > 0) {
1443 |                 stdErr(re code[0])
1444 |                 return 1
1445 |             }
1446 |         }
1447 |         else
1448 |             return
1449 | }
1450 | 
1451 | #
1452 | # Uniq a list of \n separated names
1453 | #
1454 | function uniq(names,    b,c,i,x) {
1455 | 
1456 |         c = split(names, b, "\n")
1457 |         names = "" # free memory
1458 |         while (i++ < c) {
1459 |             gsub(/\\"/,"\"",b[i])
1460 |             if (b[i] ~ "for API usage") { # Max lag exceeded.
1461 |                 stdErr("\nMax lag (" G["maxlag"] ") exceeded - aborting. Try again when API servers are less busy, or increase Maxlag (-m)")
1462 |                 exit
1463 |             }
1464 |             if (b[i] == "")
1465 |                 continue
1466 |             if (x[b[i]] == "")
1467 |                 x[b[i]] = b[i]
1468 |         }
1469 |         delete b # free memory
1470 |         return join2(x,"\n")
1471 | }
1472 | 
1473 | #
1474 | # Webpage to variable. url is assumed to be percent encoded.
1475 | #
1476 | function http2var(url,  tries,i,op) {
1477 | 
1478 |         if (G["debug"])
1479 |             print url > "/dev/stderr"                            
1480 | 
1481 |         tries = 3
1482 |         if(url ~ "(wikipedia|wikimedia)")
1483 |             tries = 20
1484 | 
1485 |         for(i = 1; i <= tries; i++) {
1486 |             if (G["wta"] == "wget")
1487 |                 op = sys2var("wget --no-check-certificate --user-agent=\"" G["agent"] "\" -q -O- -- " shquote(url) )  
1488 |             else if (G["wta"] == "curl")
1489 |                 op = sys2var("curl -L -s -k --user-agent \"" G["agent"] "\" -- " shquote(url) )  
1490 |             else if (G["wta"] == "lynx")
1491 |                 op = sys2var("lynx -source -- " shquote(url) )  
1492 |             if(!empty(op)) return op
1493 |         }
1494 | }        
1495 | 
1496 | 
1497 | # [[ ________ Library ________________________________________________________ ]]
1498 | 
1499 | # 
1500 | # sys2var() - run a system command and store result in a variable
1501 | #  
1502 | #  . supports pipes inside command string
1503 | #  . stderr is sent to null
1504 | #  . if command fails (errno) return null        
1505 | #          
1506 | #  Example:            
1507 | #     googlepage = sys2var("wget -q -O- http://google.com")
1508 | #
1509 | function sys2var(command        ,fish, scale, ship) {
1510 | 
1511 |     # command = command " 2>/dev/null"
1512 |     while ( (command | getline fish) > 0 ) {
1513 |         if ( ++scale == 1 )
1514 |             ship = fish
1515 |         else
1516 |             ship = ship "\n" fish                
1517 |     }
1518 |     close(command)
1519 |     system("")
1520 |     return ship
1521 | }
1522 | 
1523 | #
1524 | # sys2varPipe() - supports piping string data into a program eg. echo <data> | <command>
1525 | #
1526 | #  . <data> is a string not a command
1527 | #
1528 | #   Example:
1529 | #      replicate 'cat /etc/passwd | wc'
1530 | #        print sys2varPipe(readfile("/etc/passwd"), Exe["wc"])
1531 | #      send output of one command to another
1532 | #        print sys2varPipe(sys2var("date +\"%s\""), Exe["wc"])
1533 | #
1534 | function sys2varPipe(data, command,   fish, scale, ship) {
1535 | 
1536 |     printf("%s",data) |& command
1537 |     close(command, "to")
1538 | 
1539 |     while ( (command |& getline fish) > 0 ) {
1540 |         if ( ++scale == 1 )
1541 |             ship = fish
1542 |         else
1543 |             ship = ship "\n" fish
1544 |     }
1545 |     close(command)
1546 |     return ship
1547 | }
1548 | 
1549 | 
1550 | #    
1551 | # urlElement - given a URL, return a sub-portion (scheme, netloc, path, query, fragment)
1552 | #
1553 | #  In the URL "https://www.cwi.nl:80/nl?dooda/guido&path.htm#section"
1554 | #   scheme = https
1555 | #   netloc = www.cwi.nl:80
1556 | #   path = /nl                  
1557 | #   query = dooda/guido&path.htm
1558 | #   fragment = section
1559 | #
1560 | #  Example:                
1561 | #     uriElement("https://www.cwi.nl:80/nl?", "path") returns "/nl"
1562 | #           
1563 | #   . URLs have many edge cases. This function works for well-formed URLs.
1564 | #   . If a robust solution is needed:
1565 | #       "python3 -c \"from urllib.parse import urlsplit; import sys; o = urlsplit(sys.argv[1]); print(o." element ")\" " shquote(url)
1566 | #   . returns full url on error
1567 | #
1568 | function urlElement(url,element,   a,scheme,netloc,tail,b,fragment,query,path) {
1569 | 
1570 |   if(url ~ /^\/\//)        # Protocol-relative - assume http
1571 |     url = "http:" url
1572 | 
1573 |   split(url, a, /\//)
1574 | 
1575 |   scheme = substr(a[1], 0, index(a[1], ":") -1)
1576 |   netloc = a[3]
1577 | 
1578 |   tail = subs(scheme "://" netloc, "", url)
1579 | 
1580 |   splits(tail, b, "#")
1581 |   if(!empty(b[2]))
1582 |     fragment = b[2]
1583 | 
1584 |   splits(tail, b, "?")
1585 |   if(!empty(b[2])) {
1586 |     query = b[2]
1587 |     if(!empty(fragment))
1588 |       query = subs("#" fragment, "", query)
1589 |   }
1590 | 
1591 |   path = tail
1592 |   if(!empty(fragment))
1593 |     path = subs("#" fragment, "", path)
1594 |   if(!empty(query))
1595 |     path = subs("?" query, "", path)
1596 |     
1597 |   if(element == "scheme")
1598 |     return scheme
1599 |   else if(element == "netloc")
1600 |     return netloc
1601 |   else if(element == "path")
1602 |     return path
1603 |   else if(element == "query")
1604 |     return query
1605 |   else if(element == "fragment")
1606 |     return fragment
1607 | 
1608 | }
1609 | 
1610 | # 
1611 | # urlencodeawk - urlencode a string
1612 | #
1613 | #  . if optional 'class' is "url" treat 'str' with best-practice URL encoding
1614 | #     see https://perishablepress.com/stop-using-unsafe-characters-in-urls/
1615 | #  . if 'class' is "rawphp" attempt to match behavior of PhP rawurlencode()              
1616 | #  . otherwise encode everything except 0-9A-Za-z
1617 | #          
1618 | #  Requirement: gawk -b
1619 | #  Credit: Rosetta Code May 2015
1620 | #          GreenC
1621 | #
1622 | function urlencodeawk(str,class,  c, len, res, i, ord, re) {
1623 | 
1624 |     if (class == "url")               
1625 |         re = "[$\\-_.+!*'(),,;/?:@=&0-9A-Za-z]"
1626 |     else if (class == "rawphp")         
1627 |         re = "[\\-_.~0-9A-Za-z]"
1628 |     else
1629 |         re = "[0-9A-Za-z]"
1630 | 
1631 |     for (i = 0; i <= 255; i++)
1632 |         ord[sprintf("%c", i)] = i        
1633 |     len = length(str)      
1634 |     res = ""
1635 |     for (i = 1; i <= len; i++) {
1636 |         c = substr(str, i, 1)
1637 |         if (c ~ re)                # don't encode          
1638 |             res = res c
1639 |         else
1640 |             res = res "%" sprintf("%02X", ord[c])
1641 |     }
1642 |     return res
1643 | }
1644 | 
1645 | #
1646 | # concatarray() - merge array a & b into c
1647 | #
1648 | #  . if array a & b have a same key eg. a["1"] = 2 and b["1"] = 3
1649 | #      then b takes precendent eg. c["1"] = 3
1650 | #
1651 | function concatarray(a,b,c) {
1652 | 
1653 |     delete c          
1654 |     for (i in a)       
1655 |         c[i]=a[i]
1656 |     for (i in b)    
1657 |        c[i]=b[i]       
1658 | }                
1659 | 
1660 | #
1661 | # splitx() - split str along re and return num'th element
1662 | #
1663 | #   Example:
1664 | #      print splitx("a:b:c:d", "[:]", 3) ==> "c"
1665 | #
1666 | function splitx(str, re, num,    a){
1667 |     if(split(str, a, re))
1668 |         return a[num] 
1669 |     else               
1670 |         return ""
1671 | }                   
1672 | 
1673 | #               
1674 | # removefile2() - delete a file/directory
1675 | #
1676 | #   . no wildcards 
1677 | #   . return 1 success
1678 | #
1679 | #   Requirement: rm
1680 | #   
1681 | function removefile2(str) {
1682 | 
1683 |     if (str ~ /[*|?]/ || empty(str)) 
1684 |         return 0
1685 |     system("") # Flush buffer
1686 |     if (exists2(str)) {
1687 |       sys2var("rm -r -- " shquote(str) )
1688 |       system("")
1689 |       if (! exists2(str)) 
1690 |         return 1
1691 |     }
1692 |     return 0
1693 | }
1694 | 
1695 | #
1696 | # exists2() - check for file existence                              
1697 | #
1698 | #   . return 1 if exists, 0 otherwise.
1699 | #   . no dependencies version
1700 | #
1701 | function exists2(file    ,line, msg) {
1702 | 
1703 |     if ((getline line < file) == -1 ) {
1704 |         msg = (ERRNO ~ /Permission denied/ || ERRNO ~ /a directory/) ? 1 : 0
1705 |         close(file)
1706 |         return msg
1707 |     }
1708 |     else {
1709 |         close(file)
1710 |         return 1
1711 |     }
1712 | }
1713 | 
1714 | #           
1715 | # empty() - return 0 if string is 0-length      
1716 | #
1717 | function empty(s) {                 
1718 |     if (length(s) == 0)  
1719 |         return 1      
1720 |     return 0           
1721 | }                
1722 | 
1723 | #
1724 | # shquote() - make string safe for shell
1725 | #
1726 | #  . an alternate is shell_quote.awk in /usr/local/share/awk which uses '"' instead of \'
1727 | # 
1728 | #  Example:
1729 | #     print shquote("Hello' There")    produces 'Hello'\'' There'
1730 | #     echo 'Hello'\'' There'           produces Hello' There
1731 | #
1732 | function shquote(str,  safe) {      
1733 |     safe = str                      
1734 |     gsub(/'/, "'\\''", safe)          
1735 |     gsub(/’/, "'\\’'", safe)
1736 |     return "'" safe "'"                 
1737 | }
1738 | 
1739 | #   
1740 | # convertxml() - convert XML to plain
1741 | #
1742 | function convertxml(str,   safe) {  
1743 |     safe = str                      
1744 |     gsub(/&lt;/,"<",safe)             
1745 |     gsub(/&gt;/,">",safe)
1746 |     gsub(/&quot;/,"\"",safe)            
1747 |     gsub(/&amp;/,"\\&",safe)
1748 |     gsub(/&#039;/,"'",safe)
1749 |     gsub(/&#10;/,"'",safe)
1750 |     return safe
1751 | }
1752 | 
1753 | # 
1754 | # strip() - strip leading/trailing whitespace
1755 | #   
1756 | #   . faster than the gsub() or gensub() methods eg.
1757 | #        gsub(/^[[:space:]]+|[[:space:]]+$/,"",s)
1758 | #        gensub(/^[[:space:]]+|[[:space:]]+$/,"","g",s)
1759 | #
1760 | #   Credit: https://github.com/dubiousjim/awkenough by Jim Pryor 2012
1761 | #
1762 | function strip(str) {               
1763 |     if (match(str, /[^ \t\n].*[^ \t\n]/))      
1764 |         return substr(str, RSTART, RLENGTH)
1765 |     else if (match(str, /[^ \t\n]/))
1766 |         return substr(str, RSTART, 1)
1767 |     else
1768 |         return ""      
1769 | }
1770 | 
1771 | #   
1772 | # join() - merge an array of strings into a single string. Array indice are numbers.
1773 | # 
1774 | #   Credit: /usr/local/share/awk/join.awk by Arnold Robbins 1999
1775 | # 
1776 | function join(arr, start, end, sep,    result, i) {    
1777 |     if (length(arr) == 0)
1778 |         return ""
1779 | 
1780 |     result = arr[start]           
1781 | 
1782 |     for (i = start + 1; i <= end; i++)
1783 |         result = result sep arr[i]
1784 | 
1785 |     return result
1786 | }
1787 | 
1788 | #
1789 | # join2() - merge an array of strings into a single string. Array indice are strings.
1790 | #                  
1791 | #   . optional third argument 'sortkey' informs how to sort:
1792 | #       https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
1793 | #   . spliti() does reverse
1794 | #
1795 | function join2(arr, sep, sortkey,         i,lobster,result) {
1796 | 
1797 |     if (!empty(sortkey)) {
1798 |         if ("sorted_in" in PROCINFO)
1799 |             save_sorted = PROCINFO["sorted_in"]
1800 |         PROCINFO["sorted_in"] = sortkey
1801 |     }
1802 | 
1803 |     for ( lobster in arr ) {
1804 |         if (++i == 1) {
1805 |             result = lobster
1806 |             continue
1807 |          }
1808 |          result = result sep lobster
1809 |     }
1810 | 
1811 |     if (save_sorted)
1812 |         PROCINFO["sorted_in"] = save_sorted
1813 |     else
1814 |         PROCINFO["sorted_in"] = ""
1815 | 
1816 |     return result
1817 | }
1818 | 
1819 | # 
1820 | # subs() - like sub() but literal non-regex
1821 | # 
1822 | #   Example:
1823 | #      s = "*field"
1824 | #      print subs("*", "-", s)  #=> -field
1825 | # 
1826 | #   Credit: adapted from lsub() by Daniel Mills https://github.com/e36freak/awk-libs
1827 | # 
1828 | function subs(pat, rep, str,    len, i) {
1829 | 
1830 |     if (!length(str))
1831 |         return
1832 | 
1833 |     # get the length of pat, in order to know how much of the string to remove
1834 |     if (!(len = length(pat)))
1835 |         return str
1836 | 
1837 |     # substitute str for rep
1838 |     if (i = index(str, pat))
1839 |         str = substr(str, 1, i - 1) rep substr(str, i + len)
1840 | 
1841 |     return str     
1842 | }                  
1843 | 
1844 | #
1845 | # splits() - literal version of split()
1846 | #
1847 | #   . the "sep" is a literal string not re
1848 | #   . see also subs() and gsubs()
1849 | #       
1850 | #   Credit: https://github.com/e36freak/awk-libs (Daniel Mills)
1851 | #              
1852 | function splits(str, arr, sep,    len, slen, i) {
1853 | 
1854 |     delete arr
1855 | 
1856 |   # if "sep" is empty, just do a normal split
1857 |     if (!(slen = length(sep))) {         
1858 |         return split(str, arr, "")
1859 |     }
1860 | 
1861 |   # loop while "sep" is matched
1862 |     while (i = index(str, sep)) {
1863 |         # append field to array
1864 |         arr[++len] = substr(str, 1, i - 1)
1865 |         # remove that portion (with the sep) from the string
1866 |         str = substr(str, i + slen)
1867 |     }
1868 |     arr[++len] = str
1869 |     return len
1870 | }
1871 | 
1872 | 
1873 | #
1874 | # asplit() - given a string of "key=value SEP key=value" pairs, break it into array[key]=value
1875 | #
1876 | #   . can optionally supply "re" for equals, space; if they're the same or equals is "", array will be setlike
1877 | #
1878 | #   Example             
1879 | #     asplit(arr, "action=query&format=json&meta=tokens", "=", "&")
1880 | #       arr["action"] = "query"
1881 | #       arr["format"] = "json"
1882 | #       arr["meta"]   = "tokens"
1883 | # 
1884 | #   . join() does the inverse eg. join(arr, 0, length(arr) - 1, "&") == "action=query&format=json&meta=tokens"
1885 | # 
1886 | # Credit: https://github.com/dubiousjim/awkenough
1887 | # 
1888 | function asplit(array, str, equals, space, aux, i, n) {
1889 | 
1890 |     n = split(str, aux, (space == "") ? "[ \n]+" : space)
1891 |     if (space && equals == space)
1892 |         equals = ""               
1893 |     else if (!length(equals))             
1894 |         equals = "="
1895 |     delete array 
1896 |     for (i = 1; i <= n; i++) {
1897 |         if (equals && match(aux[i], equals))
1898 |             array[substr(aux[i], 1, RSTART-1)] = substr(aux[i], RSTART+RLENGTH)
1899 |         else
1900 |             array[aux[i]]
1901 |     }              
1902 |     delete aux     
1903 |     return n
1904 | }                    
1905 | 
1906 | #
1907 | # readfile2() - similar to readfile but no trailing \n               
1908 | #
1909 | #   Credit: https://github.com/dubiousjim/awkenough getfile()
1910 | #
1911 | function readfile2(path,   v, p, res) {
1912 |     res = p = ""
1913 |     while (0 < (getline v < path)) {
1914 |         res = res p v
1915 |         p = "\n"
1916 |     }        
1917 |     close(path)
1918 |     return res
1919 | }
1920 | 
1921 | #
1922 | # mktemp() - make a temporary unique file or directory and/or returns its name         
1923 | #
1924 | #  . the last six characters of 'template' must be "XXXXXX" which will be replaced by a uniq string
1925 | #  . if template is not a pathname, the file will be created in ENVIRON["TMPDIR"] if set otherwise /tmp
1926 | #  . if template not provided defaults to "tmp.XXXXXX"                
1927 | #  . recommend don't use spaces or " or ' in pathname
1928 | #  . if type == f create a file
1929 | #  . if type == d create a directory
1930 | #  . if type == u return the name but create nothing
1931 | #
1932 | #  Example:                         
1933 | #     outfile = mktemp(meta "index.XXXXXX", "u")
1934 | #
1935 | #  Credit: https://github.com/e36freak/awk-libs   
1936 | #       
1937 | function mktemp(template, type,
1938 |                 c, chars, len, dir, dir_esc, rstring, i, out, out_esc, umask,
1939 |                 cmd) {
1940 | 
1941 |   # portable filename characters
1942 |     c = "012345689ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
1943 |     len = split(c, chars, "")
1944 | 
1945 |   # make sure template is valid
1946 |     if (length(template)) {
1947 |         if (template !~ /XXXXXX$/) {
1948 |             return -1
1949 |         }
1950 | 
1951 |   # template was not supplied, use the default
1952 |     } else {
1953 |         template = "tmp.XXXXXX"
1954 |     }
1955 |   # make sure type is valid
1956 |     if (length(type)) {
1957 |         if (type !~ /^[fdu]$/) {
1958 |             return -1
1959 |         }
1960 |   # type was not supplied, use the default
1961 |     } else {
1962 |         type = "f"
1963 |     }
1964 |   # if template is a path...
1965 |     if (template ~ /\//) {
1966 |         dir = template
1967 |         sub(/\/[^/]*$/, "", dir)
1968 |         sub(/.*\//, "", template)
1969 |   # template is not a path, determine base dir
1970 |     } else {
1971 |         if (length(ENVIRON["TMPDIR"])) {
1972 |             dir = ENVIRON["TMPDIR"]
1973 |         } else {
1974 |             dir = "/tmp"
1975 |         }
1976 |     }
1977 | 
1978 |   # if this is not a dry run, make sure the dir exists
1979 |     if (type != "u" && ! exists2(dir)) {
1980 |         return -1
1981 |     }
1982 | 
1983 |   # get the base of the template, sans Xs
1984 |     template = substr(template, 0, length(template) - 6)
1985 | 
1986 |   # generate the filename
1987 |     do {
1988 |         rstring = ""
1989 |         for (i=0; i<6; i++) {
1990 |             c = chars[int(rand() * len) + 1]
1991 |             rstring = rstring c
1992 |         }
1993 |         out = dir "/" template rstring
1994 |     } while( exists2(out) )
1995 | 
1996 |     if (type == "f") {
1997 |         printf "" > out
1998 |         close(out)
1999 |     } 
2000 | 
2001 |    # removed for wikiget
2002 |     #else if (type == "d") {
2003 |     #   mkdir(out)
2004 |     #}
2005 | 
2006 |     return out
2007 | }
2008 | 
2009 | # 
2010 | # isanumber() - return 1 if str is a positive whole number or 0
2011 | # 
2012 | #   Example:
2013 | #      "1234" == 1 / "0fr123" == 0 / 1.1 == 0 / -1 == 0 / 0 == 1
2014 | # 
2015 | function isanumber(str,    safe,i) {
2016 | 
2017 |     if (length(str) == 0) return 0
2018 |     safe = str
2019 |     while ( i++ < length(safe) ) {
2020 |         if ( substr(safe,i,1) !~ /[0-9]/ )
2021 |             return 0
2022 |     }            
2023 |     return 1
2024 | }
2025 | 
2026 | #  
2027 | # randomnumber() - return a random number between 1 to max
2028 | #
2029 | #  . robust awk random number generator works at nano-second speed and parallel simultaneous invocation
2030 | #  . requires global variable _cliff_seed ie:
2031 | #        _cliff_seed = "0.00" splitx(sprintf("%f", systime() * 0.000001), ".", 2)
2032 | #    should be defined one-time only eg. in the BEGIN{} section
2033 | #
2034 | function randomnumber(max, i,randomArr) {
2035 | 
2036 |   # if missing _cliff_seed fallback to less-robust rand() method
2037 |     if (empty(_cliff_seed))
2038 |         return randomnumber1(max)
2039 | 
2040 |   # create array of 1000 random numbers made by cliff_rand() method seeded by systime()                
2041 |     for (i = 0; i <= 1002; i++)
2042 |         randomArr[i] = randomnumber2(max)  
2043 | 
2044 |   # choose one at random using rand() method seeded by PROCINFO["pid"]
2045 |     return randomArr[randomnumber1(1000)]
2046 | 
2047 | }                           
2048 | function randomnumber1(max) {
2049 |     srand(PROCINFO["pid"])
2050 |     return int( rand() * max)
2051 | }
2052 | function randomnumber2(max) {
2053 |     int( cliff_rand() * max)  # bypass first call
2054 |     return int( cliff_rand() * max)
2055 | }
2056 | #
2057 | #  cliff_rand()
2058 | #
2059 | #  Credit: https://www.gnu.org/software/gawk/manual/html_node/Cliff-Random-Function.html
2060 | #
2061 | function cliff_rand() {
2062 |     _cliff_seed = (100 * log(_cliff_seed)) % 1
2063 |     if (_cliff_seed < 0)
2064 |         _cliff_seed = - _cliff_seed
2065 |     return _cliff_seed
2066 | }
2067 | 
2068 | #                 
2069 | # stdErr() - print s to /dev/stderr
2070 | #
2071 | #  . if flag = "n" no newline
2072 | #  
2073 | function stdErr(s, flag) {
2074 |     if (flag == "n")
2075 |         printf("%s",s) > "/dev/stderr"
2076 |     else
2077 |         printf("%s\n",s) > "/dev/stderr"
2078 |     close("/dev/stderr")
2079 | }
2080 | 
2081 | # [[ ________ JSON ___________________________________________________________ ]]
2082 | 
2083 | #
2084 | #  query_json() and associate routines
2085 | #
2086 | #   From 'awkenough'
2087 | #
2088 | #	https://github.com/dubiousjim/awkenough
2089 | #
2090 | #   Copyright MIT license
2091 | #   Copyright (c) 2007-2011 Aleksey Cheusov <vle@gmx.net>
2092 | #   Copyright (c) 2012 Jim Pryor <dubiousjim@gmail.com>
2093 | #   Copyright (c) 2018 GreenC (User:GreenC at en.wikipedia.org)
2094 | #
2095 | 
2096 | #
2097 | #  Sample usage
2098 | #
2099 | # 1. Create a JSON file eg. 
2100 | #    wget -q -O- "https://en.wikipedia.org/w/api.php?action=query&titles=Public opinion on global warming|Pertussis&prop=info&format=json&utf8=&redirects" > o
2101 | # 2. In a test, view what the json-array looks like with dump() eg.
2102 | #      query_json(readfile("o"), jsona)
2103 | #      awkenough_dump(jsona, "jsona")
2104 | # 3. Use the created json-array (jsona) in a program 
2105 | #      if( query_json(readfile("o"), jsona) >= 0)
2106 | #        id = jsona["query","pages","25428398","pageid"]
2107 | #
2108 | 
2109 | function awkenough_die(msg) {
2110 |     printf("awkenough: %s\n", msg) > "/dev/stderr"
2111 |     # exit 1
2112 | }
2113 | 
2114 | function awkenough_assert(test, msg) {
2115 |     if (!test) awenough_die(msg ? msg : "assertion failed")
2116 | }
2117 | 
2118 | # unitialized scalar
2119 | function ismissing(u) {
2120 |     return u == 0 && u == ""
2121 | }
2122 | 
2123 | # explicit ""
2124 | function isnull(s, u) {
2125 |     if (u) return s == "" # accept missing as well
2126 |     return !s && s != 0
2127 | }
2128 | 
2129 | # populate array from str="key key=value key=value"
2130 | # can optionally supply "re" for equals, space; if they're the same or equals is "", array will be setlike
2131 | function awkenough_asplit(str, array,  equals, space,   aux, i, n) {
2132 |     n = split(str, aux, (space == "") ? "[ \n]+" : space)
2133 |     if (space && equals == space)
2134 |         equals = ""
2135 |     else if (ismissing(equals))
2136 |         equals = "="
2137 |     split("", array) # delete array
2138 |     for (i=1; i<=n; i++) {
2139 |         if (equals && match(aux[i], equals))
2140 |             array[substr(aux[i], 1, RSTART-1)] = substr(aux[i], RSTART+RLENGTH)
2141 |         else
2142 |             array[aux[i]]
2143 |     }
2144 |     split("", aux) # does it help to delete the aux array?
2145 |     return n
2146 | }
2147 | 
2148 | # behaves like gawk's split; special cases re == "" and " "
2149 | # unlike split, will honor 0-length matches
2150 | function awkenough_gsplit(str, items, re,  seps,   n, i, start, stop, sep1, sep2, sepn) {
2151 |     n = 0
2152 |     # find separators that don't occur in str
2153 |     i = 1
2154 |     do
2155 |         sep1 = sprintf("%c", i++)
2156 |     while (index(str, sep1))
2157 |     do
2158 |         sep2 = sprintf("%c", i++)
2159 |     while (index(str, sep2))
2160 |     sepn = 1
2161 |     split("", seps) # delete array
2162 |     if (ismissing(re))
2163 |         re = FS
2164 |     if (re == "") {
2165 |         split(str, items, "")
2166 |         n = length(str)
2167 |         for (i=1; i<n; i++)
2168 |             seps[i]
2169 |         return n
2170 |     }
2171 |     split("", items) # delete array
2172 |     if (re == " ") {
2173 |         re = "[ \t\n]+"
2174 |         if (match(str, /^[ \t\n]+/)) {
2175 |             seps[0] = substr(str, 1, RLENGTH)
2176 |             str = substr(str, RLENGTH+1)
2177 |         }
2178 |         if (match(str, /[ \t\n]+$/)) {
2179 |             sepn = substr(str, RSTART, RLENGTH)
2180 |             str = substr(str, 1, RSTART-1)
2181 |         }
2182 |     }
2183 |     i = gsub(re, sep1 "&" sep2, str)
2184 |     while (i--) {
2185 |         start = index(str, sep1)
2186 |         stop = index(str, sep2) - 1
2187 |         seps[++n] = substr(str, start + 1, stop - start)
2188 |         items[n] = substr(str, 1, start - 1)
2189 |         str = substr(str, stop + 2)
2190 |     }
2191 |     items[++n] = str
2192 |     if (sepn != 1) seps[n] = sepn
2193 |     return n
2194 | }
2195 | 
2196 | 
2197 | function parse_json(str, T, V,  slack,    c,s,n,a,A,b,B,C,U,W,i,j,k,u,v,w,root) {
2198 |     # use strings, numbers, booleans as separators
2199 |     # c = "[^\"\\\\[:cntrl:]]|\\\\[\"\\\\/bfnrt]|\\u[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:xdigit:]]"
2200 |     c = "[^\"\\\\\001-\037]|\\\\[\"\\\\/bfnrt]|\\\\u[[:xdigit:]A-F][[:xdigit:]A-F][[:xdigit:]A-F][[:xdigit:]A-F]"
2201 |     s ="\"(" c ")*\""
2202 |     n = "-?(0|[1-9][[:digit:]]*)([.][[:digit:]]+)?([eE][+-]?[[:digit:]]+)?"
2203 | 
2204 |     root = awkenough_gsplit(str, A, s "|" n "|true|false|null", T)
2205 |     awkenough_assert(root > 0, "unexpected")
2206 | 
2207 |     # rejoin string using value indices
2208 |     str = ""
2209 |     for (i=1; i<root; i++)
2210 |         str = str A[i] i
2211 |     str = str A[root]
2212 | 
2213 |     # cleanup types and values
2214 |     for (i=1; i<root; i++) {
2215 |         if (T[i] ~ /^"/) {
2216 |             b = split(substr(T[i], 2, length(T[i])-2), B, /\\/)
2217 |             if (b == 0) v = ""
2218 |             else {
2219 |                 v = B[1]
2220 |                 k = 0
2221 |                 for (j=2; j <= b; j++) {
2222 |                     u = B[j]
2223 |                     if (u == "") {
2224 |                        if (++k % 2 == 1) v = v "\\"
2225 |                     } else {
2226 |                         w = substr(u, 1, 1)  
2227 |                         if (w == "b") v = v "\b" substr(u, 2)
2228 |                         else if (w == "f") v = v "\f" substr(u, 2)
2229 |                         else if (w == "n") v = v "\n" substr(u, 2)
2230 |                         else if (w == "r") v = v "\r" substr(u, 2)
2231 |                         else if (w == "t") v = v "\t" substr(u, 2)
2232 |                         else v = v u
2233 |                     }
2234 |                 }
2235 |             }
2236 |             V[i] = v
2237 |             T[i] = "string"
2238 |         } else if (T[i] !~ /true|false|null/) {
2239 |             V[i] = T[i] + 0
2240 |             T[i] = "number"
2241 |         } else {
2242 |             V[i] = T[i]
2243 |         }
2244 |     }
2245 | 
2246 |     # sanitize string
2247 |     gsub(/[[:space:]]+/, "", str)
2248 |     if (str !~ /^[][{}[:digit:],:]+$/) {
2249 |         if (slack !~ /:/) return -1
2250 |         # handle ...unquoted:...
2251 |         a = awkenough_gsplit(str, A, "[[:alpha:]_][[:alnum:]_]*:", B)
2252 |         str = ""
2253 |         for (i=1; i < a; i++) {
2254 |             T[root] = "string"
2255 |             V[root] = substr(B[i], 1, length(B[i])-1)
2256 |             str = str A[i] root ":"
2257 |             root++
2258 |         }
2259 |         str = str A[a]
2260 |         if (str !~ /^[][{}[:digit:],:]+$/) return -10
2261 |     }
2262 | 
2263 |     # atomic value?
2264 |     a = awkenough_gsplit(str, A, "[[{]", B)
2265 |     if (A[1] != "") {
2266 |         if (a > 1) return -2
2267 |         else if (A[1] !~ /^[[:digit:]]+$/) return -3
2268 |         else return A[1]+0
2269 |     }
2270 | 
2271 |     # parse objects and arrays
2272 |     k = root
2273 |     C[0] = 0
2274 |     for (i=2; i<=a; i++) {
2275 |         T[k] = (B[i-1] ~ /\{/) ? "object" : "array"
2276 |         C[k] = C[0]
2277 |         C[0] = k
2278 |         u = awkenough_gsplit(A[i], U, "[]}]", W)
2279 |         awkenough_assert(u > 0, "unexpected")
2280 |         V[k++] = U[1]
2281 |         if (i < a && A[i] != "" && U[u] !~ /[,:]$/)
2282 |             return -4
2283 |         for (j=1; j<u; j++) {
2284 |             if (C[0] == 0 || T[C[0]] != ((W[j] == "}") ? "object" : "array")) return -5
2285 |             v = C[0]
2286 |             w = C[v]
2287 |             C[0] = w
2288 |             delete C[v]
2289 |             if (w) V[w] = V[w] v U[j+1]
2290 |         }
2291 |     }
2292 |     if (C[0] != 0) return -6
2293 | 
2294 |     # check contents
2295 |     for (i=root; i<k; i++) {
2296 |         if (T[i] == "object") {
2297 |             # check object contents
2298 |             b = split(V[i], B, /,/) 
2299 |             for (j=1; j <= b; j++) {
2300 |                 if (B[j] !~ /^[[:digit:]]+:[[:digit:]]+$/)
2301 |                     return -7
2302 |                 if (T[substr(B[j], 1, index(B[j],":")-1)] != "string")
2303 |                     return -8
2304 |             }
2305 |         } else if (V[i] != "") {
2306 |             # check array contents
2307 |             if (slack ~ /,/ && V[i] ~ /,$/)
2308 |                 V[i] = substr(V[i], 1, length(V[i] -1))
2309 |             if (V[i] !~ /^[[:digit:]]+(,[[:digit:]]+)*$/)
2310 |                 return -9
2311 |         }
2312 |     }
2313 |     return root
2314 | }
2315 | 
2316 | #
2317 | # Return a number < 0 on failure. Zero on success
2318 | #
2319 | function query_json(str, X,  root, slack,   T, V, A, B, C, i, j, k) {
2320 | 
2321 |     delete X
2322 |     k = parse_json(str, T, V, slack)
2323 |     if (k < 1) return k
2324 |     split(root, C, ".")
2325 |     j = 1
2326 |     while (j in C) {
2327 |         if (T[k] == "array")
2328 |             split(V[k], A, ",")
2329 |         else {
2330 |             split("", A)
2331 |             awkenough_asplit(V[k], B, ":", ",")
2332 |             for (i in B)
2333 |                 A[V[i]] = B[i]
2334 |         }
2335 |         if (C[j] in A) {
2336 |             k = A[C[j]]
2337 |             j++
2338 |         } else return -11 # can't find requested root
2339 |     }
2340 |     # split("", B)
2341 |     # split("", C)
2342 |     split("", X)
2343 |     B[k] = ""
2344 |     C[k] = 0
2345 |     C[0] = k
2346 |     do {
2347 |         C[0] = C[k]
2348 |         delete C[k]
2349 |         j = T[k]
2350 |         if (j == "array") {
2351 |             j = split(V[k], A, ",")
2352 |             k = B[k] ? B[k] SUBSEP : ""
2353 |             X[k 0] = j
2354 |             for (i=1; i<=j; i++) {
2355 |                # push A[i] to C, (B[k],i) to B 
2356 |                 C[A[i]] = C[0]
2357 |                 B[A[i]] = k i
2358 |                 C[0] = A[i]
2359 |             }
2360 |         } else if (j == "object") {
2361 |             awkenough_asplit(V[k], A, ":", ",")
2362 |             k = B[k] ? B[k] SUBSEP : ""
2363 |             for (i in A) {
2364 |                 # push A[i] to C, (B[k],V[i]) to B 
2365 |                 C[A[i]] = C[0]
2366 |                 B[A[i]] = k V[i]
2367 |                 C[0] = A[i]
2368 |             }
2369 |         } else if (j == "number") {
2370 |             X[B[k]] = V[k]
2371 |         } else if (j == "true") {
2372 |             X[B[k]] = 1
2373 |         } else if (j == "false") {
2374 |             X[B[k]] = 0
2375 |         } else if (j == "string") {
2376 |             X[B[k]] = V[k]
2377 |         } else {
2378 |             # null will satisfy ismissing()
2379 |             X[B[k]] 
2380 |         }
2381 |         k = C[0]
2382 |     } while (k)
2383 |     return 0
2384 | }
2385 | 
2386 | #
2387 | # Visually inspect array created by query_json()
2388 | #
2389 | function awkenough_dump(array, prefix, i,j,c,a,k,s,sep) {
2390 | 
2391 |     for (i in array) {
2392 |         j = i
2393 |         c = split(i, a, SUBSEP, sep)
2394 |         for (k = 1; k <= length(sep); k++) {
2395 |             gsub(/\\/, "\\", sep[k])
2396 |             gsub(/\//, "\\/", sep[k])
2397 |             gsub(/\t/, "\\t", sep[k])
2398 |             gsub(/\n/, "\\n", sep[k])
2399 |             gsub(/\r/, "\\r", sep[k])
2400 |             gsub(/\b/, "\\b", sep[k])
2401 |             gsub(/\f/, "\\f", sep[k])
2402 |             gsub(SUBSEP, ",", sep[k])          
2403 |             gsub(/[\001-\037]/, "¿", sep[k])   # TODO: convert to octal?
2404 |         }
2405 | 
2406 |         s = ""
2407 |         for (k = 1; k <= c; k++) 
2408 |             s = s "\"" a[k] "\"" sep[k]
2409 |         printf "%s[%s]=%s\n", prefix, s, array[i]
2410 |     }
2411 | }
2412 | 
2413 | #
2414 | # Given a JSON-array (jsonarr) created by query_json() producing:
2415 | #
2416 | #    jsona["query","pages","4035","pageid"]=8978 
2417 | # 
2418 | # Populate arr[] such that:
2419 | #
2420 | #    splitja(jsonarr, arr, 3, "pageid") ==>  arr["4035"]=8978
2421 | #
2422 | # indexn is the field # counting from left=>right - this becomes the index of arr
2423 | # value is the far-right (last) field name of the record for which the 8978 is assigned to arr[]
2424 | #
2425 | function splitja(jsonarr, arr, indexn, value) {
2426 | 
2427 |     delete arr                 
2428 |     for (ja in jsonarr) {
2429 |         c = split(ja, a, SUBSEP)
2430 |         if (a[c] == value) {
2431 |             arr[a[indexn]] = jsonarr[ja]
2432 |         }
2433 |     }
2434 |     return length(arr)
2435 | }
2436 | 
2437 | # [[ ________ Edit ___________________________________________________________ ]]
2438 | 
2439 | 
2440 | function setupEdit(   cookiejar) {
2441 | 
2442 |  # OAuth credentials
2443 | 
2444 |     if (empty(G["consumerKey"])) {
2445 |         stdErr("No account. See EDITSETUP for login/authentication info.")
2446 |         exit
2447 |     }
2448 | 
2449 |   # Where to store cookies
2450 | 
2451 |     cookiejar = "/tmp/cookiejar"
2452 |     cookieopt = " --save-cookies=\"" cookiejar "\" --load-cookies=\"" cookiejar "\""
2453 | 
2454 |   # Initialize random number generator
2455 | 
2456 |     if (empty(_cliff_seed))  # randomnumber() seed - initialize once
2457 |         _cliff_seed = "0.00" splitx(sprintf("%f", systime() * 0.000001), ".", 2)
2458 | 
2459 |   # Initialize external program dependencies
2460 | 
2461 |     setup("openssl")
2462 | 
2463 |   # Web agent support for Edit requests
2464 | 
2465 |     if (G["wta"] != "wget") {
2466 |         stdErr("Edit requires wget. Curl may be supported in a future version.")
2467 |         exit
2468 |     }
2469 | 
2470 |   # Adjust API URL for Oauth
2471 | 
2472 |     sub(/[?]$/, "", G["apiURL"])
2473 | 
2474 | }
2475 | 
2476 | function editPage(title,summary,page,    sp,jsona,data,command,postfile,fp,line,outfile,text) {
2477 | 
2478 |     if (page == "STDIN") {
2479 |         while ( (getline line < "/dev/stdin") > 0) 
2480 |             fp = fp line "\n"
2481 |         outfile = mktemp("wikigetstdinfile.XXXXXX", "f")
2482 |         print fp > outfile
2483 |         close(outfile)
2484 |         page = outfile
2485 |     }
2486 | 
2487 |     # Don't blank page
2488 |     text = urlencodeawk(readfile2(page), "rawphp")
2489 |     if (empty(text)) {
2490 |       print "No change (empty text)"
2491 |       exit
2492 |     }
2493 | 
2494 |     data = strip("action=edit&bot=&format=json&text=" text "&title=" urlencodeawk(title, "rawphp") "&summary=" urlencodeawk(summary, "rawphp") "&token=" urlencodeawk(getEditToken()) )
2495 |     postfile = genPostfile(data)
2496 |     command = "wget --user-agent=" shquote(G["agent"]) " " cookieopt " --header=" shquote("Content-Type: application/x-www-form-urlencoded") " --header=" shquote(strip(oauthHeader(data))) " --post-file=" shquote(postfile) " -q -O- " shquote(G["apiURL"]) 
2497 |     sp = sys2var(command)
2498 | 
2499 |     # Sometimes when sending large files or when the Wikimedia servers are very busy, sp will come back blank even though the edit went through. 
2500 |     #  Your calling application should be prepared for getting a blank result string and try again. 
2501 |     #  It may fail on the second try due to "nochange" since it worked on the first round (but returned a blank result string)
2502 | 
2503 |     if (G["debug"]) {
2504 |         print "\nEDITARTICLE\n------"
2505 |         print command
2506 |         print "   ---JSON---"
2507 |         query_json(sp, jsona)
2508 |         awkenough_dump(jsona, "jsona")
2509 |         print "   ---RAW---"
2510 |         print sp 
2511 |     }
2512 |     if (! G["debug"]) {
2513 |         removefile2(postfile)
2514 |         removefile2(outfile)
2515 |     }
2516 | 
2517 |     printResult(sp)
2518 | 
2519 | }
2520 | 
2521 | function getEditToken(  sp,jsona,command,data) {
2522 | 
2523 |     setupEdit()
2524 |     data = "action=query&format=json&meta=tokens"
2525 |     sp = sys2var(apiurl(data))
2526 |     query_json(sp, jsona)
2527 | 
2528 |     if (G["debug"] ) {
2529 |         print "\nGET TOKEN\n-------"
2530 |         print command
2531 |         print "  ---JSON---"
2532 |         awkenough_dump(jsona, "jsona")
2533 |         print "  ---RAW---"
2534 |         print sp
2535 |     }
2536 | 
2537 |     return jsona["query","tokens","csrftoken"]
2538 | 
2539 | }
2540 | 
2541 | function movePage(from,to,reason,    sp,jsona,data,command) {
2542 | 
2543 |     setupEdit()
2544 |     data = strip("action=move&bot&format=json&from=" urlencodeawk(from, "rawphp") "&to=" urlencodeawk(to, "rawphp") "&reason=" urlencodeawk(reason, "rawphp") "&movetalk=&token=" urlencodeawk(getEditToken()) )
2545 |     sp = sys2var(apiurl(data))
2546 | 
2547 |     if (G["debug"]) {
2548 |         print "\nMOVEARTICLE\n------"
2549 |         print command
2550 |         print "   ---JSON---"
2551 |         query_json(sp, jsona)
2552 |         awkenough_dump(jsona, "jsona")
2553 |         print "   ---RAW---"
2554 |         print sp 
2555 |     }
2556 | 
2557 |     printResult(sp)
2558 | 
2559 | }
2560 | 
2561 | #
2562 | # purgePage() - issue a purge on a page title
2563 | #  https://www.mediawiki.org/wiki/API:Purge
2564 | #
2565 | function purgePage(title,    sp,jsona,data,command) {
2566 | 
2567 |     setupEdit()
2568 |     data = strip("action=purge&titles=" urlencodeawk(title, "rawphp") "&format=json")
2569 |     postfile = genPostfile(data)
2570 |     command = "wget --user-agent=" shquote(G["agent"]) " " cookieopt " --header=" shquote("Content-Type: application/x-www-form-urlencoded") " --header=" shquote(strip(oauthHeader(data))) " --post-file=" shquote(postfile) " -q -O- " shquote(G["apiURL"]) 
2571 |     sp = sys2var(command)
2572 | 
2573 |     if (G["debug"]) {
2574 |         print "\nPURGEARTICLE\n------"
2575 |         print command
2576 |         print "   ---JSON---"
2577 |         query_json(sp, jsona)
2578 |         awkenough_dump(jsona, "jsona")
2579 |         print "   ---RAW---"
2580 |         print sp 
2581 |     }
2582 |     if (! G["debug"]) {
2583 |         removefile2(postfile)
2584 |         removefile2(outfile)
2585 |     }
2586 | 
2587 |     printResult(sp)
2588 | 
2589 | }
2590 | 
2591 | #
2592 | # userInfo() - user info via API
2593 | #  https://www.mediawiki.org/wiki/API:userinfo
2594 | #
2595 | function userInfo(  sp,jsona,command,data) {
2596 | 
2597 |     setupEdit()
2598 |     data = "action=query&meta=userinfo&uiprop=" urlencodeawk("rights|groups|blockinfo") "&format=json"
2599 |     sp = sys2var(apiurl(data))
2600 |     query_json(sp, jsona)
2601 |     awkenough_dump(jsona, "jsona")
2602 | }
2603 | 
2604 | #
2605 | # printResult() - print result of action
2606 | #
2607 | function printResult(json,  jsona,nc,sc) {
2608 | 
2609 |     query_json(json, jsona)
2610 | 
2611 |     for (k in jsona) {
2612 |          if(k ~ "nochange")
2613 |          nc++                        
2614 |     }
2615 |     if (jsona["edit","result"] ~ /[Ss]uccess/)
2616 |         sc++
2617 | 
2618 |     if (sc && nc)                      
2619 |         print "No change"     
2620 |     else if(sc)
2621 |         print jsona["edit","result"]
2622 |     else {
2623 |       if(! empty(jsona["error","info"])) 
2624 |         print jsona["error","info"]
2625 |       else if(! empty(jsona["edit","spamblacklist"]))
2626 |         print jsona["edit","spamblacklist"]
2627 |       else if( !empty(jsona["move","from"]) && !empty(jsona["move","to"]) )
2628 |         print "Page moved from " shquote(jsona["move","from"]) " -> " shquote(jsona["move","to"])
2629 |       else if( !empty(jsona["purge","1","title"]) && empty(jsona["purge","1","missing"]) )
2630 |         print "Page purged: " jsona["purge","1","title"]
2631 |       else
2632 |         print "Unknown error"
2633 |     }
2634 | }
2635 | 
2636 | #
2637 | # genPostfile() - generate postfile wget
2638 | #
2639 | function genPostfile(data,  outfile) {
2640 | 
2641 |     outfile = mktemp("wikigetpostfile.XXXXXX", "f")
2642 |     printf("%s", data) > outfile
2643 |     close(outfile)
2644 |     return outfile
2645 | }
2646 | 
2647 | #
2648 | # apiurl() - build a URL to the API using given post data 
2649 | #
2650 | function apiurl(data,  command,wget_opts) {
2651 | 
2652 |     command = "wget --user-agent=" shquote(G["agent"]) " " cookieopts " --header=" shquote("Content-Type: application/x-www-form-urlencoded") " --header=" shquote(strip(oauthHeader(data))) " --post-data=" shquote(data) " -q -O- " shquote(G["apiURL"]) 
2653 |     if (G["debug"])
2654 |         stdErr(command)
2655 |     return command
2656 | }
2657 | 
2658 | #
2659 | # oauthHeader() - retrieve OAuth header
2660 | #
2661 | function oauthHeader(data,   sp) {
2662 | 
2663 |     sp = MWOAuthGenerateHeader(G["consumerKey"], G["consumerSecret"], G["accessKey"], G["accessSecret"], G["apiURL"], "POST", data)
2664 |     if (empty(sp))
2665 |         stdErr("oauthHeader(): unable to determine header")
2666 |     return sp
2667 | }
2668 | 
2669 | #
2670 | # MWOAuthGenerateHeader() - MediaWiki Generate OAuth Header
2671 | #
2672 | #   . requires openssl
2673 | #
2674 | #   Credit: translation of PhP script https://www.mediawiki.org/wiki/OAuth/Owner-only_consumers#Algorithm
2675 | #
2676 | function MWOAuthGenerateHeader(consumerKey, consumerSecret, accessKey, accessSecret, url, method, data,  
2677 | 
2678 |                                nonce,headerParams,dataArr,allParams,allParamsJoined,k,i,j,url2,
2679 |                                signatureBaseParts,signatureBaseString,hmac,header,save_sorted) {
2680 | 
2681 |   # sort associative arrays by index string ascending
2682 |     if ("sorted_in" in PROCINFO)               
2683 |         save_sorted = PROCINFO["sorted_in"]
2684 |     PROCINFO["sorted_in"] = "@ind_str_asc"
2685 | 
2686 |     nonce = strip(splitx(sys2varPipe(systime() randomnumber(1000000), "openssl md5"), "= ", 2))
2687 | 
2688 |     asplit(headerParams, "oauth_consumer_key=" consumerKey " oauth_token=" accessKey " oauth_signature_method=HMAC-SHA1 oauth_timestamp=" systime() " oauth_nonce=" nonce " oauth_version=1.0") 
2689 |     asplit(dataArr, data, "=", "&")
2690 |     concatarray(headerParams,dataArr,allParams)
2691 |     for (k in allParams) 
2692 |         allParamsJoined[i++] = k "=" allParams[k]
2693 | 
2694 |     url2 = urlElement(url, "scheme") "://" tolower(urlElement(url, "netloc")) urlElement(url, "path")
2695 |     asplit(signatureBaseParts, "0=" toupper(method) " 1=" url " 2=" join(allParamsJoined, 0, length(allParamsJoined) - 1, "&"))
2696 |     signatureBaseString = urlencodeawk(signatureBaseParts[0], "rawphp") "&" urlencodeawk(signatureBaseParts[1], "rawphp") "&" urlencodeawk(signatureBaseParts[2], "rawphp")
2697 | 
2698 |   # printf "value" | openssl dgst -sha1 -hmac 'key' -binary
2699 |     hmac = sys2varPipe(signatureBaseString, "openssl sha1 -hmac " shquote(urlencodeawk(consumerSecret, "rawphp") "&" urlencodeawk(accessSecret, "rawphp")) " -binary")
2700 | 
2701 |   # printf "hmac" | openssl base64
2702 |     headerParams["oauth_signature"] = strip(sys2varPipe(hmac, "openssl base64") )
2703 | 
2704 |     for (k in headerParams) 
2705 |         header[j++] = urlencodeawk(k, "rawphp") "=" urlencodeawk(headerParams[k], "rawphp")
2706 | 
2707 |     if (save_sorted)
2708 |         PROCINFO["sorted_in"] = save_sorted
2709 |     else
2710 |         PROCINFO["sorted_in"] = ""
2711 | 
2712 |     return sprintf("%s", "Authorization: OAuth " join(header, 0, length(header) - 1, ", "))
2713 | }
2714 | 
2715 | 


--------------------------------------------------------------------------------