├── .gitignore ├── README.md ├── create_index └── rss /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # elasticsearch-rss 2 | 3 | RSS feed and management script for ElasticSearch 4 | 5 | ### TODO 6 | 7 | * Add a -t parameter for target index (?) 8 | * Add a ELASTICSEARCH_RSS_INDEX_NAME or similar for default index name (?) 9 | * Add index name as parameter to create-index script (?) 10 | * Clean up output 11 | * Better exception handling 12 | * Bulk indexing of new items 13 | 14 | ## Installation 15 | 16 | Pre-requisites: 17 | 18 | Python 2.6 with modules feedparser and elasticsearch: 19 | 20 | pip install feedparser elasticsearch 21 | 22 | ### Setup 23 | 24 | First run the create-index script to create your RSS index in ElasticSearch and with some basic mappings. 25 | Edit the script first if you want to change index name and/or particular mappings. 26 | 27 | If you changed the index name, this must also be changed within the script, in the constant STORE=indexname. 28 | 29 | ### Usage 30 | 31 | Suggest doing `rss -h` for the most current version. 32 | 33 | rss info [-v] [-s ] [] Show feed/channel info 34 | rss add Add feed 35 | rss remove [-c] [] Remove feed 36 | rss list [-v] [-d] [-l ] [-s ] [] List items in index 37 | rss clean [-b ] [] Clean items in index 38 | rss fetch [-v] [-d] [-f] [] Fetch items 39 | 40 | -v = verbose 41 | -d = debug (even more verbose) 42 | -c = cascading, i.e. items will also be removed if you remove a feed with this option 43 | -f = force, i.e. fetch all available items regardless of timestamps and last fetch 44 | -l limit = number of items to show 45 | -s since = for listing items since 'since', with format #unit, where # is a number and unit is 46 | one of 's', 'm', 'h', 'd', 'w', 'M', 'y' (second, minute, etc) 47 | -s ago = for deleting items older than 'ago', same format as for 'since' 48 | 49 | `rss add` can also take a batch fed from standard input. I.e. with a file containing name and URL pairs (space between the two and one feed specification on each line), do `rss add \d)+\s*(?P\w)( ago)?$") 30 | 31 | 32 | def getUUID(fromStr): 33 | return str(uuid.uuid3(uuid.NAMESPACE_URL, fromStr.encode("ascii", "ignore"))) 34 | 35 | 36 | def fromISODateString(isoStr): 37 | if isoStr == None: 38 | return None 39 | if "." in isoStr: 40 | return datetime.datetime.strptime(isoStr, "%Y-%m-%dT%H:%M:%S.%fZ") 41 | else: 42 | return datetime.datetime.strptime(isoStr, "%Y-%m-%dT%H:%M:%SZ") 43 | 44 | def toISODateString(dateObj): 45 | return dateObj.strftime("%Y-%m-%dT%H:%M:%SZ") # Screw the %.f ... 46 | 47 | def fromAgoString(agoStr): 48 | # not these: minutes, minute, seconds, second, hours, hour, etc... 49 | # only these "^(?P\d)+\s*(?P\w)( ago)?$" where unit in 50 | # s=second, m=minute, h=hour, d=day, w=week, M=month, y=year 51 | m = agoRegex.match(agoStr) 52 | if not m: 53 | raise SyntaxError("illegal 'ago' string: %s" % agoStr) 54 | number = int(m.group("number")) 55 | unit = m.group("unit") 56 | delta = None 57 | if unit == "s": delta = datetime.timedelta(seconds= number) 58 | elif unit == "m": delta = datetime.timedelta(minutes= number) 59 | elif unit == "h": delta = datetime.timedelta(hours= number) 60 | elif unit == "d": delta = datetime.timedelta(days= number) 61 | elif unit == "w": delta = datetime.timedelta(weeks= number) 62 | elif unit == "M": delta = datetime.timedelta(days= number*30) 63 | elif unit == "y": delta = datetime.timedelta(days= number*365) 64 | else: 65 | raise SyntaxError("illegal unit for 'ago' string in: %s" % agoStr) 66 | return datetime.datetime.utcnow() - delta; 67 | 68 | 69 | def addIf(target, fromDict, fromName, toName=None): 70 | 71 | if toName == None: toName = fromName 72 | 73 | if fromName.startswith("*"): 74 | searchkey = fromName[1:] 75 | for key in fromDict: 76 | if key.endswith(searchkey): 77 | target.update({toName: fromDict[key]}) 78 | break 79 | elif fromName in fromDict: 80 | target.update({toName: fromDict[fromName]}) 81 | 82 | 83 | def getChannelUpdatedIsoString(name, channel): 84 | t = time.gmtime() 85 | if "updated_parsed" in channel and type(channel["updated_parsed"]) is time.struct_time: 86 | t = channel["updated_parsed"] 87 | elif DEBUG: 88 | print "Warning: '%s' missing update time in channel meta data. Using processing time instead." % name 89 | return time.strftime("%Y-%m-%dT%H:%M:%SZ", t) 90 | 91 | def getItemUpdatedIsoString(feedname, item): 92 | t = time.gmtime() 93 | if "updated_parsed" in item and type(item["updated_parsed"]) is time.struct_time: 94 | t = item["updated_parsed"] 95 | elif "published_parsed" in item and type(item["published_parsed"]) is time.struct_time: 96 | t = item["published_parsed"] 97 | elif DEBUG: 98 | print "Warning: An item in '%s' is missing both update and publish time in item data. Using processing time." % feedname 99 | return time.strftime("%Y-%m-%dT%H:%M:%SZ", t) 100 | 101 | def getRSSChannelInfo(feedname, url): 102 | rss = getRSS(feedname, url, skipItems=True) 103 | if rss: return rss[0] 104 | return None 105 | 106 | 107 | def getRSS(feedname, url, skipItems=False): 108 | 109 | try: 110 | feed = feedparser.parse(url) 111 | except: 112 | print "Error: Failed to read feed from: %s" % url 113 | return None 114 | 115 | channel = feed["channel"] 116 | items = feed["items"] 117 | 118 | if not url == feed["url"]: 119 | print "Warning: Given URL and URL in channel meta differs:" 120 | print " given : %s" % url 121 | print " channel: %s" % feed["url"] 122 | 123 | # Create channel info part 124 | cinfo = {"feedname": feedname, "url": url, "updated": getChannelUpdatedIsoString(feedname, channel)} 125 | addIf(cinfo, feed , "version") 126 | addIf(cinfo, feed , "url") 127 | addIf(cinfo, channel, "title") 128 | addIf(cinfo, channel, "link") 129 | addIf(cinfo, channel, "subtitle", "description") 130 | addIf(cinfo, channel, "language") 131 | addIf(cinfo, channel, "generator") 132 | #addIf(cinfo, channel, "*_updateperiod", "update_period") 133 | #addIf(cinfo, channel, "*_updatefrequency", "update_frequency") 134 | #addIf(cinfo, channel, "ttl") 135 | 136 | #print "Debug: PROCESSED JSON FOR FEED [%s] %s" % (cinfo["version"], cinfo["title"]) 137 | #print json.dumps(cinfo, indent=2) 138 | 139 | 140 | if skipItems: 141 | return (cinfo, None) 142 | 143 | # Create items info part 144 | iinfos = [] 145 | for i in items: 146 | 147 | # Prefer "id", alternative "link" as "id", or skip (missing ID is too serious) 148 | # Our IDs must be globally unique, so we add "#" prefix to the ID. 149 | rid = feedname + "#" 150 | if "id" in i: 151 | rid += i["id"] 152 | elif "link" in i: 153 | rid += i["link"] 154 | if DEBUG: 155 | print "Debug: Found item without 'id', using 'link' instead." 156 | else: 157 | if DEBUG: 158 | print "Warning: Dropping item with neither 'id' nor 'link'." 159 | continue 160 | 161 | # Prefer "updated", alternative "published", alternative current processing time 162 | updatedStr = getItemUpdatedIsoString(feedname, i) 163 | 164 | # Extract categories from "tags/term" 165 | categories = [] 166 | if "tags" in i: 167 | for t in i["tags"]: 168 | categories.append(t["term"]) 169 | 170 | iinfo = {"feedname": feedname, "id": getUUID(rid), "updated": updatedStr, "categories": categories} 171 | addIf(iinfo, i, "link") 172 | addIf(iinfo, i, "title") 173 | addIf(iinfo, i, "author") 174 | addIf(iinfo, i, "comments") 175 | 176 | #addIf(iinfo, i, "summary") #, "description" 177 | #addIf(iinfo, i, "content") 178 | # "content" comes with sub elements ("base", "type", "value"). Simplifying for now by extracting only value. 179 | # This actually again comes from the non-list field "description" in RSS, AFAIK. So calling it "description" here.. 180 | # But there is not always "content", so use "summary" if it does not exist 181 | if "content" in i and i["content"]: 182 | iinfo.update({"description": i["content"][0]["value"]}) 183 | else: 184 | addIf(iinfo, i, "summary", "description") 185 | 186 | # Note: Skip "location" for now (in ES mapping) 187 | # Note: Skip "enclosures" (with "url", "type", "length") for now (in ES mapping) 188 | 189 | iinfos.append(iinfo) 190 | 191 | return (cinfo, iinfos) 192 | 193 | 194 | def createQueryFilter(filter): 195 | return {"query":{"filtered":{"filter":filter}}} 196 | 197 | 198 | def getChannel(es, feedname): 199 | try: 200 | res = es.get(index=STORE, doc_type=CHANNEL, id=feedname); 201 | 202 | channel = res["_source"] 203 | return channel 204 | 205 | except elasticsearch.exceptions.NotFoundError: 206 | #print "Warning: No channel info stored for feed '%s'." % feedname 207 | return None 208 | 209 | 210 | def getChannels(es, *feednames): 211 | 212 | # Get channels from ES 213 | body = None 214 | if feednames: 215 | body = createQueryFilter({"terms": {"feedname": feednames}}) 216 | else: 217 | body = {"query": {"match_all": {}}} 218 | body.update({"size": MAX_CHANNELS}) 219 | res = es.search(index=STORE, doc_type=CHANNEL, body=body); 220 | 221 | channels = {} 222 | for hit in res["hits"]["hits"]: 223 | name = hit["_id"] 224 | channel = hit["_source"] 225 | channels.update({name: channel}) 226 | return channels 227 | 228 | 229 | def putChannel(es, channel): 230 | # TODO: try/except 231 | res = es.index(index=STORE, doc_type=CHANNEL, id=channel["feedname"], body=channel); 232 | return 233 | 234 | 235 | def deleteChannel(es, feedname): 236 | try: 237 | res = es.delete(index=STORE, doc_type=CHANNEL, id=feedname) 238 | except: 239 | print "Warning: Failed to delete channel info for feed '%s' in ES." % feedname 240 | 241 | 242 | 243 | def putItem(es, feedname, item, verbose): 244 | 245 | id = item["id"] 246 | link = item["link"] 247 | body = item.copy() # shallow, intentional 248 | del body["id"] 249 | 250 | if link: 251 | # TODO: try/except; for now, let it fail here 252 | res = requests.get(link, verify=False) 253 | page = res.text 254 | 255 | body.update({"page": page}) 256 | #print "Debug: In feed '%s', read item in linked-to page; size = %d bytes." % (feedname, len(page)) 257 | 258 | # TODO: try/except; for now, let it fail here 259 | #print json.dumps(body, indent=2) # DEBUG 260 | res = es.index(index=STORE, doc_type=ITEM, id=id, body=body) 261 | 262 | new = 0; replaced = 0 263 | if res["created"]: 264 | if verbose: 265 | print "New item in %s: %s" % (feedname, item["title"]) 266 | new = 1 267 | else: 268 | replaced = 1 269 | return (new, replaced) 270 | 271 | 272 | #region Commands 273 | 274 | 275 | def listFeeds(es, verbose, since, *feednames): 276 | 277 | channels = getChannels(es, *feednames) 278 | 279 | partial = False 280 | total = 0 281 | 282 | if not verbose: 283 | print "%5s %s" % ("ITEMS", "NAME") 284 | 285 | for name,channel in channels.items(): 286 | 287 | # Get item count from ES 288 | 289 | namePart = {"term": {"feedname": name}} 290 | sincePart = None 291 | if since: 292 | isoSince = toISODateString(since) 293 | sincePart = {"range": {"updated": { "from" : isoSince }}} 294 | 295 | query = None 296 | if sincePart: 297 | query = createQueryFilter({"and" : [namePart, sincePart]}) 298 | else: 299 | query = createQueryFilter(namePart) 300 | 301 | res = es.count(index=STORE, doc_type=ITEM, body=query) 302 | count = res["count"] 303 | total += count 304 | if res["_shards"]["successful"] < res["_shards"]["total"]: partial = True 305 | 306 | # Show it 307 | 308 | if verbose: 309 | print "%s (%d)" % (name, count) 310 | if channel: 311 | fields = [ "version", "url", "title", "link", "updated", "description", \ 312 | "language", "generator", "lastFetch" ] 313 | for f in fields: 314 | if f in channel: 315 | print " %-11s : %s" % (f, channel[f]) 316 | else: 317 | print " %-11s :" % f 318 | else: 319 | print " ** Error: Channel not registered." 320 | print 321 | else: 322 | print "%5d %s" % (count, name) 323 | 324 | print "SUM ITEMS: %d" % total 325 | 326 | if partial: 327 | print "\n** RESULT WAS ONLY PARTIAL (ElasticSearch problem?)" 328 | 329 | 330 | def addFeed(es, name, url): 331 | 332 | channel = getRSSChannelInfo(name, url) 333 | if not channel: 334 | print "Error: Failed to read channel from: %s" % url 335 | return 336 | 337 | existingChannel = getChannel(es, name) 338 | if existingChannel: 339 | print "Channel '%s' already exists. Replacing." % name 340 | 341 | putChannel(es, channel) 342 | 343 | print "Channel '%s' registered." % name 344 | 345 | # TODO: Force a feed update(?) 346 | 347 | 348 | def addBulk(es): 349 | 350 | totalCount = 0 351 | replaceCount = 0 352 | 353 | existingChannels = getChannels(es) 354 | 355 | for line in sys.stdin: 356 | line = line.strip() 357 | if line == "" or line.startswith("#"): continue 358 | (name, url) = filter(None, line.split(" ")) 359 | 360 | totalCount += 1 361 | found = False 362 | if name in existingChannels: 363 | replaceCount += 1 364 | found = True 365 | 366 | channel = getRSSChannelInfo(name, url) 367 | if not channel: 368 | print "Error: Failed to read channel from: %s" % url 369 | continue 370 | 371 | if not found: 372 | print "Adding : %s" % name 373 | else: 374 | print "Updating: %s" % name 375 | 376 | putChannel(es, channel) 377 | 378 | # TODO: Force a feed update(?) 379 | 380 | print "%d feeds registered. (%d new)" % (totalCount, totalCount - replaceCount) 381 | 382 | 383 | def updateChannelInfo(es, verbose, *feednames): 384 | 385 | existingChannels = getChannels(es, *feednames) 386 | 387 | # Check and report if given feednames are not registered 388 | if feednames: 389 | for name in feednames: 390 | if not name in existingChannels: 391 | print "Warning: Channel '%s' not registered." % name 392 | 393 | # Fetch RSS channel info from each RSS feed and send new data to ES 394 | for name,existingChannel in existingChannels.items(): 395 | url = existingChannel["url"] 396 | channel = getRSSChannelInfo(name, url) 397 | channel.update({"lastFetch": existingChannel["lastFetch"]}) # Keep lastFetch 398 | putChannel(es, channel) 399 | print "Channel info updated for feed '%s'." % name 400 | 401 | if verbose: 402 | fields = [ "version", "url", "title", "link", "updated", "description", \ 403 | "language", "generator", "lastFetch" ] 404 | for f in fields: 405 | if f in channel: print " %-11s : %s" % (f, channel[f]) 406 | 407 | 408 | def listItems(es, verbose, debug, since, limit, *feednames): 409 | 410 | partial = False 411 | body = {} 412 | 413 | desc = {"order": "desc"} 414 | body.update({"size": limit, "sort": [{"updated": desc}, {"_timestamp": desc}]}) 415 | 416 | andParts = [] 417 | 418 | if feednames: 419 | andParts.append({"terms": {"feedname": feednames}}) 420 | 421 | if since: 422 | isoSince = toISODateString(since) 423 | andParts.append({"range": {"updated": { "from": isoSince }}}) 424 | 425 | if andParts: 426 | qf = createQueryFilter({"and": andParts}) 427 | body.update(createQueryFilter({"and": andParts})) 428 | else: 429 | body.update({"query": {"match_all": {}}}) 430 | 431 | if debug: 432 | print json.dumps(body,indent=2) 433 | 434 | res = es.search(index=STORE, doc_type=ITEM, body=body) 435 | 436 | if res["_shards"]["successful"] < res["_shards"]["total"]: partial = True 437 | total = res["hits"]["total"] 438 | 439 | print "%d HITS:" % total 440 | 441 | for item in res["hits"]["hits"]: 442 | id = item["_id"] 443 | source = item["_source"] 444 | feedname = source["feedname"] 445 | title = source.get("title", "") 446 | description = source.get("description", "") 447 | page = source.get("page", "") 448 | comments = source.get("comments", "") 449 | author = source.get("author", "") 450 | link = source.get("link", "") 451 | updatedDateIso = source.get("updated", "") 452 | updatedDate = None 453 | if updatedDateIso: updatedDate = fromISODateString(updatedDateIso) 454 | categories = [] 455 | if "categories" in source: categories = source["categories"] 456 | # Info gathered, now show it: 457 | if verbose: 458 | #print json.dumps(source, indent=2) 459 | dateStr = "" 460 | if updatedDate: dateStr = updatedDate.strftime("%Y-%m-%d %H:%M:%S z") 461 | print "-"*78 462 | print "ID = %s" % id 463 | print "FEED = %s" % feedname 464 | print "TITLE = %s" % title 465 | print "UPDATED = %s" % dateStr 466 | print "AUTHOR = %s" % author 467 | print "LINK = %s" % link 468 | print "COMMENTS = %s" % id 469 | print "CATEGORIES = %s" % " | ".join(categories) 470 | print "DESCRIPTION = (%d bytes)" % len(description) 471 | print "PAGE = (%d bytes)" % len(page) 472 | else: 473 | dateStr = "" 474 | if updatedDate: dateStr = updatedDate.strftime("%m-%d %H:%M") 475 | print "[%-10s] %s %s" % (feedname, dateStr, title.replace("\n", "\\n")) 476 | 477 | if partial: 478 | print "\n** RESULT WAS ONLY PARTIAL (ElasticSearch problem?)" 479 | 480 | 481 | def fetchItems(es, verbose, dontfeed, force, *feednames): 482 | 483 | existingChannels = getChannels(es, *feednames) 484 | 485 | # Check and report if given feednames are not registered 486 | if feednames: 487 | for name in feednames: 488 | if not name in existingChannels: 489 | print "Warning: Channel '%s' not registered." % name 490 | 491 | # Fetch RSS feeds and write items and new channel info with new lastFetch time (== now) 492 | for name,existingChannel in existingChannels.items(): 493 | url = existingChannel["url"] 494 | lastFetchDate = None 495 | if "lastFetch" in existingChannel and existingChannel["lastFetch"]: 496 | lastFetchDate = fromISODateString(existingChannel["lastFetch"]) 497 | 498 | # Get RSS feed 499 | rss = getRSS(name, url) 500 | if not rss: 501 | print "Warning: Failed to read RSS feed '%s'. (skipping)" % name 502 | continue 503 | channel, items = rss 504 | 505 | # Check if it is time to process the feed (or 'force' is specified) 506 | updated = fromISODateString(channel["updated"]) 507 | #======== 508 | #print "LAST_FETCH ", lastFetchDate 509 | #print "CHANNEL UPDATE", updated 510 | #======== 511 | qualified = False 512 | if not updated or not lastFetchDate or updated > lastFetchDate: 513 | qualified = True 514 | 515 | if not qualified: 516 | if force: 517 | if verbose: 518 | print "Debug: Nothing new to fetch in '%s', but proceeding since 'force' was specified." % name 519 | else: 520 | if verbose: 521 | print "Debug: Nothing new to fetch in '%s'." % name 522 | continue 523 | 524 | # Feed items to ES 525 | nItems = len(items) 526 | nNewItems = 0 527 | nReplacedItems = 0 528 | for item in items: 529 | if verbose and dontfeed: 530 | print "Fetched item from %s: %s" % (name, item["title"]) 531 | else: 532 | # Is it new? 533 | itemUpdatedDate = fromISODateString(item.get("updated", None)) 534 | itemQualified = False 535 | if not itemUpdatedDate or not lastFetchDate or itemUpdatedDate > lastFetchDate: 536 | itemQualified = True 537 | if not itemQualified and not force: 538 | continue 539 | 540 | new,replaced = putItem(es, name, item, verbose) 541 | nNewItems += new 542 | nReplacedItems += replaced 543 | 544 | # Update channel info with new lastFetch time (== now) 545 | channel.update({"lastFetch": toISODateString(datetime.datetime.utcnow())}) 546 | if not dontfeed: 547 | putChannel(es, channel) 548 | if verbose: 549 | print "%-10s : %3d new, %3d replaced" % (name, nNewItems, nReplacedItems) 550 | 551 | 552 | def deleteItems(es, before, *feednames): 553 | 554 | andParts = [] 555 | 556 | if feednames: 557 | andParts.append({"terms": {"feedname": feednames}}) 558 | 559 | if before: 560 | isoBefore = toISODateString(before) 561 | andParts.append({"range": {"updated": { "to": isoBefore }}}) 562 | 563 | body = None 564 | if andParts: 565 | body = createQueryFilter({"and": andParts}) 566 | else: 567 | body = {"query": {"match_all": {}}} 568 | 569 | #print "***DELETING"; print json.dumps(body) 570 | es.delete_by_query(index=STORE, doc_type=ITEM, body=body) 571 | if not feednames: 572 | print "Items deleted." 573 | else: 574 | print "Items deleted from: %s" % ", ".join(feednames) 575 | 576 | 577 | def deleteFeeds(es, cascading, *feednames): 578 | 579 | channels = getChannels(es, *feednames) 580 | 581 | # Check and report if given feednames are not registered 582 | if feednames: 583 | for name in feednames: 584 | if not name in channels: 585 | print "Warning: Channel '%s' not found." % name 586 | 587 | # Delete 588 | count = 0 589 | for name,channel in channels.items(): 590 | if cascading: 591 | deleteItems(es, None, name) 592 | deleteChannel(es, name) 593 | count += 1 594 | 595 | print "%d feed(s) removed." % count 596 | 597 | 598 | #endregion Commands 599 | 600 | 601 | def usage(err = None, rich= False): 602 | if err: 603 | print "Argument error: %s" % err 604 | 605 | p = os.path.basename(sys.argv[0]) 606 | print "Usage:" 607 | print " %s -h More help" % p 608 | print " %s info [-v] [-s ] [] Show feed info" % p 609 | print " %s add Add feed" % p 610 | print " %s remove [-c] [] Remove feed" % p 611 | # Not that important any more... maybe more valuable to remove it to keep the script simpler 612 | #print " %s update [-v] [] Update feed channel info" % p 613 | print " %s list [-v] [-d] [-l ] [-s ] [] List items in index" % p 614 | print " %s clean [-b ] [] Clean items in index" % p 615 | print " %s fetch [-v] [-d] [-f] [] Fetch items" % p 616 | 617 | if rich: 618 | print 619 | print " -v = verbose" 620 | print " -d = debug (even more verbose)" 621 | print " -c = cascading, i.e. items will also be removed if you remove a feed with" 622 | print " this option" 623 | print " -f = force, i.e. fetch all available items regardless of last fetch time" 624 | print " -l limit = number of items to show" 625 | print " -s since = for listing items since 'since', with format ," 626 | print " where is a number and unit is one of 's', 'm', 'h', 'd', 'w'," 627 | print " 'M', 'y' (second, minute, etc), e.g. '3w'" 628 | print " -s ago = for deleting items older than 'ago', same format as for 'since'" 629 | 630 | 631 | if err: 632 | sys.exit(-1) 633 | else: 634 | sys.exit(0) 635 | 636 | 637 | def main(): 638 | 639 | # Default values 640 | verbose = False 641 | debug = False 642 | cascade = False 643 | beforeStr = None 644 | sinceStr = None 645 | before = None 646 | since = None 647 | limit = 10 648 | force = False 649 | 650 | # Parse command line input 651 | if len(sys.argv) == 1: usage() 652 | try: 653 | optlist, args = getopt.gnu_getopt(sys.argv[1:], ':l:s:b:fhcvd') 654 | except: 655 | usage() 656 | for (o, a) in optlist: 657 | if o == "-h": usage(rich=True) 658 | elif o == "-v": verbose = True 659 | elif o == "-c": cascade = True 660 | elif o == "-s": sinceStr = a 661 | elif o == "-b": beforeStr = a 662 | elif o == "-l": limit = int(a) 663 | elif o == "-d": debug = True 664 | elif o == "-f": force = True 665 | if len(args) < 1: usage("missing command") 666 | cmd = args[0] 667 | args = args[1:] 668 | 669 | # Time validation conversion and checks 670 | if beforeStr: 671 | try: 672 | before = fromAgoString(beforeStr) 673 | except: 674 | usage("illegal 'ago' time format to 'before' argument, '%s'" % beforeStr) 675 | if sinceStr: 676 | try: 677 | since = fromAgoString(sinceStr) 678 | except: 679 | usage("illegal 'ago' time format to 'since' argument, '%s'" % sinceStr) 680 | 681 | # Create ElasticSearch proxy 682 | es = elasticsearch.Elasticsearch() 683 | 684 | if cmd == "info": 685 | listFeeds(es, verbose, since, *args) 686 | elif cmd == "add": 687 | if len(args) == 0: 688 | print "Reading name and url pairs from lines from stdin..." 689 | addBulk(es) 690 | else: 691 | if len(args) < 2: usage("too few arguments") 692 | elif len(args) > 2: usage("too many arguments") 693 | name = args[0] 694 | url = args[1] 695 | addFeed(es, name, url) 696 | elif cmd == "remove": 697 | deleteFeeds(es, cascade, *args) 698 | elif cmd == "update": 699 | updateChannelInfo(es, verbose, *args) 700 | elif cmd == "list": 701 | listItems(es, verbose, debug, since, limit, *args) 702 | elif cmd == "clean": 703 | deleteItems(es, before, *args) 704 | elif cmd == "fetch": 705 | fetchItems(es, verbose, debug, force, *args) 706 | else: 707 | usage("unknown command '%s'" % cmd) 708 | 709 | return 710 | 711 | 712 | if __name__ == "__main__": main() 713 | 714 | --------------------------------------------------------------------------------