s > 9 correctly.
547 | if li['name'] == "ul": self.o(self.ul_item_mark + " ")
548 | elif li['name'] == "ol":
549 | li['num'] += 1
550 | self.o(str(li['num'])+". ")
551 | self.start = 1
552 |
553 | if tag in ["table", "tr"] and start: self.p()
554 | if tag == 'td': self.pbr()
555 |
556 | if tag == "pre":
557 | if start:
558 | self.startpre = 1
559 | self.pre = 1
560 | else:
561 | self.pre = 0
562 | self.p()
563 |
564 | def pbr(self):
565 | if self.p_p == 0:
566 | self.p_p = 1
567 |
568 | def p(self):
569 | self.p_p = 2
570 |
571 | def soft_br(self):
572 | self.pbr()
573 | self.br_toggle = ' '
574 |
575 | def o(self, data, puredata=0, force=0):
576 | if self.abbr_data is not None:
577 | self.abbr_data += data
578 |
579 | if not self.quiet:
580 | if self.google_doc:
581 | # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
582 | lstripped_data = data.lstrip()
583 | if self.drop_white_space and not (self.pre or self.code):
584 | data = lstripped_data
585 | if lstripped_data != '':
586 | self.drop_white_space = 0
587 |
588 | if puredata and not self.pre:
589 | data = re.sub('\s+', ' ', data)
590 | if data and data[0] == ' ':
591 | self.space = 1
592 | data = data[1:]
593 | if not data and not force: return
594 |
595 | if self.startpre:
596 | #self.out(" :") #TODO: not output when already one there
597 | if not data.startswith("\n"): # stuff...
598 | data = "\n" + data
599 |
600 | bq = (">" * self.blockquote)
601 | if not (force and data and data[0] == ">") and self.blockquote: bq += " "
602 |
603 | if self.pre:
604 | if not self.list:
605 | bq += " "
606 | #else: list content is already partially indented
607 | # for i in xrange(len(self.list)): # no python 3
608 | for i in range(len(self.list)):
609 | bq += " "
610 | data = data.replace("\n", "\n"+bq)
611 |
612 | if self.startpre:
613 | self.startpre = 0
614 | if self.list:
615 | data = data.lstrip("\n") # use existing initial indentation
616 |
617 | if self.start:
618 | self.space = 0
619 | self.p_p = 0
620 | self.start = 0
621 |
622 | if force == 'end':
623 | # It's the end.
624 | self.p_p = 0
625 | self.out("\n")
626 | self.space = 0
627 |
628 | if self.p_p:
629 | self.out((self.br_toggle+'\n'+bq)*self.p_p)
630 | self.space = 0
631 | self.br_toggle = ''
632 |
633 | if self.space:
634 | if not self.lastWasNL: self.out(' ')
635 | self.space = 0
636 |
637 | if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
638 | if force == "end": self.out("\n")
639 |
640 | newa = []
641 | for link in self.a:
642 | if self.outcount > link['outcount']:
643 | self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
644 | if has_key(link, 'title'): self.out(" ("+link['title']+")")
645 | self.out("\n")
646 | else:
647 | newa.append(link)
648 |
649 | if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
650 |
651 | self.a = newa
652 |
653 | if self.abbr_list and force == "end":
654 | for abbr, definition in self.abbr_list.items():
655 | self.out(" *[" + abbr + "]: " + definition + "\n")
656 |
657 | self.p_p = 0
658 | self.out(data)
659 | self.outcount += 1
660 |
661 | def handle_data(self, data):
662 | if r'\/script>' in data: self.quiet -= 1
663 |
664 | if self.style:
665 | self.style_def.update(dumb_css_parser(data))
666 |
667 | if not self.maybe_automatic_link is None:
668 | href = self.maybe_automatic_link
669 | if href == data and self.absolute_url_matcher.match(href):
670 | self.o("<" + data + ">")
671 | return
672 | else:
673 | self.o("[")
674 | self.maybe_automatic_link = None
675 |
676 | if not self.code and not self.pre:
677 | data = escape_md_section(data, snob=self.escape_snob)
678 | self.o(data, 1)
679 |
680 | def unknown_decl(self, data): pass
681 |
682 | def charref(self, name):
683 | if name[0] in ['x','X']:
684 | c = int(name[1:], 16)
685 | else:
686 | c = int(name)
687 |
688 | if not self.unicode_snob and c in unifiable_n.keys():
689 | return unifiable_n[c]
690 | else:
691 | try:
692 | return unichr(c)
693 | except NameError: #Python3
694 | return chr(c)
695 |
696 | def entityref(self, c):
697 | if not self.unicode_snob and c in unifiable.keys():
698 | return unifiable[c]
699 | else:
700 | try: name2cp(c)
701 | except KeyError: return "&" + c + ';'
702 | else:
703 | try:
704 | return unichr(name2cp(c))
705 | except NameError: #Python3
706 | return chr(name2cp(c))
707 |
708 | def replaceEntities(self, s):
709 | s = s.group(1)
710 | if s[0] == "#":
711 | return self.charref(s[1:])
712 | else: return self.entityref(s)
713 |
714 | r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
715 | def unescape(self, s):
716 | return self.r_unescape.sub(self.replaceEntities, s)
717 |
718 | def google_nest_count(self, style):
719 | """calculate the nesting count of google doc lists"""
720 | nest_count = 0
721 | if 'margin-left' in style:
722 | nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
723 | return nest_count
724 |
725 |
726 | def optwrap(self, text):
727 | """Wrap all paragraphs in the provided text."""
728 | if not self.body_width:
729 | return text
730 |
731 | assert wrap, "Requires Python 2.3."
732 | result = ''
733 | newlines = 0
734 | for para in text.split("\n"):
735 | if len(para) > 0:
736 | if not skipwrap(para):
737 | result += "\n".join(wrap(para, self.body_width))
738 | if para.endswith(' '):
739 | result += " \n"
740 | newlines = 1
741 | else:
742 | result += "\n\n"
743 | newlines = 2
744 | else:
745 | if not onlywhite(para):
746 | result += para + "\n"
747 | newlines = 1
748 | else:
749 | if newlines < 2:
750 | result += "\n"
751 | newlines += 1
752 | return result
753 |
754 | ordered_list_matcher = re.compile(r'\d+\.\s')
755 | unordered_list_matcher = re.compile(r'[-\*\+]\s')
756 | md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
757 | md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
758 | md_dot_matcher = re.compile(r"""
759 | ^ # start of line
760 | (\s*\d+) # optional whitespace and a number
761 | (\.) # dot
762 | (?=\s) # lookahead assert whitespace
763 | """, re.MULTILINE | re.VERBOSE)
764 | md_plus_matcher = re.compile(r"""
765 | ^
766 | (\s*)
767 | (\+)
768 | (?=\s)
769 | """, flags=re.MULTILINE | re.VERBOSE)
770 | md_dash_matcher = re.compile(r"""
771 | ^
772 | (\s*)
773 | (-)
774 | (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
775 | # or another dash (header or hr)
776 | """, flags=re.MULTILINE | re.VERBOSE)
777 | slash_chars = r'\`*_{}[]()#+-.!'
778 | md_backslash_matcher = re.compile(r'''
779 | (\\) # match one slash
780 | (?=[%s]) # followed by a char that requires escaping
781 | ''' % re.escape(slash_chars),
782 | flags=re.VERBOSE)
783 |
784 | def skipwrap(para):
785 | # If the text begins with four spaces or one tab, it's a code block; don't wrap
786 | if para[0:4] == ' ' or para[0] == '\t':
787 | return True
788 | # If the text begins with only two "--", possibly preceded by whitespace, that's
789 | # an emdash; so wrap.
790 | stripped = para.lstrip()
791 | if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
792 | return False
793 | # I'm not sure what this is for; I thought it was to detect lists, but there's
794 | # a
-inside- case in one of the tests that also depends upon it.
795 | if stripped[0:1] == '-' or stripped[0:1] == '*':
796 | return True
797 | # If the text begins with a single -, *, or +, followed by a space, or an integer,
798 | # followed by a ., followed by a space (in either case optionally preceeded by
799 | # whitespace), it's a list; don't wrap.
800 | if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
801 | return True
802 | return False
803 |
804 | def wrapwrite(text):
805 | text = text.encode('utf-8')
806 | try: #Python3
807 | sys.stdout.buffer.write(text)
808 | except AttributeError:
809 | sys.stdout.write(text)
810 |
811 | def html2text(html, baseurl=''):
812 | h = HTML2Text(baseurl=baseurl)
813 | return h.handle(html)
814 |
815 | def unescape(s, unicode_snob=False):
816 | h = HTML2Text()
817 | h.unicode_snob = unicode_snob
818 | return h.unescape(s)
819 |
820 | def escape_md(text):
821 | """Escapes markdown-sensitive characters within other markdown constructs."""
822 | return md_chars_matcher.sub(r"\\\1", text)
823 |
824 | def escape_md_section(text, snob=False):
825 | """Escapes markdown-sensitive characters across whole document sections."""
826 | text = md_backslash_matcher.sub(r"\\\1", text)
827 | if snob:
828 | text = md_chars_matcher_all.sub(r"\\\1", text)
829 | text = md_dot_matcher.sub(r"\1\\\2", text)
830 | text = md_plus_matcher.sub(r"\1\\\2", text)
831 | text = md_dash_matcher.sub(r"\1\\\2", text)
832 | return text
833 |
834 |
835 | def main():
836 | baseurl = ''
837 |
838 | p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
839 | version='%prog ' + __version__)
840 | p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
841 | default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
842 | p.add_option("--ignore-links", dest="ignore_links", action="store_true",
843 | default=IGNORE_ANCHORS, help="don't include any formatting for links")
844 | p.add_option("--ignore-images", dest="ignore_images", action="store_true",
845 | default=IGNORE_IMAGES, help="don't include any formatting for images")
846 | p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
847 | default=False, help="convert an html-exported Google Document")
848 | p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
849 | default=False, help="use a dash rather than a star for unordered list items")
850 | p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
851 | default=False, help="use an asterisk rather than an underscore for emphasized text")
852 | p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
853 | default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
854 | p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
855 | default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
856 | p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
857 | default=False, help="hide strike-through text. only relevant when -g is specified as well")
858 | p.add_option("--escape-all", action="store_true", dest="escape_snob",
859 | default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
860 | (options, args) = p.parse_args()
861 |
862 | # process input
863 | encoding = "utf-8"
864 | if len(args) > 0:
865 | file_ = args[0]
866 | if len(args) == 2:
867 | encoding = args[1]
868 | if len(args) > 2:
869 | p.error('Too many arguments')
870 |
871 | if file_.startswith('http://') or file_.startswith('https://'):
872 | baseurl = file_
873 | j = urllib.urlopen(baseurl)
874 | data = j.read()
875 | if encoding is None:
876 | try:
877 | from feedparser import _getCharacterEncoding as enc
878 | except ImportError:
879 | enc = lambda x, y: ('utf-8', 1)
880 | encoding = enc(j.headers, data)[0]
881 | if encoding == 'us-ascii':
882 | encoding = 'utf-8'
883 | else:
884 | data = open(file_, 'rb').read()
885 | if encoding is None:
886 | try:
887 | from chardet import detect
888 | except ImportError:
889 | detect = lambda x: {'encoding': 'utf-8'}
890 | encoding = detect(data)['encoding']
891 | else:
892 | data = sys.stdin.read()
893 |
894 | data = data.decode(encoding)
895 | h = HTML2Text(baseurl=baseurl)
896 | # handle options
897 | if options.ul_style_dash: h.ul_item_mark = '-'
898 | if options.em_style_asterisk:
899 | h.emphasis_mark = '*'
900 | h.strong_mark = '__'
901 |
902 | h.body_width = options.body_width
903 | h.list_indent = options.list_indent
904 | h.ignore_emphasis = options.ignore_emphasis
905 | h.ignore_links = options.ignore_links
906 | h.ignore_images = options.ignore_images
907 | h.google_doc = options.google_doc
908 | h.hide_strikethrough = options.hide_strikethrough
909 | h.escape_snob = options.escape_snob
910 |
911 | wrapwrite(h.handle(data))
912 |
913 |
914 | if __name__ == "__main__":
915 | main()
916 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Author: Aziz Alto
4 | # email: iamaziz.alto@gmail.com
5 |
6 | try:
7 | from setuptools import setup
8 | except ImportError:
9 | from distutils.core import setup
10 |
11 |
12 | setup(
13 | name='pydataset',
14 | description=("Provides instant access to many popular datasets right from "
15 | "Python (in dataframe structure)."),
16 | author='Aziz Alto',
17 | url='https://github.com/iamaziz/PyDataset',
18 | download_url='https://github.com/iamaziz/PyDataset/tarball/0.2.0',
19 | license = 'MIT',
20 | author_email='iamaziz.alto@gmail.com',
21 | version='0.2.0',
22 | install_requires=['pandas'],
23 | packages=['pydataset', 'pydataset.utils'],
24 | package_data={'pydataset': ['*.gz', 'resources.tar.gz']}
25 | )
26 |
--------------------------------------------------------------------------------