├── pakman
├── test
│ ├── pages
│ │ ├── empty.txt
│ │ ├── text.txt
│ │ ├── page2.txt
│ │ ├── page3.txt
│ │ └── page1.txt
│ ├── liquid
│ │ ├── pak
│ │ │ ├── hello.txt
│ │ │ ├── test.txt
│ │ │ ├── s9logo.png
│ │ │ ├── hello.doc
│ │ │ ├── testbin.txt
│ │ │ └── test.html
│ │ └── test.html
│ ├── erb
│ │ └── pak
│ │ │ ├── test.txt
│ │ │ └── test.html.erb
│ ├── helper.rb
│ ├── data
│ │ └── test.yml
│ ├── test_page.rb
│ ├── test_erb.rb
│ ├── test_liquid_drops.rb
│ ├── test_liquid_binaries.rb
│ └── test_liquid.rb
├── History.md
├── lib
│ ├── pakman
│ │ ├── version.rb
│ │ ├── utils.rb
│ │ ├── erb
│ │ │ ├── template.rb
│ │ │ └── templater.rb
│ │ ├── cli
│ │ │ ├── commands
│ │ │ │ ├── fetch.rb
│ │ │ │ ├── list.rb
│ │ │ │ └── gen.rb
│ │ │ ├── ctx.rb
│ │ │ ├── helpers.rb
│ │ │ ├── opts.rb
│ │ │ └── runner.rb
│ │ ├── copier.rb
│ │ ├── finder.rb
│ │ ├── page.rb
│ │ ├── liquid
│ │ │ ├── template.rb
│ │ │ └── templater.rb
│ │ ├── manifest.rb
│ │ └── fetcher.rb
│ └── pakman.rb
├── .gitignore
├── bin
│ └── pakman
├── TODOS.md
├── Rakefile
├── Manifest.txt
└── README.md
├── linkto
├── NOTES.md
├── HISTORY.md
├── lib
│ ├── linkto
│ │ ├── version.rb
│ │ ├── bing.rb
│ │ ├── untappd.rb
│ │ ├── wikipedia.rb
│ │ ├── google.rb
│ │ └── flickr.rb
│ └── linkto.rb
├── .gitignore
├── Manifest.txt
├── test
│ ├── test_wikipedia.rb
│ ├── helper.rb
│ └── test_google.rb
├── Rakefile
└── README.md
├── textutils-more
├── README.md
├── .gitignore
└── lib
│ └── textutils
│ ├── reader
│ └── markdown_reader.rb
│ └── table
│ └── table_reader.rb
├── textutils
├── HISTORY.md
├── TODO.md
├── test
│ ├── data
│ │ ├── de-deutschland
│ │ │ ├── orte.txt
│ │ │ └── 3--by-bayern
│ │ │ │ └── 4--oberfranken
│ │ │ │ ├── orte_ii.txt
│ │ │ │ └── orte.txt
│ │ ├── feedburner.txt
│ │ └── cl_all.txt
│ ├── helper.rb
│ ├── test_tree_reader_ii.rb
│ ├── test_unicode_helper.rb
│ ├── test_fixture_reader.rb
│ ├── test_taglist.rb
│ ├── test_tree_reader.rb
│ ├── test_block_reader.rb
│ ├── test_title_mapper2.rb
│ ├── test_slugify.rb
│ ├── test_asciify.rb
│ ├── test_title_mapper.rb
│ ├── test_title_finder.rb
│ ├── test_title_helper.rb
│ ├── test_address_helper.rb
│ └── test_hypertext_helper.rb
├── lib
│ ├── textutils
│ │ ├── filter
│ │ │ ├── erb_filter.rb
│ │ │ ├── code_filter.rb
│ │ │ ├── comment_filter.rb
│ │ │ └── erb_django_filter.rb
│ │ ├── core_ext
│ │ │ ├── time.rb
│ │ │ ├── file.rb
│ │ │ └── array.rb
│ │ ├── version.rb
│ │ ├── helper
│ │ │ ├── xml_helper.rb
│ │ │ ├── tag_helper.rb
│ │ │ ├── unicode_helper.rb
│ │ │ ├── date_helper.rb
│ │ │ ├── value_helper_iii_numbers.rb
│ │ │ ├── value_helper_ii.rb
│ │ │ ├── value_helper_i.rb
│ │ │ ├── title_helper.rb
│ │ │ ├── address_helper.rb
│ │ │ └── hypertext_helper.rb
│ │ ├── reader
│ │ │ ├── code_reader.rb
│ │ │ ├── block_reader.rb
│ │ │ ├── line_reader.rb
│ │ │ ├── fixture_reader.rb
│ │ │ ├── tree_reader.rb
│ │ │ └── hash_reader.rb
│ │ ├── utils.rb
│ │ ├── sanitizier.rb
│ │ ├── parser
│ │ │ ├── name_tokenizer.rb
│ │ │ └── name_parser.rb
│ │ ├── patterns.rb
│ │ ├── classifier.rb
│ │ ├── title_mapper.rb
│ │ ├── page.rb
│ │ ├── title_mapper2.rb
│ │ └── title.rb
│ └── textutils.rb
├── .gitignore
├── Rakefile
├── Manifest.txt
└── README.md
├── README.md
├── attic
├── fixture_reader.rb
├── line_reader_v2.rb
├── values_reader_v2.rb
├── hash_reader_v2.rb
└── values_reader.rb
└── NOTES.md
/pakman/test/pages/empty.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/linkto/NOTES.md:
--------------------------------------------------------------------------------
1 | # Notes n Tips
2 |
3 |
--------------------------------------------------------------------------------
/textutils-more/README.md:
--------------------------------------------------------------------------------
1 | # textutils-more
2 |
3 |
4 |
--------------------------------------------------------------------------------
/pakman/test/pages/text.txt:
--------------------------------------------------------------------------------
1 | just some text
2 | no headers
3 |
4 |
--------------------------------------------------------------------------------
/pakman/test/liquid/pak/hello.txt:
--------------------------------------------------------------------------------
1 |
2 | just some text
3 | no front matter
4 |
--------------------------------------------------------------------------------
/pakman/test/pages/page2.txt:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 |
4 | try empty front matter
5 |
6 |
--------------------------------------------------------------------------------
/pakman/test/pages/page3.txt:
--------------------------------------------------------------------------------
1 | ---
2 | # try empty front matter with comments
3 | ---
--------------------------------------------------------------------------------
/pakman/History.md:
--------------------------------------------------------------------------------
1 | ## 0.0.1 / 2012-07-17
2 |
3 | * Everything is new. First release
4 |
--------------------------------------------------------------------------------
/pakman/test/pages/page1.txt:
--------------------------------------------------------------------------------
1 | ---
2 | title: hello
3 | ---
4 |
5 | some text here
6 |
--------------------------------------------------------------------------------
/textutils/HISTORY.md:
--------------------------------------------------------------------------------
1 | ### 0.1.0 / 2012-06-09
2 |
3 | * Everything is new. First release
--------------------------------------------------------------------------------
/linkto/HISTORY.md:
--------------------------------------------------------------------------------
1 | ### 0.0.1 / 2014-03-15
2 |
3 | * Everything is new. First release.
4 |
5 |
--------------------------------------------------------------------------------
/linkto/lib/linkto/version.rb:
--------------------------------------------------------------------------------
1 |
2 | module Linkto
3 | VERSION = '0.1.1'
4 | end
5 |
6 |
7 |
--------------------------------------------------------------------------------
/pakman/test/erb/pak/test.txt:
--------------------------------------------------------------------------------
1 | ######
2 | # simple test manifest
3 |
4 | __file__.html test.html.erb
--------------------------------------------------------------------------------
/pakman/test/liquid/pak/test.txt:
--------------------------------------------------------------------------------
1 | ######
2 | # simple test manifest
3 |
4 | __file__.html test.html
5 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/version.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 | VERSION = '1.1.0'
5 | end
6 |
--------------------------------------------------------------------------------
/pakman/test/liquid/pak/s9logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rubycocos/text/master/pakman/test/liquid/pak/s9logo.png
--------------------------------------------------------------------------------
/pakman/.gitignore:
--------------------------------------------------------------------------------
1 | # ignore generated folders
2 | pkg/
3 | doc/
4 | tmp/
5 |
6 | # ignore jekyll generated output
7 | site/_site/
8 |
9 |
--------------------------------------------------------------------------------
/pakman/test/helper.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | # minitest setup
4 | require 'minitest/autorun'
5 |
6 |
7 | ## our own code
8 | require 'pakman'
9 |
10 |
--------------------------------------------------------------------------------
/textutils/TODO.md:
--------------------------------------------------------------------------------
1 | # TODOs
2 |
3 | - [ ] add line number to unicode dash warning e.g. *** warning: found ndash U+2013 (-) in file >at-austria/2013_14/cup.txt<; converting to plain ascii hyphen_minus (-)
4 |
5 |
--------------------------------------------------------------------------------
/pakman/test/liquid/pak/hello.doc:
--------------------------------------------------------------------------------
1 | ---
2 | front matter here
3 | ---
4 |
5 | try "unkown extension"
6 | just some text here
7 |
8 | note: front matter will not matter, that is, will get ignored (e.g. not checked)
9 |
--------------------------------------------------------------------------------
/pakman/test/liquid/pak/testbin.txt:
--------------------------------------------------------------------------------
1 | ######
2 | # test manifest with binary files e.g. graphics
3 | # and "unknown" extensions (will get handled like binary e.g. copied 1:1)
4 |
5 | s9logo.png
6 | hello.txt
7 | hello.doc
8 |
--------------------------------------------------------------------------------
/linkto/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | .bundle
4 | .config
5 | coverage
6 | InstalledFiles
7 | lib/bundler/man
8 | pkg
9 | rdoc
10 | spec/reports
11 | test/tmp
12 | test/version_tmp
13 | tmp
14 |
15 | # YARD artifacts
16 | .yardoc
17 | _yardoc
18 | doc/
19 |
--------------------------------------------------------------------------------
/pakman/test/data/test.yml:
--------------------------------------------------------------------------------
1 | headers:
2 | title: test title
3 | author: test author
4 |
5 |
6 | slides:
7 | - header: test header 1
8 | content: test content 1
9 | - header: test header 2
10 | content: test content 2
11 | - content: test content 3
12 |
13 |
--------------------------------------------------------------------------------
/textutils/test/data/de-deutschland/orte.txt:
--------------------------------------------------------------------------------
1 | 2 Bayern
2 | 24 .. Oberfranken
3 | 241 .... Bamberg (Stadt) ## Kreisfreie Stadt
4 | ...... Bamberg
5 | ........ Bamberg
6 |
7 | #####
8 | # todo: for testing add berlin and some more
9 |
10 | 9 Berlin
11 | 91 .. Berlin
12 |
13 |
--------------------------------------------------------------------------------
/pakman/bin/pakman:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | ###################
4 | # == DEV TIPS:
5 | #
6 | # For local testing run like:
7 | #
8 | # ruby -Ilib bin/pakman
9 | #
10 | # Set the executable bit in Linux. Example:
11 | #
12 | # % chmod a+x bin/pakman
13 | #
14 |
15 | require 'pakman'
16 |
17 | Pakman.main
18 |
--------------------------------------------------------------------------------
/linkto/Manifest.txt:
--------------------------------------------------------------------------------
1 | HISTORY.md
2 | Manifest.txt
3 | README.md
4 | Rakefile
5 | lib/linkto.rb
6 | lib/linkto/bing.rb
7 | lib/linkto/flickr.rb
8 | lib/linkto/google.rb
9 | lib/linkto/untappd.rb
10 | lib/linkto/version.rb
11 | lib/linkto/wikipedia.rb
12 | test/helper.rb
13 | test/test_google.rb
14 | test/test_wikipedia.rb
15 |
--------------------------------------------------------------------------------
/textutils/test/helper.rb:
--------------------------------------------------------------------------------
1 |
2 | ## $:.unshift(File.dirname(__FILE__))
3 |
4 | ## minitest setup
5 |
6 | require 'minitest/autorun'
7 |
8 |
9 | ## make sure activesupport gets included/required
10 | # note: just activesupport or active_support will NOT work
11 | # require 'active_support/all' # -- now included in textutils itself
12 |
13 |
14 | ## our own code
15 |
16 | require 'textutils'
17 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/utils.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 |
6 | # downcase and remove .txt (if anywhere in name)
7 | # e.g. welcome.quick.txt becomes welcome.quick
8 | # welcome.txt.quick becomse welcome.quick
9 | # s6blank.txt becomes s6blank
10 |
11 | def self.pakname_from_file( path )
12 | File.basename( path ).downcase.gsub( '.txt', '' )
13 | end
14 |
15 | end # class Pakman
16 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/filter/erb_filter.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 | module Filter
5 |
6 | # allow plugins/helpers; process source (including header) using erb
7 | def erb( content, options={} )
8 | puts " Running embedded Ruby (erb) code/helpers..."
9 |
10 | content = ERB.new( content ).result( binding() )
11 | content
12 | end
13 |
14 | end # module Filter
15 | end # module TextUtils
--------------------------------------------------------------------------------
/textutils/test/data/feedburner.txt:
--------------------------------------------------------------------------------
1 | ####################################
2 | # feedburner text pattern (regex)
3 | #
4 | # pattern (regex)
5 | # ---
6 | # test1
7 | # ---
8 | # test2
9 | # ---
10 | # etc.
11 |
12 |
13 | ]*?
14 | src=("|')(:?http:)?//feeds\.feedburner\.com/~r/[^>]+?\1
15 | .*?>
16 |
17 | ---
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/linkto/lib/linkto/bing.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Linkto
4 | module BingHelper
5 |
6 |
7 | def link_to_bing_search_images( q, opts={} )
8 | link_to q, "http://www.bing.com/images/search?q=#{q}", opts
9 | end
10 |
11 | ############################
12 | # shortcuts / aliases
13 |
14 | def bing_search_images( q, opts={} ) link_to_bing_search_images( q, opts) end
15 |
16 |
17 | end # module BingHelper
18 | end # module Linkto
19 |
--------------------------------------------------------------------------------
/linkto/lib/linkto/untappd.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Linkto
4 | module UntappdHelper
5 |
6 |
7 | def link_to_untappd_search( q, opts={} )
8 | link_to q, "https://untappd.com/search?q=#{q}", opts
9 | end
10 |
11 |
12 | ###############################
13 | # shortcuts / aliases
14 |
15 | def untappd_search( q, opts={} ) link_to_untappd_search( q, opts ) end
16 |
17 |
18 | end # module UntappdHelper
19 | end # module Linkto
20 |
--------------------------------------------------------------------------------
/textutils/test/data/cl_all.txt:
--------------------------------------------------------------------------------
1 | #####################################
2 | # test data for fixture reader
3 |
4 |
5 | # -- leagues
6 |
7 | europe-champions-league!/leagues
8 |
9 | # -- 2011_12
10 |
11 | europe-champions-league!/2011_12/cl
12 | europe-champions-league!/2011_12/el
13 |
14 | # -- 2012_13
15 |
16 | europe-champions-league!/2012_13/cl
17 | europe-champions-league!/2012_13/el
18 |
19 | # -- 2013_14
20 |
21 | europe-champions-league!/2013_14/cl
22 |
23 |
--------------------------------------------------------------------------------
/pakman/TODOS.md:
--------------------------------------------------------------------------------
1 | # Todos
2 |
3 | - [ ] check file for front matter; use more "efficient" way
4 |
5 | e.g. do NOT load complete file; just a look-a-head;
6 | try to make it work for binary file too? why? why not?
7 | check how jekyll checks for front matter; does jekyll also
8 | check binary files? does the file extension matter (e.g. png, gif, html, css, etc)??
9 |
10 |
11 | ## robots.txt
12 |
13 | - [] see osm blogs templates; uses robots.txt template - do NOT use as manifest; add to exclude list !!!!
14 |
--------------------------------------------------------------------------------
/textutils/test/test_tree_reader_ii.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | ###
4 | # to run use
5 | # ruby -I ./lib -I ./test test/test_tree_reader_ii.rb
6 |
7 |
8 | require 'helper'
9 |
10 | class TestTreeReaderIi < MiniTest::Test
11 |
12 | def test_at_n
13 | reader = TreeReader.from_file( "#{TextUtils.root}/test/data/at-austria/1--n-niederoesterreich/orte.txt" )
14 |
15 | reader.check
16 |
17 | assert true ## assume everything ok if we get here
18 | end
19 |
20 | end # class TestTreeReaderIi
21 |
--------------------------------------------------------------------------------
/linkto/test/test_wikipedia.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 |
4 | require 'helper'
5 |
6 |
7 | class TestWikipedia < MiniTest::Unit::TestCase
8 |
9 | include LinktoHelper
10 |
11 | def test_search
12 |
13 | assert_equal "ottakringer", wikipedia_search( 'ottakringer' )
14 | assert_equal "ottakringer", wikipedia_de_search( 'ottakringer' )
15 |
16 | end
17 |
18 | end # class TestWikipedia
19 |
--------------------------------------------------------------------------------
/pakman/test/liquid/test.html:
--------------------------------------------------------------------------------
1 |
2 |
41 | # end
42 | end
43 |
44 | end # class CodeReader
45 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/copier.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 | class Copier
6 |
7 | include LogUtils::Logging
8 |
9 |
10 | def copy_pak( manifestsrc, pakpath )
11 |
12 | start = Time.now
13 |
14 | pakname = Pakman.pakname_from_file( manifestsrc )
15 |
16 | logger.info "Copying template pack '#{pakname}'"
17 |
18 | ## todo: after depreciate change back to just load_file
19 | manifest = Manifest.load_file_v2( manifestsrc )
20 |
21 | manifest.each do |entry|
22 | dest = entry[0]
23 | source = entry[1]
24 |
25 | # get full (absolute) path and make sure path exists
26 | destfull = File.expand_path( dest, pakpath )
27 | destpath = File.dirname( destfull )
28 | FileUtils.makedirs( destpath ) unless File.directory?( destpath )
29 |
30 | logger.debug "destfull=>#{destfull}<"
31 | logger.debug "destpath=>#{destpath}<"
32 |
33 | logger.info " Copying to #{dest} from #{source}..."
34 | FileUtils.copy( source, destfull )
35 | end
36 |
37 | logger.info "Done (in #{Time.now-start} s)."
38 | end # method copy_pak
39 |
40 | end # class Copier
41 | end # module Pakman
42 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/helper/tag_helper.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 | module TagHelper
5 |
6 | ####
7 | # - todo: use new additional sub module ???
8 | # e.g. TextUtils::Reader::TagHelper
9 | # lets us use "classic" web helpers a la rails
10 | # find a good name for sub module - Reader? Fixtures? Values? Parser?
11 |
12 |
13 | def find_tags( value )
14 | # logger.debug " found tags: >>#{value}<<"
15 |
16 | tag_keys = value.split('|')
17 |
18 | ## unify; replace _w/ space; remove leading n trailing whitespace
19 | tag_keys = tag_keys.map do |key|
20 | key = key.gsub( '_', ' ' )
21 | key = key.strip
22 | key
23 | end
24 |
25 | tag_keys # return tag keys as ary
26 | end
27 |
28 | def find_tags_in_attribs!( attribs )
29 | # NB: will remove :tags from attribs hash
30 |
31 | if attribs[:tags].present?
32 | tag_keys = find_tags( attribs[:tags] )
33 | attribs.delete(:tags)
34 | tag_keys # return tag keys as ary of strings
35 | else
36 | [] # nothing found; return empty ary
37 | end
38 | end
39 |
40 | end # module TagHelper
41 | end # module TextUtils
42 |
--------------------------------------------------------------------------------
/pakman/Manifest.txt:
--------------------------------------------------------------------------------
1 | History.md
2 | Manifest.txt
3 | README.md
4 | Rakefile
5 | bin/pakman
6 | lib/pakman.rb
7 | lib/pakman/cli/commands/fetch.rb
8 | lib/pakman/cli/commands/gen.rb
9 | lib/pakman/cli/commands/list.rb
10 | lib/pakman/cli/ctx.rb
11 | lib/pakman/cli/helpers.rb
12 | lib/pakman/cli/opts.rb
13 | lib/pakman/cli/runner.rb
14 | lib/pakman/copier.rb
15 | lib/pakman/erb/template.rb
16 | lib/pakman/erb/templater.rb
17 | lib/pakman/fetcher.rb
18 | lib/pakman/finder.rb
19 | lib/pakman/liquid/template.rb
20 | lib/pakman/liquid/templater.rb
21 | lib/pakman/manifest.rb
22 | lib/pakman/page.rb
23 | lib/pakman/utils.rb
24 | lib/pakman/version.rb
25 | test/data/test.yml
26 | test/erb/pak/test.html.erb
27 | test/erb/pak/test.txt
28 | test/helper.rb
29 | test/liquid/pak/hello.doc
30 | test/liquid/pak/hello.txt
31 | test/liquid/pak/s9logo.png
32 | test/liquid/pak/test.html
33 | test/liquid/pak/test.txt
34 | test/liquid/pak/testbin.txt
35 | test/liquid/test.html
36 | test/pages/empty.txt
37 | test/pages/page1.txt
38 | test/pages/page2.txt
39 | test/pages/page3.txt
40 | test/pages/text.txt
41 | test/test_erb.rb
42 | test/test_liquid.rb
43 | test/test_liquid_binaries.rb
44 | test/test_liquid_drops.rb
45 | test/test_page.rb
46 |
--------------------------------------------------------------------------------
/attic/fixture_reader.rb:
--------------------------------------------------------------------------------
1 |
2 | if @path.ends_with?( '.yml' ) || @path.ends_with?( '.yaml' )
3 | ### fix/todo: remove later on!!! - do not use!!
4 | puts "deprecated api - FixtureReader w/ yaml format - will get removed; please use new plain text manifest format"
5 | @ary = old_deprecated_yaml_reader( text )
6 | else
7 | ..
8 | end
9 |
10 |
11 | def old_deprecated_yaml_reader( text )
12 | hash = YAML.load( text )
13 |
14 | ### build up array for fixtures from hash
15 | ary = []
16 |
17 | hash.each do |key_wild, value_wild|
18 | key = key_wild.to_s.strip
19 |
20 | logger.debug "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value_wild}<<"
21 |
22 | if value_wild.kind_of?( String ) # assume single fixture name
23 | ary << value_wild
24 | elsif value_wild.kind_of?( Array ) # assume array of fixture names as strings
25 | ary = ary + value_wild
26 | else
27 | logger.error "unknow fixture type in setup (yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value_wild}<<); skipping"
28 | end
29 | end
30 | ary # return fixture ary
31 | end
32 |
--------------------------------------------------------------------------------
/textutils/test/test_title_mapper2.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | ###
4 | # to run use
5 | # ruby -I ./lib -I ./test test/test_title_mapper2.rb
6 |
7 |
8 | require 'helper'
9 |
10 |
11 | class TestTitleMapper2 < Minitest::Test
12 |
13 | ClubStruct = Struct.new(:key, :title, :synonyms)
14 |
15 | def test_title_table
16 |
17 | titles_in = [
18 | ClubStruct.new( 'barcelona', 'Barcelona', 'FC Barcelona' ),
19 | ClubStruct.new( 'espanyol', 'Espanyol', 'RCD Espanyol|Espanyol Barcelona' ),
20 | ClubStruct.new( 'sevilla', 'Sevilla', 'Sevilla FC' )
21 | ]
22 |
23 | mapper = TextUtils::TitleMapper2.new( titles_in, 'club' )
24 | titles_out = mapper.known_titles
25 |
26 | puts 'titles_out:'
27 | pp titles_out
28 |
29 | line = "Espanyol Barcelona 1-0 FC Barcelona"
30 | mapper.map_titles!( line )
31 | puts "=> #{line}"
32 |
33 | club1 = mapper.find_key!( line )
34 | club2 = mapper.find_key!( line )
35 | puts "=> #{line}"
36 |
37 | assert_equal 'espanyol', club1
38 | assert_equal 'barcelona', club2
39 |
40 | assert true ## assume everything ok if we get here
41 |
42 | end # method test_title_table
43 |
44 |
45 | end # class TestTitleMapper2
46 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/cli/opts.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 | class Opts
6 |
7 | def list=(value)
8 | @list = value
9 | end
10 |
11 | def list?
12 | return false if @list.nil? # default list flag is false
13 | @list == true
14 | end
15 |
16 |
17 | def generate=(value)
18 | @generate = value
19 | end
20 |
21 | def generate?
22 | return false if @generate.nil? # default generate flag is false
23 | @generate == true
24 | end
25 |
26 |
27 | def fetch_uri=(value)
28 | @fetch_uri = value
29 | end
30 |
31 | def fetch_uri
32 | @fetch_uri || '-fetch uri required-'
33 | end
34 |
35 | def fetch?
36 | @fetch_uri.nil? ? false : true
37 | end
38 |
39 |
40 | def manifest=(value)
41 | @manifest = value
42 | end
43 |
44 | ## fix:/todo: use a different default manifest
45 | def manifest
46 | @manifest || 's6.txt'
47 | end
48 |
49 |
50 | def config_path=(value)
51 | @config_path = value
52 | end
53 |
54 | def config_path
55 | @config_path || '~/.pak'
56 | end
57 |
58 |
59 | def output_path=(value)
60 | @output_path = value
61 | end
62 |
63 | def output_path
64 | @output_path || '.'
65 | end
66 |
67 | end # class Opts
68 | end # module Pakman
69 |
--------------------------------------------------------------------------------
/pakman/test/test_liquid_drops.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | ###
4 | # to run use
5 | # ruby -I ./lib -I ./test test/test_liquid_drops.rb
6 |
7 |
8 | require 'helper'
9 |
10 |
11 | class TestLiquidDrops < MiniTest::Test
12 |
13 | class HeadersDrop < Liquid::Drop
14 |
15 | def initialize( h )
16 | @h = h
17 | end
18 |
19 | def author() puts "call author"; @h['author']; end
20 | def title() puts "call title"; @h['title']; end
21 | end
22 |
23 | class SlideDrop < Liquid::Drop
24 |
25 | def initialize( h )
26 | @h = h
27 | end
28 |
29 | def content() puts "call content"; @h['content']; end
30 | def header() puts "call header"; @h['header']; end
31 | end
32 |
33 | def setup
34 | Liquid::Template.error_mode = :strict
35 | end
36 |
37 |
38 | def test_template
39 | hash = YAML.load_file( "#{Pakman.root}/test/data/test.yml" )
40 | headers = HeadersDrop.new( hash['headers'] )
41 | slides = hash['slides'].map { |h| SlideDrop.new( h ) }
42 | ctx= { 'headers' => headers, 'slides' => slides }
43 | pp ctx
44 |
45 | path = "#{Pakman.root}/test/liquid/test.html"
46 | t = Pakman::LiquidTemplate.from_file( path )
47 | pp t.render( ctx )
48 |
49 | assert true
50 | end
51 |
52 | end # class TestLiquidDrops
53 |
54 |
--------------------------------------------------------------------------------
/attic/line_reader_v2.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 |
4 | # fix: move into TextUtils namespace/module!!
5 |
6 |
7 | class LineReaderV2
8 | include LogUtils::Logging
9 |
10 | def initialize( name, include_path )
11 | @name = name
12 | @include_path = include_path
13 |
14 | # map name to name_real_path
15 | # name might include !/ for virtual path (gets cut off)
16 | # e.g. at-austria!/w-wien/beers becomse w-wien/beers
17 |
18 | pos = @name.index( '!/')
19 | if pos.nil?
20 | @name_real_path = @name # not found; real path is the same as name
21 | else
22 | # cut off everything until !/ e.g.
23 | # at-austria!/w-wien/beers becomes
24 | # w-wien/beers
25 | @name_real_path = @name[ (pos+2)..-1 ]
26 | end
27 | end
28 |
29 | attr_reader :name
30 | attr_reader :name_real_path
31 | attr_reader :include_path
32 |
33 | def each_line
34 | path = "#{include_path}/#{name_real_path}.txt"
35 | reader = LineReader.from_file( path )
36 |
37 | logger.info "parsing data '#{name}' (#{path})..."
38 |
39 | reader.each_line do |line|
40 | yield( line )
41 | end
42 |
43 | ConfDb::Model::Prop.create_from_fixture!( name, path )
44 | end
45 |
46 | end # class LineReaderV2
47 |
48 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/cli/commands/gen.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 | class Gen
6 |
7 | include LogUtils::Logging
8 |
9 | include ManifestHelper
10 |
11 | def initialize( opts )
12 | @opts = opts
13 | end
14 |
15 | attr_reader :opts
16 |
17 | def run( args )
18 | manifest_name = opts.manifest
19 | manifest_name = manifest_name.downcase.gsub('.txt', '' ) # remove .txt if present
20 |
21 | logger.debug "manifest=#{manifest_name}"
22 |
23 | # check for matching manifests
24 | manifests = installed_template_manifests.select { |m| m[0] == manifest_name+'.txt' }
25 |
26 | if manifests.empty?
27 | puts "*** error: unknown template pack '#{manifest_name}'; use pakman -l to list installed template packs"
28 | exit 2
29 | end
30 |
31 | manifestsrc = manifests[0][1]
32 | pakpath = opts.output_path
33 |
34 | if args.empty?
35 | Copier.new.copy_pak( manifestsrc, pakpath )
36 | else
37 | args.each do |arg|
38 | data = YAML.load_file( arg )
39 | name = File.basename( arg, '.*' )
40 | puts "#{name}:"
41 | pp data
42 | Templater.new.merge_pak( manifestsrc, pakpath, Ctx.new(data).ctx, name )
43 | end
44 | end
45 |
46 | end
47 |
48 | end # class Gen
49 | end # module Pakman
50 |
--------------------------------------------------------------------------------
/pakman/test/test_liquid_binaries.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | ###
4 | # to run use
5 | # ruby -I ./lib -I ./test test/test_liquid_binaries.rb
6 |
7 |
8 | require 'helper'
9 |
10 |
11 | class TestLiquidBinaries < MiniTest::Test
12 |
13 |
14 | def setup
15 | Liquid::Template.error_mode = :strict
16 | end
17 |
18 |
19 | def test_rx
20 | rx = Pakman::LiquidTemplater::REGEX_EXT
21 |
22 | pp rx
23 |
24 | ## todo: check why assert rx.match( 'test.html' ) == true doesn't work
25 | ## (note: regex.match will return MatchData or nil)
26 |
27 | assert rx.match( 'test.html' ).nil? == false
28 | assert rx.match( 'TEST.HTML' ).nil? == false
29 | assert rx.match( 'test.js' ).nil? == false
30 | assert rx.match( 'test.json' ).nil? == false
31 | assert rx.match( 'test.gif' ).nil? == true
32 | end
33 |
34 |
35 | def test_merge
36 | hash = YAML.load_file( "#{Pakman.root}/test/data/test.yml" )
37 | ctx= { 'headers' => hash['headers'], 'slides' => hash['slides'] }
38 | pp ctx
39 |
40 | manifestsrc = "#{Pakman.root}/test/liquid/pak/testbin.txt"
41 | outpath = "#{Pakman.root}/tmp/#{Time.now.to_i}" ## pakpath/output path
42 |
43 | Pakman::LiquidTemplater.new.merge_pak( manifestsrc, outpath, ctx, 'test' )
44 |
45 | assert true
46 | end # method test_merge
47 |
48 | end # class TestLiquidBinaries
49 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/filter/code_filter.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 | module Filter
5 |
6 | def code_block_curly_style( content, options={} )
7 | # replace {{{ w/
8 | # replace }}} w/
9 | # use 4-6 { or } to escape back to literal value (e.g. {{{{ or {{{{{{ => {{{ )
10 | # note: {{{ / }}} are anchored to beginning of line ( spaces and tabs before {{{/}}}allowed )
11 |
12 | # track statistics
13 | code_begin = 0
14 | code_begin_esc = 0
15 | code_end = 0
16 | code_end_esc = 0
17 |
18 | content.gsub!( /^[ \t]*(\{{3,6})/ ) do |match|
19 | escaped = ($1.length > 3)
20 | if escaped
21 | code_begin_esc += 1
22 | "{{{"
23 | else
24 | code_begin += 1
25 | ""
26 | end
27 | end
28 |
29 | content.gsub!( /^[ \t]*(\}{3,6})/ ) do |match|
30 | escaped = ($1.length > 3)
31 | if escaped
32 | code_end_esc += 1
33 | "}}}"
34 | else
35 | code_end += 1
36 | ""
37 | end
38 | end
39 |
40 | puts " Patching {{{/}}}-code blocks (#{code_begin}/#{code_end} blocks, " +
41 | "#{code_begin_esc}/#{code_end_esc} escaped blocks)..."
42 |
43 | content
44 | end
45 |
46 | end # module Filter
47 | end # module TextUtils
--------------------------------------------------------------------------------
/linkto/README.md:
--------------------------------------------------------------------------------
1 | # linkto
2 |
3 | linkto gem - link_to helpers for google search, bing search, flickr photo search, flickr photo tag, etc.
4 |
5 | * home :: [github.com/rubylibs/linkto](https://github.com/rubylibs/linkto)
6 | * bugs :: [github.com/rubylibs/linkto/issues](https://github.com/rubylibs/linkto/issues)
7 | * gem :: [rubygems.org/gems/linkto](https://rubygems.org/gems/linkto)
8 | * rdoc :: [rubydoc.info/gems/linkto](http://rubydoc.info/gems/linkto)
9 |
10 |
11 | ## Usage
12 |
13 | link_to_google_search 'open mundi'
14 |
15 | will become
16 |
17 | https://www.google.com/search?q=open+mundi
18 |
19 |
20 | ### Google
21 |
22 | - `link_to_google_search`
23 | - `link_to_google_de_search`
24 | - `link_to_google_search_images`
25 |
26 | ### Bing
27 |
28 | - `link_to_bing_search_images`
29 |
30 | ### Flickr
31 |
32 | - `link_to_flickr_tags`
33 | - `link_to_flickr_search`
34 |
35 | ### Wikipedia
36 |
37 | - `link_to_wikipedia_search`
38 | - `link_to_wikipedia_de_search`
39 |
40 | ### Untappd
41 |
42 | - `link_to_untappd_search`
43 |
44 |
45 |
46 | ## Real World Usage
47 |
48 | - [beer.db.admin](https://github.com/geraldb/beer.db.admin) - open source world beer guide; beer.db browser
49 |
50 |
51 | ## Alternatives
52 |
53 |
54 | ## License
55 |
56 | The `linkto` scripts are dedicated to the public domain.
57 | Use it as you please with no restrictions whatsoever.
58 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/utils.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 | # make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
5 | extend UnicodeHelper
6 | extend TitleHelper
7 | extend AddressHelper
8 |
9 | extend StringFilter # adds asciify and slugify
10 | end
11 |
12 |
13 |
14 | def title_esc_regex( title_unescaped )
15 | puts "*** warn: depreceated fn call: use TextUtils.title_esc_regex() or include TextUtils::TitleHelpers"
16 | TextUtils.title_esc_regex( title_unescaped )
17 | end
18 |
19 |
20 | def find_data_path_from_gemfile_gitref( name )
21 | puts "[textutils] find_data_path( name='#{name}' )..."
22 | puts "load path:"
23 | pp $LOAD_PATH
24 |
25 | # escape chars for regex e.g. . becomes \.
26 | name_esc = name.gsub( '.', '\.' )
27 |
28 |
29 | # note:
30 | # - hexdigest must be 12 chars e.g. b7d1c9619a54 or similar
31 |
32 | # e.g. match /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
33 |
34 | name_regex = /\/((#{name_esc}-[a-z0-9]{12})|(#{name_esc}))\/lib$/
35 | candidates = []
36 | $LOAD_PATH.each do |path|
37 | if path =~ name_regex
38 | # cutoff trailing /lib
39 | candidates << path[0..-5]
40 | end
41 | end
42 |
43 | puts 'found candidates:'
44 | pp candidates
45 |
46 | ## use first candidate
47 | candidates[0]
48 | end
49 |
50 |
--------------------------------------------------------------------------------
/textutils/test/test_slugify.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | require 'helper'
4 |
5 | class TestSlugify < Minitest::Test
6 |
7 | def test_slugify
8 |
9 | txt_io = [
10 | [ 'São Paulo', 'sao-paulo' ],
11 | [ 'São Gonçalo', 'sao-goncalo' ],
12 | [ 'Výčepní', 'vycepni' ],
13 | [ 'Żubr', 'zubr' ],
14 | [ 'Żywiec', 'zywiec' ],
15 | [ 'Lomża Export', 'lomza-export' ],
16 | [ 'Nogne Ø Imperial Stout', 'nogne-o-imperial-stout' ],
17 | [ 'Xyauyù', 'xyauyu' ],
18 | [ 'Águila', 'aguila' ],
19 | [ 'Arena Amazônia', 'arena-amazonia' ],
20 | [ 'Tōkyō', 'tokyo' ],
21 | [ 'Ōsaka', 'osaka' ],
22 | [ 'El Djazaïr', 'el-djazair' ],
23 | [ 'Al-Kharṭūm', 'al-khartum' ],
24 | [ 'Ṭarābulus', 'tarabulus' ],
25 | [ 'Al-Iskandarīyah', 'al-iskandariyah' ],
26 | [ 'Pishōr', 'pishor' ],
27 | [ 'Pishāwar', 'pishawar' ],
28 | [ 'Islām ābād', 'islam-abad' ],
29 | [ 'Thành Phố Hồ Chí Minh', 'thanh-pho-ho-chi-minh' ],
30 | [ 'Åland Islands', 'aland-islands' ],
31 | [ "Pe\u{030C}awar", 'pexawar'] ## note: use unicode literal; Pex̌awar -- see en.wikipedia.org/wiki/Peshawar
32 | ]
33 |
34 | txt_io.each do |txt|
35 | assert_equal txt[1], TextUtils.slugify( txt[0] )
36 | end
37 | end # method test_slugify
38 |
39 |
40 | end # class TestSlugify
41 |
--------------------------------------------------------------------------------
/attic/values_reader_v2.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | # fix: move into TextUtils namespace/module!!
4 |
5 | ## todo/fix: find a better name than HashReaderV2 (HashReaderPlus?) ??
6 |
7 | class ValuesReaderV2
8 | include LogUtils::Logging
9 |
10 | def initialize( name, include_path, more_attribs={} )
11 | @name = name
12 | @include_path = include_path
13 | @more_attribs = more_attribs
14 |
15 | # map name to name_real_path
16 | # name might include !/ for virtual path (gets cut off)
17 | # e.g. at-austria!/w-wien/beers becomse w-wien/beers
18 |
19 | pos = @name.index( '!/')
20 | if pos.nil?
21 | @name_real_path = @name # not found; real path is the same as name
22 | else
23 | # cut off everything until !/ e.g.
24 | # at-austria!/w-wien/beers becomes
25 | # w-wien/beers
26 | @name_real_path = @name[ (pos+2)..-1 ]
27 | end
28 | end
29 |
30 | attr_reader :name
31 | attr_reader :name_real_path
32 | attr_reader :include_path
33 | attr_reader :more_attribs
34 |
35 | def each_line
36 | path = "#{include_path}/#{name_real_path}.txt"
37 | reader = ValuesReader.new( path, more_attribs )
38 |
39 | logger.info "parsing data '#{name}' (#{path})..."
40 |
41 | reader.each_line do |attribs, values|
42 | yield( attribs, values )
43 | end
44 |
45 | ConfDb::Model::Prop.create_from_fixture!( name, path )
46 | end
47 |
48 | end # class ValuesReaderV2
49 |
50 |
--------------------------------------------------------------------------------
/pakman/lib/pakman.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | ###
4 | # Note: for local testing run like:
5 | #
6 | # 1.9.x: ruby -Ilib lib/pakman.rb
7 |
8 | # core and stlibs
9 |
10 | require 'yaml'
11 | require 'pp'
12 | require 'erb'
13 | require 'logger'
14 | require 'optparse'
15 | require 'fileutils'
16 |
17 | # rubygems
18 |
19 | require 'logutils'
20 | require 'fetcher' # fetch (download) files
21 |
22 |
23 | # 3rd party rubygems
24 | require 'liquid'
25 |
26 | # our own code
27 |
28 | require 'pakman/copier'
29 | require 'pakman/fetcher'
30 | require 'pakman/finder'
31 | require 'pakman/manifest'
32 |
33 | require 'pakman/erb/template'
34 | require 'pakman/erb/templater'
35 |
36 | require 'pakman/liquid/template'
37 | require 'pakman/liquid/templater'
38 |
39 | require 'pakman/page'
40 | require 'pakman/utils'
41 | require 'pakman/version'
42 |
43 | require 'pakman/cli/ctx'
44 | require 'pakman/cli/helpers'
45 | require 'pakman/cli/opts'
46 | require 'pakman/cli/runner'
47 | require 'pakman/cli/commands/fetch'
48 | require 'pakman/cli/commands/gen'
49 | require 'pakman/cli/commands/list'
50 |
51 |
52 | module Pakman
53 |
54 | def self.banner
55 | "pakman #{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
56 | end
57 |
58 | def self.root
59 | "#{File.expand_path( File.dirname(File.dirname(__FILE__)) )}"
60 | end
61 |
62 | def self.main
63 | Runner.new.run(ARGV)
64 | end
65 |
66 | end # module Pakman
67 |
68 |
69 | Pakman.main if __FILE__ == $0
70 |
--------------------------------------------------------------------------------
/pakman/test/test_liquid.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | ###
4 | # to run use
5 | # ruby -I ./lib -I ./test test/test_liquid.rb
6 |
7 |
8 | require 'helper'
9 |
10 |
11 | class TestLiquid < MiniTest::Test
12 |
13 |
14 | def setup
15 | Liquid::Template.error_mode = :strict
16 | end
17 |
18 |
19 | def test_template
20 | hash = YAML.load_file( "#{Pakman.root}/test/data/test.yml" )
21 | ctx= { 'headers' => hash['headers'], 'slides' => hash['slides'] }
22 | pp ctx
23 |
24 | path = "#{Pakman.root}/test/liquid/test.html"
25 | t = Pakman::LiquidTemplate.from_file( path )
26 | pp t.render( ctx )
27 |
28 | assert true
29 | end
30 |
31 | def test_page_template
32 | hash = YAML.load_file( "#{Pakman.root}/test/data/test.yml" )
33 | ctx= { 'headers' => hash['headers'], 'slides' => hash['slides'] }
34 | pp ctx
35 |
36 | path = "#{Pakman.root}/test/liquid/pak/test.html"
37 | t = Pakman::LiquidPageTemplate.from_file( path )
38 | pp t.render( ctx )
39 |
40 | assert true
41 | end
42 |
43 | def test_merge
44 | hash = YAML.load_file( "#{Pakman.root}/test/data/test.yml" )
45 | ctx= { 'headers' => hash['headers'], 'slides' => hash['slides'] }
46 | pp ctx
47 |
48 | manifestsrc = "#{Pakman.root}/test/liquid/pak/test.txt"
49 | outpath = "#{Pakman.root}/tmp/#{Time.now.to_i}" ## pakpath/output path
50 |
51 | Pakman::LiquidTemplater.new.merge_pak( manifestsrc, outpath, ctx, 'test' )
52 |
53 | assert true
54 | end # method test_merge
55 |
56 | end # class TestLiquid
57 |
58 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/finder.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 | class Finder
6 |
7 | include LogUtils::Logging
8 |
9 |
10 | def find_manifests( patterns, excludes=[] )
11 | manifests = []
12 |
13 | patterns.each do |pattern|
14 | pattern.gsub!( '\\', '/') # normalize path; make sure all path use / only
15 | logger.debug "Checking >#{pattern}<"
16 | Dir.glob( pattern ) do |file|
17 | logger.debug " Found manifest candidate >#{file}<"
18 | if File.directory?( file ) # NB: do not include directories
19 | logger.debug " Skipping match; it's a directory"
20 | else
21 | unless exclude?( file, excludes ) # check for excludes; skip if excluded
22 | logger.debug " Adding match >#{file}<"
23 |
24 | ## todo/fix:
25 | # array first entry - downcase and gsub('.txt','') ??
26 | # use Pakman.pakname_from_file()
27 |
28 | manifests << [ File.basename( file ), file ]
29 | end
30 | end
31 | end
32 | end
33 |
34 | manifests
35 | end
36 |
37 | private
38 | def exclude?( file, excludes )
39 | excludes.each do |pattern|
40 | ## todo: FNM_DOTMATCH helps or not?? (make up some tests??)
41 | if File.fnmatch?( pattern, file, File::FNM_CASEFOLD | File::FNM_DOTMATCH )
42 | logger.debug " Skipping match; it's excluded by pattern >#{pattern}<"
43 | return true
44 | end
45 | end
46 | false
47 | end
48 |
49 | end # class Finder
50 | end # module Pakman
51 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/sanitizier.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 |
5 | class Sanitizier
6 |
7 | include LogUtils::Logging
8 |
9 | @@ignore_tags = %w{ head script style }
10 | @@inline_tags = %w{ span b i u }
11 | @@block_tags = %w{ p div ul ol }
12 |
13 |
14 | def initialize( ht )
15 | @ht = ht # hypertext (html source)
16 | end
17 |
18 | def to_plain_text
19 |
20 | ht = @ht
21 | ht = handle_ignore_tags( ht )
22 |
23 | ## handle_pre_tags ?? - special rule for preformatted (keep whitespace)
24 |
25 | ht = handle_inline_tags( ht )
26 | ht = handle_block_tags( ht )
27 | ht = handle_other_tags( ht ) # rules for remain/left over tags
28 |
29 | ht = handle_entities( ht )
30 |
31 | ht
32 | end
33 |
34 | def handle_entities( ht )
35 | ## unescape entities
36 | # - check if it also works for generic entities like etc.
37 | # or only for > < etc.
38 | ht = CGI.unescapeHTML( ht )
39 | end
40 |
41 | def tag_regex( tag )
42 | # note use non-greedy .*? for content
43 |
44 | /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
45 | end
46 |
47 | def handle_ignore_tags( ht )
48 | @@ignore_tags.each do |tag|
49 | ht.gsub!( tag_regex(tag), '' )
50 | end
51 | ht
52 | end
53 |
54 | def handle_inline_tags( ht )
55 | @@inline_tags.each do |tag|
56 | # add a space after
57 | ht.gsub!( tag_regex(tag), '\1 ' )
58 | end
59 | ht
60 | end
61 |
62 | def handle_block_tags( ht )
63 | @@block_tags.each do |tag|
64 | ht.gsub!( tag_regex(tag), "\n\1\n" )
65 | end
66 | ht
67 | end
68 |
69 |
70 | end # class Sanitizier
71 |
72 | end # module TextUtils
73 |
--------------------------------------------------------------------------------
/attic/hash_reader_v2.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | # fix: move into TextUtils namespace/module!!
4 |
5 | ## todo/fix: find a better name than HashReaderV2 (HashReaderPlus?) ??
6 |
7 | class HashReaderV2
8 | include LogUtils::Logging
9 |
10 | def initialize( name, include_path )
11 | @name = name
12 | @include_path = include_path
13 |
14 | # map name to name_real_path
15 | # name might include !/ for virtual path (gets cut off)
16 | # e.g. at-austria!/w-wien/beers becomse w-wien/beers
17 |
18 | pos = @name.index( '!/')
19 | if pos.nil?
20 | @name_real_path = @name # not found; real path is the same as name
21 | else
22 | # cut off everything until !/ e.g.
23 | # at-austria!/w-wien/beers becomes
24 | # w-wien/beers
25 | @name_real_path = @name[ (pos+2)..-1 ]
26 | end
27 | end
28 |
29 | attr_reader :name
30 | attr_reader :name_real_path
31 | attr_reader :include_path
32 |
33 | def each
34 | path = "#{include_path}/#{name_real_path}.yml"
35 | reader = HashReader.from_file( path )
36 |
37 | logger.info "parsing data '#{name}' (#{path})..."
38 |
39 | reader.each do |key, value|
40 | yield( key, value )
41 | end
42 |
43 | ConfDb::Model::Prop.create_from_fixture!( name, path )
44 | end
45 |
46 |
47 | def each_typed
48 | path = "#{include_path}/#{name_real_path}.yml"
49 | reader = HashReader.from_file( path )
50 |
51 | logger.info "parsing data '#{name}' (#{path})..."
52 |
53 | reader.each_typed do |key, value|
54 | yield( key, value )
55 | end
56 |
57 | ConfDb::Model::Prop.create_from_fixture!( name, path )
58 | end
59 |
60 |
61 | end # class HashReaderV2
62 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/page.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 |
6 | ## Jekyll-style page
7 | ## with optional front-matter (yaml block)
8 |
9 | class Page
10 |
11 | def self.from_file( path )
12 | puts " Loading page (from file) >#{path}<..."
13 | text = File.open( path, 'r:bom|utf-8' ).read ## note: assume utf8
14 | self.new( text, path: path ) ## note: pass along path as an option
15 | end
16 |
17 | def self.from_string( text ) ### use parse as alias - why?? why not??
18 | self.new( text )
19 | end
20 |
21 | attr_reader :contents
22 | attr_reader :headers
23 |
24 | ## has headers/metadata (front matter block) - yes/no - use hash for check for now
25 | def headers?() @headers.kind_of?( Hash ); end
26 |
27 | ## check if \s includes newline too?
28 | ## fix/check ^ - just means start of newline (use /A or something --- MUST always be first
29 | ##
30 | ## note: include --- in headers
31 | ## e.g. --- results in nil
32 | ## empty string (without leading ---) results in false! (we want nil if no headers for empty block)
33 | HEADERS_PATTERN = /
34 | ^(?---\s*\n
35 | .*?)
36 | ^(---\s*$\n?)
37 | /xm
38 |
39 | def initialize( text, opts={} )
40 | ## todo/fix: check regex in jekyll (add link to source etc.)
41 | if m=HEADERS_PATTERN.match( text )
42 | @contents = m.post_match
43 | pp m
44 | pp m[:headers]
45 | @headers = YAML.load( m[:headers] )
46 | pp @headers
47 | @headers = {} if @headers.nil? ## check if headers is nil use/assign empty hash
48 | else
49 | @contents = text
50 | @headers = nil
51 | end
52 | end
53 |
54 | end # class Page
55 | end # module Pakman
56 |
--------------------------------------------------------------------------------
/pakman/lib/pakman/erb/templater.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module Pakman
4 |
5 | ### todo:
6 | ## rename to ErbTemplater (or RubyTemplater) - why? why not?
7 |
8 |
9 | class Templater
10 |
11 | include LogUtils::Logging
12 |
13 |
14 | def merge_pak( manifestsrc, pakpath, binding, name )
15 |
16 | start = Time.now
17 |
18 | pakname = Pakman.pakname_from_file( manifestsrc )
19 |
20 | logger.info "Merging template pack '#{pakname}'"
21 |
22 | # todo: rename to load_file once depreated API got removed
23 | manifest = Manifest.load_file_v2( manifestsrc )
24 |
25 | manifest.each do |entry|
26 | dest = entry[0]
27 | source = entry[1]
28 |
29 | if dest =~ /__file__/ # replace w/ name
30 | dest = dest.gsub( '__file__', name )
31 | end
32 |
33 | # get full (absolute) path and make sure path exists
34 | destfull = File.expand_path( dest, pakpath )
35 | destpath = File.dirname( destfull )
36 | FileUtils.makedirs( destpath ) unless File.directory?( destpath )
37 |
38 | logger.debug "destfull=>#{destfull}<"
39 | logger.debug "destpath=>#{destpath}<"
40 |
41 | if source =~ /\.erb\.|.erb$/
42 | logger.info " Merging to #{dest}..."
43 |
44 | out = File.new( destfull, 'w+:utf-8' ) ## note: use utf8 (by default)
45 | out << ErbTemplate.from_file( source ).render( binding )
46 | out.flush
47 | out.close
48 | else
49 | logger.info " Copying to #{dest} from #{source}..."
50 |
51 | FileUtils.copy( source, destfull )
52 | end
53 | end # each entry in manifest
54 |
55 | logger.info "Done (in #{Time.now-start} s)."
56 | end # method merge_pak
57 |
58 | end # class Templater
59 | end # module Pakman
60 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/filter/comment_filter.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 | module Filter
5 |
6 | def comments_percent_style( content, options={} )
7 |
8 | # remove comments
9 | # % comments
10 | # %begin multiline comment
11 | # %end multiline comment
12 |
13 | # track statistics
14 | comments_multi = 0
15 | comments_single = 0
16 | comments_end = 0
17 |
18 | # remove multi-line comments
19 | content.gsub!(/^%(begin|comment|comments).*?%end/m) do |match|
20 | comments_multi += 1
21 | ""
22 | end
23 |
24 | # remove everyting starting w/ %end (note, can only be once in file)
25 | content.sub!(/^%end.*/m) do |match|
26 | comments_end += 1
27 | ""
28 | end
29 |
30 | # hack/note:
31 | # note multi-line erb expressions/stmts might cause trouble
32 | #
33 | # %> gets escaped as special case (not treated as comment)
34 | # <%
35 | # whatever
36 | # %> /, '' )
161 | end
162 |
163 | def remove_leading_spaces( text )
164 | # remove leading spaces if less than four !!!
165 | text.gsub( /^[ \t]+(?![ \t])/, '' ) # use negative regex lookahead e.g. (?!)
166 | end
167 |
168 | def remove_blanks( text )
169 | # remove lines only with ..
170 | text.gsub( /^[ \t]*\.{2}[ \t]*\n/, '' )
171 | end
172 |
173 | def cleanup_newlines( text )
174 | # remove all blank lines that go over three
175 | text.gsub( /\n{4,}/, "\n\n\n" )
176 | end
177 |
178 |
179 | def concat_lines( text )
180 | # lines ending with ++ will get newlines get removed
181 | # e.g.
182 | # >| hello1 ++
183 | # >1 hello2
184 | # becomes
185 | # >| hello1 hello2
186 |
187 | #
188 | # note: do NOT use \s - will include \n (newline) ??
189 |
190 | text.gsub( /[ \t]+\+{2}[ \t]*\n[ \t]*/, ' ' ) # note: replace with single space
191 | end
192 |
193 |
194 | end # class PageTemplate
195 |
196 | end # module TextUtils
197 |
198 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/reader/hash_reader.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | # fix: move into TextUtils namespace/module!!
4 |
5 |
6 | class HashReader
7 |
8 | include LogUtils::Logging
9 |
10 | def self.from_zip( zip_file, entry_path )
11 | entry = zip_file.find_entry( entry_path )
12 |
13 | ## todo/fix: add force encoding to utf-8 ??
14 | ## check!!!
15 | ## clean/prepprocess lines
16 | ## e.g. CR/LF (/r/n) to LF (e.g. /n)
17 | text = entry.get_input_stream().read()
18 |
19 | ## NOTE: needs logger ref; only available in instance methods; use global logger for now
20 | logger = LogUtils::Logger.root
21 | logger.debug "text.encoding.name (before): #{text.encoding.name}"
22 | #####
23 | # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
24 | ## NB:
25 | # for now "hardcoded" to utf8 - what else can we do?
26 | # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
27 | text = text.force_encoding( Encoding::UTF_8 )
28 | logger.debug "text.encoding.name (after): #{text.encoding.name}"
29 |
30 | ## todo:
31 | # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
32 | ## text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
33 |
34 | self.from_string( text )
35 | end
36 |
37 | def self.from_file( path )
38 | ## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
39 | ## - see textutils/utils.rb
40 | text = File.read_utf8( path )
41 | self.from_string( text )
42 | end
43 |
44 | def self.from_string( text )
45 | HashReader.new( text: text )
46 | end
47 |
48 | def initialize( arg )
49 |
50 | if arg.is_a?( String ) ## old style (deprecated) - pass in filepath as string
51 | path = arg
52 | logger.info "HashReader.new - deprecated API - use HashReader.from_file() instead"
53 | text = File.read_utf8( path )
54 | else ## assume it's a hash
55 | opts = arg
56 | text = opts[:text]
57 | end
58 |
59 | ### hack for syck yaml parser (e.g.ruby 1.9.2) (cannot handle !!null)
60 | ## change it to !null to get plain nil
61 | ## w/ both syck and psych/libyml
62 |
63 | text = text.gsub( '!!null', '!null' )
64 |
65 | ### hacks for yaml
66 |
67 | ### see yaml gotschas
68 | ## - http://www.perlmonks.org/?node_id=738671
69 | ## -
70 |
71 | ## replace all tabs w/ two spaces and issue a warning
72 | ## nb: yaml does NOT support tabs see why here -> yaml.org/faq.html
73 |
74 | text = text.gsub( "\t" ) do |_|
75 | logger.warn "hash reader - found tab (\t) replacing w/ two spaces; yaml forbids tabs; see yaml.org/faq.html (path=#{path})"
76 | ' ' # replace w/ two spaces
77 | end
78 |
79 | ## quote implicit boolean types on,no,n,y
80 |
81 | ## nb: escape only if key e.g. no: or "free standing" value on its own line e.g.
82 | ## no: no
83 |
84 | text = text.gsub( /^([ ]*)(ON|On|on|OFF|Off|off|YES|Yes|yes|NO|No|no|Y|y|N|n)[ ]*:/ ) do |value|
85 | logger.warn "hash reader - found implicit bool (#{$1}#{$2}) for key; adding quotes to turn into string; see yaml.org/refcard.html (path=#{path})"
86 | # nb: preserve leading spaces for structure - might be significant
87 | "#{$1}'#{$2}':" # add quotes to turn it into a string (not bool e.g. true|false)
88 | end
89 |
90 | ## nb: value must be freestanding (only allow optional eol comment)
91 | ## do not escape if part of string sequence e.g.
92 | ## key: nb,nn,no,se => nb,nn,'no',se -- avoid!!
93 | #
94 | # check: need we add true|false too???
95 |
96 | text = text.gsub( /:[ ]+(ON|On|on|OFF|Off|off|YES|Yes|yes|NO|No|no|Y|y|N|n)[ ]*($| #.*$)/ ) do |value|
97 | logger.warn "hash reader - found implicit bool (#{$1}) for value; adding quotes to turn into string; see yaml.org/refcard.html (path=#{path})"
98 | ": '#{$1}'" # add quotes to turn it into a string (not bool e.g. true|false)
99 | end
100 |
101 |
102 | @hash = YAML.load( text )
103 | end
104 |
105 | ###
106 | # nb: returns all values as strings
107 | #
108 |
109 | def each
110 | @hash.each do |key_wild, value_wild|
111 | # normalize
112 | # - key n value as string (not symbols, bool? int? array?)
113 | # - remove leading and trailing whitespace
114 | key = key_wild.to_s.strip
115 | value = value_wild.to_s.strip
116 |
117 | logger.debug "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value}<<"
118 |
119 | yield( key, value )
120 | end
121 | end # method each
122 |
123 | ###
124 | # todo: what name to use: each_object or each_typed ???
125 | # or use new TypedHashReader class or similar??
126 |
127 | def each_typed
128 | @hash.each do |key_wild, value_wild|
129 | # normalize
130 | # - key n value as string (not symbols, bool? int? array?)
131 | # - remove leading and trailing whitespace
132 | key = key_wild.to_s.strip
133 |
134 | if value_wild.is_a?( String )
135 | value = value_wild.strip
136 | else
137 | value = value_wild
138 | end
139 |
140 | logger.debug "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value}<<"
141 |
142 | yield( key, value )
143 | end
144 | end # method each
145 |
146 | end # class HashReader
147 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/title_mapper2.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 |
4 | ## see textutils/title.rb
5 | ## for existing code
6 | ## move over here
7 |
8 |
9 | module TextUtils
10 |
11 | class TitleMapper2 ## todo/check: rename to NameMapper ? why? why not??
12 |
13 | include LogUtils::Logging
14 |
15 | attr_reader :known_titles ## rename to mapping or mappings or just titles - why? why not?
16 |
17 | ##
18 | ## key: e.g. augsburg
19 | ## title: e.g. FC Augsburg
20 | ## length (of title - not pattern): e.g. 11 -- do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
21 | MappingStruct = Struct.new( :key, :title, :length, :pattern) ## todo/check: use (rename to) TitleStruct - why? why not??
22 |
23 |
24 | def initialize( records, tag )
25 | @known_titles = build_title_table_for( records ) ## build mapping lookup table
26 |
27 | ## todo: rename tag to attrib or attrib_name - why ?? why not ???
28 | @tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc.
29 | end
30 |
31 |
32 | def map_titles!( line ) ## rename to just map! - why?? why not???
33 | begin
34 | found = map_title_for!( @tag, line, @known_titles )
35 | end while found
36 | end
37 |
38 | def find_key!( line )
39 | find_key_for!( @tag, line )
40 | end
41 |
42 | def find_keys!( line ) # NB: keys (plural!) - will return array
43 | counter = 1
44 | keys = []
45 |
46 | key = find_key_for!( "#{@tag}#{counter}", line )
47 | while key.present?
48 | keys << key
49 | counter += 1
50 | key = find_key_for!( "#{@tag}#{counter}", line )
51 | end
52 | keys
53 | end
54 |
55 |
56 | private
57 | def build_title_table_for( records )
58 |
59 | ## build known tracks table w/ synonyms e.g.
60 | #
61 | # [[ 'wolfsbrug', 'VfL Wolfsburg'],
62 | # [ 'augsburg', 'FC Augsburg'],
63 | # [ 'augsburg', 'Augi2'],
64 | # [ 'augsburg', 'Augi3' ],
65 | # [ 'stuttgart', 'VfB Stuttgart']]
66 |
67 | known_titles = []
68 |
69 | records.each_with_index do |rec,index|
70 |
71 | title_candidates = []
72 | title_candidates << rec.title
73 |
74 | title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
75 |
76 |
77 | ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
78 | # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
79 |
80 | titles = []
81 | title_candidates.each do |t|
82 | titles << t
83 | if t =~ /\(.+\)/
84 | extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
85 | # note: strip leading n trailing withspaces too!
86 | # -- todo: add squish or something if () is inline e.g. leaves two spaces?
87 | extra_title.strip!
88 | titles << extra_title
89 | end
90 | end
91 |
92 | titles.each do |t|
93 | m = MappingStruct.new
94 | m.key = rec.key
95 | m.title = t
96 | m.length = t.length
97 | ## note: escape for regex plus allow subs for special chars/accents
98 | m.pattern = TextUtils.title_esc_regex( t )
99 |
100 | known_titles << m
101 | end
102 |
103 | logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
104 |
105 | ## NB: only include code field - if defined
106 | if rec.respond_to?(:code) && rec.code.present?
107 | m = MappingStruct.new
108 | m.key = rec.key
109 | m.title = rec.code
110 | m.length = rec.code.length
111 | m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now)
112 |
113 | known_titles << m
114 | end
115 | end
116 |
117 | ## note: sort here by length (largest goes first - best match)
118 | # exclude code and key (key should always go last)
119 | known_titles = known_titles.sort { |left,right| right.length <=> left.length }
120 | known_titles
121 | end
122 |
123 |
124 | def map_title_for!( tag, line, mappings )
125 |
126 | downcase_tag = tag.downcase
127 |
128 | mappings.each do |mapping|
129 |
130 | key = mapping.key
131 | value = mapping.pattern
132 | ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
133 | ## (thus add it, allows match for Benfica Lis. for example - note . at the end)
134 |
135 | ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
136 | regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
137 | if line =~ regex
138 | logger.debug " match for #{downcase_tag} >#{key}< >#{value}<"
139 | # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
140 | line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
141 | return true # break out after first match (do NOT continue)
142 | end
143 | end
144 | return false
145 | end
146 |
147 |
148 | def find_key_for!( tag, line )
149 | regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
150 |
151 | upcase_tag = tag.upcase
152 | downcase_tag = tag.downcase
153 |
154 | if line =~ regex
155 | value = "#{$1}"
156 | logger.debug " #{downcase_tag}: >#{value}<"
157 |
158 | line.sub!( regex, "[#{upcase_tag}]" )
159 |
160 | return $1
161 | else
162 | return nil
163 | end
164 | end # method find_key_for!
165 |
166 |
167 | end # class TitleMapper2
168 | end # module TextUtils
169 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/title.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 |
4 | ## todo: rename to TitleFinder or TitleMapper ??
5 | # other options TitleMatcher?
6 | # TitleMapping? TitleMappings?
7 | # or rename to KeyMapping?, KeyMapper?, KeyTable? etc.
8 |
9 |
10 | ######
11 | ## todo/check:
12 | ### remove - use TitleMapper instead
13 | ## deprecated/obsolete - do NOT use will get removed
14 |
15 |
16 | module TextUtils
17 | module TitleTable
18 |
19 | ####
20 | ## fix: turn it into a class w/ methods
21 | #
22 | #e.g t =TitleMapper.new( records, name ) # e.g. name='team'
23 | # t.map!( line )
24 | # t.find_key!( line )
25 | # etc.
26 | #
27 | # see textutils/title_mapper.rb
28 | #
29 | # deprecate code here!!! - move to new TitleMapper class
30 |
31 |
32 | def build_title_table_for( records )
33 | LogUtils::Logger.root.info " build_title_table_for - deprecated API - use TitleMapper.new instead"
34 |
35 | ## build known tracks table w/ synonyms e.g.
36 | #
37 | # [[ 'wolfsbrug', [ 'VfL Wolfsburg' ]],
38 | # [ 'augsburg', [ 'FC Augsburg', 'Augi2', 'Augi3' ]],
39 | # [ 'stuttgart', [ 'VfB Stuttgart' ]] ]
40 |
41 | known_titles = []
42 |
43 | records.each_with_index do |rec,index|
44 |
45 | title_candidates = []
46 | title_candidates << rec.title
47 |
48 | title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
49 |
50 |
51 | ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
52 | # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
53 |
54 | titles = []
55 | title_candidates.each do |t|
56 | titles << t
57 | if t =~ /\(.+\)/
58 | extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
59 | extra_title.strip! # strip leading n trailing withspaces too!
60 | titles << extra_title
61 | end
62 | end
63 |
64 |
65 | ## NB: sort here by length (largest goes first - best match)
66 | # exclude code and key (key should always go last)
67 | titles = titles.sort { |left,right| right.length <=> left.length }
68 |
69 | ## escape for regex plus allow subs for special chars/accents
70 | titles = titles.map { |title| TextUtils.title_esc_regex( title ) }
71 |
72 | ## NB: only include code field - if defined
73 | titles << rec.code if rec.respond_to?(:code) && rec.code.present?
74 |
75 | known_titles << [ rec.key, titles ]
76 |
77 | ### fix: use plain logger
78 | LogUtils::Logger.root.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
79 | end
80 |
81 | known_titles
82 | end
83 |
84 |
85 |
86 | def find_key_for!( name, line )
87 | LogUtils::Logger.root.info " find_key_for! #{name} - deprecated API - use TitleMapper.find_key! instead"
88 |
89 | regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
90 |
91 | upcase_name = name.upcase
92 | downcase_name = name.downcase
93 |
94 | if line =~ regex
95 | value = "#{$1}"
96 | ### fix: use plain logger
97 | LogUtils::Logger.root.debug " #{downcase_name}: >#{value}<"
98 |
99 | line.sub!( regex, "[#{upcase_name}]" )
100 |
101 | return $1
102 | else
103 | return nil
104 | end
105 | end
106 |
107 |
108 | def find_keys_for!( name, line ) # NB: keys (plural!) - will return array
109 | LogUtils::Logger.root.info " find_keys_for! #{name} - deprecated API - use TitleMapper.find_keys! instead"
110 |
111 | counter = 1
112 | keys = []
113 |
114 | downcase_name = name.downcase
115 |
116 | key = find_key_for!( "#{downcase_name}#{counter}", line )
117 | while key.present?
118 | keys << key
119 | counter += 1
120 | key = find_key_for!( "#{downcase_name}#{counter}", line )
121 | end
122 |
123 | keys
124 | end
125 |
126 |
127 | def map_titles_for!( name, line, title_table )
128 | LogUtils::Logger.root.info " map_titles_for! #{name} - deprecated API - use TitleMapper.map_titles! instead"
129 |
130 | title_table.each do |rec|
131 | key = rec[0]
132 | values = rec[1]
133 | map_title_worker_for!( name, line, key, values )
134 | end
135 | end
136 |
137 |
138 | def map_title_worker_for!( name, line, key, values )
139 |
140 | downcase_name = name.downcase
141 |
142 | values.each do |value|
143 | ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
144 | ## (thus add it, allows match for Benfica Lis. for example - note . at the end)
145 |
146 | ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
147 | regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
148 | if line =~ regex
149 | ### fix: use plain logger
150 | LogUtils::Logger.root.debug " match for #{downcase_name} >#{key}< >#{value}<"
151 | # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
152 | line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
153 | return true # break out after first match (do NOT continue)
154 | end
155 | end
156 | return false
157 | end
158 |
159 | end # module TitleTable
160 | end # module TextUtils
161 |
162 |
163 | ## auto-include methods
164 |
165 | module TextUtils
166 | # make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
167 | extend TitleTable # lets us use TextUtils.build_title_table_for etc.
168 | end
169 |
170 |
171 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/helper/title_helper.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 |
4 | ###
5 | #
6 | # fix: move to filter!!!!
7 | # follows fn( content ) pattern!!!
8 |
9 |
10 | module TextUtils
11 | module TitleHelper
12 |
13 | ####
14 | # - todo: use new additional sub module ???
15 | # e.g. TextUtils::Reader::TagHelper
16 | # lets us use "classic" web helpers a la rails
17 | # find a good name for sub module - Reader? Fixtures? Values? Parser?
18 |
19 | def strip_part_markers( title ) # use different name e.g. strip_name_markers/strip_name_enclosure etc.??
20 | # remove optional part markers
21 | # e.g. Bock ‹Damm› becomes => Bock Damm
22 | # ‹Estrella› ‹Damm› Inedit becomes => Estrella Damm Inedit
23 |
24 | title.gsub( /[<>‹›]/, '' )
25 | end
26 |
27 | def strip_translations( title )
28 | # remove optional english translation in square brackets ([])
29 | # e.g. Wien [Vienna] => Wien
30 |
31 | title.gsub( /\[[^\]]+\]/, '' )
32 | end
33 |
34 | def strip_subtitles( title )
35 | # remove optional longer title part in ()
36 | # e.g. Las Palmas (de Gran Canaria) => Las Palmas
37 | # Palma (de Mallorca) => Palma
38 |
39 | title.gsub( /\([^\)]+\)/, '' )
40 | end
41 |
42 | def strip_tags( title ) # todo: use an alias or rename for better name ??
43 | # remove optional longer title part in {}
44 | # e.g. Ottakringer {Bio} => Ottakringer
45 | # Ottakringer {Alkoholfrei} => Ottakringer
46 | #
47 | # todo: use for autotags? e.g. {Bio} => bio
48 |
49 | title.gsub( /\{[^\}]+\}/, '' )
50 | end
51 |
52 | def strip_whitespaces( title )
53 | # remove all whitespace and punctuation
54 | title.gsub( /[ \t_\-\.!()\[\]'"’\/]/, '' )
55 | end
56 |
57 | def strip_special_chars( title )
58 | # remove special chars (e.g. %°&$)
59 | # e.g. +Malta
60 | # Minerva 8:60
61 | # $Alianz$ Arena
62 | title.gsub( /[%&°+:$]/, '' )
63 | end
64 |
65 | def title_to_key( title )
66 |
67 | ## NB: used in/moved from readers/values_reader.rb
68 |
69 | ## NB: downcase does NOT work for accented chars (thus, include in alternatives)
70 | key = title.downcase
71 |
72 | key = strip_part_markers( key ) # e.g. ‹Estrella› ‹Damm› Inedit becomes => Estrella Damm Inedit
73 |
74 | key = strip_translations( key )
75 |
76 | key = strip_subtitles( key )
77 |
78 | key = strip_tags( key )
79 |
80 | key = strip_whitespaces( key )
81 |
82 | key = strip_special_chars( key )
83 |
84 | key = TextUtils.asciify( key ).downcase ## see filter/string_filter
85 |
86 | key
87 | end # method title_to_key
88 |
89 |
90 | def title_esc_regex( title_unescaped )
91 |
92 | ## escape regex special chars e.g.
93 | # . to \. and
94 | # ( to \(
95 | # ) to \)
96 | # ? to \? -- zero or one
97 | # * to \* -- zero or more
98 | # + to \+ -- one or more
99 | # $ to \$ -- end of line
100 | # ^ to \^ -- start of line etc.
101 |
102 | ### add { and } ???
103 | ### add [ and ] ???
104 | ### add \ too ???
105 | ### add | too ???
106 |
107 | # e.g. Benfica Lis.
108 | # e.g. Club Atlético Colón (Santa Fe)
109 | # e.g. Bauer Anton (????)
110 |
111 | ## NB: cannot use Regexp.escape! will escape space '' to '\ '
112 | ## title = Regexp.escape( title_unescaped )
113 | title = title_unescaped.gsub( '.', '\.' )
114 | title = title.gsub( '(', '\(' )
115 | title = title.gsub( ')', '\)' )
116 | title = title.gsub( '?', '\?' )
117 | title = title.gsub( '*', '\*' )
118 | title = title.gsub( '+', '\+' )
119 | title = title.gsub( '$', '\$' )
120 | title = title.gsub( '^', '\^' )
121 |
122 | ## match accented char with or without accents
123 | ## add (ü|ue) etc.
124 | ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
125 |
126 | ## todo: add some more
127 | ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
128 | ##
129 | ## reuse for all readers!
130 |
131 | alternatives = [
132 | ['-', '(-| )'], ## e.g. Blau-Weiß Linz
133 | ['æ', '(æ|ae)'], ## e.g.
134 | ['ä', '(ä|ae)'], ## e.g.
135 | ['Ö', '(Ö|Oe)'], ## e.g. Österreich
136 | ['ö', '(ö|oe)'], ## e.g. Mönchengladbach
137 | ['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
138 | ['ü', '(ü|ue)'], ## e.g.
139 |
140 | ['á', '(á|a)'], ## e.g. Bogotá, Sársfield
141 | ['ã', '(ã|a)'], ## e.g São Paulo
142 | ['ç', '(ç|c)'], ## e.g. Fenerbahçe
143 | ['é', '(é|e)'], ## e.g. Vélez
144 | ['ê', '(ê|e)'], ## e.g. Grêmio
145 | ['ï', '(ï|i)' ], ## e.g. El Djazaïr
146 | ['ñ', '(ñ|n)'], ## e.g. Porteño
147 | ['ň', '(ň|n)'], ## e.g. Plzeň
148 | ['ó', '(ó|o)'], ## e.g. Colón
149 | ['ō', '(ō|o)'], # # e.g. Tōkyō
150 | ['ș', '(ș|s)'], ## e.g. Bucarești
151 | ['ú', '(ú|u)'] ## e.g. Fútbol
152 | ]
153 |
154 | ### fix/todo: check for dot+space e.g. . and make dot optional
155 | ##
156 | # e.g. make dot (.) optional plus allow alternative optional space e.g.
157 | # -- for U.S.A. => allow USA or U S A
158 | #
159 | ## e.g. U. de G. or U de G or U.de G. ??
160 | ## collect some more (real-world) examples first!!!!!
161 |
162 | alternatives.each do |alt|
163 | title = title.gsub( alt[0], alt[1] )
164 | end
165 |
166 | title
167 | end
168 |
169 |
170 | end # module TitleHelper
171 | end # module TextUtils
172 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/helper/address_helper.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 |
4 | module TextUtils
5 | module AddressHelper
6 |
7 | def normalize_addr( old_address, country_key=nil )
8 |
9 | # for now only checks german (de) 5-digit zip code and
10 | # austrian (at) 4-digit zip code
11 | #
12 | # e.g. Alte Plauener Straße 24 // 95028 Hof becomes
13 | # 95028 Hof // Alte Plauener Straße 24
14 |
15 | if country_key.nil?
16 | puts "TextUtils.normalize_addr drepreciated call - country_key now required; please add !!"
17 | return old_address
18 | end
19 |
20 | new_address = old_address # default - do nothing - just path through
21 |
22 | lines = old_address.split( '//' )
23 |
24 | if lines.size == 2 # two lines / check for switching lines
25 |
26 | line1 = lines[0].strip
27 | line2 = lines[1].strip
28 |
29 | regex_nnnn = /^[0-9]{4}\s+/ # four digits postal code
30 | regex_nnnnn = /^[0-9]{5}\s+/ # five digits postal code
31 |
32 | if (country_key == 'at' && line2 =~ regex_nnnn ) ||
33 | (country_key == 'de' && line2 =~ regex_nnnnn )
34 | new_address = "#{line2} // #{line1}"
35 | end
36 | end
37 |
38 | new_address
39 | end
40 |
41 |
42 | def find_city_in_addr_without_postal_code( address )
43 |
44 | ## general rule; not country-specific; no postal code/zip code or state
45 | # - must be like two lines (one line empty) e.g.
46 | # // London or
47 | # London //
48 | # will assume entry is city
49 | # note: city may NOT include numbers, or pipe (|) or comma (,) chars
50 |
51 | # fix: use blank?
52 | return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
53 |
54 | old_lines = address.split( '//' )
55 |
56 | ###
57 | # note: London // will get split into arry with size 1 e.g. ['London ']
58 | # support it, that is, add missing empty line
59 |
60 | # 1) strip lines
61 | # 2) remove blank lines
62 | lines = []
63 |
64 | old_lines.each do |line|
65 | linec = line.strip
66 | next if linec.empty?
67 | lines << linec
68 | end
69 |
70 | if lines.size == 1
71 | linec = lines[0]
72 | # note: city may NOT include
73 | # numbers (e.g. assumes zip/postal code etc.) or
74 | # pipe (|) or
75 | # comma (,)
76 | if linec =~ /[0-9|,]/
77 | return nil
78 | end
79 | # more than two uppercase letters e.g. TX NY etc.
80 | # check if city exists wit tow uppercase letters??
81 | if linec =~ /[A-Z]{2,}/
82 | return nil
83 | end
84 | return linec # bingo!!! assume candidate line is a city name
85 | end
86 |
87 | nil # no generic city match found
88 | end
89 |
90 |
91 | def find_city_in_addr_with_postal_code( address, country_key )
92 |
93 | # fix: use blank?
94 | return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
95 |
96 | lines = address.split( '//' )
97 |
98 | if country_key == 'at' || country_key == 'be'
99 | # support for now
100 | # - 2018 Antwerpen or 2870 Breendonk-Puurs (be)
101 | lines.each do |line|
102 | linec = line.strip
103 | regex_nnnn = /^[0-9]{4}\s+/
104 | if linec =~ regex_nnnn # must start w/ four digit postal code ? assume its the city line
105 | return linec.sub( regex_nnnn, '' ) # cut off leading postal code; assume rest is city
106 | end
107 | end
108 | elsif country_key == 'de'
109 | lines.each do |line|
110 | linec = line.strip
111 | regex_nnnnn = /^[0-9]{5}\s+/
112 | if linec =~ regex_nnnnn # must start w/ five digit postal code ? assume its the city line
113 | return linec.sub( regex_nnnnn, '' ) # cut off leading postal code; assume rest is city
114 | end
115 | end
116 | elsif country_key == 'cz' || country_key == 'sk'
117 | # support for now
118 | # - 284 15 Kutná Hora or 288 25 Nymburk (cz)
119 | # - 036 42 Martin or 974 05 Banská Bystrica (sk)
120 | lines.each do |line|
121 | linec = line.strip
122 | regex_nnn_nn = /^[0-9]{3}\s[0-9]{2}\s+/
123 | if linec =~ regex_nnn_nn # must start w/ five digit postal code ? assume its the city line
124 | return linec.sub( regex_nnn_nn, '' ) # cut off leading postal code; assume rest is city
125 | end
126 | end
127 | elsif country_key == 'us'
128 | # support for now
129 | # - Brooklyn | NY 11249 or Brooklyn, NY 11249
130 | # - Brooklyn | NY or Brooklyn, NY
131 |
132 | lines.each do |line|
133 | linec = line.strip
134 | regexes_us = [/\s*[|,]\s+[A-Z]{2}\s+[0-9]{5}\s*$/,
135 | /\s*[|,]\s+[A-Z]{2}\s*$/]
136 |
137 | regexes_us.each do |regex|
138 | if linec =~ regex
139 | return linec.sub( regex, '' ) # cut off leading postal code; assume rest is city
140 | end
141 | end
142 | end
143 | else
144 | # unsupported country/address schema for now; sorry
145 | end
146 | return nil # sorry nothing found
147 | end
148 |
149 |
150 | def find_city_in_addr( address, country_key )
151 |
152 | # fix: use blank?
153 | return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
154 |
155 | ## try geneneric rule first (e.g. w/o postal code/zip code or state), see above
156 | city = find_city_in_addr_without_postal_code( address )
157 | return city unless city.nil?
158 |
159 | city = find_city_in_addr_with_postal_code( address, country_key )
160 | return city unless city.nil?
161 |
162 | nil # sorry; no city found (using known patterns)
163 | end
164 |
165 |
166 | end # module AddressHelper
167 | end # module TextUtils
168 |
--------------------------------------------------------------------------------
/textutils/lib/textutils/helper/hypertext_helper.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | module TextUtils
4 | module HypertextHelper
5 |
6 |
7 | def strip_tags( ht )
8 | ### to be done
9 | ## strip markup tags; return plain text; use brute force for now
10 | # check at least for presence of required a-z+ tag names
11 | #
12 | # note: make sure we cover h1/h2/h3/h4/h5/h6 tag w/ number!!
13 |
14 | ### ht.gsub( /<[^>]+>/, '' ) - old simple
15 |
16 | ## todo: add strip comments e.g. ???
17 | ## or use new strip_comments( ht )
18 |
19 |
20 | ## note: follow offical xml spec
21 | ## - allows for first char: (Letter | '_' | ':')
22 | ## - allows for followup chars: (Letter | Digit | '_' | ':' | '.' | '-')
23 |
24 | tag_name_pattern = "[a-z_:][a-z0-9_:.\\-]*"
25 |
26 | empty_tag_pattern = "<#{tag_name_pattern}\\s*/>"
27 | opening_tag_pattern = "<#{tag_name_pattern}(\\s+[^>]*)?>"
28 | closing_tag_pattern = "#{tag_name_pattern}\\s*>"
29 |
30 | ht = ht.gsub( /#{empty_tag_pattern}/i, '' ) # remove xml-style empty tags eg.
or
31 | ht = ht.gsub( /#{opening_tag_pattern}/i, '' ) # opening tag
32 | ht = ht.gsub( /#{closing_tag_pattern}/i, '' ) # closing tag e.g.
33 | ht
34 | end
35 |
36 |
37 | def whitelist( ht, tags, opts={} )
38 |
39 | # note: assumes properly escaped <> in ht/hypertext
40 |
41 | ###############################################
42 | # step one - save whitelisted tags use ‹tag›
43 | tags.each do |tag|
44 | # note: we strip all attribues
45 | # note: match all tags case insensitive e.g. allow a,A or br,BR,bR etc.
46 | # downcase all tags
47 |
48 | # convert xml-style empty tags to simple html emtpty tags
49 | # e.g.
or
becomses
50 | ht = ht.gsub( /<(#{tag})\s*\/>/i ) { |_| "‹#{$1.downcase}›" } # eg.
or
becomes ‹br›
51 |
52 | # make sure we won't swall
for for example, thus use \s+ before [^>]
53 | ht = ht.gsub( /<(#{tag})(\s+[^>]*)?>/i ) { |_| "‹#{$1.downcase}›" } # opening tag
54 | ht = ht.gsub( /<\/(#{tag})\s*>/i ) { |_| "‹/#{$1.downcase}›" } # closing tag e.g.
55 | end
56 |
57 | ############################
58 | # step two - clean tags
59 |
60 | # strip images - special treatment for debugging
61 | ht = ht.gsub( /
]*>/i, '♦' ) # for debugging use black diamond e.g. ♦
62 | ht = ht.gsub( /<\/img>/i, '' ) # should not exists
63 |
64 | # strip all remaining tags
65 | # -- note: will NOT strip comments for now e.g.
66 | ht = strip_tags( ht )
67 |
68 | ## pp ht # fix: debugging indo - remove
69 |
70 | ############################################
71 | # step three - restore whitelisted tags
72 |
73 | return ht if opts[:skip_restore] # skip step 3 for debugging
74 |
75 | tags.each do |tag|
76 | # ht = ht.gsub( /‹(#{tag})›/, "<\1>" ) # opening tag e.g.
77 | # ht = ht.gsub( /‹\/(#{tag})›/, "<\/\1>" ) # closing tag e.g.
78 | ht = ht.gsub( /‹(#{tag})›/ ) { |_| "<#{$1}>" }
79 | ht = ht.gsub( /‹\/(#{tag})›/ ) { |_| "<\/#{$1}>" } # closing tag e.g.
80 | end
81 |
82 | ht
83 | end # method whitelist
84 |
85 |
86 |
87 |
88 | ## change to simple_hypertext or
89 | # hypertext_simple or
90 | # sanitize ???
91 |
92 | def sanitize( ht, opts={} ) # ht -> hypertext
93 | # todo: add options for
94 | # keep links, images, lists (?too), code, codeblocks
95 |
96 | ht = whitelist( ht, [:br, :p, :ul, :ol, :li, :pre, :code, :blockquote, :q, :cite], opts )
97 |
98 | # clean (prettify) literal urls (strip protocoll)
99 | ht = ht.gsub( /(http|https):\/\//, '' )
100 | ht
101 | end
102 |
103 |
104 | def textify( ht, opts={} ) # ht -> hypertext
105 | ## turn into plain (or markdown/wiki-style) text - to be done
106 |
107 | sanitize( ht, opts ) # step 1 - sanitize html
108 | # to be done
109 |
110 | # strip bold
111 | # ht = ht.gsub( /]*>/, '**' ) # fix: will also swallow bxxx tags - add b space
112 | # ht = ht.gsub( /<\/b>/, '**' )
113 |
114 | # strip em
115 | # ht = ht.gsub( /]*>/, '__' )
116 | # ht = ht.gsub( /<\/em>/, '__' )
117 |
118 | # ht = ht.gsub( / /, ' ' )
119 |
120 | # # try to cleanup whitespaces
121 | # # -- keep no more than two spaces
122 | # ht = ht.gsub( /[ \t]{3,}/, ' ' )
123 | # # -- keep no more than two new lines
124 | # ht = ht.gsub( /\n{2,}/m, "\n\n" )
125 | # # -- remove all trailing spaces
126 | # ht = ht.gsub( /[ \t\n]+$/m, '' )
127 | # # -- remove all leading spaces
128 | # ht = ht.gsub( /^[ \t\n]+/m, '' )
129 | end
130 |
131 |
132 | ##############################
133 | # rails-style asset, url tag helpers and friends
134 | #
135 | # todo: move into different helper module/modules?? why? why not?
136 |
137 | def tag( tag, opts={} ) # empty tag (no content e.g.
,
etc.)
138 | attribs = []
139 | opts.each do |key,value|
140 | attribs << "#{key}='#{value}'"
141 | end
142 |
143 | if attribs.size > 0
144 | "<#{tag} #{attribs.join(' ')}>"
145 | else
146 | "<#{tag}>"
147 | end
148 | end
149 |
150 | def content_tag( tag, content, opts={} ) # content tag (e.g. hello
- w/ opening and closing tag)
151 | attribs = []
152 | opts.each do |key,value|
153 | attribs << "#{key}='#{value}'"
154 | end
155 |
156 | if attribs.size > 0
157 | "<#{tag} #{attribs.join(' ')}>#{content}#{tag}>"
158 | else
159 | "<#{tag}>#{content}#{tag}>"
160 | end
161 | end
162 |
163 |
164 | def stylesheet_link_tag( href, opts={} )
165 | href = "#{href}.css" unless href.end_with?( '.css' ) # auto-add .css if not present
166 | attribs = { rel: 'stylesheet',
167 | type: 'text/css',
168 | href: href }
169 | attribs = attribs.merge( opts ) ### fix/todo: use reverse merge e.g. overwrite only if not present
170 | tag( :link, attribs )
171 | end
172 |
173 | def image_tag( src, opts={} )
174 | attribs = { src: src }
175 | attribs = attribs.merge( opts ) ### fix/todo: use reverse merge e.g. overwrite only if not present
176 | tag( :img, attribs ) ### "
"
177 | end
178 |
179 | def link_to( content, href, opts={} )
180 | attribs = { href: href }
181 | attribs = attribs.merge( opts ) ### fix/todo: use reverse merge e.g. overwrite only if not present
182 | content_tag( :a, content, attribs ) ### "#{text}"
183 | end
184 |
185 |
186 | end # module HypertextHelper
187 | end # module TextUtils
188 |
--------------------------------------------------------------------------------